module/zcommon/zfs_fletcher.c

   1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
  25  */
  26 /*
  27  * Copyright 2013 Saso Kiselkov. All rights reserved.
  28  */
  29
  30 /*
  31  * Fletcher Checksums
  32  * ------------------
  33  *
  34  * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
  35  * recurrence relations:
  36  *
  37  *      a  = a    + f
  38  *       i    i-1    i-1
  39  *
  40  *      b  = b    + a
  41  *       i    i-1    i
  42  *
  43  *      c  = c    + b           (fletcher-4 only)
  44  *       i    i-1    i
  45  *
  46  *      d  = d    + c           (fletcher-4 only)
  47  *       i    i-1    i
  48  *
  49  * Where
  50  *      a_0 = b_0 = c_0 = d_0 = 0
  51  * and
  52  *      f_0 .. f_(n-1) are the input data.
  53  *
  54  * Using standard techniques, these translate into the following series:
  55  *
  56  *           __n_                            __n_
  57  *           \   |                           \   |
  58  *      a  =  >     f                   b  =  >     i * f
  59  *       n   /___|   n - i               n   /___|       n - i
  60  *           i = 1                           i = 1
  61  *
  62  *
  63  *           __n_                            __n_
  64  *           \   |  i*(i+1)                  \   |  i*(i+1)*(i+2)
  65  *      c  =  >     ------- f           d  =  >     ------------- f
  66  *       n   /___|     2     n - i       n   /___|        6        n - i
  67  *           i = 1                           i = 1
  68  *
  69  * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
  70  * Since the additions are done mod (2^64), errors in the high bits may not
  71  * be noticed.  For this reason, fletcher-2 is deprecated.
  72  *
  73  * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
  74  * A conservative estimate of how big the buffer can get before we overflow
  75  * can be estimated using f_i = 0xffffffff for all i:
  76  *
  77  * % bc
  78  *  f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
  79  * 2264
  80  *  quit
  81  * %
  82  *
  83  * So blocks of up to 2k will not overflow.  Our largest block size is
  84  * 128k, which has 32k 4-byte words, so we can compute the largest possible
  85  * accumulators, then divide by 2^64 to figure the max amount of overflow:
  86  *
  87  * % bc
  88  *  a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
  89  *  a/2^64;b/2^64;c/2^64;d/2^64
  90  * 0
  91  * 0
  92  * 1365
  93  * 11186858
  94  *  quit
  95  * %
  96  *
  97  * So a and b cannot overflow.  To make sure each bit of input has some
  98  * effect on the contents of c and d, we can look at what the factors of
  99  * the coefficients in the equations for c_n and d_n are.  The number of 2s
 100  * in the factors determines the lowest set bit in the multiplier.  Running
 101  * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
 102  * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15.  So while some data may overflow
 103  * the 64-bit accumulators, every bit of every f_i effects every accumulator,
 104  * even for 128k blocks.
 105  *
 106  * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
 107  * we could do our calculations mod (2^32 - 1) by adding in the carries
 108  * periodically, and store the number of carries in the top 32-bits.
 109  *
 110  * --------------------
 111  * Checksum Performance
 112  * --------------------
 113  *
 114  * There are two interesting components to checksum performance: cached and
 115  * uncached performance.  With cached data, fletcher-2 is about four times
 116  * faster than fletcher-4.  With uncached data, the performance difference is
 117  * negligible, since the cost of a cache fill dominates the processing time.
 118  * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
 119  * efficient pass over the data.
 120  *
 121  * In normal operation, the data which is being checksummed is in a buffer
 122  * which has been filled either by:
 123  *
 124  *      1. a compression step, which will be mostly cached, or
 125  *      2. a bcopy() or copyin(), which will be uncached (because the
 126  *         copy is cache-bypassing).
 127  *
 128  * For both cached and uncached data, both fletcher checksums are much faster
 129  * than sha-256, and slower than 'off', which doesn't touch the data at all.
 130  */
 131
 132 #include <sys/types.h>
 133 #include <sys/sysmacros.h>
 134 #include <sys/byteorder.h>
 135 #include <sys/spa.h>
 136 #include <sys/zio_checksum.h>
 137 #include <sys/zfs_context.h>
 138 #include <zfs_fletcher.h>
 139
 140
 141 static void fletcher_4_scalar_init(zio_cksum_t *zcp);
 142 static void fletcher_4_scalar_native(const void *buf, uint64_t size,
 143     zio_cksum_t *zcp);
 144 static void fletcher_4_scalar_byteswap(const void *buf, uint64_t size,
 145     zio_cksum_t *zcp);
 146 static boolean_t fletcher_4_scalar_valid(void);
 147
 148 static const fletcher_4_ops_t fletcher_4_scalar_ops = {
 149         .init_native = fletcher_4_scalar_init,
 150         .compute_native = fletcher_4_scalar_native,
 151         .init_byteswap = fletcher_4_scalar_init,
 152         .compute_byteswap = fletcher_4_scalar_byteswap,
 153         .valid = fletcher_4_scalar_valid,
 154         .name = "scalar"
 155 };
 156
 157 static fletcher_4_ops_t fletcher_4_fastest_impl = {
 158         .name = "fastest",
 159         .valid = fletcher_4_scalar_valid
 160 };
 161
 162 static const fletcher_4_ops_t *fletcher_4_impls[] = {
 163         &fletcher_4_scalar_ops,
 164 #if defined(HAVE_SSE2)
 165         &fletcher_4_sse2_ops,
 166 #endif
 167 #if defined(HAVE_SSE2) && defined(HAVE_SSSE3)
 168         &fletcher_4_ssse3_ops,
 169 #endif
 170 #if defined(HAVE_AVX) && defined(HAVE_AVX2)
 171         &fletcher_4_avx2_ops,
 172 #endif
 173 #if defined(__x86_64) && defined(HAVE_AVX512F)
 174         &fletcher_4_avx512f_ops,
 175 #endif
 176 };
 177
 178 /* Hold all supported implementations */
 179 static uint32_t fletcher_4_supp_impls_cnt = 0;
 180 static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)];
 181
 182 /* Select fletcher4 implementation */
 183 #define IMPL_FASTEST    (UINT32_MAX)
 184 #define IMPL_CYCLE      (UINT32_MAX - 1)
 185 #define IMPL_SCALAR     (0)
 186
 187 static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST;
 188
 189 #define IMPL_READ(i)    (*(volatile uint32_t *) &(i))
 190
 191 static struct fletcher_4_impl_selector {
 192         const char      *fis_name;
 193         uint32_t        fis_sel;
 194 } fletcher_4_impl_selectors[] = {
 195 #if !defined(_KERNEL)
 196         { "cycle",      IMPL_CYCLE },
 197 #endif
 198         { "fastest",    IMPL_FASTEST },
 199         { "scalar",     IMPL_SCALAR }
 200 };
 201
 202 static kstat_t *fletcher_4_kstat;
 203
 204 static struct fletcher_4_kstat {
 205         uint64_t native;
 206         uint64_t byteswap;
 207 } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1];
 208
 209 /* Indicate that benchmark has been completed */
 210 static boolean_t fletcher_4_initialized = B_FALSE;
 211
 212 /*ARGSUSED*/
 213 void
 214 fletcher_2_native(const void *buf, uint64_t size,
 215     const void *ctx_template, zio_cksum_t *zcp)
 216 {
 217         const uint64_t *ip = buf;
 218         const uint64_t *ipend = ip + (size / sizeof (uint64_t));
 219         uint64_t a0, b0, a1, b1;
 220
 221         for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
 222                 a0 += ip[0];
 223                 a1 += ip[1];
 224                 b0 += a0;
 225                 b1 += a1;
 226         }
 227
 228         ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
 229 }
 230
 231 /*ARGSUSED*/
 232 void
 233 fletcher_2_byteswap(const void *buf, uint64_t size,
 234     const void *ctx_template, zio_cksum_t *zcp)
 235 {
 236         const uint64_t *ip = buf;
 237         const uint64_t *ipend = ip + (size / sizeof (uint64_t));
 238         uint64_t a0, b0, a1, b1;
 239
 240         for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
 241                 a0 += BSWAP_64(ip[0]);
 242                 a1 += BSWAP_64(ip[1]);
 243                 b0 += a0;
 244                 b1 += a1;
 245         }
 246
 247         ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
 248 }
 249
 250 static void
 251 fletcher_4_scalar_init(zio_cksum_t *zcp)
 252 {
 253         ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 254 }
 255
 256 static void
 257 fletcher_4_scalar_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
 258 {
 259         const uint32_t *ip = buf;
 260         const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 261         uint64_t a, b, c, d;
 262
 263         a = zcp->zc_word[0];
 264         b = zcp->zc_word[1];
 265         c = zcp->zc_word[2];
 266         d = zcp->zc_word[3];
 267
 268         for (; ip < ipend; ip++) {
 269                 a += ip[0];
 270                 b += a;
 271                 c += b;
 272                 d += c;
 273         }
 274
 275         ZIO_SET_CHECKSUM(zcp, a, b, c, d);
 276 }
 277
 278 static void
 279 fletcher_4_scalar_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
 280 {
 281         const uint32_t *ip = buf;
 282         const uint32_t *ipend = ip + (size / sizeof (uint32_t));
 283         uint64_t a, b, c, d;
 284
 285         a = zcp->zc_word[0];
 286         b = zcp->zc_word[1];
 287         c = zcp->zc_word[2];
 288         d = zcp->zc_word[3];
 289
 290         for (; ip < ipend; ip++) {
 291                 a += BSWAP_32(ip[0]);
 292                 b += a;
 293                 c += b;
 294                 d += c;
 295         }
 296
 297         ZIO_SET_CHECKSUM(zcp, a, b, c, d);
 298 }
 299
 300 static boolean_t
 301 fletcher_4_scalar_valid(void)
 302 {
 303         return (B_TRUE);
 304 }
 305
 306 int
 307 fletcher_4_impl_set(const char *val)
 308 {
 309         int err = -EINVAL;
 310         uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
 311         size_t i, val_len;
 312
 313         val_len = strlen(val);
 314         while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */
 315                 val_len--;
 316
 317         /* check mandatory implementations */
 318         for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) {
 319                 const char *name = fletcher_4_impl_selectors[i].fis_name;
 320
 321                 if (val_len == strlen(name) &&
 322                     strncmp(val, name, val_len) == 0) {
 323                         impl = fletcher_4_impl_selectors[i].fis_sel;
 324                         err = 0;
 325                         break;
 326                 }
 327         }
 328
 329         if (err != 0 && fletcher_4_initialized) {
 330                 /* check all supported implementations */
 331                 for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
 332                         const char *name = fletcher_4_supp_impls[i]->name;
 333
 334                         if (val_len == strlen(name) &&
 335                             strncmp(val, name, val_len) == 0) {
 336                                 impl = i;
 337                                 err = 0;
 338                                 break;
 339                         }
 340                 }
 341         }
 342
 343         if (err == 0) {
 344                 atomic_swap_32(&fletcher_4_impl_chosen, impl);
 345                 membar_producer();
 346         }
 347
 348         return (err);
 349 }
 350
 351 static inline const fletcher_4_ops_t *
 352 fletcher_4_impl_get(void)
 353 {
 354         fletcher_4_ops_t *ops = NULL;
 355         const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
 356
 357         switch (impl) {
 358         case IMPL_FASTEST:
 359                 ASSERT(fletcher_4_initialized);
 360                 ops = &fletcher_4_fastest_impl;
 361                 break;
 362 #if !defined(_KERNEL)
 363         case IMPL_CYCLE: {
 364                 ASSERT(fletcher_4_initialized);
 365                 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
 366
 367                 static uint32_t cycle_count = 0;
 368                 uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt;
 369                 ops = fletcher_4_supp_impls[idx];
 370         }
 371         break;
 372 #endif
 373         default:
 374                 ASSERT3U(fletcher_4_supp_impls_cnt, >, 0);
 375                 ASSERT3U(impl, <, fletcher_4_supp_impls_cnt);
 376
 377                 ops = fletcher_4_supp_impls[impl];
 378                 break;
 379         }
 380
 381         ASSERT3P(ops, !=, NULL);
 382
 383         return (ops);
 384 }
 385
 386 void
 387 fletcher_4_incremental_native(const void *buf, uint64_t size,
 388     zio_cksum_t *zcp)
 389 {
 390         ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
 391
 392         fletcher_4_scalar_native(buf, size, zcp);
 393 }
 394
 395 void
 396 fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
 397     zio_cksum_t *zcp)
 398 {
 399         ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
 400
 401         fletcher_4_scalar_byteswap(buf, size, zcp);
 402 }
 403
 404 static inline void
 405 fletcher_4_native_impl(const fletcher_4_ops_t *ops, const void *buf,
 406         uint64_t size, zio_cksum_t *zcp)
 407 {
 408         ops->init_native(zcp);
 409         ops->compute_native(buf, size, zcp);
 410         if (ops->fini_native != NULL)
 411                 ops->fini_native(zcp);
 412 }
 413
 414 /*ARGSUSED*/
 415 void
 416 fletcher_4_native(const void *buf, uint64_t size,
 417     const void *ctx_template, zio_cksum_t *zcp)
 418 {
 419         const fletcher_4_ops_t *ops;
 420         uint64_t p2size = P2ALIGN(size, 64);
 421
 422         ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
 423
 424         if (size == 0) {
 425                 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 426         } else if (p2size == 0) {
 427                 ops = &fletcher_4_scalar_ops;
 428                 fletcher_4_native_impl(ops, buf, size, zcp);
 429         } else {
 430                 ops = fletcher_4_impl_get();
 431                 fletcher_4_native_impl(ops, buf, p2size, zcp);
 432
 433                 if (p2size < size)
 434                         fletcher_4_incremental_native((char *)buf + p2size,
 435                             size - p2size, zcp);
 436         }
 437 }
 438
 439 void
 440 fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp)
 441 {
 442         fletcher_4_native_impl(&fletcher_4_scalar_ops, buf, size, zcp);
 443 }
 444
 445 static inline void
 446 fletcher_4_byteswap_impl(const fletcher_4_ops_t *ops, const void *buf,
 447         uint64_t size, zio_cksum_t *zcp)
 448 {
 449         ops->init_byteswap(zcp);
 450         ops->compute_byteswap(buf, size, zcp);
 451         if (ops->fini_byteswap != NULL)
 452                 ops->fini_byteswap(zcp);
 453 }
 454
 455 /*ARGSUSED*/
 456 void
 457 fletcher_4_byteswap(const void *buf, uint64_t size,
 458     const void *ctx_template, zio_cksum_t *zcp)
 459 {
 460         const fletcher_4_ops_t *ops;
 461         uint64_t p2size = P2ALIGN(size, 64);
 462
 463         ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t)));
 464
 465         if (size == 0) {
 466                 ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0);
 467         } else if (p2size == 0) {
 468                 ops = &fletcher_4_scalar_ops;
 469                 fletcher_4_byteswap_impl(ops, buf, size, zcp);
 470         } else {
 471                 ops = fletcher_4_impl_get();
 472                 fletcher_4_byteswap_impl(ops, buf, p2size, zcp);
 473
 474                 if (p2size < size)
 475                         fletcher_4_incremental_byteswap((char *)buf + p2size,
 476                             size - p2size, zcp);
 477         }
 478 }
 479
 480 static int
 481 fletcher_4_kstat_headers(char *buf, size_t size)
 482 {
 483         ssize_t off = 0;
 484
 485         off += snprintf(buf + off, size, "%-17s", "implementation");
 486         off += snprintf(buf + off, size - off, "%-15s", "native");
 487         (void) snprintf(buf + off, size - off, "%-15s\n", "byteswap");
 488
 489         return (0);
 490 }
 491
 492 static int
 493 fletcher_4_kstat_data(char *buf, size_t size, void *data)
 494 {
 495         struct fletcher_4_kstat *fastest_stat =
 496             &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
 497         struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *) data;
 498         ssize_t off = 0;
 499
 500         if (curr_stat == fastest_stat) {
 501                 off += snprintf(buf + off, size - off, "%-17s", "fastest");
 502                 off += snprintf(buf + off, size - off, "%-15s",
 503                     fletcher_4_supp_impls[fastest_stat->native]->name);
 504                 off += snprintf(buf + off, size - off, "%-15s\n",
 505                     fletcher_4_supp_impls[fastest_stat->byteswap]->name);
 506         } else {
 507                 ptrdiff_t id = curr_stat - fletcher_4_stat_data;
 508
 509                 off += snprintf(buf + off, size - off, "%-17s",
 510                     fletcher_4_supp_impls[id]->name);
 511                 off += snprintf(buf + off, size - off, "%-15llu",
 512                             (u_longlong_t) curr_stat->native);
 513                 off += snprintf(buf + off, size - off, "%-15llu\n",
 514                             (u_longlong_t) curr_stat->byteswap);
 515         }
 516
 517         return (0);
 518 }
 519
 520 static void *
 521 fletcher_4_kstat_addr(kstat_t *ksp, loff_t n)
 522 {
 523         if (n <= fletcher_4_supp_impls_cnt)
 524                 ksp->ks_private = (void *) (fletcher_4_stat_data + n);
 525         else
 526                 ksp->ks_private = NULL;
 527
 528         return (ksp->ks_private);
 529 }
 530
 531 #define FLETCHER_4_FASTEST_FN_COPY(type, src)                             \
 532 {                                                                         \
 533         fletcher_4_fastest_impl.init_ ## type = src->init_ ## type;       \
 534         fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type;       \
 535         fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \
 536 }
 537
 538 #define FLETCHER_4_BENCH_NS     (MSEC2NSEC(50))         /* 50ms */
 539
 540 static void
 541 fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size)
 542 {
 543
 544         struct fletcher_4_kstat *fastest_stat =
 545             &fletcher_4_stat_data[fletcher_4_supp_impls_cnt];
 546         hrtime_t start;
 547         uint64_t run_bw, run_time_ns, best_run = 0;
 548         zio_cksum_t zc;
 549         uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen);
 550
 551         zio_checksum_func_t *fletcher_4_test = native ? fletcher_4_native :
 552             fletcher_4_byteswap;
 553
 554         for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
 555                 struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i];
 556                 uint64_t run_count = 0;
 557
 558                 /* temporary set an implementation */
 559                 fletcher_4_impl_chosen = i;
 560
 561                 kpreempt_disable();
 562                 start = gethrtime();
 563                 do {
 564                         for (l = 0; l < 32; l++, run_count++)
 565                                 fletcher_4_test(data, data_size, NULL, &zc);
 566
 567                         run_time_ns = gethrtime() - start;
 568                 } while (run_time_ns < FLETCHER_4_BENCH_NS);
 569                 kpreempt_enable();
 570
 571                 run_bw = data_size * run_count * NANOSEC;
 572                 run_bw /= run_time_ns;  /* B/s */
 573
 574                 if (native)
 575                         stat->native = run_bw;
 576                 else
 577                         stat->byteswap = run_bw;
 578
 579                 if (run_bw > best_run) {
 580                         best_run = run_bw;
 581
 582                         if (native) {
 583                                 fastest_stat->native = i;
 584                                 FLETCHER_4_FASTEST_FN_COPY(native,
 585                                     fletcher_4_supp_impls[i]);
 586                         } else {
 587                                 fastest_stat->byteswap = i;
 588                                 FLETCHER_4_FASTEST_FN_COPY(byteswap,
 589                                     fletcher_4_supp_impls[i]);
 590                         }
 591                 }
 592         }
 593
 594         /* restore original selection */
 595         atomic_swap_32(&fletcher_4_impl_chosen, sel_save);
 596 }
 597
 598 void
 599 fletcher_4_init(void)
 600 {
 601         static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */
 602         fletcher_4_ops_t *curr_impl;
 603         char *databuf;
 604         int i, c;
 605
 606         /* move supported impl into fletcher_4_supp_impls */
 607         for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) {
 608                 curr_impl = (fletcher_4_ops_t *) fletcher_4_impls[i];
 609
 610                 if (curr_impl->valid && curr_impl->valid())
 611                         fletcher_4_supp_impls[c++] = curr_impl;
 612         }
 613         membar_producer();      /* complete fletcher_4_supp_impls[] init */
 614         fletcher_4_supp_impls_cnt = c;  /* number of supported impl */
 615
 616 #if !defined(_KERNEL)
 617         /* Skip benchmarking and use last implementation as fastest */
 618         memcpy(&fletcher_4_fastest_impl,
 619             fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1],
 620             sizeof (fletcher_4_fastest_impl));
 621         fletcher_4_fastest_impl.name = "fastest";
 622         membar_producer();
 623
 624         fletcher_4_initialized = B_TRUE;
 625
 626         /* Use 'cycle' math selection method for userspace */
 627         VERIFY0(fletcher_4_impl_set("cycle"));
 628         return;
 629 #endif
 630         /* Benchmark all supported implementations */
 631         databuf = vmem_alloc(data_size, KM_SLEEP);
 632         for (i = 0; i < data_size / sizeof (uint64_t); i++)
 633                 ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */
 634
 635         fletcher_4_benchmark_impl(B_FALSE, databuf, data_size);
 636         fletcher_4_benchmark_impl(B_TRUE, databuf, data_size);
 637
 638         vmem_free(databuf, data_size);
 639
 640         /* install kstats for all implementations */
 641         fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc",
 642                 KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL);
 643         if (fletcher_4_kstat != NULL) {
 644                 fletcher_4_kstat->ks_data = NULL;
 645                 fletcher_4_kstat->ks_ndata = UINT32_MAX;
 646                 kstat_set_raw_ops(fletcher_4_kstat,
 647                     fletcher_4_kstat_headers,
 648                     fletcher_4_kstat_data,
 649                     fletcher_4_kstat_addr);
 650                 kstat_install(fletcher_4_kstat);
 651         }
 652
 653         /* Finish initialization */
 654         fletcher_4_initialized = B_TRUE;
 655 }
 656
 657 void
 658 fletcher_4_fini(void)
 659 {
 660         if (fletcher_4_kstat != NULL) {
 661                 kstat_delete(fletcher_4_kstat);
 662                 fletcher_4_kstat = NULL;
 663         }
 664 }
 665
 666 #if defined(_KERNEL) && defined(HAVE_SPL)
 667 #include <linux/mod_compat.h>
 668
 669 static int
 670 fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused)
 671 {
 672         const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen);
 673         char *fmt;
 674         int i, cnt = 0;
 675
 676         /* list fastest */
 677         fmt = (impl == IMPL_FASTEST) ? "[%s] " : "%s ";
 678         cnt += sprintf(buffer + cnt, fmt, "fastest");
 679
 680         /* list all supported implementations */
 681         for (i = 0; i < fletcher_4_supp_impls_cnt; i++) {
 682                 fmt = (i == impl) ? "[%s] " : "%s ";
 683                 cnt += sprintf(buffer + cnt, fmt,
 684                     fletcher_4_supp_impls[i]->name);
 685         }
 686
 687         return (cnt);
 688 }
 689
 690 static int
 691 fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused)
 692 {
 693         return (fletcher_4_impl_set(val));
 694 }
 695
 696 /*
 697  * Choose a fletcher 4 implementation in ZFS.
 698  * Users can choose "cycle" to exercise all implementations, but this is
 699  * for testing purpose therefore it can only be set in user space.
 700  */
 701 module_param_call(zfs_fletcher_4_impl,
 702     fletcher_4_param_set, fletcher_4_param_get, NULL, 0644);
 703 MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation.");
 704
 705 EXPORT_SYMBOL(fletcher_4_init);
 706 EXPORT_SYMBOL(fletcher_4_fini);
 707 EXPORT_SYMBOL(fletcher_2_native);
 708 EXPORT_SYMBOL(fletcher_2_byteswap);
 709 EXPORT_SYMBOL(fletcher_4_native);
 710 EXPORT_SYMBOL(fletcher_4_native_varsize);
 711 EXPORT_SYMBOL(fletcher_4_byteswap);
 712 EXPORT_SYMBOL(fletcher_4_incremental_native);
 713 EXPORT_SYMBOL(fletcher_4_incremental_byteswap);
 714 #endif