]>
Commit | Line | Data |
---|---|---|
34dc7c2f BB |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
9babb374 | 22 | * Copyright 2009 Sun Microsystems, Inc. All rights reserved. |
34dc7c2f | 23 | * Use is subject to license terms. |
fc897b24 | 24 | * Copyright (C) 2016 Gvozden Nešković. All rights reserved. |
34dc7c2f | 25 | */ |
3c67d83a TH |
26 | /* |
27 | * Copyright 2013 Saso Kiselkov. All rights reserved. | |
28 | */ | |
34dc7c2f | 29 | |
9babb374 BB |
30 | /* |
31 | * Fletcher Checksums | |
32 | * ------------------ | |
33 | * | |
34 | * ZFS's 2nd and 4th order Fletcher checksums are defined by the following | |
35 | * recurrence relations: | |
36 | * | |
37 | * a = a + f | |
38 | * i i-1 i-1 | |
39 | * | |
40 | * b = b + a | |
41 | * i i-1 i | |
42 | * | |
43 | * c = c + b (fletcher-4 only) | |
44 | * i i-1 i | |
45 | * | |
46 | * d = d + c (fletcher-4 only) | |
47 | * i i-1 i | |
48 | * | |
49 | * Where | |
50 | * a_0 = b_0 = c_0 = d_0 = 0 | |
51 | * and | |
52 | * f_0 .. f_(n-1) are the input data. | |
53 | * | |
54 | * Using standard techniques, these translate into the following series: | |
55 | * | |
56 | * __n_ __n_ | |
57 | * \ | \ | | |
58 | * a = > f b = > i * f | |
59 | * n /___| n - i n /___| n - i | |
60 | * i = 1 i = 1 | |
61 | * | |
62 | * | |
63 | * __n_ __n_ | |
64 | * \ | i*(i+1) \ | i*(i+1)*(i+2) | |
65 | * c = > ------- f d = > ------------- f | |
66 | * n /___| 2 n - i n /___| 6 n - i | |
67 | * i = 1 i = 1 | |
68 | * | |
69 | * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators. | |
70 | * Since the additions are done mod (2^64), errors in the high bits may not | |
71 | * be noticed. For this reason, fletcher-2 is deprecated. | |
72 | * | |
73 | * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators. | |
74 | * A conservative estimate of how big the buffer can get before we overflow | |
75 | * can be estimated using f_i = 0xffffffff for all i: | |
76 | * | |
77 | * % bc | |
78 | * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4 | |
79 | * 2264 | |
80 | * quit | |
81 | * % | |
82 | * | |
83 | * So blocks of up to 2k will not overflow. Our largest block size is | |
84 | * 128k, which has 32k 4-byte words, so we can compute the largest possible | |
85 | * accumulators, then divide by 2^64 to figure the max amount of overflow: | |
86 | * | |
87 | * % bc | |
88 | * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c } | |
89 | * a/2^64;b/2^64;c/2^64;d/2^64 | |
90 | * 0 | |
91 | * 0 | |
92 | * 1365 | |
93 | * 11186858 | |
94 | * quit | |
95 | * % | |
96 | * | |
97 | * So a and b cannot overflow. To make sure each bit of input has some | |
98 | * effect on the contents of c and d, we can look at what the factors of | |
99 | * the coefficients in the equations for c_n and d_n are. The number of 2s | |
100 | * in the factors determines the lowest set bit in the multiplier. Running | |
101 | * through the cases for n*(n+1)/2 reveals that the highest power of 2 is | |
102 | * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow | |
103 | * the 64-bit accumulators, every bit of every f_i effects every accumulator, | |
104 | * even for 128k blocks. | |
105 | * | |
106 | * If we wanted to make a stronger version of fletcher4 (fletcher4c?), | |
107 | * we could do our calculations mod (2^32 - 1) by adding in the carries | |
108 | * periodically, and store the number of carries in the top 32-bits. | |
109 | * | |
110 | * -------------------- | |
111 | * Checksum Performance | |
112 | * -------------------- | |
113 | * | |
114 | * There are two interesting components to checksum performance: cached and | |
115 | * uncached performance. With cached data, fletcher-2 is about four times | |
116 | * faster than fletcher-4. With uncached data, the performance difference is | |
117 | * negligible, since the cost of a cache fill dominates the processing time. | |
118 | * Even though fletcher-4 is slower than fletcher-2, it is still a pretty | |
119 | * efficient pass over the data. | |
120 | * | |
121 | * In normal operation, the data which is being checksummed is in a buffer | |
122 | * which has been filled either by: | |
123 | * | |
124 | * 1. a compression step, which will be mostly cached, or | |
125 | * 2. a bcopy() or copyin(), which will be uncached (because the | |
126 | * copy is cache-bypassing). | |
127 | * | |
128 | * For both cached and uncached data, both fletcher checksums are much faster | |
129 | * than sha-256, and slower than 'off', which doesn't touch the data at all. | |
130 | */ | |
34dc7c2f BB |
131 | |
132 | #include <sys/types.h> | |
133 | #include <sys/sysmacros.h> | |
134 | #include <sys/byteorder.h> | |
135 | #include <sys/spa.h> | |
fc897b24 | 136 | #include <sys/zio_checksum.h> |
1eeb4562 JX |
137 | #include <sys/zfs_context.h> |
138 | #include <zfs_fletcher.h> | |
139 | ||
fc897b24 | 140 | |
1eeb4562 | 141 | static void fletcher_4_scalar_init(zio_cksum_t *zcp); |
fc897b24 | 142 | static void fletcher_4_scalar_native(const void *buf, uint64_t size, |
1eeb4562 JX |
143 | zio_cksum_t *zcp); |
144 | static void fletcher_4_scalar_byteswap(const void *buf, uint64_t size, | |
145 | zio_cksum_t *zcp); | |
146 | static boolean_t fletcher_4_scalar_valid(void); | |
147 | ||
148 | static const fletcher_4_ops_t fletcher_4_scalar_ops = { | |
fc897b24 GN |
149 | .init_native = fletcher_4_scalar_init, |
150 | .compute_native = fletcher_4_scalar_native, | |
151 | .init_byteswap = fletcher_4_scalar_init, | |
1eeb4562 JX |
152 | .compute_byteswap = fletcher_4_scalar_byteswap, |
153 | .valid = fletcher_4_scalar_valid, | |
154 | .name = "scalar" | |
155 | }; | |
156 | ||
fc897b24 GN |
157 | static fletcher_4_ops_t fletcher_4_fastest_impl = { |
158 | .name = "fastest", | |
159 | .valid = fletcher_4_scalar_valid | |
160 | }; | |
161 | ||
162 | static const fletcher_4_ops_t *fletcher_4_impls[] = { | |
1eeb4562 | 163 | &fletcher_4_scalar_ops, |
35a76a03 TS |
164 | #if defined(HAVE_SSE2) |
165 | &fletcher_4_sse2_ops, | |
166 | #endif | |
167 | #if defined(HAVE_SSE2) && defined(HAVE_SSSE3) | |
168 | &fletcher_4_ssse3_ops, | |
169 | #endif | |
1eeb4562 JX |
170 | #if defined(HAVE_AVX) && defined(HAVE_AVX2) |
171 | &fletcher_4_avx2_ops, | |
172 | #endif | |
70b258fc GN |
173 | #if defined(__x86_64) && defined(HAVE_AVX512F) |
174 | &fletcher_4_avx512f_ops, | |
175 | #endif | |
1eeb4562 JX |
176 | }; |
177 | ||
fc897b24 GN |
178 | /* Hold all supported implementations */ |
179 | static uint32_t fletcher_4_supp_impls_cnt = 0; | |
180 | static fletcher_4_ops_t *fletcher_4_supp_impls[ARRAY_SIZE(fletcher_4_impls)]; | |
181 | ||
182 | /* Select fletcher4 implementation */ | |
183 | #define IMPL_FASTEST (UINT32_MAX) | |
184 | #define IMPL_CYCLE (UINT32_MAX - 1) | |
185 | #define IMPL_SCALAR (0) | |
186 | ||
187 | static uint32_t fletcher_4_impl_chosen = IMPL_FASTEST; | |
188 | ||
189 | #define IMPL_READ(i) (*(volatile uint32_t *) &(i)) | |
1eeb4562 JX |
190 | |
191 | static struct fletcher_4_impl_selector { | |
fc897b24 GN |
192 | const char *fis_name; |
193 | uint32_t fis_sel; | |
1eeb4562 | 194 | } fletcher_4_impl_selectors[] = { |
1eeb4562 | 195 | #if !defined(_KERNEL) |
fc897b24 | 196 | { "cycle", IMPL_CYCLE }, |
1eeb4562 | 197 | #endif |
fc897b24 GN |
198 | { "fastest", IMPL_FASTEST }, |
199 | { "scalar", IMPL_SCALAR } | |
1eeb4562 JX |
200 | }; |
201 | ||
1eeb4562 JX |
202 | static kstat_t *fletcher_4_kstat; |
203 | ||
fc897b24 GN |
204 | static struct fletcher_4_kstat { |
205 | uint64_t native; | |
206 | uint64_t byteswap; | |
207 | } fletcher_4_stat_data[ARRAY_SIZE(fletcher_4_impls) + 1]; | |
208 | ||
209 | /* Indicate that benchmark has been completed */ | |
210 | static boolean_t fletcher_4_initialized = B_FALSE; | |
34dc7c2f | 211 | |
3c67d83a | 212 | /*ARGSUSED*/ |
34dc7c2f | 213 | void |
3c67d83a TH |
214 | fletcher_2_native(const void *buf, uint64_t size, |
215 | const void *ctx_template, zio_cksum_t *zcp) | |
34dc7c2f BB |
216 | { |
217 | const uint64_t *ip = buf; | |
218 | const uint64_t *ipend = ip + (size / sizeof (uint64_t)); | |
219 | uint64_t a0, b0, a1, b1; | |
220 | ||
221 | for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { | |
222 | a0 += ip[0]; | |
223 | a1 += ip[1]; | |
224 | b0 += a0; | |
225 | b1 += a1; | |
226 | } | |
227 | ||
228 | ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); | |
229 | } | |
230 | ||
3c67d83a | 231 | /*ARGSUSED*/ |
34dc7c2f | 232 | void |
3c67d83a TH |
233 | fletcher_2_byteswap(const void *buf, uint64_t size, |
234 | const void *ctx_template, zio_cksum_t *zcp) | |
34dc7c2f BB |
235 | { |
236 | const uint64_t *ip = buf; | |
237 | const uint64_t *ipend = ip + (size / sizeof (uint64_t)); | |
238 | uint64_t a0, b0, a1, b1; | |
239 | ||
240 | for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { | |
241 | a0 += BSWAP_64(ip[0]); | |
242 | a1 += BSWAP_64(ip[1]); | |
243 | b0 += a0; | |
244 | b1 += a1; | |
245 | } | |
246 | ||
247 | ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); | |
248 | } | |
249 | ||
fc897b24 GN |
250 | static void |
251 | fletcher_4_scalar_init(zio_cksum_t *zcp) | |
34dc7c2f | 252 | { |
1eeb4562 | 253 | ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); |
34dc7c2f BB |
254 | } |
255 | ||
1eeb4562 | 256 | static void |
fc897b24 | 257 | fletcher_4_scalar_native(const void *buf, uint64_t size, zio_cksum_t *zcp) |
34dc7c2f BB |
258 | { |
259 | const uint32_t *ip = buf; | |
260 | const uint32_t *ipend = ip + (size / sizeof (uint32_t)); | |
261 | uint64_t a, b, c, d; | |
262 | ||
1eeb4562 JX |
263 | a = zcp->zc_word[0]; |
264 | b = zcp->zc_word[1]; | |
265 | c = zcp->zc_word[2]; | |
266 | d = zcp->zc_word[3]; | |
267 | ||
268 | for (; ip < ipend; ip++) { | |
269 | a += ip[0]; | |
34dc7c2f BB |
270 | b += a; |
271 | c += b; | |
272 | d += c; | |
273 | } | |
274 | ||
275 | ZIO_SET_CHECKSUM(zcp, a, b, c, d); | |
276 | } | |
277 | ||
1eeb4562 JX |
278 | static void |
279 | fletcher_4_scalar_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) | |
34dc7c2f BB |
280 | { |
281 | const uint32_t *ip = buf; | |
282 | const uint32_t *ipend = ip + (size / sizeof (uint32_t)); | |
283 | uint64_t a, b, c, d; | |
284 | ||
285 | a = zcp->zc_word[0]; | |
286 | b = zcp->zc_word[1]; | |
287 | c = zcp->zc_word[2]; | |
288 | d = zcp->zc_word[3]; | |
289 | ||
290 | for (; ip < ipend; ip++) { | |
1eeb4562 | 291 | a += BSWAP_32(ip[0]); |
34dc7c2f BB |
292 | b += a; |
293 | c += b; | |
294 | d += c; | |
295 | } | |
296 | ||
297 | ZIO_SET_CHECKSUM(zcp, a, b, c, d); | |
298 | } | |
299 | ||
1eeb4562 JX |
300 | static boolean_t |
301 | fletcher_4_scalar_valid(void) | |
302 | { | |
303 | return (B_TRUE); | |
304 | } | |
305 | ||
306 | int | |
307 | fletcher_4_impl_set(const char *val) | |
308 | { | |
fc897b24 GN |
309 | int err = -EINVAL; |
310 | uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); | |
311 | size_t i, val_len; | |
1eeb4562 JX |
312 | |
313 | val_len = strlen(val); | |
314 | while ((val_len > 0) && !!isspace(val[val_len-1])) /* trim '\n' */ | |
315 | val_len--; | |
316 | ||
fc897b24 | 317 | /* check mandatory implementations */ |
1eeb4562 JX |
318 | for (i = 0; i < ARRAY_SIZE(fletcher_4_impl_selectors); i++) { |
319 | const char *name = fletcher_4_impl_selectors[i].fis_name; | |
320 | ||
321 | if (val_len == strlen(name) && | |
322 | strncmp(val, name, val_len) == 0) { | |
fc897b24 GN |
323 | impl = fletcher_4_impl_selectors[i].fis_sel; |
324 | err = 0; | |
1eeb4562 JX |
325 | break; |
326 | } | |
327 | } | |
1eeb4562 | 328 | |
fc897b24 GN |
329 | if (err != 0 && fletcher_4_initialized) { |
330 | /* check all supported implementations */ | |
331 | for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { | |
332 | const char *name = fletcher_4_supp_impls[i]->name; | |
1eeb4562 | 333 | |
fc897b24 GN |
334 | if (val_len == strlen(name) && |
335 | strncmp(val, name, val_len) == 0) { | |
336 | impl = i; | |
337 | err = 0; | |
338 | break; | |
339 | } | |
340 | } | |
341 | } | |
1eeb4562 | 342 | |
fc897b24 GN |
343 | if (err == 0) { |
344 | atomic_swap_32(&fletcher_4_impl_chosen, impl); | |
345 | membar_producer(); | |
346 | } | |
347 | ||
348 | return (err); | |
1eeb4562 JX |
349 | } |
350 | ||
351 | static inline const fletcher_4_ops_t * | |
352 | fletcher_4_impl_get(void) | |
353 | { | |
fc897b24 GN |
354 | fletcher_4_ops_t *ops = NULL; |
355 | const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); | |
356 | ||
357 | switch (impl) { | |
358 | case IMPL_FASTEST: | |
359 | ASSERT(fletcher_4_initialized); | |
360 | ops = &fletcher_4_fastest_impl; | |
361 | break; | |
1eeb4562 | 362 | #if !defined(_KERNEL) |
fc897b24 GN |
363 | case IMPL_CYCLE: { |
364 | ASSERT(fletcher_4_initialized); | |
365 | ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); | |
366 | ||
367 | static uint32_t cycle_count = 0; | |
368 | uint32_t idx = (++cycle_count) % fletcher_4_supp_impls_cnt; | |
369 | ops = fletcher_4_supp_impls[idx]; | |
1eeb4562 | 370 | } |
fc897b24 | 371 | break; |
1eeb4562 | 372 | #endif |
fc897b24 GN |
373 | default: |
374 | ASSERT3U(fletcher_4_supp_impls_cnt, >, 0); | |
375 | ASSERT3U(impl, <, fletcher_4_supp_impls_cnt); | |
376 | ||
377 | ops = fletcher_4_supp_impls[impl]; | |
378 | break; | |
379 | } | |
380 | ||
381 | ASSERT3P(ops, !=, NULL); | |
382 | ||
383 | return (ops); | |
384 | } | |
385 | ||
386 | void | |
387 | fletcher_4_incremental_native(const void *buf, uint64_t size, | |
388 | zio_cksum_t *zcp) | |
389 | { | |
390 | ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); | |
391 | ||
392 | fletcher_4_scalar_native(buf, size, zcp); | |
393 | } | |
394 | ||
395 | void | |
396 | fletcher_4_incremental_byteswap(const void *buf, uint64_t size, | |
397 | zio_cksum_t *zcp) | |
398 | { | |
399 | ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); | |
400 | ||
401 | fletcher_4_scalar_byteswap(buf, size, zcp); | |
402 | } | |
403 | ||
404 | static inline void | |
405 | fletcher_4_native_impl(const fletcher_4_ops_t *ops, const void *buf, | |
406 | uint64_t size, zio_cksum_t *zcp) | |
407 | { | |
408 | ops->init_native(zcp); | |
409 | ops->compute_native(buf, size, zcp); | |
410 | if (ops->fini_native != NULL) | |
411 | ops->fini_native(zcp); | |
1eeb4562 JX |
412 | } |
413 | ||
3c67d83a | 414 | /*ARGSUSED*/ |
1eeb4562 | 415 | void |
3c67d83a TH |
416 | fletcher_4_native(const void *buf, uint64_t size, |
417 | const void *ctx_template, zio_cksum_t *zcp) | |
1eeb4562 | 418 | { |
0dab2e84 | 419 | const fletcher_4_ops_t *ops; |
fc897b24 | 420 | uint64_t p2size = P2ALIGN(size, 64); |
0dab2e84 | 421 | |
fc897b24 GN |
422 | ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); |
423 | ||
424 | if (size == 0) { | |
425 | ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); | |
426 | } else if (p2size == 0) { | |
0dab2e84 | 427 | ops = &fletcher_4_scalar_ops; |
fc897b24 GN |
428 | fletcher_4_native_impl(ops, buf, size, zcp); |
429 | } else { | |
430 | ops = fletcher_4_impl_get(); | |
431 | fletcher_4_native_impl(ops, buf, p2size, zcp); | |
1eeb4562 | 432 | |
fc897b24 GN |
433 | if (p2size < size) |
434 | fletcher_4_incremental_native((char *)buf + p2size, | |
435 | size - p2size, zcp); | |
436 | } | |
437 | } | |
438 | ||
439 | void | |
440 | fletcher_4_native_varsize(const void *buf, uint64_t size, zio_cksum_t *zcp) | |
441 | { | |
442 | fletcher_4_native_impl(&fletcher_4_scalar_ops, buf, size, zcp); | |
443 | } | |
444 | ||
445 | static inline void | |
446 | fletcher_4_byteswap_impl(const fletcher_4_ops_t *ops, const void *buf, | |
447 | uint64_t size, zio_cksum_t *zcp) | |
448 | { | |
449 | ops->init_byteswap(zcp); | |
450 | ops->compute_byteswap(buf, size, zcp); | |
451 | if (ops->fini_byteswap != NULL) | |
452 | ops->fini_byteswap(zcp); | |
1eeb4562 JX |
453 | } |
454 | ||
3c67d83a | 455 | /*ARGSUSED*/ |
1eeb4562 | 456 | void |
3c67d83a TH |
457 | fletcher_4_byteswap(const void *buf, uint64_t size, |
458 | const void *ctx_template, zio_cksum_t *zcp) | |
1eeb4562 | 459 | { |
0dab2e84 | 460 | const fletcher_4_ops_t *ops; |
fc897b24 | 461 | uint64_t p2size = P2ALIGN(size, 64); |
0dab2e84 | 462 | |
fc897b24 GN |
463 | ASSERT(IS_P2ALIGNED(size, sizeof (uint32_t))); |
464 | ||
465 | if (size == 0) { | |
466 | ZIO_SET_CHECKSUM(zcp, 0, 0, 0, 0); | |
467 | } else if (p2size == 0) { | |
0dab2e84 | 468 | ops = &fletcher_4_scalar_ops; |
fc897b24 GN |
469 | fletcher_4_byteswap_impl(ops, buf, size, zcp); |
470 | } else { | |
471 | ops = fletcher_4_impl_get(); | |
472 | fletcher_4_byteswap_impl(ops, buf, p2size, zcp); | |
1eeb4562 | 473 | |
fc897b24 GN |
474 | if (p2size < size) |
475 | fletcher_4_incremental_byteswap((char *)buf + p2size, | |
476 | size - p2size, zcp); | |
477 | } | |
1eeb4562 JX |
478 | } |
479 | ||
fc897b24 GN |
480 | static int |
481 | fletcher_4_kstat_headers(char *buf, size_t size) | |
1eeb4562 | 482 | { |
fc897b24 GN |
483 | ssize_t off = 0; |
484 | ||
485 | off += snprintf(buf + off, size, "%-17s", "implementation"); | |
486 | off += snprintf(buf + off, size - off, "%-15s", "native"); | |
487 | (void) snprintf(buf + off, size - off, "%-15s\n", "byteswap"); | |
488 | ||
489 | return (0); | |
1eeb4562 JX |
490 | } |
491 | ||
fc897b24 GN |
492 | static int |
493 | fletcher_4_kstat_data(char *buf, size_t size, void *data) | |
34dc7c2f | 494 | { |
fc897b24 GN |
495 | struct fletcher_4_kstat *fastest_stat = |
496 | &fletcher_4_stat_data[fletcher_4_supp_impls_cnt]; | |
497 | struct fletcher_4_kstat *curr_stat = (struct fletcher_4_kstat *) data; | |
498 | ssize_t off = 0; | |
499 | ||
500 | if (curr_stat == fastest_stat) { | |
501 | off += snprintf(buf + off, size - off, "%-17s", "fastest"); | |
502 | off += snprintf(buf + off, size - off, "%-15s", | |
503 | fletcher_4_supp_impls[fastest_stat->native]->name); | |
504 | off += snprintf(buf + off, size - off, "%-15s\n", | |
505 | fletcher_4_supp_impls[fastest_stat->byteswap]->name); | |
506 | } else { | |
507 | ptrdiff_t id = curr_stat - fletcher_4_stat_data; | |
508 | ||
509 | off += snprintf(buf + off, size - off, "%-17s", | |
510 | fletcher_4_supp_impls[id]->name); | |
511 | off += snprintf(buf + off, size - off, "%-15llu", | |
512 | (u_longlong_t) curr_stat->native); | |
513 | off += snprintf(buf + off, size - off, "%-15llu\n", | |
514 | (u_longlong_t) curr_stat->byteswap); | |
515 | } | |
516 | ||
517 | return (0); | |
1eeb4562 | 518 | } |
34dc7c2f | 519 | |
fc897b24 GN |
520 | static void * |
521 | fletcher_4_kstat_addr(kstat_t *ksp, loff_t n) | |
1eeb4562 | 522 | { |
fc897b24 GN |
523 | if (n <= fletcher_4_supp_impls_cnt) |
524 | ksp->ks_private = (void *) (fletcher_4_stat_data + n); | |
525 | else | |
526 | ksp->ks_private = NULL; | |
527 | ||
528 | return (ksp->ks_private); | |
529 | } | |
530 | ||
531 | #define FLETCHER_4_FASTEST_FN_COPY(type, src) \ | |
532 | { \ | |
533 | fletcher_4_fastest_impl.init_ ## type = src->init_ ## type; \ | |
534 | fletcher_4_fastest_impl.fini_ ## type = src->fini_ ## type; \ | |
535 | fletcher_4_fastest_impl.compute_ ## type = src->compute_ ## type; \ | |
536 | } | |
537 | ||
538 | #define FLETCHER_4_BENCH_NS (MSEC2NSEC(50)) /* 50ms */ | |
34dc7c2f | 539 | |
fc897b24 GN |
540 | static void |
541 | fletcher_4_benchmark_impl(boolean_t native, char *data, uint64_t data_size) | |
542 | { | |
543 | ||
544 | struct fletcher_4_kstat *fastest_stat = | |
545 | &fletcher_4_stat_data[fletcher_4_supp_impls_cnt]; | |
546 | hrtime_t start; | |
547 | uint64_t run_bw, run_time_ns, best_run = 0; | |
548 | zio_cksum_t zc; | |
549 | uint32_t i, l, sel_save = IMPL_READ(fletcher_4_impl_chosen); | |
550 | ||
551 | zio_checksum_func_t *fletcher_4_test = native ? fletcher_4_native : | |
552 | fletcher_4_byteswap; | |
1eeb4562 | 553 | |
fc897b24 GN |
554 | for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { |
555 | struct fletcher_4_kstat *stat = &fletcher_4_stat_data[i]; | |
556 | uint64_t run_count = 0; | |
1eeb4562 | 557 | |
fc897b24 GN |
558 | /* temporary set an implementation */ |
559 | fletcher_4_impl_chosen = i; | |
1eeb4562 JX |
560 | |
561 | kpreempt_disable(); | |
562 | start = gethrtime(); | |
1eeb4562 | 563 | do { |
fc897b24 | 564 | for (l = 0; l < 32; l++, run_count++) |
3c67d83a | 565 | fletcher_4_test(data, data_size, NULL, &zc); |
fc897b24 GN |
566 | |
567 | run_time_ns = gethrtime() - start; | |
568 | } while (run_time_ns < FLETCHER_4_BENCH_NS); | |
1eeb4562 JX |
569 | kpreempt_enable(); |
570 | ||
fc897b24 GN |
571 | run_bw = data_size * run_count * NANOSEC; |
572 | run_bw /= run_time_ns; /* B/s */ | |
573 | ||
574 | if (native) | |
575 | stat->native = run_bw; | |
576 | else | |
577 | stat->byteswap = run_bw; | |
578 | ||
579 | if (run_bw > best_run) { | |
580 | best_run = run_bw; | |
581 | ||
582 | if (native) { | |
583 | fastest_stat->native = i; | |
584 | FLETCHER_4_FASTEST_FN_COPY(native, | |
585 | fletcher_4_supp_impls[i]); | |
586 | } else { | |
587 | fastest_stat->byteswap = i; | |
588 | FLETCHER_4_FASTEST_FN_COPY(byteswap, | |
589 | fletcher_4_supp_impls[i]); | |
590 | } | |
1eeb4562 | 591 | } |
fc897b24 GN |
592 | } |
593 | ||
594 | /* restore original selection */ | |
595 | atomic_swap_32(&fletcher_4_impl_chosen, sel_save); | |
596 | } | |
1eeb4562 | 597 | |
fc897b24 GN |
598 | void |
599 | fletcher_4_init(void) | |
600 | { | |
601 | static const size_t data_size = 1 << SPA_OLD_MAXBLOCKSHIFT; /* 128kiB */ | |
602 | fletcher_4_ops_t *curr_impl; | |
603 | char *databuf; | |
604 | int i, c; | |
605 | ||
606 | /* move supported impl into fletcher_4_supp_impls */ | |
607 | for (i = 0, c = 0; i < ARRAY_SIZE(fletcher_4_impls); i++) { | |
608 | curr_impl = (fletcher_4_ops_t *) fletcher_4_impls[i]; | |
609 | ||
610 | if (curr_impl->valid && curr_impl->valid()) | |
611 | fletcher_4_supp_impls[c++] = curr_impl; | |
34dc7c2f | 612 | } |
fc897b24 GN |
613 | membar_producer(); /* complete fletcher_4_supp_impls[] init */ |
614 | fletcher_4_supp_impls_cnt = c; /* number of supported impl */ | |
34dc7c2f | 615 | |
fc897b24 GN |
616 | #if !defined(_KERNEL) |
617 | /* Skip benchmarking and use last implementation as fastest */ | |
618 | memcpy(&fletcher_4_fastest_impl, | |
619 | fletcher_4_supp_impls[fletcher_4_supp_impls_cnt-1], | |
620 | sizeof (fletcher_4_fastest_impl)); | |
621 | fletcher_4_fastest_impl.name = "fastest"; | |
622 | membar_producer(); | |
1eeb4562 | 623 | |
fc897b24 | 624 | fletcher_4_initialized = B_TRUE; |
1eeb4562 | 625 | |
fc897b24 GN |
626 | /* Use 'cycle' math selection method for userspace */ |
627 | VERIFY0(fletcher_4_impl_set("cycle")); | |
628 | return; | |
629 | #endif | |
630 | /* Benchmark all supported implementations */ | |
631 | databuf = vmem_alloc(data_size, KM_SLEEP); | |
632 | for (i = 0; i < data_size / sizeof (uint64_t); i++) | |
633 | ((uint64_t *)databuf)[i] = (uintptr_t)(databuf+i); /* warm-up */ | |
634 | ||
635 | fletcher_4_benchmark_impl(B_FALSE, databuf, data_size); | |
636 | fletcher_4_benchmark_impl(B_TRUE, databuf, data_size); | |
637 | ||
638 | vmem_free(databuf, data_size); | |
639 | ||
640 | /* install kstats for all implementations */ | |
641 | fletcher_4_kstat = kstat_create("zfs", 0, "fletcher_4_bench", "misc", | |
642 | KSTAT_TYPE_RAW, 0, KSTAT_FLAG_VIRTUAL); | |
1eeb4562 | 643 | if (fletcher_4_kstat != NULL) { |
fc897b24 GN |
644 | fletcher_4_kstat->ks_data = NULL; |
645 | fletcher_4_kstat->ks_ndata = UINT32_MAX; | |
646 | kstat_set_raw_ops(fletcher_4_kstat, | |
647 | fletcher_4_kstat_headers, | |
648 | fletcher_4_kstat_data, | |
649 | fletcher_4_kstat_addr); | |
1eeb4562 JX |
650 | kstat_install(fletcher_4_kstat); |
651 | } | |
fc897b24 GN |
652 | |
653 | /* Finish initialization */ | |
654 | fletcher_4_initialized = B_TRUE; | |
1eeb4562 JX |
655 | } |
656 | ||
657 | void | |
658 | fletcher_4_fini(void) | |
659 | { | |
1eeb4562 JX |
660 | if (fletcher_4_kstat != NULL) { |
661 | kstat_delete(fletcher_4_kstat); | |
662 | fletcher_4_kstat = NULL; | |
663 | } | |
34dc7c2f | 664 | } |
c28b2279 BB |
665 | |
666 | #if defined(_KERNEL) && defined(HAVE_SPL) | |
9cc1844a | 667 | #include <linux/mod_compat.h> |
1eeb4562 JX |
668 | |
669 | static int | |
9cc1844a | 670 | fletcher_4_param_get(char *buffer, zfs_kernel_param_t *unused) |
1eeb4562 | 671 | { |
fc897b24 GN |
672 | const uint32_t impl = IMPL_READ(fletcher_4_impl_chosen); |
673 | char *fmt; | |
1eeb4562 JX |
674 | int i, cnt = 0; |
675 | ||
fc897b24 GN |
676 | /* list fastest */ |
677 | fmt = (impl == IMPL_FASTEST) ? "[%s] " : "%s "; | |
678 | cnt += sprintf(buffer + cnt, fmt, "fastest"); | |
1eeb4562 | 679 | |
fc897b24 GN |
680 | /* list all supported implementations */ |
681 | for (i = 0; i < fletcher_4_supp_impls_cnt; i++) { | |
682 | fmt = (i == impl) ? "[%s] " : "%s "; | |
683 | cnt += sprintf(buffer + cnt, fmt, | |
684 | fletcher_4_supp_impls[i]->name); | |
1eeb4562 JX |
685 | } |
686 | ||
687 | return (cnt); | |
688 | } | |
689 | ||
690 | static int | |
9cc1844a | 691 | fletcher_4_param_set(const char *val, zfs_kernel_param_t *unused) |
1eeb4562 JX |
692 | { |
693 | return (fletcher_4_impl_set(val)); | |
694 | } | |
695 | ||
696 | /* | |
697 | * Choose a fletcher 4 implementation in ZFS. | |
fc897b24 | 698 | * Users can choose "cycle" to exercise all implementations, but this is |
1eeb4562 JX |
699 | * for testing purpose therefore it can only be set in user space. |
700 | */ | |
701 | module_param_call(zfs_fletcher_4_impl, | |
702 | fletcher_4_param_set, fletcher_4_param_get, NULL, 0644); | |
fc897b24 | 703 | MODULE_PARM_DESC(zfs_fletcher_4_impl, "Select fletcher 4 implementation."); |
1eeb4562 JX |
704 | |
705 | EXPORT_SYMBOL(fletcher_4_init); | |
706 | EXPORT_SYMBOL(fletcher_4_fini); | |
c28b2279 BB |
707 | EXPORT_SYMBOL(fletcher_2_native); |
708 | EXPORT_SYMBOL(fletcher_2_byteswap); | |
709 | EXPORT_SYMBOL(fletcher_4_native); | |
fc897b24 | 710 | EXPORT_SYMBOL(fletcher_4_native_varsize); |
c28b2279 BB |
711 | EXPORT_SYMBOL(fletcher_4_byteswap); |
712 | EXPORT_SYMBOL(fletcher_4_incremental_native); | |
713 | EXPORT_SYMBOL(fletcher_4_incremental_byteswap); | |
714 | #endif |