]>
Commit | Line | Data |
---|---|---|
70b258fc GN |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
1d3ba0bf | 9 | * or https://opensource.org/licenses/CDDL-1.0. |
70b258fc GN |
10 | * See the License for the specific language governing permissions |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (C) 2016 Gvozden Nešković. All rights reserved. | |
23 | */ | |
24 | ||
25 | #if defined(__x86_64) && defined(HAVE_AVX512F) | |
26 | ||
70b258fc | 27 | #include <sys/byteorder.h> |
c28a6773 | 28 | #include <sys/frame.h> |
70b258fc | 29 | #include <sys/spa_checksum.h> |
d465fc58 | 30 | #include <sys/string.h> |
006e9a40 | 31 | #include <sys/simd.h> |
70b258fc GN |
32 | #include <zfs_fletcher.h> |
33 | ||
e64cc495 | 34 | #ifdef __linux__ |
70b258fc | 35 | #define __asm __asm__ __volatile__ |
e64cc495 | 36 | #endif |
70b258fc | 37 | |
5bf703b8 GN |
38 | static void |
39 | fletcher_4_avx512f_init(fletcher_4_ctx_t *ctx) | |
40 | { | |
861166b0 | 41 | memset(ctx->avx512, 0, 4 * sizeof (zfs_fletcher_avx512_t)); |
5bf703b8 | 42 | } |
70b258fc GN |
43 | |
44 | static void | |
5bf703b8 | 45 | fletcher_4_avx512f_fini(fletcher_4_ctx_t *ctx, zio_cksum_t *zcp) |
70b258fc | 46 | { |
5bf703b8 GN |
47 | static const uint64_t |
48 | CcA[] = { 0, 0, 1, 3, 6, 10, 15, 21 }, | |
49 | CcB[] = { 28, 36, 44, 52, 60, 68, 76, 84 }, | |
50 | DcA[] = { 0, 0, 0, 1, 4, 10, 20, 35 }, | |
51 | DcB[] = { 56, 84, 120, 164, 216, 276, 344, 420 }, | |
52 | DcC[] = { 448, 512, 576, 640, 704, 768, 832, 896 }; | |
53 | ||
54 | uint64_t A, B, C, D; | |
55 | uint64_t i; | |
70b258fc | 56 | |
5bf703b8 GN |
57 | A = ctx->avx512[0].v[0]; |
58 | B = 8 * ctx->avx512[1].v[0]; | |
59 | C = 64 * ctx->avx512[2].v[0] - CcB[0] * ctx->avx512[1].v[0]; | |
60 | D = 512 * ctx->avx512[3].v[0] - DcC[0] * ctx->avx512[2].v[0] + | |
61 | DcB[0] * ctx->avx512[1].v[0]; | |
62 | ||
63 | for (i = 1; i < 8; i++) { | |
64 | A += ctx->avx512[0].v[i]; | |
65 | B += 8 * ctx->avx512[1].v[i] - i * ctx->avx512[0].v[i]; | |
66 | C += 64 * ctx->avx512[2].v[i] - CcB[i] * ctx->avx512[1].v[i] + | |
67 | CcA[i] * ctx->avx512[0].v[i]; | |
68 | D += 512 * ctx->avx512[3].v[i] - DcC[i] * ctx->avx512[2].v[i] + | |
69 | DcB[i] * ctx->avx512[1].v[i] - DcA[i] * ctx->avx512[0].v[i]; | |
70 | } | |
71 | ||
72 | ZIO_SET_CHECKSUM(zcp, A, B, C, D); | |
73 | } | |
74 | ||
75 | #define FLETCHER_4_AVX512_RESTORE_CTX(ctx) \ | |
76 | { \ | |
77 | __asm("vmovdqu64 %0, %%zmm0" :: "m" ((ctx)->avx512[0])); \ | |
78 | __asm("vmovdqu64 %0, %%zmm1" :: "m" ((ctx)->avx512[1])); \ | |
79 | __asm("vmovdqu64 %0, %%zmm2" :: "m" ((ctx)->avx512[2])); \ | |
80 | __asm("vmovdqu64 %0, %%zmm3" :: "m" ((ctx)->avx512[3])); \ | |
81 | } | |
82 | ||
83 | #define FLETCHER_4_AVX512_SAVE_CTX(ctx) \ | |
84 | { \ | |
85 | __asm("vmovdqu64 %%zmm0, %0" : "=m" ((ctx)->avx512[0])); \ | |
86 | __asm("vmovdqu64 %%zmm1, %0" : "=m" ((ctx)->avx512[1])); \ | |
87 | __asm("vmovdqu64 %%zmm2, %0" : "=m" ((ctx)->avx512[2])); \ | |
88 | __asm("vmovdqu64 %%zmm3, %0" : "=m" ((ctx)->avx512[3])); \ | |
70b258fc GN |
89 | } |
90 | ||
91 | static void | |
5bf703b8 | 92 | fletcher_4_avx512f_native(fletcher_4_ctx_t *ctx, const void *buf, uint64_t size) |
70b258fc GN |
93 | { |
94 | const uint32_t *ip = buf; | |
95 | const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); | |
96 | ||
5bf703b8 GN |
97 | FLETCHER_4_AVX512_RESTORE_CTX(ctx); |
98 | ||
59493b63 | 99 | do { |
70b258fc GN |
100 | __asm("vpmovzxdq %0, %%zmm4"::"m" (*ip)); |
101 | __asm("vpaddq %zmm4, %zmm0, %zmm0"); | |
102 | __asm("vpaddq %zmm0, %zmm1, %zmm1"); | |
103 | __asm("vpaddq %zmm1, %zmm2, %zmm2"); | |
104 | __asm("vpaddq %zmm2, %zmm3, %zmm3"); | |
59493b63 | 105 | } while ((ip += 8) < ipend); |
5bf703b8 GN |
106 | |
107 | FLETCHER_4_AVX512_SAVE_CTX(ctx); | |
70b258fc | 108 | } |
c28a6773 | 109 | STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_native); |
70b258fc GN |
110 | |
111 | static void | |
5bf703b8 GN |
112 | fletcher_4_avx512f_byteswap(fletcher_4_ctx_t *ctx, const void *buf, |
113 | uint64_t size) | |
70b258fc GN |
114 | { |
115 | static const uint64_t byteswap_mask = 0xFFULL; | |
116 | const uint32_t *ip = buf; | |
117 | const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); | |
118 | ||
5bf703b8 GN |
119 | FLETCHER_4_AVX512_RESTORE_CTX(ctx); |
120 | ||
70b258fc GN |
121 | __asm("vpbroadcastq %0, %%zmm8" :: "r" (byteswap_mask)); |
122 | __asm("vpsllq $8, %zmm8, %zmm9"); | |
123 | __asm("vpsllq $16, %zmm8, %zmm10"); | |
124 | __asm("vpsllq $24, %zmm8, %zmm11"); | |
125 | ||
59493b63 | 126 | do { |
70b258fc GN |
127 | __asm("vpmovzxdq %0, %%zmm5"::"m" (*ip)); |
128 | ||
129 | __asm("vpsrlq $24, %zmm5, %zmm6"); | |
130 | __asm("vpandd %zmm8, %zmm6, %zmm6"); | |
131 | __asm("vpsrlq $8, %zmm5, %zmm7"); | |
132 | __asm("vpandd %zmm9, %zmm7, %zmm7"); | |
133 | __asm("vpord %zmm6, %zmm7, %zmm4"); | |
134 | __asm("vpsllq $8, %zmm5, %zmm6"); | |
135 | __asm("vpandd %zmm10, %zmm6, %zmm6"); | |
136 | __asm("vpord %zmm6, %zmm4, %zmm4"); | |
137 | __asm("vpsllq $24, %zmm5, %zmm5"); | |
138 | __asm("vpandd %zmm11, %zmm5, %zmm5"); | |
139 | __asm("vpord %zmm5, %zmm4, %zmm4"); | |
140 | ||
141 | __asm("vpaddq %zmm4, %zmm0, %zmm0"); | |
142 | __asm("vpaddq %zmm0, %zmm1, %zmm1"); | |
143 | __asm("vpaddq %zmm1, %zmm2, %zmm2"); | |
144 | __asm("vpaddq %zmm2, %zmm3, %zmm3"); | |
59493b63 | 145 | } while ((ip += 8) < ipend); |
70b258fc | 146 | |
5bf703b8 | 147 | FLETCHER_4_AVX512_SAVE_CTX(ctx) |
70b258fc | 148 | } |
c28a6773 | 149 | STACK_FRAME_NON_STANDARD(fletcher_4_avx512f_byteswap); |
70b258fc GN |
150 | |
151 | static boolean_t | |
152 | fletcher_4_avx512f_valid(void) | |
153 | { | |
e5db3134 | 154 | return (kfpu_allowed() && zfs_avx512f_available()); |
70b258fc GN |
155 | } |
156 | ||
157 | const fletcher_4_ops_t fletcher_4_avx512f_ops = { | |
fc897b24 GN |
158 | .init_native = fletcher_4_avx512f_init, |
159 | .fini_native = fletcher_4_avx512f_fini, | |
160 | .compute_native = fletcher_4_avx512f_native, | |
161 | .init_byteswap = fletcher_4_avx512f_init, | |
162 | .fini_byteswap = fletcher_4_avx512f_fini, | |
70b258fc GN |
163 | .compute_byteswap = fletcher_4_avx512f_byteswap, |
164 | .valid = fletcher_4_avx512f_valid, | |
78289b84 | 165 | .uses_fpu = B_TRUE, |
70b258fc GN |
166 | .name = "avx512f" |
167 | }; | |
168 | ||
0b2a6423 RD |
169 | #if defined(HAVE_AVX512BW) |
170 | static void | |
171 | fletcher_4_avx512bw_byteswap(fletcher_4_ctx_t *ctx, const void *buf, | |
172 | uint64_t size) | |
173 | { | |
174 | static const zfs_fletcher_avx512_t mask = { | |
175 | .v = { 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, | |
176 | 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, | |
177 | 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B, | |
178 | 0xFFFFFFFF00010203, 0xFFFFFFFF08090A0B } | |
179 | }; | |
180 | const uint32_t *ip = buf; | |
181 | const uint32_t *ipend = (uint32_t *)((uint8_t *)ip + size); | |
182 | ||
0b2a6423 RD |
183 | FLETCHER_4_AVX512_RESTORE_CTX(ctx); |
184 | ||
185 | __asm("vmovdqu64 %0, %%zmm5" :: "m" (mask)); | |
186 | ||
59493b63 | 187 | do { |
0b2a6423 RD |
188 | __asm("vpmovzxdq %0, %%zmm4"::"m" (*ip)); |
189 | ||
190 | __asm("vpshufb %zmm5, %zmm4, %zmm4"); | |
191 | ||
192 | __asm("vpaddq %zmm4, %zmm0, %zmm0"); | |
193 | __asm("vpaddq %zmm0, %zmm1, %zmm1"); | |
194 | __asm("vpaddq %zmm1, %zmm2, %zmm2"); | |
195 | __asm("vpaddq %zmm2, %zmm3, %zmm3"); | |
59493b63 | 196 | } while ((ip += 8) < ipend); |
0b2a6423 RD |
197 | |
198 | FLETCHER_4_AVX512_SAVE_CTX(ctx) | |
0b2a6423 RD |
199 | } |
200 | STACK_FRAME_NON_STANDARD(fletcher_4_avx512bw_byteswap); | |
201 | ||
03754655 RD |
202 | static boolean_t |
203 | fletcher_4_avx512bw_valid(void) | |
204 | { | |
205 | return (fletcher_4_avx512f_valid() && zfs_avx512bw_available()); | |
206 | } | |
207 | ||
0b2a6423 RD |
208 | const fletcher_4_ops_t fletcher_4_avx512bw_ops = { |
209 | .init_native = fletcher_4_avx512f_init, | |
210 | .fini_native = fletcher_4_avx512f_fini, | |
211 | .compute_native = fletcher_4_avx512f_native, | |
212 | .init_byteswap = fletcher_4_avx512f_init, | |
213 | .fini_byteswap = fletcher_4_avx512f_fini, | |
214 | .compute_byteswap = fletcher_4_avx512bw_byteswap, | |
03754655 | 215 | .valid = fletcher_4_avx512bw_valid, |
78289b84 | 216 | .uses_fpu = B_TRUE, |
0b2a6423 RD |
217 | .name = "avx512bw" |
218 | }; | |
219 | #endif | |
220 | ||
70b258fc | 221 | #endif /* defined(__x86_64) && defined(HAVE_AVX512F) */ |