]> git.proxmox.com Git - mirror_zfs.git/blame - module/zfs/vdev_raidz_math_aarch64_neon_common.h
Fletcher4: Incremental updates and ctx calculation
[mirror_zfs.git] / module / zfs / vdev_raidz_math_aarch64_neon_common.h
CommitLineData
62a65a65
RD
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21/*
22 * Copyright (C) 2016 Romain Dolbeau. All rights reserved.
23 */
24
25#include <sys/types.h>
26#include <linux/simd_aarch64.h>
27
28#define __asm __asm__ __volatile__
29
30#define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
31#define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
32
33#define VR0_(REG, ...) "%[w"#REG"]"
34#define VR1_(_1, REG, ...) "%[w"#REG"]"
35#define VR2_(_1, _2, REG, ...) "%[w"#REG"]"
36#define VR3_(_1, _2, _3, REG, ...) "%[w"#REG"]"
37#define VR4_(_1, _2, _3, _4, REG, ...) "%[w"#REG"]"
38#define VR5_(_1, _2, _3, _4, _5, REG, ...) "%[w"#REG"]"
39#define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "%[w"#REG"]"
40#define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "%[w"#REG"]"
41
42/*
43 * Here we need registers not used otherwise.
44 * They will be used in unused ASM for the case
45 * with more registers than required... but GGC
46 * will still need to make sure the constraints
47 * are correct, and duplicate constraints are illegal
48 * ... and we use the "register" number as a name
49 */
50
51#define VR0(r...) VR0_(r)
52#define VR1(r...) VR1_(r)
53#define VR2(r...) VR2_(r, 36)
54#define VR3(r...) VR3_(r, 36, 35)
55#define VR4(r...) VR4_(r, 36, 35, 34, 33)
56#define VR5(r...) VR5_(r, 36, 35, 34, 33, 32)
57#define VR6(r...) VR6_(r, 36, 35, 34, 33, 32, 31)
58#define VR7(r...) VR7_(r, 36, 35, 34, 33, 32, 31, 30)
59
60#define VR(X) "%[w"#X"]"
61
62#define RVR0_(REG, ...) [w##REG] "w" (w##REG)
63#define RVR1_(_1, REG, ...) [w##REG] "w" (w##REG)
64#define RVR2_(_1, _2, REG, ...) [w##REG] "w" (w##REG)
65#define RVR3_(_1, _2, _3, REG, ...) [w##REG] "w" (w##REG)
66#define RVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "w" (w##REG)
67#define RVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "w" (w##REG)
68#define RVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "w" (w##REG)
69#define RVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "w" (w##REG)
70
71#define RVR0(r...) RVR0_(r)
72#define RVR1(r...) RVR1_(r)
73#define RVR2(r...) RVR2_(r, 36)
74#define RVR3(r...) RVR3_(r, 36, 35)
75#define RVR4(r...) RVR4_(r, 36, 35, 34, 33)
76#define RVR5(r...) RVR5_(r, 36, 35, 34, 33, 32)
77#define RVR6(r...) RVR6_(r, 36, 35, 34, 33, 32, 31)
78#define RVR7(r...) RVR7_(r, 36, 35, 34, 33, 32, 31, 30)
79
80#define RVR(X) [w##X] "w" (w##X)
81
82#define WVR0_(REG, ...) [w##REG] "=w" (w##REG)
83#define WVR1_(_1, REG, ...) [w##REG] "=w" (w##REG)
84#define WVR2_(_1, _2, REG, ...) [w##REG] "=w" (w##REG)
85#define WVR3_(_1, _2, _3, REG, ...) [w##REG] "=w" (w##REG)
86#define WVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "=w" (w##REG)
87#define WVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "=w" (w##REG)
88#define WVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "=w" (w##REG)
89#define WVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "=w" (w##REG)
90
91#define WVR0(r...) WVR0_(r)
92#define WVR1(r...) WVR1_(r)
93#define WVR2(r...) WVR2_(r, 36)
94#define WVR3(r...) WVR3_(r, 36, 35)
95#define WVR4(r...) WVR4_(r, 36, 35, 34, 33)
96#define WVR5(r...) WVR5_(r, 36, 35, 34, 33, 32)
97#define WVR6(r...) WVR6_(r, 36, 35, 34, 33, 32, 31)
98#define WVR7(r...) WVR7_(r, 36, 35, 34, 33, 32, 31, 30)
99
100#define WVR(X) [w##X] "=w" (w##X)
101
102#define UVR0_(REG, ...) [w##REG] "+&w" (w##REG)
103#define UVR1_(_1, REG, ...) [w##REG] "+&w" (w##REG)
104#define UVR2_(_1, _2, REG, ...) [w##REG] "+&w" (w##REG)
105#define UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&w" (w##REG)
106#define UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&w" (w##REG)
107#define UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&w" (w##REG)
108#define UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&w" (w##REG)
109#define UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&w" (w##REG)
110
111#define UVR0(r...) UVR0_(r)
112#define UVR1(r...) UVR1_(r)
113#define UVR2(r...) UVR2_(r, 36)
114#define UVR3(r...) UVR3_(r, 36, 35)
115#define UVR4(r...) UVR4_(r, 36, 35, 34, 33)
116#define UVR5(r...) UVR5_(r, 36, 35, 34, 33, 32)
117#define UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31)
118#define UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30)
119
120#define UVR(X) [w##X] "+&w" (w##X)
121
122#define R_01(REG1, REG2, ...) REG1, REG2
123#define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3
124#define R_23(REG...) _R_23(REG, 1, 2, 3)
125
126#define ASM_BUG() ASSERT(0)
127
128#define OFFSET(ptr, val) (((unsigned char *)ptr)+val)
129
130extern const uint8_t gf_clmul_mod_lt[4*256][16];
131
132#define ELEM_SIZE 16
133
134typedef struct v {
135 uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
136} v_t;
137
138#define PREFETCHNTA(ptr, offset) \
139{ \
140 __asm( \
141 "prfm pstl1strm, %[MEM]\n" \
142 : : [MEM] "Q" (*(ptr + offset))); \
143}
144
145#define PREFETCH(ptr, offset) \
146{ \
147 __asm( \
148 "prfm pldl1keep, %[MEM]\n" \
149 : : [MEM] "Q" (*(ptr + offset))); \
150}
151
152#define XOR_ACC(src, r...) \
153{ \
154 switch (REG_CNT(r)) { \
155 case 8: \
156 __asm( \
157 "ld1 { v21.4s },%[SRC0]\n" \
158 "ld1 { v20.4s },%[SRC1]\n" \
159 "ld1 { v19.4s },%[SRC2]\n" \
160 "ld1 { v18.4s },%[SRC3]\n" \
161 "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n" \
162 "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n" \
163 "eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n" \
164 "eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n" \
165 "ld1 { v21.4s },%[SRC4]\n" \
166 "ld1 { v20.4s },%[SRC5]\n" \
167 "ld1 { v19.4s },%[SRC6]\n" \
168 "ld1 { v18.4s },%[SRC7]\n" \
169 "eor " VR4(r) ".16b," VR4(r) ".16b,v21.16b\n" \
170 "eor " VR5(r) ".16b," VR5(r) ".16b,v20.16b\n" \
171 "eor " VR6(r) ".16b," VR6(r) ".16b,v19.16b\n" \
172 "eor " VR7(r) ".16b," VR7(r) ".16b,v18.16b\n" \
173 : UVR0(r), UVR1(r), UVR2(r), UVR3(r), \
174 UVR4(r), UVR5(r), UVR6(r), UVR7(r) \
175 : [SRC0] "Q" (*(OFFSET(src, 0))), \
176 [SRC1] "Q" (*(OFFSET(src, 16))), \
177 [SRC2] "Q" (*(OFFSET(src, 32))), \
178 [SRC3] "Q" (*(OFFSET(src, 48))), \
179 [SRC4] "Q" (*(OFFSET(src, 64))), \
180 [SRC5] "Q" (*(OFFSET(src, 80))), \
181 [SRC6] "Q" (*(OFFSET(src, 96))), \
182 [SRC7] "Q" (*(OFFSET(src, 112))) \
183 : "v18", "v19", "v20", "v21"); \
184 break; \
185 case 4: \
186 __asm( \
187 "ld1 { v21.4s },%[SRC0]\n" \
188 "ld1 { v20.4s },%[SRC1]\n" \
189 "ld1 { v19.4s },%[SRC2]\n" \
190 "ld1 { v18.4s },%[SRC3]\n" \
191 "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n" \
192 "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n" \
193 "eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n" \
194 "eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n" \
195 : UVR0(r), UVR1(r), UVR2(r), UVR3(r) \
196 : [SRC0] "Q" (*(OFFSET(src, 0))), \
197 [SRC1] "Q" (*(OFFSET(src, 16))), \
198 [SRC2] "Q" (*(OFFSET(src, 32))), \
199 [SRC3] "Q" (*(OFFSET(src, 48))) \
200 : "v18", "v19", "v20", "v21"); \
201 break; \
202 case 2: \
203 __asm( \
204 "ld1 { v21.4s },%[SRC0]\n" \
205 "ld1 { v20.4s },%[SRC1]\n" \
206 "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n" \
207 "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n" \
208 : UVR0(r), UVR1(r) \
209 : [SRC0] "Q" (*(OFFSET(src, 0))), \
210 [SRC1] "Q" (*(OFFSET(src, 16))) \
211 : "v20", "v21"); \
212 break; \
213 default: \
214 ASM_BUG(); \
215 } \
216}
217
218#define XOR(r...) \
219{ \
220 switch (REG_CNT(r)) { \
221 case 8: \
222 __asm( \
223 "eor " VR4(r) ".16b," VR4(r) ".16b," VR0(r) ".16b\n" \
224 "eor " VR5(r) ".16b," VR5(r) ".16b," VR1(r) ".16b\n" \
225 "eor " VR6(r) ".16b," VR6(r) ".16b," VR2(r) ".16b\n" \
226 "eor " VR7(r) ".16b," VR7(r) ".16b," VR3(r) ".16b\n" \
227 : UVR4(r), UVR5(r), UVR6(r), UVR7(r) \
228 : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \
229 break; \
230 case 4: \
231 __asm( \
232 "eor " VR2(r) ".16b," VR2(r) ".16b," VR0(r) ".16b\n" \
233 "eor " VR3(r) ".16b," VR3(r) ".16b," VR1(r) ".16b\n" \
234 : UVR2(r), UVR3(r) \
235 : RVR0(r), RVR1(r)); \
236 break; \
237 default: \
238 ASM_BUG(); \
239 } \
240}
241
242#define ZERO(r...) \
243{ \
244 switch (REG_CNT(r)) { \
245 case 4: \
246 __asm( \
247 "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \
248 "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n" \
249 "eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n" \
250 "eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n" \
251 : WVR0(r), WVR1(r), WVR2(r), WVR3(r)); \
252 break; \
253 case 2: \
254 __asm( \
255 "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \
256 "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n" \
257 : WVR0(r), WVR1(r)); \
258 break; \
259 default: \
260 ASM_BUG(); \
261 } \
262}
263
264#define COPY(r...) \
265{ \
266 switch (REG_CNT(r)) { \
267 case 8: \
268 __asm( \
269 "mov " VR4(r) ".16b," VR0(r) ".16b\n" \
270 "mov " VR5(r) ".16b," VR1(r) ".16b\n" \
271 "mov " VR6(r) ".16b," VR2(r) ".16b\n" \
272 "mov " VR7(r) ".16b," VR3(r) ".16b\n" \
273 : WVR4(r), WVR5(r), WVR6(r), WVR7(r) \
274 : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \
275 break; \
276 case 4: \
277 __asm( \
278 "mov " VR2(r) ".16b," VR0(r) ".16b\n" \
279 "mov " VR3(r) ".16b," VR1(r) ".16b\n" \
280 : WVR2(r), WVR3(r) \
281 : RVR0(r), RVR1(r)); \
282 break; \
283 default: \
284 ASM_BUG(); \
285 } \
286}
287
288#define LOAD(src, r...) \
289{ \
290 switch (REG_CNT(r)) { \
291 case 8: \
292 __asm( \
293 "ld1 { " VR0(r) ".4s },%[SRC0]\n" \
294 "ld1 { " VR1(r) ".4s },%[SRC1]\n" \
295 "ld1 { " VR2(r) ".4s },%[SRC2]\n" \
296 "ld1 { " VR3(r) ".4s },%[SRC3]\n" \
297 "ld1 { " VR4(r) ".4s },%[SRC4]\n" \
298 "ld1 { " VR5(r) ".4s },%[SRC5]\n" \
299 "ld1 { " VR6(r) ".4s },%[SRC6]\n" \
300 "ld1 { " VR7(r) ".4s },%[SRC7]\n" \
301 : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \
302 WVR4(r), WVR5(r), WVR6(r), WVR7(r) \
303 : [SRC0] "Q" (*(OFFSET(src, 0))), \
304 [SRC1] "Q" (*(OFFSET(src, 16))), \
305 [SRC2] "Q" (*(OFFSET(src, 32))), \
306 [SRC3] "Q" (*(OFFSET(src, 48))), \
307 [SRC4] "Q" (*(OFFSET(src, 64))), \
308 [SRC5] "Q" (*(OFFSET(src, 80))), \
309 [SRC6] "Q" (*(OFFSET(src, 96))), \
310 [SRC7] "Q" (*(OFFSET(src, 112)))); \
311 break; \
312 case 4: \
313 __asm( \
314 "ld1 { " VR0(r) ".4s },%[SRC0]\n" \
315 "ld1 { " VR1(r) ".4s },%[SRC1]\n" \
316 "ld1 { " VR2(r) ".4s },%[SRC2]\n" \
317 "ld1 { " VR3(r) ".4s },%[SRC3]\n" \
318 : WVR0(r), WVR1(r), WVR2(r), WVR3(r) \
319 : [SRC0] "Q" (*(OFFSET(src, 0))), \
320 [SRC1] "Q" (*(OFFSET(src, 16))), \
321 [SRC2] "Q" (*(OFFSET(src, 32))), \
322 [SRC3] "Q" (*(OFFSET(src, 48)))); \
323 break; \
324 case 2: \
325 __asm( \
326 "ld1 { " VR0(r) ".4s },%[SRC0]\n" \
327 "ld1 { " VR1(r) ".4s },%[SRC1]\n" \
328 : WVR0(r), WVR1(r) \
329 : [SRC0] "Q" (*(OFFSET(src, 0))), \
330 [SRC1] "Q" (*(OFFSET(src, 16)))); \
331 break; \
332 default: \
333 ASM_BUG(); \
334 } \
335}
336
337#define STORE(dst, r...) \
338{ \
339 switch (REG_CNT(r)) { \
340 case 8: \
341 __asm( \
342 "st1 { " VR0(r) ".4s },%[DST0]\n" \
343 "st1 { " VR1(r) ".4s },%[DST1]\n" \
344 "st1 { " VR2(r) ".4s },%[DST2]\n" \
345 "st1 { " VR3(r) ".4s },%[DST3]\n" \
346 "st1 { " VR4(r) ".4s },%[DST4]\n" \
347 "st1 { " VR5(r) ".4s },%[DST5]\n" \
348 "st1 { " VR6(r) ".4s },%[DST6]\n" \
349 "st1 { " VR7(r) ".4s },%[DST7]\n" \
350 : [DST0] "=Q" (*(OFFSET(dst, 0))), \
351 [DST1] "=Q" (*(OFFSET(dst, 16))), \
352 [DST2] "=Q" (*(OFFSET(dst, 32))), \
353 [DST3] "=Q" (*(OFFSET(dst, 48))), \
354 [DST4] "=Q" (*(OFFSET(dst, 64))), \
355 [DST5] "=Q" (*(OFFSET(dst, 80))), \
356 [DST6] "=Q" (*(OFFSET(dst, 96))), \
357 [DST7] "=Q" (*(OFFSET(dst, 112))) \
358 : RVR0(r), RVR1(r), RVR2(r), RVR3(r), \
359 RVR4(r), RVR5(r), RVR6(r), RVR7(r)); \
360 break; \
361 case 4: \
362 __asm( \
363 "st1 { " VR0(r) ".4s },%[DST0]\n" \
364 "st1 { " VR1(r) ".4s },%[DST1]\n" \
365 "st1 { " VR2(r) ".4s },%[DST2]\n" \
366 "st1 { " VR3(r) ".4s },%[DST3]\n" \
367 : [DST0] "=Q" (*(OFFSET(dst, 0))), \
368 [DST1] "=Q" (*(OFFSET(dst, 16))), \
369 [DST2] "=Q" (*(OFFSET(dst, 32))), \
370 [DST3] "=Q" (*(OFFSET(dst, 48))) \
371 : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \
372 break; \
373 case 2: \
374 __asm( \
375 "st1 { " VR0(r) ".4s },%[DST0]\n" \
376 "st1 { " VR1(r) ".4s },%[DST1]\n" \
377 : [DST0] "=Q" (*(OFFSET(dst, 0))), \
378 [DST1] "=Q" (*(OFFSET(dst, 16))) \
379 : RVR0(r), RVR1(r)); \
380 break; \
381 default: \
382 ASM_BUG(); \
383 } \
384}
385
386/*
387 * Unfortunately cannot use the macro, because GCC
388 * will try to use the macro name and not value
389 * later on...
390 * Kept as a reference to what a numbered variable is
391 */
392#define _00 "v17"
393#define _1d "v16"
394#define _temp0 "v19"
395#define _temp1 "v18"
396
397#define MUL2_SETUP() \
398{ \
399 __asm( \
400 "eor " VR(17) ".16b," VR(17) ".16b," VR(17) ".16b\n" \
401 "movi " VR(16) ".16b,#0x1d\n" \
402 : WVR(16), WVR(17)); \
403}
404
405#define MUL2(r...) \
406{ \
407 switch (REG_CNT(r)) { \
408 case 4: \
409 __asm( \
410 "cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n" \
411 "cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n" \
412 "cmgt v21.16b," VR(17) ".16b," VR2(r) ".16b\n" \
413 "cmgt v20.16b," VR(17) ".16b," VR3(r) ".16b\n" \
414 "and v19.16b,v19.16b," VR(16) ".16b\n" \
415 "and v18.16b,v18.16b," VR(16) ".16b\n" \
416 "and v21.16b,v21.16b," VR(16) ".16b\n" \
417 "and v20.16b,v20.16b," VR(16) ".16b\n" \
418 "shl " VR0(r) ".16b," VR0(r) ".16b,#1\n" \
419 "shl " VR1(r) ".16b," VR1(r) ".16b,#1\n" \
420 "shl " VR2(r) ".16b," VR2(r) ".16b,#1\n" \
421 "shl " VR3(r) ".16b," VR3(r) ".16b,#1\n" \
422 "eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n" \
423 "eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n" \
424 "eor " VR2(r) ".16b,v21.16b," VR2(r) ".16b\n" \
425 "eor " VR3(r) ".16b,v20.16b," VR3(r) ".16b\n" \
426 : UVR0(r), UVR1(r), UVR2(r), UVR3(r) \
427 : RVR(17), RVR(16) \
428 : "v18", "v19", "v20", "v21"); \
429 break; \
430 case 2: \
431 __asm( \
432 "cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n" \
433 "cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n" \
434 "and v19.16b,v19.16b," VR(16) ".16b\n" \
435 "and v18.16b,v18.16b," VR(16) ".16b\n" \
436 "shl " VR0(r) ".16b," VR0(r) ".16b,#1\n" \
437 "shl " VR1(r) ".16b," VR1(r) ".16b,#1\n" \
438 "eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n" \
439 "eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n" \
440 : UVR0(r), UVR1(r) \
441 : RVR(17), RVR(16) \
442 : "v18", "v19"); \
443 break; \
444 default: \
445 ASM_BUG(); \
446 } \
447}
448
449#define MUL4(r...) \
450{ \
451 MUL2(r); \
452 MUL2(r); \
453}
454
455/*
456 * Unfortunately cannot use the macro, because GCC
457 * will try to use the macro name and not value
458 * later on...
459 * Kept as a reference to what a register is
460 * (here we're using actual registers for the
461 * clobbered ones)
462 */
463#define _0f "v15"
464#define _a_save "v14"
465#define _b_save "v13"
466#define _lt_mod_a "v12"
467#define _lt_clmul_a "v11"
468#define _lt_mod_b "v10"
469#define _lt_clmul_b "v15"
470
471#define _MULx2(c, r...) \
472{ \
473 switch (REG_CNT(r)) { \
474 case 2: \
475 __asm( \
476 /* lts for upper part */ \
477 "movi v15.16b,#0x0f\n" \
478 "ld1 { v10.4s },%[lt0]\n" \
479 "ld1 { v11.4s },%[lt1]\n" \
480 /* upper part */ \
481 "and v14.16b," VR0(r) ".16b,v15.16b\n" \
482 "and v13.16b," VR1(r) ".16b,v15.16b\n" \
483 "sshr " VR0(r) ".8h," VR0(r) ".8h,#4\n" \
484 "sshr " VR1(r) ".8h," VR1(r) ".8h,#4\n" \
485 "and " VR0(r) ".16b," VR0(r) ".16b,v15.16b\n" \
486 "and " VR1(r) ".16b," VR1(r) ".16b,v15.16b\n" \
487 \
488 "tbl v12.16b,{v10.16b}," VR0(r) ".16b\n" \
489 "tbl v10.16b,{v10.16b}," VR1(r) ".16b\n" \
490 "tbl v15.16b,{v11.16b}," VR0(r) ".16b\n" \
491 "tbl v11.16b,{v11.16b}," VR1(r) ".16b\n" \
492 \
493 "eor " VR0(r) ".16b,v15.16b,v12.16b\n" \
494 "eor " VR1(r) ".16b,v11.16b,v10.16b\n" \
495 /* lts for lower part */ \
496 "ld1 { v10.4s },%[lt2]\n" \
497 "ld1 { v15.4s },%[lt3]\n" \
498 /* lower part */ \
499 "tbl v12.16b,{v10.16b},v14.16b\n" \
500 "tbl v10.16b,{v10.16b},v13.16b\n" \
501 "tbl v11.16b,{v15.16b},v14.16b\n" \
502 "tbl v15.16b,{v15.16b},v13.16b\n" \
503 \
504 "eor " VR0(r) ".16b," VR0(r) ".16b,v12.16b\n" \
505 "eor " VR1(r) ".16b," VR1(r) ".16b,v10.16b\n" \
506 "eor " VR0(r) ".16b," VR0(r) ".16b,v11.16b\n" \
507 "eor " VR1(r) ".16b," VR1(r) ".16b,v15.16b\n" \
508 : UVR0(r), UVR1(r) \
509 : [lt0] "Q" ((gf_clmul_mod_lt[4*(c)+0][0])), \
510 [lt1] "Q" ((gf_clmul_mod_lt[4*(c)+1][0])), \
511 [lt2] "Q" ((gf_clmul_mod_lt[4*(c)+2][0])), \
512 [lt3] "Q" ((gf_clmul_mod_lt[4*(c)+3][0])) \
513 : "v10", "v11", "v12", "v13", "v14", "v15"); \
514 break; \
515 default: \
516 ASM_BUG(); \
517 } \
518}
519
520#define MUL(c, r...) \
521{ \
522 switch (REG_CNT(r)) { \
523 case 4: \
524 _MULx2(c, R_23(r)); \
525 _MULx2(c, R_01(r)); \
526 break; \
527 case 2: \
528 _MULx2(c, R_01(r)); \
529 break; \
530 default: \
531 ASM_BUG(); \
532 } \
533}
534
535#define raidz_math_begin() kfpu_begin()
536#define raidz_math_end() kfpu_end()
537
538/* Overkill... */
539#if defined(_KERNEL)
540#define GEN_X_DEFINE_0_3() \
541register unsigned char w0 asm("v0") __attribute__((vector_size(16))); \
542register unsigned char w1 asm("v1") __attribute__((vector_size(16))); \
543register unsigned char w2 asm("v2") __attribute__((vector_size(16))); \
544register unsigned char w3 asm("v3") __attribute__((vector_size(16)));
545#define GEN_X_DEFINE_4_5() \
546register unsigned char w4 asm("v4") __attribute__((vector_size(16))); \
547register unsigned char w5 asm("v5") __attribute__((vector_size(16)));
548#define GEN_X_DEFINE_6_7() \
549register unsigned char w6 asm("v6") __attribute__((vector_size(16))); \
550register unsigned char w7 asm("v7") __attribute__((vector_size(16)));
551#define GEN_X_DEFINE_8_9() \
552register unsigned char w8 asm("v8") __attribute__((vector_size(16))); \
553register unsigned char w9 asm("v9") __attribute__((vector_size(16)));
554#define GEN_X_DEFINE_10_11() \
555register unsigned char w10 asm("v10") __attribute__((vector_size(16))); \
556register unsigned char w11 asm("v11") __attribute__((vector_size(16)));
557#define GEN_X_DEFINE_12_15() \
558register unsigned char w12 asm("v12") __attribute__((vector_size(16))); \
559register unsigned char w13 asm("v13") __attribute__((vector_size(16))); \
560register unsigned char w14 asm("v14") __attribute__((vector_size(16))); \
561register unsigned char w15 asm("v15") __attribute__((vector_size(16)));
562#define GEN_X_DEFINE_16() \
563register unsigned char w16 asm("v16") __attribute__((vector_size(16)));
564#define GEN_X_DEFINE_17() \
565register unsigned char w17 asm("v17") __attribute__((vector_size(16)));
566#define GEN_X_DEFINE_18_21() \
567register unsigned char w18 asm("v18") __attribute__((vector_size(16))); \
568register unsigned char w19 asm("v19") __attribute__((vector_size(16))); \
569register unsigned char w20 asm("v20") __attribute__((vector_size(16))); \
570register unsigned char w21 asm("v21") __attribute__((vector_size(16)));
571#define GEN_X_DEFINE_22_23() \
572register unsigned char w22 asm("v22") __attribute__((vector_size(16))); \
573register unsigned char w23 asm("v23") __attribute__((vector_size(16)));
574#define GEN_X_DEFINE_24_27() \
575register unsigned char w24 asm("v24") __attribute__((vector_size(16))); \
576register unsigned char w25 asm("v25") __attribute__((vector_size(16))); \
577register unsigned char w26 asm("v26") __attribute__((vector_size(16))); \
578register unsigned char w27 asm("v27") __attribute__((vector_size(16)));
579#define GEN_X_DEFINE_28_30() \
580register unsigned char w28 asm("v28") __attribute__((vector_size(16))); \
581register unsigned char w29 asm("v29") __attribute__((vector_size(16))); \
582register unsigned char w30 asm("v30") __attribute__((vector_size(16)));
583#define GEN_X_DEFINE_31() \
584register unsigned char w31 asm("v31") __attribute__((vector_size(16)));
585#define GEN_X_DEFINE_32() \
586register unsigned char w32 asm("v31") __attribute__((vector_size(16)));
587#define GEN_X_DEFINE_33_36() \
588register unsigned char w33 asm("v31") __attribute__((vector_size(16))); \
589register unsigned char w34 asm("v31") __attribute__((vector_size(16))); \
590register unsigned char w35 asm("v31") __attribute__((vector_size(16))); \
591register unsigned char w36 asm("v31") __attribute__((vector_size(16)));
592#define GEN_X_DEFINE_37_38() \
593register unsigned char w37 asm("v31") __attribute__((vector_size(16))); \
594register unsigned char w38 asm("v31") __attribute__((vector_size(16)));
595#define GEN_X_DEFINE_ALL() \
596 GEN_X_DEFINE_0_3() \
597 GEN_X_DEFINE_4_5() \
598 GEN_X_DEFINE_6_7() \
599 GEN_X_DEFINE_8_9() \
600 GEN_X_DEFINE_10_11() \
601 GEN_X_DEFINE_12_15() \
602 GEN_X_DEFINE_16() \
603 GEN_X_DEFINE_17() \
604 GEN_X_DEFINE_18_21() \
605 GEN_X_DEFINE_22_23() \
606 GEN_X_DEFINE_24_27() \
607 GEN_X_DEFINE_28_30() \
608 GEN_X_DEFINE_31() \
609 GEN_X_DEFINE_32() \
610 GEN_X_DEFINE_33_36() \
611 GEN_X_DEFINE_37_38()
612#else
613#define GEN_X_DEFINE_0_3() \
614 unsigned char w0 __attribute__((vector_size(16))); \
615 unsigned char w1 __attribute__((vector_size(16))); \
616 unsigned char w2 __attribute__((vector_size(16))); \
617 unsigned char w3 __attribute__((vector_size(16)));
618#define GEN_X_DEFINE_4_5() \
619 unsigned char w4 __attribute__((vector_size(16))); \
620 unsigned char w5 __attribute__((vector_size(16)));
621#define GEN_X_DEFINE_6_7() \
622 unsigned char w6 __attribute__((vector_size(16))); \
623 unsigned char w7 __attribute__((vector_size(16)));
624#define GEN_X_DEFINE_8_9() \
625 unsigned char w8 __attribute__((vector_size(16))); \
626 unsigned char w9 __attribute__((vector_size(16)));
627#define GEN_X_DEFINE_10_11() \
628 unsigned char w10 __attribute__((vector_size(16))); \
629 unsigned char w11 __attribute__((vector_size(16)));
630#define GEN_X_DEFINE_12_15() \
631 unsigned char w12 __attribute__((vector_size(16))); \
632 unsigned char w13 __attribute__((vector_size(16))); \
633 unsigned char w14 __attribute__((vector_size(16))); \
634 unsigned char w15 __attribute__((vector_size(16)));
635#define GEN_X_DEFINE_16() \
636 unsigned char w16 __attribute__((vector_size(16)));
637#define GEN_X_DEFINE_17() \
638 unsigned char w17 __attribute__((vector_size(16)));
639#define GEN_X_DEFINE_18_21() \
640 unsigned char w18 __attribute__((vector_size(16))); \
641 unsigned char w19 __attribute__((vector_size(16))); \
642 unsigned char w20 __attribute__((vector_size(16))); \
643 unsigned char w21 __attribute__((vector_size(16)));
644#define GEN_X_DEFINE_22_23() \
645 unsigned char w22 __attribute__((vector_size(16))); \
646 unsigned char w23 __attribute__((vector_size(16)));
647#define GEN_X_DEFINE_24_27() \
648 unsigned char w24 __attribute__((vector_size(16))); \
649 unsigned char w25 __attribute__((vector_size(16))); \
650 unsigned char w26 __attribute__((vector_size(16))); \
651 unsigned char w27 __attribute__((vector_size(16)));
652#define GEN_X_DEFINE_28_30() \
653 unsigned char w28 __attribute__((vector_size(16))); \
654 unsigned char w29 __attribute__((vector_size(16))); \
655 unsigned char w30 __attribute__((vector_size(16)));
656#define GEN_X_DEFINE_31() \
657 unsigned char w31 __attribute__((vector_size(16)));
658#define GEN_X_DEFINE_32() \
659 unsigned char w32 __attribute__((vector_size(16)));
660#define GEN_X_DEFINE_33_36() \
661 unsigned char w33 __attribute__((vector_size(16))); \
662 unsigned char w34 __attribute__((vector_size(16))); \
663 unsigned char w35 __attribute__((vector_size(16))); \
664 unsigned char w36 __attribute__((vector_size(16)));
665#define GEN_X_DEFINE_37_38() \
666 unsigned char w37 __attribute__((vector_size(16))); \
667 unsigned char w38 __attribute__((vector_size(16)));
668#define GEN_X_DEFINE_ALL() \
669 GEN_X_DEFINE_0_3() \
670 GEN_X_DEFINE_4_5() \
671 GEN_X_DEFINE_6_7() \
672 GEN_X_DEFINE_8_9() \
673 GEN_X_DEFINE_10_11() \
674 GEN_X_DEFINE_12_15() \
675 GEN_X_DEFINE_16() \
676 GEN_X_DEFINE_17() \
677 GEN_X_DEFINE_18_21() \
678 GEN_X_DEFINE_22_23() \
679 GEN_X_DEFINE_24_27() \
680 GEN_X_DEFINE_28_30() \
681 GEN_X_DEFINE_31() \
682 GEN_X_DEFINE_32() \
683 GEN_X_DEFINE_33_36() \
684 GEN_X_DEFINE_37_38()
685#endif