]>
Commit | Line | Data |
---|---|---|
62a65a65 RD |
1 | /* |
2 | * CDDL HEADER START | |
3 | * | |
4 | * The contents of this file are subject to the terms of the | |
5 | * Common Development and Distribution License (the "License"). | |
6 | * You may not use this file except in compliance with the License. | |
7 | * | |
8 | * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE | |
9 | * or http://www.opensolaris.org/os/licensing. | |
10 | * See the License for the specific language governing permissions | |
11 | * and limitations under the License. | |
12 | * | |
13 | * When distributing Covered Code, include this CDDL HEADER in each | |
14 | * file and include the License file at usr/src/OPENSOLARIS.LICENSE. | |
15 | * If applicable, add the following below this CDDL HEADER, with the | |
16 | * fields enclosed by brackets "[]" replaced with your own identifying | |
17 | * information: Portions Copyright [yyyy] [name of copyright owner] | |
18 | * | |
19 | * CDDL HEADER END | |
20 | */ | |
21 | /* | |
22 | * Copyright (C) 2016 Romain Dolbeau. All rights reserved. | |
23 | */ | |
24 | ||
25 | #include <sys/types.h> | |
26 | #include <linux/simd_aarch64.h> | |
27 | ||
28 | #define __asm __asm__ __volatile__ | |
29 | ||
30 | #define _REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N | |
31 | #define REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1) | |
32 | ||
33 | #define VR0_(REG, ...) "%[w"#REG"]" | |
34 | #define VR1_(_1, REG, ...) "%[w"#REG"]" | |
35 | #define VR2_(_1, _2, REG, ...) "%[w"#REG"]" | |
36 | #define VR3_(_1, _2, _3, REG, ...) "%[w"#REG"]" | |
37 | #define VR4_(_1, _2, _3, _4, REG, ...) "%[w"#REG"]" | |
38 | #define VR5_(_1, _2, _3, _4, _5, REG, ...) "%[w"#REG"]" | |
39 | #define VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "%[w"#REG"]" | |
40 | #define VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "%[w"#REG"]" | |
41 | ||
42 | /* | |
43 | * Here we need registers not used otherwise. | |
44 | * They will be used in unused ASM for the case | |
45 | * with more registers than required... but GGC | |
46 | * will still need to make sure the constraints | |
47 | * are correct, and duplicate constraints are illegal | |
48 | * ... and we use the "register" number as a name | |
49 | */ | |
50 | ||
51 | #define VR0(r...) VR0_(r) | |
52 | #define VR1(r...) VR1_(r) | |
53 | #define VR2(r...) VR2_(r, 36) | |
54 | #define VR3(r...) VR3_(r, 36, 35) | |
55 | #define VR4(r...) VR4_(r, 36, 35, 34, 33) | |
56 | #define VR5(r...) VR5_(r, 36, 35, 34, 33, 32) | |
57 | #define VR6(r...) VR6_(r, 36, 35, 34, 33, 32, 31) | |
58 | #define VR7(r...) VR7_(r, 36, 35, 34, 33, 32, 31, 30) | |
59 | ||
60 | #define VR(X) "%[w"#X"]" | |
61 | ||
62 | #define RVR0_(REG, ...) [w##REG] "w" (w##REG) | |
63 | #define RVR1_(_1, REG, ...) [w##REG] "w" (w##REG) | |
64 | #define RVR2_(_1, _2, REG, ...) [w##REG] "w" (w##REG) | |
65 | #define RVR3_(_1, _2, _3, REG, ...) [w##REG] "w" (w##REG) | |
66 | #define RVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "w" (w##REG) | |
67 | #define RVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "w" (w##REG) | |
68 | #define RVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "w" (w##REG) | |
69 | #define RVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "w" (w##REG) | |
70 | ||
71 | #define RVR0(r...) RVR0_(r) | |
72 | #define RVR1(r...) RVR1_(r) | |
73 | #define RVR2(r...) RVR2_(r, 36) | |
74 | #define RVR3(r...) RVR3_(r, 36, 35) | |
75 | #define RVR4(r...) RVR4_(r, 36, 35, 34, 33) | |
76 | #define RVR5(r...) RVR5_(r, 36, 35, 34, 33, 32) | |
77 | #define RVR6(r...) RVR6_(r, 36, 35, 34, 33, 32, 31) | |
78 | #define RVR7(r...) RVR7_(r, 36, 35, 34, 33, 32, 31, 30) | |
79 | ||
80 | #define RVR(X) [w##X] "w" (w##X) | |
81 | ||
82 | #define WVR0_(REG, ...) [w##REG] "=w" (w##REG) | |
83 | #define WVR1_(_1, REG, ...) [w##REG] "=w" (w##REG) | |
84 | #define WVR2_(_1, _2, REG, ...) [w##REG] "=w" (w##REG) | |
85 | #define WVR3_(_1, _2, _3, REG, ...) [w##REG] "=w" (w##REG) | |
86 | #define WVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "=w" (w##REG) | |
87 | #define WVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "=w" (w##REG) | |
88 | #define WVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "=w" (w##REG) | |
89 | #define WVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "=w" (w##REG) | |
90 | ||
91 | #define WVR0(r...) WVR0_(r) | |
92 | #define WVR1(r...) WVR1_(r) | |
93 | #define WVR2(r...) WVR2_(r, 36) | |
94 | #define WVR3(r...) WVR3_(r, 36, 35) | |
95 | #define WVR4(r...) WVR4_(r, 36, 35, 34, 33) | |
96 | #define WVR5(r...) WVR5_(r, 36, 35, 34, 33, 32) | |
97 | #define WVR6(r...) WVR6_(r, 36, 35, 34, 33, 32, 31) | |
98 | #define WVR7(r...) WVR7_(r, 36, 35, 34, 33, 32, 31, 30) | |
99 | ||
100 | #define WVR(X) [w##X] "=w" (w##X) | |
101 | ||
102 | #define UVR0_(REG, ...) [w##REG] "+&w" (w##REG) | |
103 | #define UVR1_(_1, REG, ...) [w##REG] "+&w" (w##REG) | |
104 | #define UVR2_(_1, _2, REG, ...) [w##REG] "+&w" (w##REG) | |
105 | #define UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&w" (w##REG) | |
106 | #define UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&w" (w##REG) | |
107 | #define UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&w" (w##REG) | |
108 | #define UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&w" (w##REG) | |
109 | #define UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&w" (w##REG) | |
110 | ||
111 | #define UVR0(r...) UVR0_(r) | |
112 | #define UVR1(r...) UVR1_(r) | |
113 | #define UVR2(r...) UVR2_(r, 36) | |
114 | #define UVR3(r...) UVR3_(r, 36, 35) | |
115 | #define UVR4(r...) UVR4_(r, 36, 35, 34, 33) | |
116 | #define UVR5(r...) UVR5_(r, 36, 35, 34, 33, 32) | |
117 | #define UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31) | |
118 | #define UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30) | |
119 | ||
120 | #define UVR(X) [w##X] "+&w" (w##X) | |
121 | ||
122 | #define R_01(REG1, REG2, ...) REG1, REG2 | |
123 | #define _R_23(_0, _1, REG2, REG3, ...) REG2, REG3 | |
124 | #define R_23(REG...) _R_23(REG, 1, 2, 3) | |
125 | ||
126 | #define ASM_BUG() ASSERT(0) | |
127 | ||
128 | #define OFFSET(ptr, val) (((unsigned char *)ptr)+val) | |
129 | ||
130 | extern const uint8_t gf_clmul_mod_lt[4*256][16]; | |
131 | ||
132 | #define ELEM_SIZE 16 | |
133 | ||
134 | typedef struct v { | |
135 | uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE))); | |
136 | } v_t; | |
137 | ||
138 | #define PREFETCHNTA(ptr, offset) \ | |
139 | { \ | |
140 | __asm( \ | |
141 | "prfm pstl1strm, %[MEM]\n" \ | |
142 | : : [MEM] "Q" (*(ptr + offset))); \ | |
143 | } | |
144 | ||
145 | #define PREFETCH(ptr, offset) \ | |
146 | { \ | |
147 | __asm( \ | |
148 | "prfm pldl1keep, %[MEM]\n" \ | |
149 | : : [MEM] "Q" (*(ptr + offset))); \ | |
150 | } | |
151 | ||
152 | #define XOR_ACC(src, r...) \ | |
153 | { \ | |
154 | switch (REG_CNT(r)) { \ | |
155 | case 8: \ | |
156 | __asm( \ | |
157 | "ld1 { v21.4s },%[SRC0]\n" \ | |
158 | "ld1 { v20.4s },%[SRC1]\n" \ | |
159 | "ld1 { v19.4s },%[SRC2]\n" \ | |
160 | "ld1 { v18.4s },%[SRC3]\n" \ | |
161 | "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n" \ | |
162 | "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n" \ | |
163 | "eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n" \ | |
164 | "eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n" \ | |
165 | "ld1 { v21.4s },%[SRC4]\n" \ | |
166 | "ld1 { v20.4s },%[SRC5]\n" \ | |
167 | "ld1 { v19.4s },%[SRC6]\n" \ | |
168 | "ld1 { v18.4s },%[SRC7]\n" \ | |
169 | "eor " VR4(r) ".16b," VR4(r) ".16b,v21.16b\n" \ | |
170 | "eor " VR5(r) ".16b," VR5(r) ".16b,v20.16b\n" \ | |
171 | "eor " VR6(r) ".16b," VR6(r) ".16b,v19.16b\n" \ | |
172 | "eor " VR7(r) ".16b," VR7(r) ".16b,v18.16b\n" \ | |
173 | : UVR0(r), UVR1(r), UVR2(r), UVR3(r), \ | |
174 | UVR4(r), UVR5(r), UVR6(r), UVR7(r) \ | |
175 | : [SRC0] "Q" (*(OFFSET(src, 0))), \ | |
176 | [SRC1] "Q" (*(OFFSET(src, 16))), \ | |
177 | [SRC2] "Q" (*(OFFSET(src, 32))), \ | |
178 | [SRC3] "Q" (*(OFFSET(src, 48))), \ | |
179 | [SRC4] "Q" (*(OFFSET(src, 64))), \ | |
180 | [SRC5] "Q" (*(OFFSET(src, 80))), \ | |
181 | [SRC6] "Q" (*(OFFSET(src, 96))), \ | |
182 | [SRC7] "Q" (*(OFFSET(src, 112))) \ | |
183 | : "v18", "v19", "v20", "v21"); \ | |
184 | break; \ | |
185 | case 4: \ | |
186 | __asm( \ | |
187 | "ld1 { v21.4s },%[SRC0]\n" \ | |
188 | "ld1 { v20.4s },%[SRC1]\n" \ | |
189 | "ld1 { v19.4s },%[SRC2]\n" \ | |
190 | "ld1 { v18.4s },%[SRC3]\n" \ | |
191 | "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n" \ | |
192 | "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n" \ | |
193 | "eor " VR2(r) ".16b," VR2(r) ".16b,v19.16b\n" \ | |
194 | "eor " VR3(r) ".16b," VR3(r) ".16b,v18.16b\n" \ | |
195 | : UVR0(r), UVR1(r), UVR2(r), UVR3(r) \ | |
196 | : [SRC0] "Q" (*(OFFSET(src, 0))), \ | |
197 | [SRC1] "Q" (*(OFFSET(src, 16))), \ | |
198 | [SRC2] "Q" (*(OFFSET(src, 32))), \ | |
199 | [SRC3] "Q" (*(OFFSET(src, 48))) \ | |
200 | : "v18", "v19", "v20", "v21"); \ | |
201 | break; \ | |
202 | case 2: \ | |
203 | __asm( \ | |
204 | "ld1 { v21.4s },%[SRC0]\n" \ | |
205 | "ld1 { v20.4s },%[SRC1]\n" \ | |
206 | "eor " VR0(r) ".16b," VR0(r) ".16b,v21.16b\n" \ | |
207 | "eor " VR1(r) ".16b," VR1(r) ".16b,v20.16b\n" \ | |
208 | : UVR0(r), UVR1(r) \ | |
209 | : [SRC0] "Q" (*(OFFSET(src, 0))), \ | |
210 | [SRC1] "Q" (*(OFFSET(src, 16))) \ | |
211 | : "v20", "v21"); \ | |
212 | break; \ | |
213 | default: \ | |
214 | ASM_BUG(); \ | |
215 | } \ | |
216 | } | |
217 | ||
218 | #define XOR(r...) \ | |
219 | { \ | |
220 | switch (REG_CNT(r)) { \ | |
221 | case 8: \ | |
222 | __asm( \ | |
223 | "eor " VR4(r) ".16b," VR4(r) ".16b," VR0(r) ".16b\n" \ | |
224 | "eor " VR5(r) ".16b," VR5(r) ".16b," VR1(r) ".16b\n" \ | |
225 | "eor " VR6(r) ".16b," VR6(r) ".16b," VR2(r) ".16b\n" \ | |
226 | "eor " VR7(r) ".16b," VR7(r) ".16b," VR3(r) ".16b\n" \ | |
227 | : UVR4(r), UVR5(r), UVR6(r), UVR7(r) \ | |
228 | : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \ | |
229 | break; \ | |
230 | case 4: \ | |
231 | __asm( \ | |
232 | "eor " VR2(r) ".16b," VR2(r) ".16b," VR0(r) ".16b\n" \ | |
233 | "eor " VR3(r) ".16b," VR3(r) ".16b," VR1(r) ".16b\n" \ | |
234 | : UVR2(r), UVR3(r) \ | |
235 | : RVR0(r), RVR1(r)); \ | |
236 | break; \ | |
237 | default: \ | |
238 | ASM_BUG(); \ | |
239 | } \ | |
240 | } | |
241 | ||
242 | #define ZERO(r...) \ | |
243 | { \ | |
244 | switch (REG_CNT(r)) { \ | |
245 | case 4: \ | |
246 | __asm( \ | |
247 | "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \ | |
248 | "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n" \ | |
249 | "eor " VR2(r) ".16b," VR2(r) ".16b," VR2(r) ".16b\n" \ | |
250 | "eor " VR3(r) ".16b," VR3(r) ".16b," VR3(r) ".16b\n" \ | |
251 | : WVR0(r), WVR1(r), WVR2(r), WVR3(r)); \ | |
252 | break; \ | |
253 | case 2: \ | |
254 | __asm( \ | |
255 | "eor " VR0(r) ".16b," VR0(r) ".16b," VR0(r) ".16b\n" \ | |
256 | "eor " VR1(r) ".16b," VR1(r) ".16b," VR1(r) ".16b\n" \ | |
257 | : WVR0(r), WVR1(r)); \ | |
258 | break; \ | |
259 | default: \ | |
260 | ASM_BUG(); \ | |
261 | } \ | |
262 | } | |
263 | ||
264 | #define COPY(r...) \ | |
265 | { \ | |
266 | switch (REG_CNT(r)) { \ | |
267 | case 8: \ | |
268 | __asm( \ | |
269 | "mov " VR4(r) ".16b," VR0(r) ".16b\n" \ | |
270 | "mov " VR5(r) ".16b," VR1(r) ".16b\n" \ | |
271 | "mov " VR6(r) ".16b," VR2(r) ".16b\n" \ | |
272 | "mov " VR7(r) ".16b," VR3(r) ".16b\n" \ | |
273 | : WVR4(r), WVR5(r), WVR6(r), WVR7(r) \ | |
274 | : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \ | |
275 | break; \ | |
276 | case 4: \ | |
277 | __asm( \ | |
278 | "mov " VR2(r) ".16b," VR0(r) ".16b\n" \ | |
279 | "mov " VR3(r) ".16b," VR1(r) ".16b\n" \ | |
280 | : WVR2(r), WVR3(r) \ | |
281 | : RVR0(r), RVR1(r)); \ | |
282 | break; \ | |
283 | default: \ | |
284 | ASM_BUG(); \ | |
285 | } \ | |
286 | } | |
287 | ||
288 | #define LOAD(src, r...) \ | |
289 | { \ | |
290 | switch (REG_CNT(r)) { \ | |
291 | case 8: \ | |
292 | __asm( \ | |
293 | "ld1 { " VR0(r) ".4s },%[SRC0]\n" \ | |
294 | "ld1 { " VR1(r) ".4s },%[SRC1]\n" \ | |
295 | "ld1 { " VR2(r) ".4s },%[SRC2]\n" \ | |
296 | "ld1 { " VR3(r) ".4s },%[SRC3]\n" \ | |
297 | "ld1 { " VR4(r) ".4s },%[SRC4]\n" \ | |
298 | "ld1 { " VR5(r) ".4s },%[SRC5]\n" \ | |
299 | "ld1 { " VR6(r) ".4s },%[SRC6]\n" \ | |
300 | "ld1 { " VR7(r) ".4s },%[SRC7]\n" \ | |
301 | : WVR0(r), WVR1(r), WVR2(r), WVR3(r), \ | |
302 | WVR4(r), WVR5(r), WVR6(r), WVR7(r) \ | |
303 | : [SRC0] "Q" (*(OFFSET(src, 0))), \ | |
304 | [SRC1] "Q" (*(OFFSET(src, 16))), \ | |
305 | [SRC2] "Q" (*(OFFSET(src, 32))), \ | |
306 | [SRC3] "Q" (*(OFFSET(src, 48))), \ | |
307 | [SRC4] "Q" (*(OFFSET(src, 64))), \ | |
308 | [SRC5] "Q" (*(OFFSET(src, 80))), \ | |
309 | [SRC6] "Q" (*(OFFSET(src, 96))), \ | |
310 | [SRC7] "Q" (*(OFFSET(src, 112)))); \ | |
311 | break; \ | |
312 | case 4: \ | |
313 | __asm( \ | |
314 | "ld1 { " VR0(r) ".4s },%[SRC0]\n" \ | |
315 | "ld1 { " VR1(r) ".4s },%[SRC1]\n" \ | |
316 | "ld1 { " VR2(r) ".4s },%[SRC2]\n" \ | |
317 | "ld1 { " VR3(r) ".4s },%[SRC3]\n" \ | |
318 | : WVR0(r), WVR1(r), WVR2(r), WVR3(r) \ | |
319 | : [SRC0] "Q" (*(OFFSET(src, 0))), \ | |
320 | [SRC1] "Q" (*(OFFSET(src, 16))), \ | |
321 | [SRC2] "Q" (*(OFFSET(src, 32))), \ | |
322 | [SRC3] "Q" (*(OFFSET(src, 48)))); \ | |
323 | break; \ | |
324 | case 2: \ | |
325 | __asm( \ | |
326 | "ld1 { " VR0(r) ".4s },%[SRC0]\n" \ | |
327 | "ld1 { " VR1(r) ".4s },%[SRC1]\n" \ | |
328 | : WVR0(r), WVR1(r) \ | |
329 | : [SRC0] "Q" (*(OFFSET(src, 0))), \ | |
330 | [SRC1] "Q" (*(OFFSET(src, 16)))); \ | |
331 | break; \ | |
332 | default: \ | |
333 | ASM_BUG(); \ | |
334 | } \ | |
335 | } | |
336 | ||
337 | #define STORE(dst, r...) \ | |
338 | { \ | |
339 | switch (REG_CNT(r)) { \ | |
340 | case 8: \ | |
341 | __asm( \ | |
342 | "st1 { " VR0(r) ".4s },%[DST0]\n" \ | |
343 | "st1 { " VR1(r) ".4s },%[DST1]\n" \ | |
344 | "st1 { " VR2(r) ".4s },%[DST2]\n" \ | |
345 | "st1 { " VR3(r) ".4s },%[DST3]\n" \ | |
346 | "st1 { " VR4(r) ".4s },%[DST4]\n" \ | |
347 | "st1 { " VR5(r) ".4s },%[DST5]\n" \ | |
348 | "st1 { " VR6(r) ".4s },%[DST6]\n" \ | |
349 | "st1 { " VR7(r) ".4s },%[DST7]\n" \ | |
350 | : [DST0] "=Q" (*(OFFSET(dst, 0))), \ | |
351 | [DST1] "=Q" (*(OFFSET(dst, 16))), \ | |
352 | [DST2] "=Q" (*(OFFSET(dst, 32))), \ | |
353 | [DST3] "=Q" (*(OFFSET(dst, 48))), \ | |
354 | [DST4] "=Q" (*(OFFSET(dst, 64))), \ | |
355 | [DST5] "=Q" (*(OFFSET(dst, 80))), \ | |
356 | [DST6] "=Q" (*(OFFSET(dst, 96))), \ | |
357 | [DST7] "=Q" (*(OFFSET(dst, 112))) \ | |
358 | : RVR0(r), RVR1(r), RVR2(r), RVR3(r), \ | |
359 | RVR4(r), RVR5(r), RVR6(r), RVR7(r)); \ | |
360 | break; \ | |
361 | case 4: \ | |
362 | __asm( \ | |
363 | "st1 { " VR0(r) ".4s },%[DST0]\n" \ | |
364 | "st1 { " VR1(r) ".4s },%[DST1]\n" \ | |
365 | "st1 { " VR2(r) ".4s },%[DST2]\n" \ | |
366 | "st1 { " VR3(r) ".4s },%[DST3]\n" \ | |
367 | : [DST0] "=Q" (*(OFFSET(dst, 0))), \ | |
368 | [DST1] "=Q" (*(OFFSET(dst, 16))), \ | |
369 | [DST2] "=Q" (*(OFFSET(dst, 32))), \ | |
370 | [DST3] "=Q" (*(OFFSET(dst, 48))) \ | |
371 | : RVR0(r), RVR1(r), RVR2(r), RVR3(r)); \ | |
372 | break; \ | |
373 | case 2: \ | |
374 | __asm( \ | |
375 | "st1 { " VR0(r) ".4s },%[DST0]\n" \ | |
376 | "st1 { " VR1(r) ".4s },%[DST1]\n" \ | |
377 | : [DST0] "=Q" (*(OFFSET(dst, 0))), \ | |
378 | [DST1] "=Q" (*(OFFSET(dst, 16))) \ | |
379 | : RVR0(r), RVR1(r)); \ | |
380 | break; \ | |
381 | default: \ | |
382 | ASM_BUG(); \ | |
383 | } \ | |
384 | } | |
385 | ||
386 | /* | |
387 | * Unfortunately cannot use the macro, because GCC | |
388 | * will try to use the macro name and not value | |
389 | * later on... | |
390 | * Kept as a reference to what a numbered variable is | |
391 | */ | |
392 | #define _00 "v17" | |
393 | #define _1d "v16" | |
394 | #define _temp0 "v19" | |
395 | #define _temp1 "v18" | |
396 | ||
397 | #define MUL2_SETUP() \ | |
398 | { \ | |
399 | __asm( \ | |
400 | "eor " VR(17) ".16b," VR(17) ".16b," VR(17) ".16b\n" \ | |
401 | "movi " VR(16) ".16b,#0x1d\n" \ | |
402 | : WVR(16), WVR(17)); \ | |
403 | } | |
404 | ||
405 | #define MUL2(r...) \ | |
406 | { \ | |
407 | switch (REG_CNT(r)) { \ | |
408 | case 4: \ | |
409 | __asm( \ | |
410 | "cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n" \ | |
411 | "cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n" \ | |
412 | "cmgt v21.16b," VR(17) ".16b," VR2(r) ".16b\n" \ | |
413 | "cmgt v20.16b," VR(17) ".16b," VR3(r) ".16b\n" \ | |
414 | "and v19.16b,v19.16b," VR(16) ".16b\n" \ | |
415 | "and v18.16b,v18.16b," VR(16) ".16b\n" \ | |
416 | "and v21.16b,v21.16b," VR(16) ".16b\n" \ | |
417 | "and v20.16b,v20.16b," VR(16) ".16b\n" \ | |
418 | "shl " VR0(r) ".16b," VR0(r) ".16b,#1\n" \ | |
419 | "shl " VR1(r) ".16b," VR1(r) ".16b,#1\n" \ | |
420 | "shl " VR2(r) ".16b," VR2(r) ".16b,#1\n" \ | |
421 | "shl " VR3(r) ".16b," VR3(r) ".16b,#1\n" \ | |
422 | "eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n" \ | |
423 | "eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n" \ | |
424 | "eor " VR2(r) ".16b,v21.16b," VR2(r) ".16b\n" \ | |
425 | "eor " VR3(r) ".16b,v20.16b," VR3(r) ".16b\n" \ | |
426 | : UVR0(r), UVR1(r), UVR2(r), UVR3(r) \ | |
427 | : RVR(17), RVR(16) \ | |
428 | : "v18", "v19", "v20", "v21"); \ | |
429 | break; \ | |
430 | case 2: \ | |
431 | __asm( \ | |
432 | "cmgt v19.16b," VR(17) ".16b," VR0(r) ".16b\n" \ | |
433 | "cmgt v18.16b," VR(17) ".16b," VR1(r) ".16b\n" \ | |
434 | "and v19.16b,v19.16b," VR(16) ".16b\n" \ | |
435 | "and v18.16b,v18.16b," VR(16) ".16b\n" \ | |
436 | "shl " VR0(r) ".16b," VR0(r) ".16b,#1\n" \ | |
437 | "shl " VR1(r) ".16b," VR1(r) ".16b,#1\n" \ | |
438 | "eor " VR0(r) ".16b,v19.16b," VR0(r) ".16b\n" \ | |
439 | "eor " VR1(r) ".16b,v18.16b," VR1(r) ".16b\n" \ | |
440 | : UVR0(r), UVR1(r) \ | |
441 | : RVR(17), RVR(16) \ | |
442 | : "v18", "v19"); \ | |
443 | break; \ | |
444 | default: \ | |
445 | ASM_BUG(); \ | |
446 | } \ | |
447 | } | |
448 | ||
449 | #define MUL4(r...) \ | |
450 | { \ | |
451 | MUL2(r); \ | |
452 | MUL2(r); \ | |
453 | } | |
454 | ||
455 | /* | |
456 | * Unfortunately cannot use the macro, because GCC | |
457 | * will try to use the macro name and not value | |
458 | * later on... | |
459 | * Kept as a reference to what a register is | |
460 | * (here we're using actual registers for the | |
461 | * clobbered ones) | |
462 | */ | |
463 | #define _0f "v15" | |
464 | #define _a_save "v14" | |
465 | #define _b_save "v13" | |
466 | #define _lt_mod_a "v12" | |
467 | #define _lt_clmul_a "v11" | |
468 | #define _lt_mod_b "v10" | |
469 | #define _lt_clmul_b "v15" | |
470 | ||
471 | #define _MULx2(c, r...) \ | |
472 | { \ | |
473 | switch (REG_CNT(r)) { \ | |
474 | case 2: \ | |
475 | __asm( \ | |
476 | /* lts for upper part */ \ | |
477 | "movi v15.16b,#0x0f\n" \ | |
478 | "ld1 { v10.4s },%[lt0]\n" \ | |
479 | "ld1 { v11.4s },%[lt1]\n" \ | |
480 | /* upper part */ \ | |
481 | "and v14.16b," VR0(r) ".16b,v15.16b\n" \ | |
482 | "and v13.16b," VR1(r) ".16b,v15.16b\n" \ | |
483 | "sshr " VR0(r) ".8h," VR0(r) ".8h,#4\n" \ | |
484 | "sshr " VR1(r) ".8h," VR1(r) ".8h,#4\n" \ | |
485 | "and " VR0(r) ".16b," VR0(r) ".16b,v15.16b\n" \ | |
486 | "and " VR1(r) ".16b," VR1(r) ".16b,v15.16b\n" \ | |
487 | \ | |
488 | "tbl v12.16b,{v10.16b}," VR0(r) ".16b\n" \ | |
489 | "tbl v10.16b,{v10.16b}," VR1(r) ".16b\n" \ | |
490 | "tbl v15.16b,{v11.16b}," VR0(r) ".16b\n" \ | |
491 | "tbl v11.16b,{v11.16b}," VR1(r) ".16b\n" \ | |
492 | \ | |
493 | "eor " VR0(r) ".16b,v15.16b,v12.16b\n" \ | |
494 | "eor " VR1(r) ".16b,v11.16b,v10.16b\n" \ | |
495 | /* lts for lower part */ \ | |
496 | "ld1 { v10.4s },%[lt2]\n" \ | |
497 | "ld1 { v15.4s },%[lt3]\n" \ | |
498 | /* lower part */ \ | |
499 | "tbl v12.16b,{v10.16b},v14.16b\n" \ | |
500 | "tbl v10.16b,{v10.16b},v13.16b\n" \ | |
501 | "tbl v11.16b,{v15.16b},v14.16b\n" \ | |
502 | "tbl v15.16b,{v15.16b},v13.16b\n" \ | |
503 | \ | |
504 | "eor " VR0(r) ".16b," VR0(r) ".16b,v12.16b\n" \ | |
505 | "eor " VR1(r) ".16b," VR1(r) ".16b,v10.16b\n" \ | |
506 | "eor " VR0(r) ".16b," VR0(r) ".16b,v11.16b\n" \ | |
507 | "eor " VR1(r) ".16b," VR1(r) ".16b,v15.16b\n" \ | |
508 | : UVR0(r), UVR1(r) \ | |
509 | : [lt0] "Q" ((gf_clmul_mod_lt[4*(c)+0][0])), \ | |
510 | [lt1] "Q" ((gf_clmul_mod_lt[4*(c)+1][0])), \ | |
511 | [lt2] "Q" ((gf_clmul_mod_lt[4*(c)+2][0])), \ | |
512 | [lt3] "Q" ((gf_clmul_mod_lt[4*(c)+3][0])) \ | |
513 | : "v10", "v11", "v12", "v13", "v14", "v15"); \ | |
514 | break; \ | |
515 | default: \ | |
516 | ASM_BUG(); \ | |
517 | } \ | |
518 | } | |
519 | ||
520 | #define MUL(c, r...) \ | |
521 | { \ | |
522 | switch (REG_CNT(r)) { \ | |
523 | case 4: \ | |
524 | _MULx2(c, R_23(r)); \ | |
525 | _MULx2(c, R_01(r)); \ | |
526 | break; \ | |
527 | case 2: \ | |
528 | _MULx2(c, R_01(r)); \ | |
529 | break; \ | |
530 | default: \ | |
531 | ASM_BUG(); \ | |
532 | } \ | |
533 | } | |
534 | ||
535 | #define raidz_math_begin() kfpu_begin() | |
536 | #define raidz_math_end() kfpu_end() | |
537 | ||
538 | /* Overkill... */ | |
539 | #if defined(_KERNEL) | |
540 | #define GEN_X_DEFINE_0_3() \ | |
541 | register unsigned char w0 asm("v0") __attribute__((vector_size(16))); \ | |
542 | register unsigned char w1 asm("v1") __attribute__((vector_size(16))); \ | |
543 | register unsigned char w2 asm("v2") __attribute__((vector_size(16))); \ | |
544 | register unsigned char w3 asm("v3") __attribute__((vector_size(16))); | |
545 | #define GEN_X_DEFINE_4_5() \ | |
546 | register unsigned char w4 asm("v4") __attribute__((vector_size(16))); \ | |
547 | register unsigned char w5 asm("v5") __attribute__((vector_size(16))); | |
548 | #define GEN_X_DEFINE_6_7() \ | |
549 | register unsigned char w6 asm("v6") __attribute__((vector_size(16))); \ | |
550 | register unsigned char w7 asm("v7") __attribute__((vector_size(16))); | |
551 | #define GEN_X_DEFINE_8_9() \ | |
552 | register unsigned char w8 asm("v8") __attribute__((vector_size(16))); \ | |
553 | register unsigned char w9 asm("v9") __attribute__((vector_size(16))); | |
554 | #define GEN_X_DEFINE_10_11() \ | |
555 | register unsigned char w10 asm("v10") __attribute__((vector_size(16))); \ | |
556 | register unsigned char w11 asm("v11") __attribute__((vector_size(16))); | |
557 | #define GEN_X_DEFINE_12_15() \ | |
558 | register unsigned char w12 asm("v12") __attribute__((vector_size(16))); \ | |
559 | register unsigned char w13 asm("v13") __attribute__((vector_size(16))); \ | |
560 | register unsigned char w14 asm("v14") __attribute__((vector_size(16))); \ | |
561 | register unsigned char w15 asm("v15") __attribute__((vector_size(16))); | |
562 | #define GEN_X_DEFINE_16() \ | |
563 | register unsigned char w16 asm("v16") __attribute__((vector_size(16))); | |
564 | #define GEN_X_DEFINE_17() \ | |
565 | register unsigned char w17 asm("v17") __attribute__((vector_size(16))); | |
566 | #define GEN_X_DEFINE_18_21() \ | |
567 | register unsigned char w18 asm("v18") __attribute__((vector_size(16))); \ | |
568 | register unsigned char w19 asm("v19") __attribute__((vector_size(16))); \ | |
569 | register unsigned char w20 asm("v20") __attribute__((vector_size(16))); \ | |
570 | register unsigned char w21 asm("v21") __attribute__((vector_size(16))); | |
571 | #define GEN_X_DEFINE_22_23() \ | |
572 | register unsigned char w22 asm("v22") __attribute__((vector_size(16))); \ | |
573 | register unsigned char w23 asm("v23") __attribute__((vector_size(16))); | |
574 | #define GEN_X_DEFINE_24_27() \ | |
575 | register unsigned char w24 asm("v24") __attribute__((vector_size(16))); \ | |
576 | register unsigned char w25 asm("v25") __attribute__((vector_size(16))); \ | |
577 | register unsigned char w26 asm("v26") __attribute__((vector_size(16))); \ | |
578 | register unsigned char w27 asm("v27") __attribute__((vector_size(16))); | |
579 | #define GEN_X_DEFINE_28_30() \ | |
580 | register unsigned char w28 asm("v28") __attribute__((vector_size(16))); \ | |
581 | register unsigned char w29 asm("v29") __attribute__((vector_size(16))); \ | |
582 | register unsigned char w30 asm("v30") __attribute__((vector_size(16))); | |
583 | #define GEN_X_DEFINE_31() \ | |
584 | register unsigned char w31 asm("v31") __attribute__((vector_size(16))); | |
585 | #define GEN_X_DEFINE_32() \ | |
586 | register unsigned char w32 asm("v31") __attribute__((vector_size(16))); | |
587 | #define GEN_X_DEFINE_33_36() \ | |
588 | register unsigned char w33 asm("v31") __attribute__((vector_size(16))); \ | |
589 | register unsigned char w34 asm("v31") __attribute__((vector_size(16))); \ | |
590 | register unsigned char w35 asm("v31") __attribute__((vector_size(16))); \ | |
591 | register unsigned char w36 asm("v31") __attribute__((vector_size(16))); | |
592 | #define GEN_X_DEFINE_37_38() \ | |
593 | register unsigned char w37 asm("v31") __attribute__((vector_size(16))); \ | |
594 | register unsigned char w38 asm("v31") __attribute__((vector_size(16))); | |
595 | #define GEN_X_DEFINE_ALL() \ | |
596 | GEN_X_DEFINE_0_3() \ | |
597 | GEN_X_DEFINE_4_5() \ | |
598 | GEN_X_DEFINE_6_7() \ | |
599 | GEN_X_DEFINE_8_9() \ | |
600 | GEN_X_DEFINE_10_11() \ | |
601 | GEN_X_DEFINE_12_15() \ | |
602 | GEN_X_DEFINE_16() \ | |
603 | GEN_X_DEFINE_17() \ | |
604 | GEN_X_DEFINE_18_21() \ | |
605 | GEN_X_DEFINE_22_23() \ | |
606 | GEN_X_DEFINE_24_27() \ | |
607 | GEN_X_DEFINE_28_30() \ | |
608 | GEN_X_DEFINE_31() \ | |
609 | GEN_X_DEFINE_32() \ | |
610 | GEN_X_DEFINE_33_36() \ | |
611 | GEN_X_DEFINE_37_38() | |
612 | #else | |
613 | #define GEN_X_DEFINE_0_3() \ | |
614 | unsigned char w0 __attribute__((vector_size(16))); \ | |
615 | unsigned char w1 __attribute__((vector_size(16))); \ | |
616 | unsigned char w2 __attribute__((vector_size(16))); \ | |
617 | unsigned char w3 __attribute__((vector_size(16))); | |
618 | #define GEN_X_DEFINE_4_5() \ | |
619 | unsigned char w4 __attribute__((vector_size(16))); \ | |
620 | unsigned char w5 __attribute__((vector_size(16))); | |
621 | #define GEN_X_DEFINE_6_7() \ | |
622 | unsigned char w6 __attribute__((vector_size(16))); \ | |
623 | unsigned char w7 __attribute__((vector_size(16))); | |
624 | #define GEN_X_DEFINE_8_9() \ | |
625 | unsigned char w8 __attribute__((vector_size(16))); \ | |
626 | unsigned char w9 __attribute__((vector_size(16))); | |
627 | #define GEN_X_DEFINE_10_11() \ | |
628 | unsigned char w10 __attribute__((vector_size(16))); \ | |
629 | unsigned char w11 __attribute__((vector_size(16))); | |
630 | #define GEN_X_DEFINE_12_15() \ | |
631 | unsigned char w12 __attribute__((vector_size(16))); \ | |
632 | unsigned char w13 __attribute__((vector_size(16))); \ | |
633 | unsigned char w14 __attribute__((vector_size(16))); \ | |
634 | unsigned char w15 __attribute__((vector_size(16))); | |
635 | #define GEN_X_DEFINE_16() \ | |
636 | unsigned char w16 __attribute__((vector_size(16))); | |
637 | #define GEN_X_DEFINE_17() \ | |
638 | unsigned char w17 __attribute__((vector_size(16))); | |
639 | #define GEN_X_DEFINE_18_21() \ | |
640 | unsigned char w18 __attribute__((vector_size(16))); \ | |
641 | unsigned char w19 __attribute__((vector_size(16))); \ | |
642 | unsigned char w20 __attribute__((vector_size(16))); \ | |
643 | unsigned char w21 __attribute__((vector_size(16))); | |
644 | #define GEN_X_DEFINE_22_23() \ | |
645 | unsigned char w22 __attribute__((vector_size(16))); \ | |
646 | unsigned char w23 __attribute__((vector_size(16))); | |
647 | #define GEN_X_DEFINE_24_27() \ | |
648 | unsigned char w24 __attribute__((vector_size(16))); \ | |
649 | unsigned char w25 __attribute__((vector_size(16))); \ | |
650 | unsigned char w26 __attribute__((vector_size(16))); \ | |
651 | unsigned char w27 __attribute__((vector_size(16))); | |
652 | #define GEN_X_DEFINE_28_30() \ | |
653 | unsigned char w28 __attribute__((vector_size(16))); \ | |
654 | unsigned char w29 __attribute__((vector_size(16))); \ | |
655 | unsigned char w30 __attribute__((vector_size(16))); | |
656 | #define GEN_X_DEFINE_31() \ | |
657 | unsigned char w31 __attribute__((vector_size(16))); | |
658 | #define GEN_X_DEFINE_32() \ | |
659 | unsigned char w32 __attribute__((vector_size(16))); | |
660 | #define GEN_X_DEFINE_33_36() \ | |
661 | unsigned char w33 __attribute__((vector_size(16))); \ | |
662 | unsigned char w34 __attribute__((vector_size(16))); \ | |
663 | unsigned char w35 __attribute__((vector_size(16))); \ | |
664 | unsigned char w36 __attribute__((vector_size(16))); | |
665 | #define GEN_X_DEFINE_37_38() \ | |
666 | unsigned char w37 __attribute__((vector_size(16))); \ | |
667 | unsigned char w38 __attribute__((vector_size(16))); | |
668 | #define GEN_X_DEFINE_ALL() \ | |
669 | GEN_X_DEFINE_0_3() \ | |
670 | GEN_X_DEFINE_4_5() \ | |
671 | GEN_X_DEFINE_6_7() \ | |
672 | GEN_X_DEFINE_8_9() \ | |
673 | GEN_X_DEFINE_10_11() \ | |
674 | GEN_X_DEFINE_12_15() \ | |
675 | GEN_X_DEFINE_16() \ | |
676 | GEN_X_DEFINE_17() \ | |
677 | GEN_X_DEFINE_18_21() \ | |
678 | GEN_X_DEFINE_22_23() \ | |
679 | GEN_X_DEFINE_24_27() \ | |
680 | GEN_X_DEFINE_28_30() \ | |
681 | GEN_X_DEFINE_31() \ | |
682 | GEN_X_DEFINE_32() \ | |
683 | GEN_X_DEFINE_33_36() \ | |
684 | GEN_X_DEFINE_37_38() | |
685 | #endif |