]>
Commit | Line | Data |
---|---|---|
9e18d7a6 RH |
1 | /* |
2 | * ARM SVE Operations | |
3 | * | |
4 | * Copyright (c) 2018 Linaro, Ltd. | |
5 | * | |
6 | * This library is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
9 | * version 2 of the License, or (at your option) any later version. | |
10 | * | |
11 | * This library is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. | |
18 | */ | |
19 | ||
20 | #include "qemu/osdep.h" | |
21 | #include "cpu.h" | |
22 | #include "exec/exec-all.h" | |
23 | #include "exec/cpu_ldst.h" | |
24 | #include "exec/helper-proto.h" | |
25 | #include "tcg/tcg-gvec-desc.h" | |
26 | ||
27 | ||
f97cfd59 RH |
28 | /* Note that vector data is stored in host-endian 64-bit chunks, |
29 | so addressing units smaller than that needs a host-endian fixup. */ | |
30 | #ifdef HOST_WORDS_BIGENDIAN | |
31 | #define H1(x) ((x) ^ 7) | |
32 | #define H1_2(x) ((x) ^ 6) | |
33 | #define H1_4(x) ((x) ^ 4) | |
34 | #define H2(x) ((x) ^ 3) | |
35 | #define H4(x) ((x) ^ 1) | |
36 | #else | |
37 | #define H1(x) (x) | |
38 | #define H1_2(x) (x) | |
39 | #define H1_4(x) (x) | |
40 | #define H2(x) (x) | |
41 | #define H4(x) (x) | |
42 | #endif | |
43 | ||
9e18d7a6 RH |
44 | /* Return a value for NZCV as per the ARM PredTest pseudofunction. |
45 | * | |
46 | * The return value has bit 31 set if N is set, bit 1 set if Z is clear, | |
47 | * and bit 0 set if C is set. Compare the definitions of these variables | |
48 | * within CPUARMState. | |
49 | */ | |
50 | ||
51 | /* For no G bits set, NZCV = C. */ | |
52 | #define PREDTEST_INIT 1 | |
53 | ||
54 | /* This is an iterative function, called for each Pd and Pg word | |
55 | * moving forward. | |
56 | */ | |
57 | static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags) | |
58 | { | |
59 | if (likely(g)) { | |
60 | /* Compute N from first D & G. | |
61 | Use bit 2 to signal first G bit seen. */ | |
62 | if (!(flags & 4)) { | |
63 | flags |= ((d & (g & -g)) != 0) << 31; | |
64 | flags |= 4; | |
65 | } | |
66 | ||
67 | /* Accumulate Z from each D & G. */ | |
68 | flags |= ((d & g) != 0) << 1; | |
69 | ||
70 | /* Compute C from last !(D & G). Replace previous. */ | |
71 | flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0); | |
72 | } | |
73 | return flags; | |
74 | } | |
75 | ||
76 | /* The same for a single word predicate. */ | |
77 | uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g) | |
78 | { | |
79 | return iter_predtest_fwd(d, g, PREDTEST_INIT); | |
80 | } | |
81 | ||
82 | /* The same for a multi-word predicate. */ | |
83 | uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words) | |
84 | { | |
85 | uint32_t flags = PREDTEST_INIT; | |
86 | uint64_t *d = vd, *g = vg; | |
87 | uintptr_t i = 0; | |
88 | ||
89 | do { | |
90 | flags = iter_predtest_fwd(d[i], g[i], flags); | |
91 | } while (++i < words); | |
92 | ||
93 | return flags; | |
94 | } | |
516e246a | 95 | |
ccd841c3 RH |
96 | /* Expand active predicate bits to bytes, for byte elements. |
97 | * for (i = 0; i < 256; ++i) { | |
98 | * unsigned long m = 0; | |
99 | * for (j = 0; j < 8; j++) { | |
100 | * if ((i >> j) & 1) { | |
101 | * m |= 0xfful << (j << 3); | |
102 | * } | |
103 | * } | |
104 | * printf("0x%016lx,\n", m); | |
105 | * } | |
106 | */ | |
107 | static inline uint64_t expand_pred_b(uint8_t byte) | |
108 | { | |
109 | static const uint64_t word[256] = { | |
110 | 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, | |
111 | 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, | |
112 | 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, | |
113 | 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, | |
114 | 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, | |
115 | 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, | |
116 | 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, | |
117 | 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, | |
118 | 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, | |
119 | 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, | |
120 | 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, | |
121 | 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, | |
122 | 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, | |
123 | 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, | |
124 | 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, | |
125 | 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, | |
126 | 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, | |
127 | 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, | |
128 | 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, | |
129 | 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, | |
130 | 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, | |
131 | 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, | |
132 | 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, | |
133 | 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, | |
134 | 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, | |
135 | 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, | |
136 | 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, | |
137 | 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, | |
138 | 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, | |
139 | 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, | |
140 | 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, | |
141 | 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, | |
142 | 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, | |
143 | 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, | |
144 | 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, | |
145 | 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, | |
146 | 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, | |
147 | 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, | |
148 | 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, | |
149 | 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, | |
150 | 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, | |
151 | 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, | |
152 | 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, | |
153 | 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, | |
154 | 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, | |
155 | 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, | |
156 | 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, | |
157 | 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, | |
158 | 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, | |
159 | 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, | |
160 | 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, | |
161 | 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, | |
162 | 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, | |
163 | 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, | |
164 | 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, | |
165 | 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, | |
166 | 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, | |
167 | 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, | |
168 | 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, | |
169 | 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, | |
170 | 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, | |
171 | 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, | |
172 | 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, | |
173 | 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, | |
174 | 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, | |
175 | 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, | |
176 | 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, | |
177 | 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, | |
178 | 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, | |
179 | 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, | |
180 | 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, | |
181 | 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, | |
182 | 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, | |
183 | 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, | |
184 | 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, | |
185 | 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, | |
186 | 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, | |
187 | 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, | |
188 | 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, | |
189 | 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, | |
190 | 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, | |
191 | 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, | |
192 | 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, | |
193 | 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, | |
194 | 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, | |
195 | 0xffffffffffffffff, | |
196 | }; | |
197 | return word[byte]; | |
198 | } | |
199 | ||
200 | /* Similarly for half-word elements. | |
201 | * for (i = 0; i < 256; ++i) { | |
202 | * unsigned long m = 0; | |
203 | * if (i & 0xaa) { | |
204 | * continue; | |
205 | * } | |
206 | * for (j = 0; j < 8; j += 2) { | |
207 | * if ((i >> j) & 1) { | |
208 | * m |= 0xfffful << (j << 3); | |
209 | * } | |
210 | * } | |
211 | * printf("[0x%x] = 0x%016lx,\n", i, m); | |
212 | * } | |
213 | */ | |
214 | static inline uint64_t expand_pred_h(uint8_t byte) | |
215 | { | |
216 | static const uint64_t word[] = { | |
217 | [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, | |
218 | [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, | |
219 | [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, | |
220 | [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, | |
221 | [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, | |
222 | [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, | |
223 | [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, | |
224 | [0x55] = 0xffffffffffffffff, | |
225 | }; | |
226 | return word[byte & 0x55]; | |
227 | } | |
228 | ||
229 | /* Similarly for single word elements. */ | |
230 | static inline uint64_t expand_pred_s(uint8_t byte) | |
231 | { | |
232 | static const uint64_t word[] = { | |
233 | [0x01] = 0x00000000ffffffffull, | |
234 | [0x10] = 0xffffffff00000000ull, | |
235 | [0x11] = 0xffffffffffffffffull, | |
236 | }; | |
237 | return word[byte & 0x11]; | |
238 | } | |
239 | ||
516e246a RH |
240 | #define LOGICAL_PPPP(NAME, FUNC) \ |
241 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ | |
242 | { \ | |
243 | uintptr_t opr_sz = simd_oprsz(desc); \ | |
244 | uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \ | |
245 | uintptr_t i; \ | |
246 | for (i = 0; i < opr_sz / 8; ++i) { \ | |
247 | d[i] = FUNC(n[i], m[i], g[i]); \ | |
248 | } \ | |
249 | } | |
250 | ||
251 | #define DO_AND(N, M, G) (((N) & (M)) & (G)) | |
252 | #define DO_BIC(N, M, G) (((N) & ~(M)) & (G)) | |
253 | #define DO_EOR(N, M, G) (((N) ^ (M)) & (G)) | |
254 | #define DO_ORR(N, M, G) (((N) | (M)) & (G)) | |
255 | #define DO_ORN(N, M, G) (((N) | ~(M)) & (G)) | |
256 | #define DO_NOR(N, M, G) (~((N) | (M)) & (G)) | |
257 | #define DO_NAND(N, M, G) (~((N) & (M)) & (G)) | |
258 | #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G))) | |
259 | ||
260 | LOGICAL_PPPP(sve_and_pppp, DO_AND) | |
261 | LOGICAL_PPPP(sve_bic_pppp, DO_BIC) | |
262 | LOGICAL_PPPP(sve_eor_pppp, DO_EOR) | |
263 | LOGICAL_PPPP(sve_sel_pppp, DO_SEL) | |
264 | LOGICAL_PPPP(sve_orr_pppp, DO_ORR) | |
265 | LOGICAL_PPPP(sve_orn_pppp, DO_ORN) | |
266 | LOGICAL_PPPP(sve_nor_pppp, DO_NOR) | |
267 | LOGICAL_PPPP(sve_nand_pppp, DO_NAND) | |
268 | ||
269 | #undef DO_AND | |
270 | #undef DO_BIC | |
271 | #undef DO_EOR | |
272 | #undef DO_ORR | |
273 | #undef DO_ORN | |
274 | #undef DO_NOR | |
275 | #undef DO_NAND | |
276 | #undef DO_SEL | |
277 | #undef LOGICAL_PPPP | |
028e2a7b | 278 | |
f97cfd59 RH |
279 | /* Fully general three-operand expander, controlled by a predicate. |
280 | * This is complicated by the host-endian storage of the register file. | |
281 | */ | |
282 | /* ??? I don't expect the compiler could ever vectorize this itself. | |
283 | * With some tables we can convert bit masks to byte masks, and with | |
284 | * extra care wrt byte/word ordering we could use gcc generic vectors | |
285 | * and do 16 bytes at a time. | |
286 | */ | |
287 | #define DO_ZPZZ(NAME, TYPE, H, OP) \ | |
288 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ | |
289 | { \ | |
290 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
291 | for (i = 0; i < opr_sz; ) { \ | |
292 | uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ | |
293 | do { \ | |
294 | if (pg & 1) { \ | |
295 | TYPE nn = *(TYPE *)(vn + H(i)); \ | |
296 | TYPE mm = *(TYPE *)(vm + H(i)); \ | |
297 | *(TYPE *)(vd + H(i)) = OP(nn, mm); \ | |
298 | } \ | |
299 | i += sizeof(TYPE), pg >>= sizeof(TYPE); \ | |
300 | } while (i & 15); \ | |
301 | } \ | |
302 | } | |
303 | ||
304 | /* Similarly, specialized for 64-bit operands. */ | |
305 | #define DO_ZPZZ_D(NAME, TYPE, OP) \ | |
306 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ | |
307 | { \ | |
308 | intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ | |
309 | TYPE *d = vd, *n = vn, *m = vm; \ | |
310 | uint8_t *pg = vg; \ | |
311 | for (i = 0; i < opr_sz; i += 1) { \ | |
312 | if (pg[H1(i)] & 1) { \ | |
313 | TYPE nn = n[i], mm = m[i]; \ | |
314 | d[i] = OP(nn, mm); \ | |
315 | } \ | |
316 | } \ | |
317 | } | |
318 | ||
319 | #define DO_AND(N, M) (N & M) | |
320 | #define DO_EOR(N, M) (N ^ M) | |
321 | #define DO_ORR(N, M) (N | M) | |
322 | #define DO_BIC(N, M) (N & ~M) | |
323 | #define DO_ADD(N, M) (N + M) | |
324 | #define DO_SUB(N, M) (N - M) | |
325 | #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M)) | |
326 | #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N)) | |
327 | #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N)) | |
328 | #define DO_MUL(N, M) (N * M) | |
329 | #define DO_DIV(N, M) (M ? N / M : 0) | |
330 | ||
331 | DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND) | |
332 | DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND) | |
333 | DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND) | |
334 | DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND) | |
335 | ||
336 | DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR) | |
337 | DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR) | |
338 | DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR) | |
339 | DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR) | |
340 | ||
341 | DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR) | |
342 | DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR) | |
343 | DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR) | |
344 | DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR) | |
345 | ||
346 | DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC) | |
347 | DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC) | |
348 | DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC) | |
349 | DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC) | |
350 | ||
351 | DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD) | |
352 | DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD) | |
353 | DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD) | |
354 | DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD) | |
355 | ||
356 | DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB) | |
357 | DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB) | |
358 | DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB) | |
359 | DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB) | |
360 | ||
361 | DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX) | |
362 | DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX) | |
363 | DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX) | |
364 | DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX) | |
365 | ||
366 | DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX) | |
367 | DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX) | |
368 | DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX) | |
369 | DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX) | |
370 | ||
371 | DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN) | |
372 | DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN) | |
373 | DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN) | |
374 | DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN) | |
375 | ||
376 | DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN) | |
377 | DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN) | |
378 | DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN) | |
379 | DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN) | |
380 | ||
381 | DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD) | |
382 | DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD) | |
383 | DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD) | |
384 | DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD) | |
385 | ||
386 | DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD) | |
387 | DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD) | |
388 | DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD) | |
389 | DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD) | |
390 | ||
391 | /* Because the computation type is at least twice as large as required, | |
392 | these work for both signed and unsigned source types. */ | |
393 | static inline uint8_t do_mulh_b(int32_t n, int32_t m) | |
394 | { | |
395 | return (n * m) >> 8; | |
396 | } | |
397 | ||
398 | static inline uint16_t do_mulh_h(int32_t n, int32_t m) | |
399 | { | |
400 | return (n * m) >> 16; | |
401 | } | |
402 | ||
403 | static inline uint32_t do_mulh_s(int64_t n, int64_t m) | |
404 | { | |
405 | return (n * m) >> 32; | |
406 | } | |
407 | ||
408 | static inline uint64_t do_smulh_d(uint64_t n, uint64_t m) | |
409 | { | |
410 | uint64_t lo, hi; | |
411 | muls64(&lo, &hi, n, m); | |
412 | return hi; | |
413 | } | |
414 | ||
415 | static inline uint64_t do_umulh_d(uint64_t n, uint64_t m) | |
416 | { | |
417 | uint64_t lo, hi; | |
418 | mulu64(&lo, &hi, n, m); | |
419 | return hi; | |
420 | } | |
421 | ||
422 | DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL) | |
423 | DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL) | |
424 | DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL) | |
425 | DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL) | |
426 | ||
427 | DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b) | |
428 | DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h) | |
429 | DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s) | |
430 | DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d) | |
431 | ||
432 | DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b) | |
433 | DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h) | |
434 | DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s) | |
435 | DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d) | |
436 | ||
437 | DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_DIV) | |
438 | DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV) | |
439 | ||
440 | DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV) | |
441 | DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV) | |
442 | ||
27721dbb RH |
443 | /* Note that all bits of the shift are significant |
444 | and not modulo the element size. */ | |
445 | #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1)) | |
446 | #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0) | |
447 | #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0) | |
448 | ||
449 | DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR) | |
450 | DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR) | |
451 | DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL) | |
452 | ||
453 | DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR) | |
454 | DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR) | |
455 | DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL) | |
456 | ||
457 | DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR) | |
458 | DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR) | |
459 | DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL) | |
460 | ||
461 | DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR) | |
462 | DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR) | |
463 | DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL) | |
464 | ||
f97cfd59 RH |
465 | #undef DO_ZPZZ |
466 | #undef DO_ZPZZ_D | |
047cec97 | 467 | |
fe7f8dfb RH |
468 | /* Three-operand expander, controlled by a predicate, in which the |
469 | * third operand is "wide". That is, for D = N op M, the same 64-bit | |
470 | * value of M is used with all of the narrower values of N. | |
471 | */ | |
472 | #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \ | |
473 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \ | |
474 | { \ | |
475 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
476 | for (i = 0; i < opr_sz; ) { \ | |
477 | uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \ | |
478 | TYPEW mm = *(TYPEW *)(vm + i); \ | |
479 | do { \ | |
480 | if (pg & 1) { \ | |
481 | TYPE nn = *(TYPE *)(vn + H(i)); \ | |
482 | *(TYPE *)(vd + H(i)) = OP(nn, mm); \ | |
483 | } \ | |
484 | i += sizeof(TYPE), pg >>= sizeof(TYPE); \ | |
485 | } while (i & 7); \ | |
486 | } \ | |
487 | } | |
488 | ||
489 | DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR) | |
490 | DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR) | |
491 | DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL) | |
492 | ||
493 | DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR) | |
494 | DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR) | |
495 | DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL) | |
496 | ||
497 | DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR) | |
498 | DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR) | |
499 | DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL) | |
500 | ||
501 | #undef DO_ZPZW | |
502 | ||
afac6d04 RH |
503 | /* Fully general two-operand expander, controlled by a predicate. |
504 | */ | |
505 | #define DO_ZPZ(NAME, TYPE, H, OP) \ | |
506 | void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ | |
507 | { \ | |
508 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
509 | for (i = 0; i < opr_sz; ) { \ | |
510 | uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ | |
511 | do { \ | |
512 | if (pg & 1) { \ | |
513 | TYPE nn = *(TYPE *)(vn + H(i)); \ | |
514 | *(TYPE *)(vd + H(i)) = OP(nn); \ | |
515 | } \ | |
516 | i += sizeof(TYPE), pg >>= sizeof(TYPE); \ | |
517 | } while (i & 15); \ | |
518 | } \ | |
519 | } | |
520 | ||
521 | /* Similarly, specialized for 64-bit operands. */ | |
522 | #define DO_ZPZ_D(NAME, TYPE, OP) \ | |
523 | void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ | |
524 | { \ | |
525 | intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ | |
526 | TYPE *d = vd, *n = vn; \ | |
527 | uint8_t *pg = vg; \ | |
528 | for (i = 0; i < opr_sz; i += 1) { \ | |
529 | if (pg[H1(i)] & 1) { \ | |
530 | TYPE nn = n[i]; \ | |
531 | d[i] = OP(nn); \ | |
532 | } \ | |
533 | } \ | |
534 | } | |
535 | ||
536 | #define DO_CLS_B(N) (clrsb32(N) - 24) | |
537 | #define DO_CLS_H(N) (clrsb32(N) - 16) | |
538 | ||
539 | DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B) | |
540 | DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H) | |
541 | DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32) | |
542 | DO_ZPZ_D(sve_cls_d, int64_t, clrsb64) | |
543 | ||
544 | #define DO_CLZ_B(N) (clz32(N) - 24) | |
545 | #define DO_CLZ_H(N) (clz32(N) - 16) | |
546 | ||
547 | DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B) | |
548 | DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H) | |
549 | DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32) | |
550 | DO_ZPZ_D(sve_clz_d, uint64_t, clz64) | |
551 | ||
552 | DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8) | |
553 | DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16) | |
554 | DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32) | |
555 | DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64) | |
556 | ||
557 | #define DO_CNOT(N) (N == 0) | |
558 | ||
559 | DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT) | |
560 | DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT) | |
561 | DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT) | |
562 | DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT) | |
563 | ||
564 | #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1)) | |
565 | ||
566 | DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS) | |
567 | DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS) | |
568 | DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS) | |
569 | ||
570 | #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1)) | |
571 | ||
572 | DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG) | |
573 | DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG) | |
574 | DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG) | |
575 | ||
576 | #define DO_NOT(N) (~N) | |
577 | ||
578 | DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT) | |
579 | DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT) | |
580 | DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT) | |
581 | DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT) | |
582 | ||
583 | #define DO_SXTB(N) ((int8_t)N) | |
584 | #define DO_SXTH(N) ((int16_t)N) | |
585 | #define DO_SXTS(N) ((int32_t)N) | |
586 | #define DO_UXTB(N) ((uint8_t)N) | |
587 | #define DO_UXTH(N) ((uint16_t)N) | |
588 | #define DO_UXTS(N) ((uint32_t)N) | |
589 | ||
590 | DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB) | |
591 | DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB) | |
592 | DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH) | |
593 | DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB) | |
594 | DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH) | |
595 | DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS) | |
596 | ||
597 | DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB) | |
598 | DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB) | |
599 | DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH) | |
600 | DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB) | |
601 | DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH) | |
602 | DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS) | |
603 | ||
604 | #define DO_ABS(N) (N < 0 ? -N : N) | |
605 | ||
606 | DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS) | |
607 | DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS) | |
608 | DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS) | |
609 | DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS) | |
610 | ||
611 | #define DO_NEG(N) (-N) | |
612 | ||
613 | DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG) | |
614 | DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG) | |
615 | DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG) | |
616 | DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG) | |
617 | ||
d9d78dcc RH |
618 | /* Three-operand expander, unpredicated, in which the third operand is "wide". |
619 | */ | |
620 | #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \ | |
621 | void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ | |
622 | { \ | |
623 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
624 | for (i = 0; i < opr_sz; ) { \ | |
625 | TYPEW mm = *(TYPEW *)(vm + i); \ | |
626 | do { \ | |
627 | TYPE nn = *(TYPE *)(vn + H(i)); \ | |
628 | *(TYPE *)(vd + H(i)) = OP(nn, mm); \ | |
629 | i += sizeof(TYPE); \ | |
630 | } while (i & 7); \ | |
631 | } \ | |
632 | } | |
633 | ||
634 | DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR) | |
635 | DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR) | |
636 | DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL) | |
637 | ||
638 | DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR) | |
639 | DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR) | |
640 | DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL) | |
641 | ||
642 | DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR) | |
643 | DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR) | |
644 | DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL) | |
645 | ||
646 | #undef DO_ZZW | |
647 | ||
afac6d04 RH |
648 | #undef DO_CLS_B |
649 | #undef DO_CLS_H | |
650 | #undef DO_CLZ_B | |
651 | #undef DO_CLZ_H | |
652 | #undef DO_CNOT | |
653 | #undef DO_FABS | |
654 | #undef DO_FNEG | |
655 | #undef DO_ABS | |
656 | #undef DO_NEG | |
657 | #undef DO_ZPZ | |
658 | #undef DO_ZPZ_D | |
659 | ||
047cec97 RH |
660 | /* Two-operand reduction expander, controlled by a predicate. |
661 | * The difference between TYPERED and TYPERET has to do with | |
662 | * sign-extension. E.g. for SMAX, TYPERED must be signed, | |
663 | * but TYPERET must be unsigned so that e.g. a 32-bit value | |
664 | * is not sign-extended to the ABI uint64_t return type. | |
665 | */ | |
666 | /* ??? If we were to vectorize this by hand the reduction ordering | |
667 | * would change. For integer operands, this is perfectly fine. | |
668 | */ | |
669 | #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \ | |
670 | uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ | |
671 | { \ | |
672 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
673 | TYPERED ret = INIT; \ | |
674 | for (i = 0; i < opr_sz; ) { \ | |
675 | uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ | |
676 | do { \ | |
677 | if (pg & 1) { \ | |
678 | TYPEELT nn = *(TYPEELT *)(vn + H(i)); \ | |
679 | ret = OP(ret, nn); \ | |
680 | } \ | |
681 | i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \ | |
682 | } while (i & 15); \ | |
683 | } \ | |
684 | return (TYPERET)ret; \ | |
685 | } | |
686 | ||
687 | #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \ | |
688 | uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \ | |
689 | { \ | |
690 | intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ | |
691 | TYPEE *n = vn; \ | |
692 | uint8_t *pg = vg; \ | |
693 | TYPER ret = INIT; \ | |
694 | for (i = 0; i < opr_sz; i += 1) { \ | |
695 | if (pg[H1(i)] & 1) { \ | |
696 | TYPEE nn = n[i]; \ | |
697 | ret = OP(ret, nn); \ | |
698 | } \ | |
699 | } \ | |
700 | return ret; \ | |
701 | } | |
702 | ||
703 | DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR) | |
704 | DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR) | |
705 | DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR) | |
706 | DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR) | |
707 | ||
708 | DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR) | |
709 | DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR) | |
710 | DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR) | |
711 | DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR) | |
712 | ||
713 | DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND) | |
714 | DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND) | |
715 | DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND) | |
716 | DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND) | |
717 | ||
718 | DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD) | |
719 | DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) | |
720 | DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) | |
721 | ||
722 | DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD) | |
723 | DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD) | |
724 | DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD) | |
725 | DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD) | |
726 | ||
727 | DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX) | |
728 | DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX) | |
729 | DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX) | |
730 | DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX) | |
731 | ||
732 | DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX) | |
733 | DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX) | |
734 | DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX) | |
735 | DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX) | |
736 | ||
737 | DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN) | |
738 | DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN) | |
739 | DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN) | |
740 | DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN) | |
741 | ||
742 | DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN) | |
743 | DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN) | |
744 | DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN) | |
745 | DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN) | |
746 | ||
747 | #undef DO_VPZ | |
748 | #undef DO_VPZ_D | |
749 | ||
f97cfd59 RH |
750 | #undef DO_AND |
751 | #undef DO_ORR | |
752 | #undef DO_EOR | |
753 | #undef DO_BIC | |
754 | #undef DO_ADD | |
755 | #undef DO_SUB | |
756 | #undef DO_MAX | |
757 | #undef DO_MIN | |
758 | #undef DO_ABD | |
759 | #undef DO_MUL | |
760 | #undef DO_DIV | |
27721dbb RH |
761 | #undef DO_ASR |
762 | #undef DO_LSR | |
763 | #undef DO_LSL | |
f97cfd59 | 764 | |
028e2a7b RH |
765 | /* Similar to the ARM LastActiveElement pseudocode function, except the |
766 | result is multiplied by the element size. This includes the not found | |
767 | indication; e.g. not found for esz=3 is -8. */ | |
768 | static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz) | |
769 | { | |
770 | uint64_t mask = pred_esz_masks[esz]; | |
771 | intptr_t i = words; | |
772 | ||
773 | do { | |
774 | uint64_t this_g = g[--i] & mask; | |
775 | if (this_g) { | |
776 | return i * 64 + (63 - clz64(this_g)); | |
777 | } | |
778 | } while (i > 0); | |
779 | return (intptr_t)-1 << esz; | |
780 | } | |
781 | ||
782 | uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words) | |
783 | { | |
784 | uint32_t flags = PREDTEST_INIT; | |
785 | uint64_t *d = vd, *g = vg; | |
786 | intptr_t i = 0; | |
787 | ||
788 | do { | |
789 | uint64_t this_d = d[i]; | |
790 | uint64_t this_g = g[i]; | |
791 | ||
792 | if (this_g) { | |
793 | if (!(flags & 4)) { | |
794 | /* Set in D the first bit of G. */ | |
795 | this_d |= this_g & -this_g; | |
796 | d[i] = this_d; | |
797 | } | |
798 | flags = iter_predtest_fwd(this_d, this_g, flags); | |
799 | } | |
800 | } while (++i < words); | |
801 | ||
802 | return flags; | |
803 | } | |
804 | ||
805 | uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc) | |
806 | { | |
807 | intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS); | |
808 | intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2); | |
809 | uint32_t flags = PREDTEST_INIT; | |
810 | uint64_t *d = vd, *g = vg, esz_mask; | |
811 | intptr_t i, next; | |
812 | ||
813 | next = last_active_element(vd, words, esz) + (1 << esz); | |
814 | esz_mask = pred_esz_masks[esz]; | |
815 | ||
816 | /* Similar to the pseudocode for pnext, but scaled by ESZ | |
817 | so that we find the correct bit. */ | |
818 | if (next < words * 64) { | |
819 | uint64_t mask = -1; | |
820 | ||
821 | if (next & 63) { | |
822 | mask = ~((1ull << (next & 63)) - 1); | |
823 | next &= -64; | |
824 | } | |
825 | do { | |
826 | uint64_t this_g = g[next / 64] & esz_mask & mask; | |
827 | if (this_g != 0) { | |
828 | next = (next & -64) + ctz64(this_g); | |
829 | break; | |
830 | } | |
831 | next += 64; | |
832 | mask = -1; | |
833 | } while (next < words * 64); | |
834 | } | |
835 | ||
836 | i = 0; | |
837 | do { | |
838 | uint64_t this_d = 0; | |
839 | if (i == next / 64) { | |
840 | this_d = 1ull << (next & 63); | |
841 | } | |
842 | d[i] = this_d; | |
843 | flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags); | |
844 | } while (++i < words); | |
845 | ||
846 | return flags; | |
847 | } | |
ccd841c3 RH |
848 | |
849 | /* Store zero into every active element of Zd. We will use this for two | |
850 | * and three-operand predicated instructions for which logic dictates a | |
851 | * zero result. In particular, logical shift by element size, which is | |
852 | * otherwise undefined on the host. | |
853 | * | |
854 | * For element sizes smaller than uint64_t, we use tables to expand | |
855 | * the N bits of the controlling predicate to a byte mask, and clear | |
856 | * those bytes. | |
857 | */ | |
858 | void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc) | |
859 | { | |
860 | intptr_t i, opr_sz = simd_oprsz(desc) / 8; | |
861 | uint64_t *d = vd; | |
862 | uint8_t *pg = vg; | |
863 | for (i = 0; i < opr_sz; i += 1) { | |
864 | d[i] &= ~expand_pred_b(pg[H1(i)]); | |
865 | } | |
866 | } | |
867 | ||
868 | void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc) | |
869 | { | |
870 | intptr_t i, opr_sz = simd_oprsz(desc) / 8; | |
871 | uint64_t *d = vd; | |
872 | uint8_t *pg = vg; | |
873 | for (i = 0; i < opr_sz; i += 1) { | |
874 | d[i] &= ~expand_pred_h(pg[H1(i)]); | |
875 | } | |
876 | } | |
877 | ||
878 | void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc) | |
879 | { | |
880 | intptr_t i, opr_sz = simd_oprsz(desc) / 8; | |
881 | uint64_t *d = vd; | |
882 | uint8_t *pg = vg; | |
883 | for (i = 0; i < opr_sz; i += 1) { | |
884 | d[i] &= ~expand_pred_s(pg[H1(i)]); | |
885 | } | |
886 | } | |
887 | ||
888 | void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc) | |
889 | { | |
890 | intptr_t i, opr_sz = simd_oprsz(desc) / 8; | |
891 | uint64_t *d = vd; | |
892 | uint8_t *pg = vg; | |
893 | for (i = 0; i < opr_sz; i += 1) { | |
894 | if (pg[H1(i)] & 1) { | |
895 | d[i] = 0; | |
896 | } | |
897 | } | |
898 | } | |
899 | ||
900 | /* Three-operand expander, immediate operand, controlled by a predicate. | |
901 | */ | |
902 | #define DO_ZPZI(NAME, TYPE, H, OP) \ | |
903 | void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ | |
904 | { \ | |
905 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
906 | TYPE imm = simd_data(desc); \ | |
907 | for (i = 0; i < opr_sz; ) { \ | |
908 | uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ | |
909 | do { \ | |
910 | if (pg & 1) { \ | |
911 | TYPE nn = *(TYPE *)(vn + H(i)); \ | |
912 | *(TYPE *)(vd + H(i)) = OP(nn, imm); \ | |
913 | } \ | |
914 | i += sizeof(TYPE), pg >>= sizeof(TYPE); \ | |
915 | } while (i & 15); \ | |
916 | } \ | |
917 | } | |
918 | ||
919 | /* Similarly, specialized for 64-bit operands. */ | |
920 | #define DO_ZPZI_D(NAME, TYPE, OP) \ | |
921 | void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \ | |
922 | { \ | |
923 | intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ | |
924 | TYPE *d = vd, *n = vn; \ | |
925 | TYPE imm = simd_data(desc); \ | |
926 | uint8_t *pg = vg; \ | |
927 | for (i = 0; i < opr_sz; i += 1) { \ | |
928 | if (pg[H1(i)] & 1) { \ | |
929 | TYPE nn = n[i]; \ | |
930 | d[i] = OP(nn, imm); \ | |
931 | } \ | |
932 | } \ | |
933 | } | |
934 | ||
935 | #define DO_SHR(N, M) (N >> M) | |
936 | #define DO_SHL(N, M) (N << M) | |
937 | ||
938 | /* Arithmetic shift right for division. This rounds negative numbers | |
939 | toward zero as per signed division. Therefore before shifting, | |
940 | when N is negative, add 2**M-1. */ | |
941 | #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M) | |
942 | ||
943 | DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR) | |
944 | DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR) | |
945 | DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR) | |
946 | DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR) | |
947 | ||
948 | DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR) | |
949 | DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR) | |
950 | DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR) | |
951 | DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR) | |
952 | ||
953 | DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL) | |
954 | DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL) | |
955 | DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL) | |
956 | DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL) | |
957 | ||
958 | DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD) | |
959 | DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD) | |
960 | DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD) | |
961 | DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD) | |
962 | ||
963 | #undef DO_SHR | |
964 | #undef DO_SHL | |
965 | #undef DO_ASRD | |
966 | #undef DO_ZPZI | |
967 | #undef DO_ZPZI_D | |
96a36e4a RH |
968 | |
969 | /* Fully general four-operand expander, controlled by a predicate. | |
970 | */ | |
971 | #define DO_ZPZZZ(NAME, TYPE, H, OP) \ | |
972 | void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ | |
973 | void *vg, uint32_t desc) \ | |
974 | { \ | |
975 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
976 | for (i = 0; i < opr_sz; ) { \ | |
977 | uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \ | |
978 | do { \ | |
979 | if (pg & 1) { \ | |
980 | TYPE nn = *(TYPE *)(vn + H(i)); \ | |
981 | TYPE mm = *(TYPE *)(vm + H(i)); \ | |
982 | TYPE aa = *(TYPE *)(va + H(i)); \ | |
983 | *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \ | |
984 | } \ | |
985 | i += sizeof(TYPE), pg >>= sizeof(TYPE); \ | |
986 | } while (i & 15); \ | |
987 | } \ | |
988 | } | |
989 | ||
990 | /* Similarly, specialized for 64-bit operands. */ | |
991 | #define DO_ZPZZZ_D(NAME, TYPE, OP) \ | |
992 | void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \ | |
993 | void *vg, uint32_t desc) \ | |
994 | { \ | |
995 | intptr_t i, opr_sz = simd_oprsz(desc) / 8; \ | |
996 | TYPE *d = vd, *a = va, *n = vn, *m = vm; \ | |
997 | uint8_t *pg = vg; \ | |
998 | for (i = 0; i < opr_sz; i += 1) { \ | |
999 | if (pg[H1(i)] & 1) { \ | |
1000 | TYPE aa = a[i], nn = n[i], mm = m[i]; \ | |
1001 | d[i] = OP(aa, nn, mm); \ | |
1002 | } \ | |
1003 | } \ | |
1004 | } | |
1005 | ||
1006 | #define DO_MLA(A, N, M) (A + N * M) | |
1007 | #define DO_MLS(A, N, M) (A - N * M) | |
1008 | ||
1009 | DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA) | |
1010 | DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS) | |
1011 | ||
1012 | DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA) | |
1013 | DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS) | |
1014 | ||
1015 | DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA) | |
1016 | DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS) | |
1017 | ||
1018 | DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA) | |
1019 | DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS) | |
1020 | ||
1021 | #undef DO_MLA | |
1022 | #undef DO_MLS | |
1023 | #undef DO_ZPZZZ | |
1024 | #undef DO_ZPZZZ_D | |
9a56c9c3 RH |
1025 | |
1026 | void HELPER(sve_index_b)(void *vd, uint32_t start, | |
1027 | uint32_t incr, uint32_t desc) | |
1028 | { | |
1029 | intptr_t i, opr_sz = simd_oprsz(desc); | |
1030 | uint8_t *d = vd; | |
1031 | for (i = 0; i < opr_sz; i += 1) { | |
1032 | d[H1(i)] = start + i * incr; | |
1033 | } | |
1034 | } | |
1035 | ||
1036 | void HELPER(sve_index_h)(void *vd, uint32_t start, | |
1037 | uint32_t incr, uint32_t desc) | |
1038 | { | |
1039 | intptr_t i, opr_sz = simd_oprsz(desc) / 2; | |
1040 | uint16_t *d = vd; | |
1041 | for (i = 0; i < opr_sz; i += 1) { | |
1042 | d[H2(i)] = start + i * incr; | |
1043 | } | |
1044 | } | |
1045 | ||
1046 | void HELPER(sve_index_s)(void *vd, uint32_t start, | |
1047 | uint32_t incr, uint32_t desc) | |
1048 | { | |
1049 | intptr_t i, opr_sz = simd_oprsz(desc) / 4; | |
1050 | uint32_t *d = vd; | |
1051 | for (i = 0; i < opr_sz; i += 1) { | |
1052 | d[H4(i)] = start + i * incr; | |
1053 | } | |
1054 | } | |
1055 | ||
1056 | void HELPER(sve_index_d)(void *vd, uint64_t start, | |
1057 | uint64_t incr, uint32_t desc) | |
1058 | { | |
1059 | intptr_t i, opr_sz = simd_oprsz(desc) / 8; | |
1060 | uint64_t *d = vd; | |
1061 | for (i = 0; i < opr_sz; i += 1) { | |
1062 | d[i] = start + i * incr; | |
1063 | } | |
1064 | } |