]> git.proxmox.com Git - mirror_qemu.git/blob - target/arm/sve_helper.c
qemu-img: fix regression copying secrets during convert
[mirror_qemu.git] / target / arm / sve_helper.c
1 /*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/exec-all.h"
23 #include "exec/cpu_ldst.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
27
28
29 /* Note that vector data is stored in host-endian 64-bit chunks,
30 so addressing units smaller than that needs a host-endian fixup. */
31 #ifdef HOST_WORDS_BIGENDIAN
32 #define H1(x) ((x) ^ 7)
33 #define H1_2(x) ((x) ^ 6)
34 #define H1_4(x) ((x) ^ 4)
35 #define H2(x) ((x) ^ 3)
36 #define H4(x) ((x) ^ 1)
37 #else
38 #define H1(x) (x)
39 #define H1_2(x) (x)
40 #define H1_4(x) (x)
41 #define H2(x) (x)
42 #define H4(x) (x)
43 #endif
44
45 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
46 *
47 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
48 * and bit 0 set if C is set. Compare the definitions of these variables
49 * within CPUARMState.
50 */
51
52 /* For no G bits set, NZCV = C. */
53 #define PREDTEST_INIT 1
54
55 /* This is an iterative function, called for each Pd and Pg word
56 * moving forward.
57 */
58 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
59 {
60 if (likely(g)) {
61 /* Compute N from first D & G.
62 Use bit 2 to signal first G bit seen. */
63 if (!(flags & 4)) {
64 flags |= ((d & (g & -g)) != 0) << 31;
65 flags |= 4;
66 }
67
68 /* Accumulate Z from each D & G. */
69 flags |= ((d & g) != 0) << 1;
70
71 /* Compute C from last !(D & G). Replace previous. */
72 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
73 }
74 return flags;
75 }
76
77 /* This is an iterative function, called for each Pd and Pg word
78 * moving backward.
79 */
80 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
81 {
82 if (likely(g)) {
83 /* Compute C from first (i.e last) !(D & G).
84 Use bit 2 to signal first G bit seen. */
85 if (!(flags & 4)) {
86 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
87 flags |= (d & pow2floor(g)) == 0;
88 }
89
90 /* Accumulate Z from each D & G. */
91 flags |= ((d & g) != 0) << 1;
92
93 /* Compute N from last (i.e first) D & G. Replace previous. */
94 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
95 }
96 return flags;
97 }
98
99 /* The same for a single word predicate. */
100 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
101 {
102 return iter_predtest_fwd(d, g, PREDTEST_INIT);
103 }
104
105 /* The same for a multi-word predicate. */
106 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
107 {
108 uint32_t flags = PREDTEST_INIT;
109 uint64_t *d = vd, *g = vg;
110 uintptr_t i = 0;
111
112 do {
113 flags = iter_predtest_fwd(d[i], g[i], flags);
114 } while (++i < words);
115
116 return flags;
117 }
118
119 /* Expand active predicate bits to bytes, for byte elements.
120 * for (i = 0; i < 256; ++i) {
121 * unsigned long m = 0;
122 * for (j = 0; j < 8; j++) {
123 * if ((i >> j) & 1) {
124 * m |= 0xfful << (j << 3);
125 * }
126 * }
127 * printf("0x%016lx,\n", m);
128 * }
129 */
130 static inline uint64_t expand_pred_b(uint8_t byte)
131 {
132 static const uint64_t word[256] = {
133 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
134 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
135 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
136 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
137 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
138 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
139 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
140 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
141 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
142 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
143 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
144 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
145 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
146 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
147 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
148 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
149 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
150 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
151 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
152 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
153 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
154 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
155 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
156 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
157 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
158 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
159 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
160 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
161 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
162 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
163 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
164 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
165 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
166 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
167 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
168 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
169 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
170 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
171 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
172 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
173 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
174 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
175 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
176 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
177 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
178 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
179 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
180 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
181 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
182 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
183 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
184 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
185 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
186 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
187 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
188 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
189 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
190 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
191 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
192 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
193 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
194 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
195 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
196 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
197 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
198 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
199 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
200 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
201 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
202 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
203 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
204 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
205 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
206 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
207 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
208 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
209 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
210 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
211 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
212 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
213 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
214 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
215 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
216 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
217 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
218 0xffffffffffffffff,
219 };
220 return word[byte];
221 }
222
223 /* Similarly for half-word elements.
224 * for (i = 0; i < 256; ++i) {
225 * unsigned long m = 0;
226 * if (i & 0xaa) {
227 * continue;
228 * }
229 * for (j = 0; j < 8; j += 2) {
230 * if ((i >> j) & 1) {
231 * m |= 0xfffful << (j << 3);
232 * }
233 * }
234 * printf("[0x%x] = 0x%016lx,\n", i, m);
235 * }
236 */
237 static inline uint64_t expand_pred_h(uint8_t byte)
238 {
239 static const uint64_t word[] = {
240 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
241 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
242 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
243 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
244 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
245 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
246 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
247 [0x55] = 0xffffffffffffffff,
248 };
249 return word[byte & 0x55];
250 }
251
252 /* Similarly for single word elements. */
253 static inline uint64_t expand_pred_s(uint8_t byte)
254 {
255 static const uint64_t word[] = {
256 [0x01] = 0x00000000ffffffffull,
257 [0x10] = 0xffffffff00000000ull,
258 [0x11] = 0xffffffffffffffffull,
259 };
260 return word[byte & 0x11];
261 }
262
263 /* Swap 16-bit words within a 32-bit word. */
264 static inline uint32_t hswap32(uint32_t h)
265 {
266 return rol32(h, 16);
267 }
268
269 /* Swap 16-bit words within a 64-bit word. */
270 static inline uint64_t hswap64(uint64_t h)
271 {
272 uint64_t m = 0x0000ffff0000ffffull;
273 h = rol64(h, 32);
274 return ((h & m) << 16) | ((h >> 16) & m);
275 }
276
277 /* Swap 32-bit words within a 64-bit word. */
278 static inline uint64_t wswap64(uint64_t h)
279 {
280 return rol64(h, 32);
281 }
282
283 #define LOGICAL_PPPP(NAME, FUNC) \
284 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
285 { \
286 uintptr_t opr_sz = simd_oprsz(desc); \
287 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
288 uintptr_t i; \
289 for (i = 0; i < opr_sz / 8; ++i) { \
290 d[i] = FUNC(n[i], m[i], g[i]); \
291 } \
292 }
293
294 #define DO_AND(N, M, G) (((N) & (M)) & (G))
295 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
296 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
297 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
298 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
299 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
300 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
301 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
302
303 LOGICAL_PPPP(sve_and_pppp, DO_AND)
304 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
305 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
306 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
307 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
308 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
309 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
310 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
311
312 #undef DO_AND
313 #undef DO_BIC
314 #undef DO_EOR
315 #undef DO_ORR
316 #undef DO_ORN
317 #undef DO_NOR
318 #undef DO_NAND
319 #undef DO_SEL
320 #undef LOGICAL_PPPP
321
322 /* Fully general three-operand expander, controlled by a predicate.
323 * This is complicated by the host-endian storage of the register file.
324 */
325 /* ??? I don't expect the compiler could ever vectorize this itself.
326 * With some tables we can convert bit masks to byte masks, and with
327 * extra care wrt byte/word ordering we could use gcc generic vectors
328 * and do 16 bytes at a time.
329 */
330 #define DO_ZPZZ(NAME, TYPE, H, OP) \
331 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
332 { \
333 intptr_t i, opr_sz = simd_oprsz(desc); \
334 for (i = 0; i < opr_sz; ) { \
335 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
336 do { \
337 if (pg & 1) { \
338 TYPE nn = *(TYPE *)(vn + H(i)); \
339 TYPE mm = *(TYPE *)(vm + H(i)); \
340 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
341 } \
342 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
343 } while (i & 15); \
344 } \
345 }
346
347 /* Similarly, specialized for 64-bit operands. */
348 #define DO_ZPZZ_D(NAME, TYPE, OP) \
349 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
350 { \
351 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
352 TYPE *d = vd, *n = vn, *m = vm; \
353 uint8_t *pg = vg; \
354 for (i = 0; i < opr_sz; i += 1) { \
355 if (pg[H1(i)] & 1) { \
356 TYPE nn = n[i], mm = m[i]; \
357 d[i] = OP(nn, mm); \
358 } \
359 } \
360 }
361
362 #define DO_AND(N, M) (N & M)
363 #define DO_EOR(N, M) (N ^ M)
364 #define DO_ORR(N, M) (N | M)
365 #define DO_BIC(N, M) (N & ~M)
366 #define DO_ADD(N, M) (N + M)
367 #define DO_SUB(N, M) (N - M)
368 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
369 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
370 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
371 #define DO_MUL(N, M) (N * M)
372
373
374 /*
375 * We must avoid the C undefined behaviour cases: division by
376 * zero and signed division of INT_MIN by -1. Both of these
377 * have architecturally defined required results for Arm.
378 * We special case all signed divisions by -1 to avoid having
379 * to deduce the minimum integer for the type involved.
380 */
381 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
382 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
383
384 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
385 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
386 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
387 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
388
389 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
390 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
391 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
392 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
393
394 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
395 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
396 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
397 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
398
399 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
400 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
401 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
402 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
403
404 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
405 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
406 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
407 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
408
409 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
410 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
411 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
412 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
413
414 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
415 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
416 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
417 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
418
419 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
420 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
421 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
422 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
423
424 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
425 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
426 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
427 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
428
429 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
430 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
431 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
432 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
433
434 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
435 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
436 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
437 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
438
439 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
440 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
441 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
442 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
443
444 /* Because the computation type is at least twice as large as required,
445 these work for both signed and unsigned source types. */
446 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
447 {
448 return (n * m) >> 8;
449 }
450
451 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
452 {
453 return (n * m) >> 16;
454 }
455
456 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
457 {
458 return (n * m) >> 32;
459 }
460
461 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
462 {
463 uint64_t lo, hi;
464 muls64(&lo, &hi, n, m);
465 return hi;
466 }
467
468 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
469 {
470 uint64_t lo, hi;
471 mulu64(&lo, &hi, n, m);
472 return hi;
473 }
474
475 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
476 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
477 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
478 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
479
480 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
481 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
482 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
483 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
484
485 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
486 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
487 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
488 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
489
490 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
491 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
492
493 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
494 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
495
496 /* Note that all bits of the shift are significant
497 and not modulo the element size. */
498 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
499 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
500 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
501
502 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
503 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
504 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
505
506 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
507 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
508 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
509
510 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
511 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
512 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
513
514 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
515 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
516 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
517
518 #undef DO_ZPZZ
519 #undef DO_ZPZZ_D
520
521 /* Three-operand expander, controlled by a predicate, in which the
522 * third operand is "wide". That is, for D = N op M, the same 64-bit
523 * value of M is used with all of the narrower values of N.
524 */
525 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
526 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
527 { \
528 intptr_t i, opr_sz = simd_oprsz(desc); \
529 for (i = 0; i < opr_sz; ) { \
530 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
531 TYPEW mm = *(TYPEW *)(vm + i); \
532 do { \
533 if (pg & 1) { \
534 TYPE nn = *(TYPE *)(vn + H(i)); \
535 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
536 } \
537 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
538 } while (i & 7); \
539 } \
540 }
541
542 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
543 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
544 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
545
546 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
547 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
548 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
549
550 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
551 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
552 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
553
554 #undef DO_ZPZW
555
556 /* Fully general two-operand expander, controlled by a predicate.
557 */
558 #define DO_ZPZ(NAME, TYPE, H, OP) \
559 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
560 { \
561 intptr_t i, opr_sz = simd_oprsz(desc); \
562 for (i = 0; i < opr_sz; ) { \
563 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
564 do { \
565 if (pg & 1) { \
566 TYPE nn = *(TYPE *)(vn + H(i)); \
567 *(TYPE *)(vd + H(i)) = OP(nn); \
568 } \
569 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
570 } while (i & 15); \
571 } \
572 }
573
574 /* Similarly, specialized for 64-bit operands. */
575 #define DO_ZPZ_D(NAME, TYPE, OP) \
576 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
577 { \
578 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
579 TYPE *d = vd, *n = vn; \
580 uint8_t *pg = vg; \
581 for (i = 0; i < opr_sz; i += 1) { \
582 if (pg[H1(i)] & 1) { \
583 TYPE nn = n[i]; \
584 d[i] = OP(nn); \
585 } \
586 } \
587 }
588
589 #define DO_CLS_B(N) (clrsb32(N) - 24)
590 #define DO_CLS_H(N) (clrsb32(N) - 16)
591
592 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
593 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
594 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
595 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
596
597 #define DO_CLZ_B(N) (clz32(N) - 24)
598 #define DO_CLZ_H(N) (clz32(N) - 16)
599
600 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
601 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
602 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
603 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
604
605 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
606 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
607 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
608 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
609
610 #define DO_CNOT(N) (N == 0)
611
612 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
613 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
614 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
615 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
616
617 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
618
619 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
620 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
621 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
622
623 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
624
625 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
626 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
627 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
628
629 #define DO_NOT(N) (~N)
630
631 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
632 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
633 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
634 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
635
636 #define DO_SXTB(N) ((int8_t)N)
637 #define DO_SXTH(N) ((int16_t)N)
638 #define DO_SXTS(N) ((int32_t)N)
639 #define DO_UXTB(N) ((uint8_t)N)
640 #define DO_UXTH(N) ((uint16_t)N)
641 #define DO_UXTS(N) ((uint32_t)N)
642
643 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
644 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
645 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
646 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
647 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
648 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
649
650 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
651 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
652 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
653 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
654 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
655 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
656
657 #define DO_ABS(N) (N < 0 ? -N : N)
658
659 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
660 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
661 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
662 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
663
664 #define DO_NEG(N) (-N)
665
666 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
667 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
668 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
669 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
670
671 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
672 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
673 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
674
675 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
676 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
677
678 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
679
680 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
681 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
682 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
683 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
684
685 /* Three-operand expander, unpredicated, in which the third operand is "wide".
686 */
687 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
688 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
689 { \
690 intptr_t i, opr_sz = simd_oprsz(desc); \
691 for (i = 0; i < opr_sz; ) { \
692 TYPEW mm = *(TYPEW *)(vm + i); \
693 do { \
694 TYPE nn = *(TYPE *)(vn + H(i)); \
695 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
696 i += sizeof(TYPE); \
697 } while (i & 7); \
698 } \
699 }
700
701 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
702 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
703 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
704
705 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
706 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
707 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
708
709 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
710 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
711 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
712
713 #undef DO_ZZW
714
715 #undef DO_CLS_B
716 #undef DO_CLS_H
717 #undef DO_CLZ_B
718 #undef DO_CLZ_H
719 #undef DO_CNOT
720 #undef DO_FABS
721 #undef DO_FNEG
722 #undef DO_ABS
723 #undef DO_NEG
724 #undef DO_ZPZ
725 #undef DO_ZPZ_D
726
727 /* Two-operand reduction expander, controlled by a predicate.
728 * The difference between TYPERED and TYPERET has to do with
729 * sign-extension. E.g. for SMAX, TYPERED must be signed,
730 * but TYPERET must be unsigned so that e.g. a 32-bit value
731 * is not sign-extended to the ABI uint64_t return type.
732 */
733 /* ??? If we were to vectorize this by hand the reduction ordering
734 * would change. For integer operands, this is perfectly fine.
735 */
736 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
737 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
738 { \
739 intptr_t i, opr_sz = simd_oprsz(desc); \
740 TYPERED ret = INIT; \
741 for (i = 0; i < opr_sz; ) { \
742 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
743 do { \
744 if (pg & 1) { \
745 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
746 ret = OP(ret, nn); \
747 } \
748 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
749 } while (i & 15); \
750 } \
751 return (TYPERET)ret; \
752 }
753
754 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
755 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
756 { \
757 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
758 TYPEE *n = vn; \
759 uint8_t *pg = vg; \
760 TYPER ret = INIT; \
761 for (i = 0; i < opr_sz; i += 1) { \
762 if (pg[H1(i)] & 1) { \
763 TYPEE nn = n[i]; \
764 ret = OP(ret, nn); \
765 } \
766 } \
767 return ret; \
768 }
769
770 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
771 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
772 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
773 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
774
775 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
776 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
777 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
778 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
779
780 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
781 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
782 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
783 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
784
785 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
786 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
787 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
788
789 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
790 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
791 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
792 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
793
794 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
795 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
796 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
797 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
798
799 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
800 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
801 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
802 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
803
804 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
805 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
806 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
807 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
808
809 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
810 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
811 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
812 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
813
814 #undef DO_VPZ
815 #undef DO_VPZ_D
816
817 /* Two vector operand, one scalar operand, unpredicated. */
818 #define DO_ZZI(NAME, TYPE, OP) \
819 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
820 { \
821 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
822 TYPE s = s64, *d = vd, *n = vn; \
823 for (i = 0; i < opr_sz; ++i) { \
824 d[i] = OP(n[i], s); \
825 } \
826 }
827
828 #define DO_SUBR(X, Y) (Y - X)
829
830 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
831 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
832 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
833 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
834
835 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
836 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
837 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
838 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
839
840 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
841 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
842 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
843 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
844
845 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
846 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
847 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
848 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
849
850 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
851 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
852 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
853 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
854
855 #undef DO_ZZI
856
857 #undef DO_AND
858 #undef DO_ORR
859 #undef DO_EOR
860 #undef DO_BIC
861 #undef DO_ADD
862 #undef DO_SUB
863 #undef DO_MAX
864 #undef DO_MIN
865 #undef DO_ABD
866 #undef DO_MUL
867 #undef DO_DIV
868 #undef DO_ASR
869 #undef DO_LSR
870 #undef DO_LSL
871 #undef DO_SUBR
872
873 /* Similar to the ARM LastActiveElement pseudocode function, except the
874 result is multiplied by the element size. This includes the not found
875 indication; e.g. not found for esz=3 is -8. */
876 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
877 {
878 uint64_t mask = pred_esz_masks[esz];
879 intptr_t i = words;
880
881 do {
882 uint64_t this_g = g[--i] & mask;
883 if (this_g) {
884 return i * 64 + (63 - clz64(this_g));
885 }
886 } while (i > 0);
887 return (intptr_t)-1 << esz;
888 }
889
890 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
891 {
892 uint32_t flags = PREDTEST_INIT;
893 uint64_t *d = vd, *g = vg;
894 intptr_t i = 0;
895
896 do {
897 uint64_t this_d = d[i];
898 uint64_t this_g = g[i];
899
900 if (this_g) {
901 if (!(flags & 4)) {
902 /* Set in D the first bit of G. */
903 this_d |= this_g & -this_g;
904 d[i] = this_d;
905 }
906 flags = iter_predtest_fwd(this_d, this_g, flags);
907 }
908 } while (++i < words);
909
910 return flags;
911 }
912
913 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
914 {
915 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
916 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
917 uint32_t flags = PREDTEST_INIT;
918 uint64_t *d = vd, *g = vg, esz_mask;
919 intptr_t i, next;
920
921 next = last_active_element(vd, words, esz) + (1 << esz);
922 esz_mask = pred_esz_masks[esz];
923
924 /* Similar to the pseudocode for pnext, but scaled by ESZ
925 so that we find the correct bit. */
926 if (next < words * 64) {
927 uint64_t mask = -1;
928
929 if (next & 63) {
930 mask = ~((1ull << (next & 63)) - 1);
931 next &= -64;
932 }
933 do {
934 uint64_t this_g = g[next / 64] & esz_mask & mask;
935 if (this_g != 0) {
936 next = (next & -64) + ctz64(this_g);
937 break;
938 }
939 next += 64;
940 mask = -1;
941 } while (next < words * 64);
942 }
943
944 i = 0;
945 do {
946 uint64_t this_d = 0;
947 if (i == next / 64) {
948 this_d = 1ull << (next & 63);
949 }
950 d[i] = this_d;
951 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
952 } while (++i < words);
953
954 return flags;
955 }
956
957 /* Store zero into every active element of Zd. We will use this for two
958 * and three-operand predicated instructions for which logic dictates a
959 * zero result. In particular, logical shift by element size, which is
960 * otherwise undefined on the host.
961 *
962 * For element sizes smaller than uint64_t, we use tables to expand
963 * the N bits of the controlling predicate to a byte mask, and clear
964 * those bytes.
965 */
966 void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
967 {
968 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
969 uint64_t *d = vd;
970 uint8_t *pg = vg;
971 for (i = 0; i < opr_sz; i += 1) {
972 d[i] &= ~expand_pred_b(pg[H1(i)]);
973 }
974 }
975
976 void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
977 {
978 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
979 uint64_t *d = vd;
980 uint8_t *pg = vg;
981 for (i = 0; i < opr_sz; i += 1) {
982 d[i] &= ~expand_pred_h(pg[H1(i)]);
983 }
984 }
985
986 void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
987 {
988 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
989 uint64_t *d = vd;
990 uint8_t *pg = vg;
991 for (i = 0; i < opr_sz; i += 1) {
992 d[i] &= ~expand_pred_s(pg[H1(i)]);
993 }
994 }
995
996 void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
997 {
998 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
999 uint64_t *d = vd;
1000 uint8_t *pg = vg;
1001 for (i = 0; i < opr_sz; i += 1) {
1002 if (pg[H1(i)] & 1) {
1003 d[i] = 0;
1004 }
1005 }
1006 }
1007
1008 /* Copy Zn into Zd, and store zero into inactive elements. */
1009 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1010 {
1011 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1012 uint64_t *d = vd, *n = vn;
1013 uint8_t *pg = vg;
1014 for (i = 0; i < opr_sz; i += 1) {
1015 d[i] = n[i] & expand_pred_b(pg[H1(i)]);
1016 }
1017 }
1018
1019 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1020 {
1021 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1022 uint64_t *d = vd, *n = vn;
1023 uint8_t *pg = vg;
1024 for (i = 0; i < opr_sz; i += 1) {
1025 d[i] = n[i] & expand_pred_h(pg[H1(i)]);
1026 }
1027 }
1028
1029 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1030 {
1031 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1032 uint64_t *d = vd, *n = vn;
1033 uint8_t *pg = vg;
1034 for (i = 0; i < opr_sz; i += 1) {
1035 d[i] = n[i] & expand_pred_s(pg[H1(i)]);
1036 }
1037 }
1038
1039 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1040 {
1041 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1042 uint64_t *d = vd, *n = vn;
1043 uint8_t *pg = vg;
1044 for (i = 0; i < opr_sz; i += 1) {
1045 d[i] = n[1] & -(uint64_t)(pg[H1(i)] & 1);
1046 }
1047 }
1048
1049 /* Three-operand expander, immediate operand, controlled by a predicate.
1050 */
1051 #define DO_ZPZI(NAME, TYPE, H, OP) \
1052 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1053 { \
1054 intptr_t i, opr_sz = simd_oprsz(desc); \
1055 TYPE imm = simd_data(desc); \
1056 for (i = 0; i < opr_sz; ) { \
1057 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1058 do { \
1059 if (pg & 1) { \
1060 TYPE nn = *(TYPE *)(vn + H(i)); \
1061 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1062 } \
1063 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1064 } while (i & 15); \
1065 } \
1066 }
1067
1068 /* Similarly, specialized for 64-bit operands. */
1069 #define DO_ZPZI_D(NAME, TYPE, OP) \
1070 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1071 { \
1072 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1073 TYPE *d = vd, *n = vn; \
1074 TYPE imm = simd_data(desc); \
1075 uint8_t *pg = vg; \
1076 for (i = 0; i < opr_sz; i += 1) { \
1077 if (pg[H1(i)] & 1) { \
1078 TYPE nn = n[i]; \
1079 d[i] = OP(nn, imm); \
1080 } \
1081 } \
1082 }
1083
1084 #define DO_SHR(N, M) (N >> M)
1085 #define DO_SHL(N, M) (N << M)
1086
1087 /* Arithmetic shift right for division. This rounds negative numbers
1088 toward zero as per signed division. Therefore before shifting,
1089 when N is negative, add 2**M-1. */
1090 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1091
1092 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1093 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1094 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1095 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1096
1097 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1098 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1099 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1100 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1101
1102 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1103 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1104 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1105 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1106
1107 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1108 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1109 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1110 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1111
1112 #undef DO_SHR
1113 #undef DO_SHL
1114 #undef DO_ASRD
1115 #undef DO_ZPZI
1116 #undef DO_ZPZI_D
1117
1118 /* Fully general four-operand expander, controlled by a predicate.
1119 */
1120 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1121 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1122 void *vg, uint32_t desc) \
1123 { \
1124 intptr_t i, opr_sz = simd_oprsz(desc); \
1125 for (i = 0; i < opr_sz; ) { \
1126 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1127 do { \
1128 if (pg & 1) { \
1129 TYPE nn = *(TYPE *)(vn + H(i)); \
1130 TYPE mm = *(TYPE *)(vm + H(i)); \
1131 TYPE aa = *(TYPE *)(va + H(i)); \
1132 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1133 } \
1134 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1135 } while (i & 15); \
1136 } \
1137 }
1138
1139 /* Similarly, specialized for 64-bit operands. */
1140 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1141 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1142 void *vg, uint32_t desc) \
1143 { \
1144 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1145 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1146 uint8_t *pg = vg; \
1147 for (i = 0; i < opr_sz; i += 1) { \
1148 if (pg[H1(i)] & 1) { \
1149 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1150 d[i] = OP(aa, nn, mm); \
1151 } \
1152 } \
1153 }
1154
1155 #define DO_MLA(A, N, M) (A + N * M)
1156 #define DO_MLS(A, N, M) (A - N * M)
1157
1158 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1159 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1160
1161 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1162 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1163
1164 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1165 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1166
1167 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1168 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1169
1170 #undef DO_MLA
1171 #undef DO_MLS
1172 #undef DO_ZPZZZ
1173 #undef DO_ZPZZZ_D
1174
1175 void HELPER(sve_index_b)(void *vd, uint32_t start,
1176 uint32_t incr, uint32_t desc)
1177 {
1178 intptr_t i, opr_sz = simd_oprsz(desc);
1179 uint8_t *d = vd;
1180 for (i = 0; i < opr_sz; i += 1) {
1181 d[H1(i)] = start + i * incr;
1182 }
1183 }
1184
1185 void HELPER(sve_index_h)(void *vd, uint32_t start,
1186 uint32_t incr, uint32_t desc)
1187 {
1188 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1189 uint16_t *d = vd;
1190 for (i = 0; i < opr_sz; i += 1) {
1191 d[H2(i)] = start + i * incr;
1192 }
1193 }
1194
1195 void HELPER(sve_index_s)(void *vd, uint32_t start,
1196 uint32_t incr, uint32_t desc)
1197 {
1198 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1199 uint32_t *d = vd;
1200 for (i = 0; i < opr_sz; i += 1) {
1201 d[H4(i)] = start + i * incr;
1202 }
1203 }
1204
1205 void HELPER(sve_index_d)(void *vd, uint64_t start,
1206 uint64_t incr, uint32_t desc)
1207 {
1208 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1209 uint64_t *d = vd;
1210 for (i = 0; i < opr_sz; i += 1) {
1211 d[i] = start + i * incr;
1212 }
1213 }
1214
1215 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1216 {
1217 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1218 uint32_t sh = simd_data(desc);
1219 uint32_t *d = vd, *n = vn, *m = vm;
1220 for (i = 0; i < opr_sz; i += 1) {
1221 d[i] = n[i] + (m[i] << sh);
1222 }
1223 }
1224
1225 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1226 {
1227 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1228 uint64_t sh = simd_data(desc);
1229 uint64_t *d = vd, *n = vn, *m = vm;
1230 for (i = 0; i < opr_sz; i += 1) {
1231 d[i] = n[i] + (m[i] << sh);
1232 }
1233 }
1234
1235 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1236 {
1237 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1238 uint64_t sh = simd_data(desc);
1239 uint64_t *d = vd, *n = vn, *m = vm;
1240 for (i = 0; i < opr_sz; i += 1) {
1241 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1242 }
1243 }
1244
1245 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1246 {
1247 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1248 uint64_t sh = simd_data(desc);
1249 uint64_t *d = vd, *n = vn, *m = vm;
1250 for (i = 0; i < opr_sz; i += 1) {
1251 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1252 }
1253 }
1254
1255 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1256 {
1257 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1258 static const uint16_t coeff[] = {
1259 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1260 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1261 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1262 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1263 };
1264 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1265 uint16_t *d = vd, *n = vn;
1266
1267 for (i = 0; i < opr_sz; i++) {
1268 uint16_t nn = n[i];
1269 intptr_t idx = extract32(nn, 0, 5);
1270 uint16_t exp = extract32(nn, 5, 5);
1271 d[i] = coeff[idx] | (exp << 10);
1272 }
1273 }
1274
1275 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1276 {
1277 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1278 static const uint32_t coeff[] = {
1279 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1280 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1281 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1282 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1283 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1284 0x1ef532, 0x20b051, 0x227043, 0x243516,
1285 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1286 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1287 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1288 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1289 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1290 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1291 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1292 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1293 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1294 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1295 };
1296 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1297 uint32_t *d = vd, *n = vn;
1298
1299 for (i = 0; i < opr_sz; i++) {
1300 uint32_t nn = n[i];
1301 intptr_t idx = extract32(nn, 0, 6);
1302 uint32_t exp = extract32(nn, 6, 8);
1303 d[i] = coeff[idx] | (exp << 23);
1304 }
1305 }
1306
1307 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1308 {
1309 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1310 static const uint64_t coeff[] = {
1311 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1312 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1313 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1314 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1315 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1316 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1317 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1318 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1319 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1320 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1321 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1322 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1323 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1324 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1325 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1326 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1327 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1328 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1329 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1330 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1331 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1332 0xFA7C1819E90D8ull,
1333 };
1334 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1335 uint64_t *d = vd, *n = vn;
1336
1337 for (i = 0; i < opr_sz; i++) {
1338 uint64_t nn = n[i];
1339 intptr_t idx = extract32(nn, 0, 6);
1340 uint64_t exp = extract32(nn, 6, 11);
1341 d[i] = coeff[idx] | (exp << 52);
1342 }
1343 }
1344
1345 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1346 {
1347 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1348 uint16_t *d = vd, *n = vn, *m = vm;
1349 for (i = 0; i < opr_sz; i += 1) {
1350 uint16_t nn = n[i];
1351 uint16_t mm = m[i];
1352 if (mm & 1) {
1353 nn = float16_one;
1354 }
1355 d[i] = nn ^ (mm & 2) << 14;
1356 }
1357 }
1358
1359 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1360 {
1361 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1362 uint32_t *d = vd, *n = vn, *m = vm;
1363 for (i = 0; i < opr_sz; i += 1) {
1364 uint32_t nn = n[i];
1365 uint32_t mm = m[i];
1366 if (mm & 1) {
1367 nn = float32_one;
1368 }
1369 d[i] = nn ^ (mm & 2) << 30;
1370 }
1371 }
1372
1373 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1374 {
1375 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1376 uint64_t *d = vd, *n = vn, *m = vm;
1377 for (i = 0; i < opr_sz; i += 1) {
1378 uint64_t nn = n[i];
1379 uint64_t mm = m[i];
1380 if (mm & 1) {
1381 nn = float64_one;
1382 }
1383 d[i] = nn ^ (mm & 2) << 62;
1384 }
1385 }
1386
1387 /*
1388 * Signed saturating addition with scalar operand.
1389 */
1390
1391 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1392 {
1393 intptr_t i, oprsz = simd_oprsz(desc);
1394
1395 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1396 int r = *(int8_t *)(a + i) + b;
1397 if (r > INT8_MAX) {
1398 r = INT8_MAX;
1399 } else if (r < INT8_MIN) {
1400 r = INT8_MIN;
1401 }
1402 *(int8_t *)(d + i) = r;
1403 }
1404 }
1405
1406 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1407 {
1408 intptr_t i, oprsz = simd_oprsz(desc);
1409
1410 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1411 int r = *(int16_t *)(a + i) + b;
1412 if (r > INT16_MAX) {
1413 r = INT16_MAX;
1414 } else if (r < INT16_MIN) {
1415 r = INT16_MIN;
1416 }
1417 *(int16_t *)(d + i) = r;
1418 }
1419 }
1420
1421 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1422 {
1423 intptr_t i, oprsz = simd_oprsz(desc);
1424
1425 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1426 int64_t r = *(int32_t *)(a + i) + b;
1427 if (r > INT32_MAX) {
1428 r = INT32_MAX;
1429 } else if (r < INT32_MIN) {
1430 r = INT32_MIN;
1431 }
1432 *(int32_t *)(d + i) = r;
1433 }
1434 }
1435
1436 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1437 {
1438 intptr_t i, oprsz = simd_oprsz(desc);
1439
1440 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1441 int64_t ai = *(int64_t *)(a + i);
1442 int64_t r = ai + b;
1443 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1444 /* Signed overflow. */
1445 r = (r < 0 ? INT64_MAX : INT64_MIN);
1446 }
1447 *(int64_t *)(d + i) = r;
1448 }
1449 }
1450
1451 /*
1452 * Unsigned saturating addition with scalar operand.
1453 */
1454
1455 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1456 {
1457 intptr_t i, oprsz = simd_oprsz(desc);
1458
1459 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1460 int r = *(uint8_t *)(a + i) + b;
1461 if (r > UINT8_MAX) {
1462 r = UINT8_MAX;
1463 } else if (r < 0) {
1464 r = 0;
1465 }
1466 *(uint8_t *)(d + i) = r;
1467 }
1468 }
1469
1470 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1471 {
1472 intptr_t i, oprsz = simd_oprsz(desc);
1473
1474 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1475 int r = *(uint16_t *)(a + i) + b;
1476 if (r > UINT16_MAX) {
1477 r = UINT16_MAX;
1478 } else if (r < 0) {
1479 r = 0;
1480 }
1481 *(uint16_t *)(d + i) = r;
1482 }
1483 }
1484
1485 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1486 {
1487 intptr_t i, oprsz = simd_oprsz(desc);
1488
1489 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1490 int64_t r = *(uint32_t *)(a + i) + b;
1491 if (r > UINT32_MAX) {
1492 r = UINT32_MAX;
1493 } else if (r < 0) {
1494 r = 0;
1495 }
1496 *(uint32_t *)(d + i) = r;
1497 }
1498 }
1499
1500 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1501 {
1502 intptr_t i, oprsz = simd_oprsz(desc);
1503
1504 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1505 uint64_t r = *(uint64_t *)(a + i) + b;
1506 if (r < b) {
1507 r = UINT64_MAX;
1508 }
1509 *(uint64_t *)(d + i) = r;
1510 }
1511 }
1512
1513 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1514 {
1515 intptr_t i, oprsz = simd_oprsz(desc);
1516
1517 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1518 uint64_t ai = *(uint64_t *)(a + i);
1519 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1520 }
1521 }
1522
1523 /* Two operand predicated copy immediate with merge. All valid immediates
1524 * can fit within 17 signed bits in the simd_data field.
1525 */
1526 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1527 uint64_t mm, uint32_t desc)
1528 {
1529 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1530 uint64_t *d = vd, *n = vn;
1531 uint8_t *pg = vg;
1532
1533 mm = dup_const(MO_8, mm);
1534 for (i = 0; i < opr_sz; i += 1) {
1535 uint64_t nn = n[i];
1536 uint64_t pp = expand_pred_b(pg[H1(i)]);
1537 d[i] = (mm & pp) | (nn & ~pp);
1538 }
1539 }
1540
1541 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1542 uint64_t mm, uint32_t desc)
1543 {
1544 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1545 uint64_t *d = vd, *n = vn;
1546 uint8_t *pg = vg;
1547
1548 mm = dup_const(MO_16, mm);
1549 for (i = 0; i < opr_sz; i += 1) {
1550 uint64_t nn = n[i];
1551 uint64_t pp = expand_pred_h(pg[H1(i)]);
1552 d[i] = (mm & pp) | (nn & ~pp);
1553 }
1554 }
1555
1556 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1557 uint64_t mm, uint32_t desc)
1558 {
1559 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1560 uint64_t *d = vd, *n = vn;
1561 uint8_t *pg = vg;
1562
1563 mm = dup_const(MO_32, mm);
1564 for (i = 0; i < opr_sz; i += 1) {
1565 uint64_t nn = n[i];
1566 uint64_t pp = expand_pred_s(pg[H1(i)]);
1567 d[i] = (mm & pp) | (nn & ~pp);
1568 }
1569 }
1570
1571 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1572 uint64_t mm, uint32_t desc)
1573 {
1574 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1575 uint64_t *d = vd, *n = vn;
1576 uint8_t *pg = vg;
1577
1578 for (i = 0; i < opr_sz; i += 1) {
1579 uint64_t nn = n[i];
1580 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1581 }
1582 }
1583
1584 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1585 {
1586 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1587 uint64_t *d = vd;
1588 uint8_t *pg = vg;
1589
1590 val = dup_const(MO_8, val);
1591 for (i = 0; i < opr_sz; i += 1) {
1592 d[i] = val & expand_pred_b(pg[H1(i)]);
1593 }
1594 }
1595
1596 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1597 {
1598 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1599 uint64_t *d = vd;
1600 uint8_t *pg = vg;
1601
1602 val = dup_const(MO_16, val);
1603 for (i = 0; i < opr_sz; i += 1) {
1604 d[i] = val & expand_pred_h(pg[H1(i)]);
1605 }
1606 }
1607
1608 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1609 {
1610 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1611 uint64_t *d = vd;
1612 uint8_t *pg = vg;
1613
1614 val = dup_const(MO_32, val);
1615 for (i = 0; i < opr_sz; i += 1) {
1616 d[i] = val & expand_pred_s(pg[H1(i)]);
1617 }
1618 }
1619
1620 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1621 {
1622 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1623 uint64_t *d = vd;
1624 uint8_t *pg = vg;
1625
1626 for (i = 0; i < opr_sz; i += 1) {
1627 d[i] = (pg[H1(i)] & 1 ? val : 0);
1628 }
1629 }
1630
1631 /* Big-endian hosts need to frob the byte indicies. If the copy
1632 * happens to be 8-byte aligned, then no frobbing necessary.
1633 */
1634 static void swap_memmove(void *vd, void *vs, size_t n)
1635 {
1636 uintptr_t d = (uintptr_t)vd;
1637 uintptr_t s = (uintptr_t)vs;
1638 uintptr_t o = (d | s | n) & 7;
1639 size_t i;
1640
1641 #ifndef HOST_WORDS_BIGENDIAN
1642 o = 0;
1643 #endif
1644 switch (o) {
1645 case 0:
1646 memmove(vd, vs, n);
1647 break;
1648
1649 case 4:
1650 if (d < s || d >= s + n) {
1651 for (i = 0; i < n; i += 4) {
1652 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1653 }
1654 } else {
1655 for (i = n; i > 0; ) {
1656 i -= 4;
1657 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1658 }
1659 }
1660 break;
1661
1662 case 2:
1663 case 6:
1664 if (d < s || d >= s + n) {
1665 for (i = 0; i < n; i += 2) {
1666 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1667 }
1668 } else {
1669 for (i = n; i > 0; ) {
1670 i -= 2;
1671 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1672 }
1673 }
1674 break;
1675
1676 default:
1677 if (d < s || d >= s + n) {
1678 for (i = 0; i < n; i++) {
1679 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1680 }
1681 } else {
1682 for (i = n; i > 0; ) {
1683 i -= 1;
1684 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1685 }
1686 }
1687 break;
1688 }
1689 }
1690
1691 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1692 {
1693 intptr_t opr_sz = simd_oprsz(desc);
1694 size_t n_ofs = simd_data(desc);
1695 size_t n_siz = opr_sz - n_ofs;
1696
1697 if (vd != vm) {
1698 swap_memmove(vd, vn + n_ofs, n_siz);
1699 swap_memmove(vd + n_siz, vm, n_ofs);
1700 } else if (vd != vn) {
1701 swap_memmove(vd + n_siz, vd, n_ofs);
1702 swap_memmove(vd, vn + n_ofs, n_siz);
1703 } else {
1704 /* vd == vn == vm. Need temp space. */
1705 ARMVectorReg tmp;
1706 swap_memmove(&tmp, vm, n_ofs);
1707 swap_memmove(vd, vd + n_ofs, n_siz);
1708 memcpy(vd + n_siz, &tmp, n_ofs);
1709 }
1710 }
1711
1712 #define DO_INSR(NAME, TYPE, H) \
1713 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1714 { \
1715 intptr_t opr_sz = simd_oprsz(desc); \
1716 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1717 *(TYPE *)(vd + H(0)) = val; \
1718 }
1719
1720 DO_INSR(sve_insr_b, uint8_t, H1)
1721 DO_INSR(sve_insr_h, uint16_t, H1_2)
1722 DO_INSR(sve_insr_s, uint32_t, H1_4)
1723 DO_INSR(sve_insr_d, uint64_t, )
1724
1725 #undef DO_INSR
1726
1727 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1728 {
1729 intptr_t i, j, opr_sz = simd_oprsz(desc);
1730 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1731 uint64_t f = *(uint64_t *)(vn + i);
1732 uint64_t b = *(uint64_t *)(vn + j);
1733 *(uint64_t *)(vd + i) = bswap64(b);
1734 *(uint64_t *)(vd + j) = bswap64(f);
1735 }
1736 }
1737
1738 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1739 {
1740 intptr_t i, j, opr_sz = simd_oprsz(desc);
1741 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1742 uint64_t f = *(uint64_t *)(vn + i);
1743 uint64_t b = *(uint64_t *)(vn + j);
1744 *(uint64_t *)(vd + i) = hswap64(b);
1745 *(uint64_t *)(vd + j) = hswap64(f);
1746 }
1747 }
1748
1749 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1750 {
1751 intptr_t i, j, opr_sz = simd_oprsz(desc);
1752 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1753 uint64_t f = *(uint64_t *)(vn + i);
1754 uint64_t b = *(uint64_t *)(vn + j);
1755 *(uint64_t *)(vd + i) = rol64(b, 32);
1756 *(uint64_t *)(vd + j) = rol64(f, 32);
1757 }
1758 }
1759
1760 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1761 {
1762 intptr_t i, j, opr_sz = simd_oprsz(desc);
1763 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1764 uint64_t f = *(uint64_t *)(vn + i);
1765 uint64_t b = *(uint64_t *)(vn + j);
1766 *(uint64_t *)(vd + i) = b;
1767 *(uint64_t *)(vd + j) = f;
1768 }
1769 }
1770
1771 #define DO_TBL(NAME, TYPE, H) \
1772 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1773 { \
1774 intptr_t i, opr_sz = simd_oprsz(desc); \
1775 uintptr_t elem = opr_sz / sizeof(TYPE); \
1776 TYPE *d = vd, *n = vn, *m = vm; \
1777 ARMVectorReg tmp; \
1778 if (unlikely(vd == vn)) { \
1779 n = memcpy(&tmp, vn, opr_sz); \
1780 } \
1781 for (i = 0; i < elem; i++) { \
1782 TYPE j = m[H(i)]; \
1783 d[H(i)] = j < elem ? n[H(j)] : 0; \
1784 } \
1785 }
1786
1787 DO_TBL(sve_tbl_b, uint8_t, H1)
1788 DO_TBL(sve_tbl_h, uint16_t, H2)
1789 DO_TBL(sve_tbl_s, uint32_t, H4)
1790 DO_TBL(sve_tbl_d, uint64_t, )
1791
1792 #undef TBL
1793
1794 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1795 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1796 { \
1797 intptr_t i, opr_sz = simd_oprsz(desc); \
1798 TYPED *d = vd; \
1799 TYPES *n = vn; \
1800 ARMVectorReg tmp; \
1801 if (unlikely(vn - vd < opr_sz)) { \
1802 n = memcpy(&tmp, n, opr_sz / 2); \
1803 } \
1804 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1805 d[HD(i)] = n[HS(i)]; \
1806 } \
1807 }
1808
1809 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1810 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1811 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1812
1813 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1814 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1815 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1816
1817 #undef DO_UNPK
1818
1819 /* Mask of bits included in the even numbered predicates of width esz.
1820 * We also use this for expand_bits/compress_bits, and so extend the
1821 * same pattern out to 16-bit units.
1822 */
1823 static const uint64_t even_bit_esz_masks[5] = {
1824 0x5555555555555555ull,
1825 0x3333333333333333ull,
1826 0x0f0f0f0f0f0f0f0full,
1827 0x00ff00ff00ff00ffull,
1828 0x0000ffff0000ffffull,
1829 };
1830
1831 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1832 * For N==0, this corresponds to the operation that in qemu/bitops.h
1833 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1834 * section 7-2 Shuffling Bits.
1835 */
1836 static uint64_t expand_bits(uint64_t x, int n)
1837 {
1838 int i;
1839
1840 x &= 0xffffffffu;
1841 for (i = 4; i >= n; i--) {
1842 int sh = 1 << i;
1843 x = ((x << sh) | x) & even_bit_esz_masks[i];
1844 }
1845 return x;
1846 }
1847
1848 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1849 * For N==0, this corresponds to the operation that in qemu/bitops.h
1850 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1851 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1852 */
1853 static uint64_t compress_bits(uint64_t x, int n)
1854 {
1855 int i;
1856
1857 for (i = n; i <= 4; i++) {
1858 int sh = 1 << i;
1859 x &= even_bit_esz_masks[i];
1860 x = (x >> sh) | x;
1861 }
1862 return x & 0xffffffffu;
1863 }
1864
1865 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1866 {
1867 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1868 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1869 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1870 uint64_t *d = vd;
1871 intptr_t i;
1872
1873 if (oprsz <= 8) {
1874 uint64_t nn = *(uint64_t *)vn;
1875 uint64_t mm = *(uint64_t *)vm;
1876 int half = 4 * oprsz;
1877
1878 nn = extract64(nn, high * half, half);
1879 mm = extract64(mm, high * half, half);
1880 nn = expand_bits(nn, esz);
1881 mm = expand_bits(mm, esz);
1882 d[0] = nn + (mm << (1 << esz));
1883 } else {
1884 ARMPredicateReg tmp_n, tmp_m;
1885
1886 /* We produce output faster than we consume input.
1887 Therefore we must be mindful of possible overlap. */
1888 if ((vn - vd) < (uintptr_t)oprsz) {
1889 vn = memcpy(&tmp_n, vn, oprsz);
1890 }
1891 if ((vm - vd) < (uintptr_t)oprsz) {
1892 vm = memcpy(&tmp_m, vm, oprsz);
1893 }
1894 if (high) {
1895 high = oprsz >> 1;
1896 }
1897
1898 if ((high & 3) == 0) {
1899 uint32_t *n = vn, *m = vm;
1900 high >>= 2;
1901
1902 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1903 uint64_t nn = n[H4(high + i)];
1904 uint64_t mm = m[H4(high + i)];
1905
1906 nn = expand_bits(nn, esz);
1907 mm = expand_bits(mm, esz);
1908 d[i] = nn + (mm << (1 << esz));
1909 }
1910 } else {
1911 uint8_t *n = vn, *m = vm;
1912 uint16_t *d16 = vd;
1913
1914 for (i = 0; i < oprsz / 2; i++) {
1915 uint16_t nn = n[H1(high + i)];
1916 uint16_t mm = m[H1(high + i)];
1917
1918 nn = expand_bits(nn, esz);
1919 mm = expand_bits(mm, esz);
1920 d16[H2(i)] = nn + (mm << (1 << esz));
1921 }
1922 }
1923 }
1924 }
1925
1926 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1927 {
1928 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1929 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1930 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1931 uint64_t *d = vd, *n = vn, *m = vm;
1932 uint64_t l, h;
1933 intptr_t i;
1934
1935 if (oprsz <= 8) {
1936 l = compress_bits(n[0] >> odd, esz);
1937 h = compress_bits(m[0] >> odd, esz);
1938 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1939 } else {
1940 ARMPredicateReg tmp_m;
1941 intptr_t oprsz_16 = oprsz / 16;
1942
1943 if ((vm - vd) < (uintptr_t)oprsz) {
1944 m = memcpy(&tmp_m, vm, oprsz);
1945 }
1946
1947 for (i = 0; i < oprsz_16; i++) {
1948 l = n[2 * i + 0];
1949 h = n[2 * i + 1];
1950 l = compress_bits(l >> odd, esz);
1951 h = compress_bits(h >> odd, esz);
1952 d[i] = l + (h << 32);
1953 }
1954
1955 /* For VL which is not a power of 2, the results from M do not
1956 align nicely with the uint64_t for D. Put the aligned results
1957 from M into TMP_M and then copy it into place afterward. */
1958 if (oprsz & 15) {
1959 d[i] = compress_bits(n[2 * i] >> odd, esz);
1960
1961 for (i = 0; i < oprsz_16; i++) {
1962 l = m[2 * i + 0];
1963 h = m[2 * i + 1];
1964 l = compress_bits(l >> odd, esz);
1965 h = compress_bits(h >> odd, esz);
1966 tmp_m.p[i] = l + (h << 32);
1967 }
1968 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1969
1970 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1971 } else {
1972 for (i = 0; i < oprsz_16; i++) {
1973 l = m[2 * i + 0];
1974 h = m[2 * i + 1];
1975 l = compress_bits(l >> odd, esz);
1976 h = compress_bits(h >> odd, esz);
1977 d[oprsz_16 + i] = l + (h << 32);
1978 }
1979 }
1980 }
1981 }
1982
1983 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1984 {
1985 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1986 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1987 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1988 uint64_t *d = vd, *n = vn, *m = vm;
1989 uint64_t mask;
1990 int shr, shl;
1991 intptr_t i;
1992
1993 shl = 1 << esz;
1994 shr = 0;
1995 mask = even_bit_esz_masks[esz];
1996 if (odd) {
1997 mask <<= shl;
1998 shr = shl;
1999 shl = 0;
2000 }
2001
2002 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2003 uint64_t nn = (n[i] & mask) >> shr;
2004 uint64_t mm = (m[i] & mask) << shl;
2005 d[i] = nn + mm;
2006 }
2007 }
2008
2009 /* Reverse units of 2**N bits. */
2010 static uint64_t reverse_bits_64(uint64_t x, int n)
2011 {
2012 int i, sh;
2013
2014 x = bswap64(x);
2015 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2016 uint64_t mask = even_bit_esz_masks[i];
2017 x = ((x & mask) << sh) | ((x >> sh) & mask);
2018 }
2019 return x;
2020 }
2021
2022 static uint8_t reverse_bits_8(uint8_t x, int n)
2023 {
2024 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2025 int i, sh;
2026
2027 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2028 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2029 }
2030 return x;
2031 }
2032
2033 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2034 {
2035 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2036 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2037 intptr_t i, oprsz_2 = oprsz / 2;
2038
2039 if (oprsz <= 8) {
2040 uint64_t l = *(uint64_t *)vn;
2041 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2042 *(uint64_t *)vd = l;
2043 } else if ((oprsz & 15) == 0) {
2044 for (i = 0; i < oprsz_2; i += 8) {
2045 intptr_t ih = oprsz - 8 - i;
2046 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2047 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2048 *(uint64_t *)(vd + i) = h;
2049 *(uint64_t *)(vd + ih) = l;
2050 }
2051 } else {
2052 for (i = 0; i < oprsz_2; i += 1) {
2053 intptr_t il = H1(i);
2054 intptr_t ih = H1(oprsz - 1 - i);
2055 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2056 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2057 *(uint8_t *)(vd + il) = h;
2058 *(uint8_t *)(vd + ih) = l;
2059 }
2060 }
2061 }
2062
2063 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2064 {
2065 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2066 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2067 uint64_t *d = vd;
2068 intptr_t i;
2069
2070 if (oprsz <= 8) {
2071 uint64_t nn = *(uint64_t *)vn;
2072 int half = 4 * oprsz;
2073
2074 nn = extract64(nn, high * half, half);
2075 nn = expand_bits(nn, 0);
2076 d[0] = nn;
2077 } else {
2078 ARMPredicateReg tmp_n;
2079
2080 /* We produce output faster than we consume input.
2081 Therefore we must be mindful of possible overlap. */
2082 if ((vn - vd) < (uintptr_t)oprsz) {
2083 vn = memcpy(&tmp_n, vn, oprsz);
2084 }
2085 if (high) {
2086 high = oprsz >> 1;
2087 }
2088
2089 if ((high & 3) == 0) {
2090 uint32_t *n = vn;
2091 high >>= 2;
2092
2093 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2094 uint64_t nn = n[H4(high + i)];
2095 d[i] = expand_bits(nn, 0);
2096 }
2097 } else {
2098 uint16_t *d16 = vd;
2099 uint8_t *n = vn;
2100
2101 for (i = 0; i < oprsz / 2; i++) {
2102 uint16_t nn = n[H1(high + i)];
2103 d16[H2(i)] = expand_bits(nn, 0);
2104 }
2105 }
2106 }
2107 }
2108
2109 #define DO_ZIP(NAME, TYPE, H) \
2110 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2111 { \
2112 intptr_t oprsz = simd_oprsz(desc); \
2113 intptr_t i, oprsz_2 = oprsz / 2; \
2114 ARMVectorReg tmp_n, tmp_m; \
2115 /* We produce output faster than we consume input. \
2116 Therefore we must be mindful of possible overlap. */ \
2117 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2118 vn = memcpy(&tmp_n, vn, oprsz_2); \
2119 } \
2120 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2121 vm = memcpy(&tmp_m, vm, oprsz_2); \
2122 } \
2123 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2124 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2125 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2126 } \
2127 }
2128
2129 DO_ZIP(sve_zip_b, uint8_t, H1)
2130 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2131 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2132 DO_ZIP(sve_zip_d, uint64_t, )
2133
2134 #define DO_UZP(NAME, TYPE, H) \
2135 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2136 { \
2137 intptr_t oprsz = simd_oprsz(desc); \
2138 intptr_t oprsz_2 = oprsz / 2; \
2139 intptr_t odd_ofs = simd_data(desc); \
2140 intptr_t i; \
2141 ARMVectorReg tmp_m; \
2142 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2143 vm = memcpy(&tmp_m, vm, oprsz); \
2144 } \
2145 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2146 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2147 } \
2148 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2149 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2150 } \
2151 }
2152
2153 DO_UZP(sve_uzp_b, uint8_t, H1)
2154 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2155 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2156 DO_UZP(sve_uzp_d, uint64_t, )
2157
2158 #define DO_TRN(NAME, TYPE, H) \
2159 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2160 { \
2161 intptr_t oprsz = simd_oprsz(desc); \
2162 intptr_t odd_ofs = simd_data(desc); \
2163 intptr_t i; \
2164 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2165 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2166 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2167 *(TYPE *)(vd + H(i + 0)) = ae; \
2168 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2169 } \
2170 }
2171
2172 DO_TRN(sve_trn_b, uint8_t, H1)
2173 DO_TRN(sve_trn_h, uint16_t, H1_2)
2174 DO_TRN(sve_trn_s, uint32_t, H1_4)
2175 DO_TRN(sve_trn_d, uint64_t, )
2176
2177 #undef DO_ZIP
2178 #undef DO_UZP
2179 #undef DO_TRN
2180
2181 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2182 {
2183 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2184 uint32_t *d = vd, *n = vn;
2185 uint8_t *pg = vg;
2186
2187 for (i = j = 0; i < opr_sz; i++) {
2188 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2189 d[H4(j)] = n[H4(i)];
2190 j++;
2191 }
2192 }
2193 for (; j < opr_sz; j++) {
2194 d[H4(j)] = 0;
2195 }
2196 }
2197
2198 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2199 {
2200 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2201 uint64_t *d = vd, *n = vn;
2202 uint8_t *pg = vg;
2203
2204 for (i = j = 0; i < opr_sz; i++) {
2205 if (pg[H1(i)] & 1) {
2206 d[j] = n[i];
2207 j++;
2208 }
2209 }
2210 for (; j < opr_sz; j++) {
2211 d[j] = 0;
2212 }
2213 }
2214
2215 /* Similar to the ARM LastActiveElement pseudocode function, except the
2216 * result is multiplied by the element size. This includes the not found
2217 * indication; e.g. not found for esz=3 is -8.
2218 */
2219 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2220 {
2221 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2222 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2223
2224 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2225 }
2226
2227 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2228 {
2229 intptr_t opr_sz = simd_oprsz(desc) / 8;
2230 int esz = simd_data(desc);
2231 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2232 intptr_t i, first_i, last_i;
2233 ARMVectorReg tmp;
2234
2235 first_i = last_i = 0;
2236 first_g = last_g = 0;
2237
2238 /* Find the extent of the active elements within VG. */
2239 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2240 pg = *(uint64_t *)(vg + i) & mask;
2241 if (pg) {
2242 if (last_g == 0) {
2243 last_g = pg;
2244 last_i = i;
2245 }
2246 first_g = pg;
2247 first_i = i;
2248 }
2249 }
2250
2251 len = 0;
2252 if (first_g != 0) {
2253 first_i = first_i * 8 + ctz64(first_g);
2254 last_i = last_i * 8 + 63 - clz64(last_g);
2255 len = last_i - first_i + (1 << esz);
2256 if (vd == vm) {
2257 vm = memcpy(&tmp, vm, opr_sz * 8);
2258 }
2259 swap_memmove(vd, vn + first_i, len);
2260 }
2261 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2262 }
2263
2264 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2265 void *vg, uint32_t desc)
2266 {
2267 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2268 uint64_t *d = vd, *n = vn, *m = vm;
2269 uint8_t *pg = vg;
2270
2271 for (i = 0; i < opr_sz; i += 1) {
2272 uint64_t nn = n[i], mm = m[i];
2273 uint64_t pp = expand_pred_b(pg[H1(i)]);
2274 d[i] = (nn & pp) | (mm & ~pp);
2275 }
2276 }
2277
2278 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2279 void *vg, uint32_t desc)
2280 {
2281 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2282 uint64_t *d = vd, *n = vn, *m = vm;
2283 uint8_t *pg = vg;
2284
2285 for (i = 0; i < opr_sz; i += 1) {
2286 uint64_t nn = n[i], mm = m[i];
2287 uint64_t pp = expand_pred_h(pg[H1(i)]);
2288 d[i] = (nn & pp) | (mm & ~pp);
2289 }
2290 }
2291
2292 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2293 void *vg, uint32_t desc)
2294 {
2295 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2296 uint64_t *d = vd, *n = vn, *m = vm;
2297 uint8_t *pg = vg;
2298
2299 for (i = 0; i < opr_sz; i += 1) {
2300 uint64_t nn = n[i], mm = m[i];
2301 uint64_t pp = expand_pred_s(pg[H1(i)]);
2302 d[i] = (nn & pp) | (mm & ~pp);
2303 }
2304 }
2305
2306 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2307 void *vg, uint32_t desc)
2308 {
2309 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2310 uint64_t *d = vd, *n = vn, *m = vm;
2311 uint8_t *pg = vg;
2312
2313 for (i = 0; i < opr_sz; i += 1) {
2314 uint64_t nn = n[i], mm = m[i];
2315 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2316 }
2317 }
2318
2319 /* Two operand comparison controlled by a predicate.
2320 * ??? It is very tempting to want to be able to expand this inline
2321 * with x86 instructions, e.g.
2322 *
2323 * vcmpeqw zm, zn, %ymm0
2324 * vpmovmskb %ymm0, %eax
2325 * and $0x5555, %eax
2326 * and pg, %eax
2327 *
2328 * or even aarch64, e.g.
2329 *
2330 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2331 * cmeq v0.8h, zn, zm
2332 * and v0.8h, v0.8h, mask
2333 * addv h0, v0.8h
2334 * and v0.8b, pg
2335 *
2336 * However, coming up with an abstraction that allows vector inputs and
2337 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2338 * scalar outputs, is tricky.
2339 */
2340 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2341 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2342 { \
2343 intptr_t opr_sz = simd_oprsz(desc); \
2344 uint32_t flags = PREDTEST_INIT; \
2345 intptr_t i = opr_sz; \
2346 do { \
2347 uint64_t out = 0, pg; \
2348 do { \
2349 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2350 TYPE nn = *(TYPE *)(vn + H(i)); \
2351 TYPE mm = *(TYPE *)(vm + H(i)); \
2352 out |= nn OP mm; \
2353 } while (i & 63); \
2354 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2355 out &= pg; \
2356 *(uint64_t *)(vd + (i >> 3)) = out; \
2357 flags = iter_predtest_bwd(out, pg, flags); \
2358 } while (i > 0); \
2359 return flags; \
2360 }
2361
2362 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2363 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2364 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2365 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2366 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2367 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2368 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2369 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2370
2371 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2372 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2373 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2374 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2375
2376 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2377 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2378 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2379 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2380
2381 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2382 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2383 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2384 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2385
2386 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2387 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2388 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2389 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2390
2391 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2392 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2393 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2394 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2395
2396 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2397 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2398 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2399 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2400
2401 #undef DO_CMP_PPZZ_B
2402 #undef DO_CMP_PPZZ_H
2403 #undef DO_CMP_PPZZ_S
2404 #undef DO_CMP_PPZZ_D
2405 #undef DO_CMP_PPZZ
2406
2407 /* Similar, but the second source is "wide". */
2408 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2409 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2410 { \
2411 intptr_t opr_sz = simd_oprsz(desc); \
2412 uint32_t flags = PREDTEST_INIT; \
2413 intptr_t i = opr_sz; \
2414 do { \
2415 uint64_t out = 0, pg; \
2416 do { \
2417 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2418 do { \
2419 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2420 TYPE nn = *(TYPE *)(vn + H(i)); \
2421 out |= nn OP mm; \
2422 } while (i & 7); \
2423 } while (i & 63); \
2424 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2425 out &= pg; \
2426 *(uint64_t *)(vd + (i >> 3)) = out; \
2427 flags = iter_predtest_bwd(out, pg, flags); \
2428 } while (i > 0); \
2429 return flags; \
2430 }
2431
2432 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2433 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2434 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2435 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2436 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2437 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2438
2439 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, uint8_t, uint64_t, ==)
2440 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, uint16_t, uint64_t, ==)
2441 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, uint32_t, uint64_t, ==)
2442
2443 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, uint8_t, uint64_t, !=)
2444 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, uint16_t, uint64_t, !=)
2445 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, uint32_t, uint64_t, !=)
2446
2447 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2448 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2449 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2450
2451 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2452 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2453 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2454
2455 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2456 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2457 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2458
2459 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2460 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2461 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2462
2463 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2464 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2465 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2466
2467 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2468 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2469 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2470
2471 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2472 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2473 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2474
2475 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2476 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2477 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2478
2479 #undef DO_CMP_PPZW_B
2480 #undef DO_CMP_PPZW_H
2481 #undef DO_CMP_PPZW_S
2482 #undef DO_CMP_PPZW
2483
2484 /* Similar, but the second source is immediate. */
2485 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2486 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2487 { \
2488 intptr_t opr_sz = simd_oprsz(desc); \
2489 uint32_t flags = PREDTEST_INIT; \
2490 TYPE mm = simd_data(desc); \
2491 intptr_t i = opr_sz; \
2492 do { \
2493 uint64_t out = 0, pg; \
2494 do { \
2495 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2496 TYPE nn = *(TYPE *)(vn + H(i)); \
2497 out |= nn OP mm; \
2498 } while (i & 63); \
2499 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2500 out &= pg; \
2501 *(uint64_t *)(vd + (i >> 3)) = out; \
2502 flags = iter_predtest_bwd(out, pg, flags); \
2503 } while (i > 0); \
2504 return flags; \
2505 }
2506
2507 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2508 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2509 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2510 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2511 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2512 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2513 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2514 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2515
2516 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2517 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2518 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2519 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2520
2521 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2522 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2523 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2524 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2525
2526 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2527 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2528 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2529 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2530
2531 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2532 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2533 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2534 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2535
2536 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2537 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2538 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2539 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2540
2541 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2542 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2543 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2544 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2545
2546 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2547 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2548 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2549 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2550
2551 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2552 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2553 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2554 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2555
2556 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2557 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2558 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2559 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2560
2561 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2562 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2563 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2564 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2565
2566 #undef DO_CMP_PPZI_B
2567 #undef DO_CMP_PPZI_H
2568 #undef DO_CMP_PPZI_S
2569 #undef DO_CMP_PPZI_D
2570 #undef DO_CMP_PPZI
2571
2572 /* Similar to the ARM LastActive pseudocode function. */
2573 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2574 {
2575 intptr_t i;
2576
2577 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2578 uint64_t pg = *(uint64_t *)(vg + i);
2579 if (pg) {
2580 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2581 }
2582 }
2583 return 0;
2584 }
2585
2586 /* Compute a mask into RETB that is true for all G, up to and including
2587 * (if after) or excluding (if !after) the first G & N.
2588 * Return true if BRK found.
2589 */
2590 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2591 bool brk, bool after)
2592 {
2593 uint64_t b;
2594
2595 if (brk) {
2596 b = 0;
2597 } else if ((g & n) == 0) {
2598 /* For all G, no N are set; break not found. */
2599 b = g;
2600 } else {
2601 /* Break somewhere in N. Locate it. */
2602 b = g & n; /* guard true, pred true */
2603 b = b & -b; /* first such */
2604 if (after) {
2605 b = b | (b - 1); /* break after same */
2606 } else {
2607 b = b - 1; /* break before same */
2608 }
2609 brk = true;
2610 }
2611
2612 *retb = b;
2613 return brk;
2614 }
2615
2616 /* Compute a zeroing BRK. */
2617 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2618 intptr_t oprsz, bool after)
2619 {
2620 bool brk = false;
2621 intptr_t i;
2622
2623 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2624 uint64_t this_b, this_g = g[i];
2625
2626 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2627 d[i] = this_b & this_g;
2628 }
2629 }
2630
2631 /* Likewise, but also compute flags. */
2632 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2633 intptr_t oprsz, bool after)
2634 {
2635 uint32_t flags = PREDTEST_INIT;
2636 bool brk = false;
2637 intptr_t i;
2638
2639 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2640 uint64_t this_b, this_d, this_g = g[i];
2641
2642 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2643 d[i] = this_d = this_b & this_g;
2644 flags = iter_predtest_fwd(this_d, this_g, flags);
2645 }
2646 return flags;
2647 }
2648
2649 /* Compute a merging BRK. */
2650 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2651 intptr_t oprsz, bool after)
2652 {
2653 bool brk = false;
2654 intptr_t i;
2655
2656 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2657 uint64_t this_b, this_g = g[i];
2658
2659 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2660 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2661 }
2662 }
2663
2664 /* Likewise, but also compute flags. */
2665 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2666 intptr_t oprsz, bool after)
2667 {
2668 uint32_t flags = PREDTEST_INIT;
2669 bool brk = false;
2670 intptr_t i;
2671
2672 for (i = 0; i < oprsz / 8; ++i) {
2673 uint64_t this_b, this_d = d[i], this_g = g[i];
2674
2675 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2676 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2677 flags = iter_predtest_fwd(this_d, this_g, flags);
2678 }
2679 return flags;
2680 }
2681
2682 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2683 {
2684 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2685 * The compiler should turn this into 4 64-bit integer stores.
2686 */
2687 memset(d, 0, sizeof(ARMPredicateReg));
2688 return PREDTEST_INIT;
2689 }
2690
2691 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2692 uint32_t pred_desc)
2693 {
2694 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2695 if (last_active_pred(vn, vg, oprsz)) {
2696 compute_brk_z(vd, vm, vg, oprsz, true);
2697 } else {
2698 do_zero(vd, oprsz);
2699 }
2700 }
2701
2702 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2703 uint32_t pred_desc)
2704 {
2705 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2706 if (last_active_pred(vn, vg, oprsz)) {
2707 return compute_brks_z(vd, vm, vg, oprsz, true);
2708 } else {
2709 return do_zero(vd, oprsz);
2710 }
2711 }
2712
2713 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2714 uint32_t pred_desc)
2715 {
2716 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2717 if (last_active_pred(vn, vg, oprsz)) {
2718 compute_brk_z(vd, vm, vg, oprsz, false);
2719 } else {
2720 do_zero(vd, oprsz);
2721 }
2722 }
2723
2724 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2725 uint32_t pred_desc)
2726 {
2727 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2728 if (last_active_pred(vn, vg, oprsz)) {
2729 return compute_brks_z(vd, vm, vg, oprsz, false);
2730 } else {
2731 return do_zero(vd, oprsz);
2732 }
2733 }
2734
2735 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2736 {
2737 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2738 compute_brk_z(vd, vn, vg, oprsz, true);
2739 }
2740
2741 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2742 {
2743 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2744 return compute_brks_z(vd, vn, vg, oprsz, true);
2745 }
2746
2747 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2748 {
2749 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2750 compute_brk_z(vd, vn, vg, oprsz, false);
2751 }
2752
2753 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2754 {
2755 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2756 return compute_brks_z(vd, vn, vg, oprsz, false);
2757 }
2758
2759 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2760 {
2761 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2762 compute_brk_m(vd, vn, vg, oprsz, true);
2763 }
2764
2765 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2766 {
2767 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2768 return compute_brks_m(vd, vn, vg, oprsz, true);
2769 }
2770
2771 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2772 {
2773 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2774 compute_brk_m(vd, vn, vg, oprsz, false);
2775 }
2776
2777 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2778 {
2779 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2780 return compute_brks_m(vd, vn, vg, oprsz, false);
2781 }
2782
2783 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2784 {
2785 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2786
2787 if (!last_active_pred(vn, vg, oprsz)) {
2788 do_zero(vd, oprsz);
2789 }
2790 }
2791
2792 /* As if PredTest(Ones(PL), D, esz). */
2793 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2794 uint64_t esz_mask)
2795 {
2796 uint32_t flags = PREDTEST_INIT;
2797 intptr_t i;
2798
2799 for (i = 0; i < oprsz / 8; i++) {
2800 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2801 }
2802 if (oprsz & 7) {
2803 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2804 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2805 }
2806 return flags;
2807 }
2808
2809 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2810 {
2811 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2812
2813 if (last_active_pred(vn, vg, oprsz)) {
2814 return predtest_ones(vd, oprsz, -1);
2815 } else {
2816 return do_zero(vd, oprsz);
2817 }
2818 }
2819
2820 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2821 {
2822 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2823 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2824 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2825 intptr_t i;
2826
2827 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2828 uint64_t t = n[i] & g[i] & mask;
2829 sum += ctpop64(t);
2830 }
2831 return sum;
2832 }
2833
2834 uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2835 {
2836 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2837 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2838 uint64_t esz_mask = pred_esz_masks[esz];
2839 ARMPredicateReg *d = vd;
2840 uint32_t flags;
2841 intptr_t i;
2842
2843 /* Begin with a zero predicate register. */
2844 flags = do_zero(d, oprsz);
2845 if (count == 0) {
2846 return flags;
2847 }
2848
2849 /* Scale from predicate element count to bits. */
2850 count <<= esz;
2851 /* Bound to the bits in the predicate. */
2852 count = MIN(count, oprsz * 8);
2853
2854 /* Set all of the requested bits. */
2855 for (i = 0; i < count / 64; ++i) {
2856 d->p[i] = esz_mask;
2857 }
2858 if (count & 63) {
2859 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2860 }
2861
2862 return predtest_ones(d, oprsz, esz_mask);
2863 }
2864
2865 /* Recursive reduction on a function;
2866 * C.f. the ARM ARM function ReducePredicated.
2867 *
2868 * While it would be possible to write this without the DATA temporary,
2869 * it is much simpler to process the predicate register this way.
2870 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2871 * little to gain with a more complex non-recursive form.
2872 */
2873 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2874 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2875 { \
2876 if (n == 1) { \
2877 return *data; \
2878 } else { \
2879 uintptr_t half = n / 2; \
2880 TYPE lo = NAME##_reduce(data, status, half); \
2881 TYPE hi = NAME##_reduce(data + half, status, half); \
2882 return TYPE##_##FUNC(lo, hi, status); \
2883 } \
2884 } \
2885 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2886 { \
2887 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2888 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2889 for (i = 0; i < oprsz; ) { \
2890 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2891 do { \
2892 TYPE nn = *(TYPE *)(vn + H(i)); \
2893 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2894 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2895 } while (i & 15); \
2896 } \
2897 for (; i < maxsz; i += sizeof(TYPE)) { \
2898 *(TYPE *)((void *)data + i) = IDENT; \
2899 } \
2900 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2901 }
2902
2903 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2904 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2905 DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2906
2907 /* Identity is floatN_default_nan, without the function call. */
2908 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2909 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2910 DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2911
2912 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2913 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2914 DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2915
2916 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2917 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2918 DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2919
2920 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2921 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2922 DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2923
2924 #undef DO_REDUCE
2925
2926 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2927 void *status, uint32_t desc)
2928 {
2929 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2930 float16 result = nn;
2931
2932 do {
2933 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2934 do {
2935 if (pg & 1) {
2936 float16 mm = *(float16 *)(vm + H1_2(i));
2937 result = float16_add(result, mm, status);
2938 }
2939 i += sizeof(float16), pg >>= sizeof(float16);
2940 } while (i & 15);
2941 } while (i < opr_sz);
2942
2943 return result;
2944 }
2945
2946 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2947 void *status, uint32_t desc)
2948 {
2949 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2950 float32 result = nn;
2951
2952 do {
2953 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2954 do {
2955 if (pg & 1) {
2956 float32 mm = *(float32 *)(vm + H1_2(i));
2957 result = float32_add(result, mm, status);
2958 }
2959 i += sizeof(float32), pg >>= sizeof(float32);
2960 } while (i & 15);
2961 } while (i < opr_sz);
2962
2963 return result;
2964 }
2965
2966 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
2967 void *status, uint32_t desc)
2968 {
2969 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
2970 uint64_t *m = vm;
2971 uint8_t *pg = vg;
2972
2973 for (i = 0; i < opr_sz; i++) {
2974 if (pg[H1(i)] & 1) {
2975 nn = float64_add(nn, m[i], status);
2976 }
2977 }
2978
2979 return nn;
2980 }
2981
2982 /* Fully general three-operand expander, controlled by a predicate,
2983 * With the extra float_status parameter.
2984 */
2985 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
2986 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
2987 void *status, uint32_t desc) \
2988 { \
2989 intptr_t i = simd_oprsz(desc); \
2990 uint64_t *g = vg; \
2991 do { \
2992 uint64_t pg = g[(i - 1) >> 6]; \
2993 do { \
2994 i -= sizeof(TYPE); \
2995 if (likely((pg >> (i & 63)) & 1)) { \
2996 TYPE nn = *(TYPE *)(vn + H(i)); \
2997 TYPE mm = *(TYPE *)(vm + H(i)); \
2998 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
2999 } \
3000 } while (i & 63); \
3001 } while (i != 0); \
3002 }
3003
3004 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3005 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3006 DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3007
3008 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3009 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3010 DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3011
3012 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3013 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3014 DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3015
3016 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3017 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3018 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3019
3020 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3021 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3022 DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3023
3024 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3025 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3026 DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3027
3028 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3029 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3030 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3031
3032 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3033 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3034 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3035
3036 static inline float16 abd_h(float16 a, float16 b, float_status *s)
3037 {
3038 return float16_abs(float16_sub(a, b, s));
3039 }
3040
3041 static inline float32 abd_s(float32 a, float32 b, float_status *s)
3042 {
3043 return float32_abs(float32_sub(a, b, s));
3044 }
3045
3046 static inline float64 abd_d(float64 a, float64 b, float_status *s)
3047 {
3048 return float64_abs(float64_sub(a, b, s));
3049 }
3050
3051 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3052 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3053 DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3054
3055 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3056 {
3057 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3058 return float64_scalbn(a, b_int, s);
3059 }
3060
3061 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3062 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3063 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3064
3065 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3066 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3067 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3068
3069 #undef DO_ZPZZ_FP
3070
3071 /* Three-operand expander, with one scalar operand, controlled by
3072 * a predicate, with the extra float_status parameter.
3073 */
3074 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3075 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3076 void *status, uint32_t desc) \
3077 { \
3078 intptr_t i = simd_oprsz(desc); \
3079 uint64_t *g = vg; \
3080 TYPE mm = scalar; \
3081 do { \
3082 uint64_t pg = g[(i - 1) >> 6]; \
3083 do { \
3084 i -= sizeof(TYPE); \
3085 if (likely((pg >> (i & 63)) & 1)) { \
3086 TYPE nn = *(TYPE *)(vn + H(i)); \
3087 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3088 } \
3089 } while (i & 63); \
3090 } while (i != 0); \
3091 }
3092
3093 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3094 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3095 DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3096
3097 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3098 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3099 DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3100
3101 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3102 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3103 DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3104
3105 static inline float16 subr_h(float16 a, float16 b, float_status *s)
3106 {
3107 return float16_sub(b, a, s);
3108 }
3109
3110 static inline float32 subr_s(float32 a, float32 b, float_status *s)
3111 {
3112 return float32_sub(b, a, s);
3113 }
3114
3115 static inline float64 subr_d(float64 a, float64 b, float_status *s)
3116 {
3117 return float64_sub(b, a, s);
3118 }
3119
3120 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3121 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3122 DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3123
3124 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3125 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3126 DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3127
3128 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3129 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3130 DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3131
3132 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3133 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3134 DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3135
3136 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3137 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3138 DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3139
3140 /* Fully general two-operand expander, controlled by a predicate,
3141 * With the extra float_status parameter.
3142 */
3143 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3144 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3145 { \
3146 intptr_t i = simd_oprsz(desc); \
3147 uint64_t *g = vg; \
3148 do { \
3149 uint64_t pg = g[(i - 1) >> 6]; \
3150 do { \
3151 i -= sizeof(TYPE); \
3152 if (likely((pg >> (i & 63)) & 1)) { \
3153 TYPE nn = *(TYPE *)(vn + H(i)); \
3154 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3155 } \
3156 } while (i & 63); \
3157 } while (i != 0); \
3158 }
3159
3160 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3161 * FZ16. When converting from fp16, this affects flushing input denormals;
3162 * when converting to fp16, this affects flushing output denormals.
3163 */
3164 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3165 {
3166 flag save = get_flush_inputs_to_zero(fpst);
3167 float32 ret;
3168
3169 set_flush_inputs_to_zero(false, fpst);
3170 ret = float16_to_float32(f, true, fpst);
3171 set_flush_inputs_to_zero(save, fpst);
3172 return ret;
3173 }
3174
3175 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3176 {
3177 flag save = get_flush_inputs_to_zero(fpst);
3178 float64 ret;
3179
3180 set_flush_inputs_to_zero(false, fpst);
3181 ret = float16_to_float64(f, true, fpst);
3182 set_flush_inputs_to_zero(save, fpst);
3183 return ret;
3184 }
3185
3186 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3187 {
3188 flag save = get_flush_to_zero(fpst);
3189 float16 ret;
3190
3191 set_flush_to_zero(false, fpst);
3192 ret = float32_to_float16(f, true, fpst);
3193 set_flush_to_zero(save, fpst);
3194 return ret;
3195 }
3196
3197 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3198 {
3199 flag save = get_flush_to_zero(fpst);
3200 float16 ret;
3201
3202 set_flush_to_zero(false, fpst);
3203 ret = float64_to_float16(f, true, fpst);
3204 set_flush_to_zero(save, fpst);
3205 return ret;
3206 }
3207
3208 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3209 {
3210 if (float16_is_any_nan(f)) {
3211 float_raise(float_flag_invalid, s);
3212 return 0;
3213 }
3214 return float16_to_int16_round_to_zero(f, s);
3215 }
3216
3217 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3218 {
3219 if (float16_is_any_nan(f)) {
3220 float_raise(float_flag_invalid, s);
3221 return 0;
3222 }
3223 return float16_to_int64_round_to_zero(f, s);
3224 }
3225
3226 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3227 {
3228 if (float32_is_any_nan(f)) {
3229 float_raise(float_flag_invalid, s);
3230 return 0;
3231 }
3232 return float32_to_int64_round_to_zero(f, s);
3233 }
3234
3235 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3236 {
3237 if (float64_is_any_nan(f)) {
3238 float_raise(float_flag_invalid, s);
3239 return 0;
3240 }
3241 return float64_to_int64_round_to_zero(f, s);
3242 }
3243
3244 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3245 {
3246 if (float16_is_any_nan(f)) {
3247 float_raise(float_flag_invalid, s);
3248 return 0;
3249 }
3250 return float16_to_uint16_round_to_zero(f, s);
3251 }
3252
3253 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3254 {
3255 if (float16_is_any_nan(f)) {
3256 float_raise(float_flag_invalid, s);
3257 return 0;
3258 }
3259 return float16_to_uint64_round_to_zero(f, s);
3260 }
3261
3262 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3263 {
3264 if (float32_is_any_nan(f)) {
3265 float_raise(float_flag_invalid, s);
3266 return 0;
3267 }
3268 return float32_to_uint64_round_to_zero(f, s);
3269 }
3270
3271 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3272 {
3273 if (float64_is_any_nan(f)) {
3274 float_raise(float_flag_invalid, s);
3275 return 0;
3276 }
3277 return float64_to_uint64_round_to_zero(f, s);
3278 }
3279
3280 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3281 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3282 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3283 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3284 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3285 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3286
3287 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3288 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3289 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3290 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3291 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3292 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3293 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3294
3295 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3296 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3297 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3298 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3299 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3300 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3301 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3302
3303 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3304 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3305 DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3306
3307 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3308 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3309 DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3310
3311 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3312 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3313 DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3314
3315 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3316 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3317 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3318
3319 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3320 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3321 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3322 DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3323 DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3324 DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3325 DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3326
3327 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3328 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3329 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3330 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3331 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3332 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3333 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3334
3335 #undef DO_ZPZ_FP
3336
3337 /* 4-operand predicated multiply-add. This requires 7 operands to pass
3338 * "properly", so we need to encode some of the registers into DESC.
3339 */
3340 QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
3341
3342 static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
3343 uint16_t neg1, uint16_t neg3)
3344 {
3345 intptr_t i = simd_oprsz(desc);
3346 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3347 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3348 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3349 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3350 void *vd = &env->vfp.zregs[rd];
3351 void *vn = &env->vfp.zregs[rn];
3352 void *vm = &env->vfp.zregs[rm];
3353 void *va = &env->vfp.zregs[ra];
3354 uint64_t *g = vg;
3355
3356 do {
3357 uint64_t pg = g[(i - 1) >> 6];
3358 do {
3359 i -= 2;
3360 if (likely((pg >> (i & 63)) & 1)) {
3361 float16 e1, e2, e3, r;
3362
3363 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3364 e2 = *(uint16_t *)(vm + H1_2(i));
3365 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3366 r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3367 *(uint16_t *)(vd + H1_2(i)) = r;
3368 }
3369 } while (i & 63);
3370 } while (i != 0);
3371 }
3372
3373 void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3374 {
3375 do_fmla_zpzzz_h(env, vg, desc, 0, 0);
3376 }
3377
3378 void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3379 {
3380 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
3381 }
3382
3383 void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3384 {
3385 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
3386 }
3387
3388 void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3389 {
3390 do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
3391 }
3392
3393 static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
3394 uint32_t neg1, uint32_t neg3)
3395 {
3396 intptr_t i = simd_oprsz(desc);
3397 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3398 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3399 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3400 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3401 void *vd = &env->vfp.zregs[rd];
3402 void *vn = &env->vfp.zregs[rn];
3403 void *vm = &env->vfp.zregs[rm];
3404 void *va = &env->vfp.zregs[ra];
3405 uint64_t *g = vg;
3406
3407 do {
3408 uint64_t pg = g[(i - 1) >> 6];
3409 do {
3410 i -= 4;
3411 if (likely((pg >> (i & 63)) & 1)) {
3412 float32 e1, e2, e3, r;
3413
3414 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3415 e2 = *(uint32_t *)(vm + H1_4(i));
3416 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3417 r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3418 *(uint32_t *)(vd + H1_4(i)) = r;
3419 }
3420 } while (i & 63);
3421 } while (i != 0);
3422 }
3423
3424 void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3425 {
3426 do_fmla_zpzzz_s(env, vg, desc, 0, 0);
3427 }
3428
3429 void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3430 {
3431 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
3432 }
3433
3434 void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3435 {
3436 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
3437 }
3438
3439 void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3440 {
3441 do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
3442 }
3443
3444 static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
3445 uint64_t neg1, uint64_t neg3)
3446 {
3447 intptr_t i = simd_oprsz(desc);
3448 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3449 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3450 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3451 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3452 void *vd = &env->vfp.zregs[rd];
3453 void *vn = &env->vfp.zregs[rn];
3454 void *vm = &env->vfp.zregs[rm];
3455 void *va = &env->vfp.zregs[ra];
3456 uint64_t *g = vg;
3457
3458 do {
3459 uint64_t pg = g[(i - 1) >> 6];
3460 do {
3461 i -= 8;
3462 if (likely((pg >> (i & 63)) & 1)) {
3463 float64 e1, e2, e3, r;
3464
3465 e1 = *(uint64_t *)(vn + i) ^ neg1;
3466 e2 = *(uint64_t *)(vm + i);
3467 e3 = *(uint64_t *)(va + i) ^ neg3;
3468 r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3469 *(uint64_t *)(vd + i) = r;
3470 }
3471 } while (i & 63);
3472 } while (i != 0);
3473 }
3474
3475 void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3476 {
3477 do_fmla_zpzzz_d(env, vg, desc, 0, 0);
3478 }
3479
3480 void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3481 {
3482 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
3483 }
3484
3485 void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3486 {
3487 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
3488 }
3489
3490 void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3491 {
3492 do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
3493 }
3494
3495 /* Two operand floating-point comparison controlled by a predicate.
3496 * Unlike the integer version, we are not allowed to optimistically
3497 * compare operands, since the comparison may have side effects wrt
3498 * the FPSR.
3499 */
3500 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3501 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3502 void *status, uint32_t desc) \
3503 { \
3504 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3505 uint64_t *d = vd, *g = vg; \
3506 do { \
3507 uint64_t out = 0, pg = g[j]; \
3508 do { \
3509 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3510 if (likely((pg >> (i & 63)) & 1)) { \
3511 TYPE nn = *(TYPE *)(vn + H(i)); \
3512 TYPE mm = *(TYPE *)(vm + H(i)); \
3513 out |= OP(TYPE, nn, mm, status); \
3514 } \
3515 } while (i & 63); \
3516 d[j--] = out; \
3517 } while (i > 0); \
3518 }
3519
3520 #define DO_FPCMP_PPZZ_H(NAME, OP) \
3521 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3522 #define DO_FPCMP_PPZZ_S(NAME, OP) \
3523 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3524 #define DO_FPCMP_PPZZ_D(NAME, OP) \
3525 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3526
3527 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3528 DO_FPCMP_PPZZ_H(NAME, OP) \
3529 DO_FPCMP_PPZZ_S(NAME, OP) \
3530 DO_FPCMP_PPZZ_D(NAME, OP)
3531
3532 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3533 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
3534 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3535 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
3536 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3537 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3538 #define DO_FCMUO(TYPE, X, Y, ST) \
3539 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3540 #define DO_FACGE(TYPE, X, Y, ST) \
3541 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3542 #define DO_FACGT(TYPE, X, Y, ST) \
3543 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3544
3545 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3546 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3547 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3548 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3549 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3550 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3551 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3552
3553 #undef DO_FPCMP_PPZZ_ALL
3554 #undef DO_FPCMP_PPZZ_D
3555 #undef DO_FPCMP_PPZZ_S
3556 #undef DO_FPCMP_PPZZ_H
3557 #undef DO_FPCMP_PPZZ
3558
3559 /* One operand floating-point comparison against zero, controlled
3560 * by a predicate.
3561 */
3562 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3563 void HELPER(NAME)(void *vd, void *vn, void *vg, \
3564 void *status, uint32_t desc) \
3565 { \
3566 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3567 uint64_t *d = vd, *g = vg; \
3568 do { \
3569 uint64_t out = 0, pg = g[j]; \
3570 do { \
3571 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3572 if ((pg >> (i & 63)) & 1) { \
3573 TYPE nn = *(TYPE *)(vn + H(i)); \
3574 out |= OP(TYPE, nn, 0, status); \
3575 } \
3576 } while (i & 63); \
3577 d[j--] = out; \
3578 } while (i > 0); \
3579 }
3580
3581 #define DO_FPCMP_PPZ0_H(NAME, OP) \
3582 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3583 #define DO_FPCMP_PPZ0_S(NAME, OP) \
3584 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3585 #define DO_FPCMP_PPZ0_D(NAME, OP) \
3586 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3587
3588 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3589 DO_FPCMP_PPZ0_H(NAME, OP) \
3590 DO_FPCMP_PPZ0_S(NAME, OP) \
3591 DO_FPCMP_PPZ0_D(NAME, OP)
3592
3593 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3594 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3595 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3596 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3597 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3598 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3599
3600 /* FP Trig Multiply-Add. */
3601
3602 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3603 {
3604 static const float16 coeff[16] = {
3605 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3606 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3607 };
3608 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3609 intptr_t x = simd_data(desc);
3610 float16 *d = vd, *n = vn, *m = vm;
3611 for (i = 0; i < opr_sz; i++) {
3612 float16 mm = m[i];
3613 intptr_t xx = x;
3614 if (float16_is_neg(mm)) {
3615 mm = float16_abs(mm);
3616 xx += 8;
3617 }
3618 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3619 }
3620 }
3621
3622 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3623 {
3624 static const float32 coeff[16] = {
3625 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3626 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3627 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3628 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3629 };
3630 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3631 intptr_t x = simd_data(desc);
3632 float32 *d = vd, *n = vn, *m = vm;
3633 for (i = 0; i < opr_sz; i++) {
3634 float32 mm = m[i];
3635 intptr_t xx = x;
3636 if (float32_is_neg(mm)) {
3637 mm = float32_abs(mm);
3638 xx += 8;
3639 }
3640 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3641 }
3642 }
3643
3644 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3645 {
3646 static const float64 coeff[16] = {
3647 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3648 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3649 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3650 0x3de5d8408868552full, 0x0000000000000000ull,
3651 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3652 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3653 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3654 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3655 };
3656 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3657 intptr_t x = simd_data(desc);
3658 float64 *d = vd, *n = vn, *m = vm;
3659 for (i = 0; i < opr_sz; i++) {
3660 float64 mm = m[i];
3661 intptr_t xx = x;
3662 if (float64_is_neg(mm)) {
3663 mm = float64_abs(mm);
3664 xx += 8;
3665 }
3666 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3667 }
3668 }
3669
3670 /*
3671 * FP Complex Add
3672 */
3673
3674 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3675 void *vs, uint32_t desc)
3676 {
3677 intptr_t j, i = simd_oprsz(desc);
3678 uint64_t *g = vg;
3679 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3680 float16 neg_real = float16_chs(neg_imag);
3681
3682 do {
3683 uint64_t pg = g[(i - 1) >> 6];
3684 do {
3685 float16 e0, e1, e2, e3;
3686
3687 /* I holds the real index; J holds the imag index. */
3688 j = i - sizeof(float16);
3689 i -= 2 * sizeof(float16);
3690
3691 e0 = *(float16 *)(vn + H1_2(i));
3692 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3693 e2 = *(float16 *)(vn + H1_2(j));
3694 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3695
3696 if (likely((pg >> (i & 63)) & 1)) {
3697 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3698 }
3699 if (likely((pg >> (j & 63)) & 1)) {
3700 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3701 }
3702 } while (i & 63);
3703 } while (i != 0);
3704 }
3705
3706 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3707 void *vs, uint32_t desc)
3708 {
3709 intptr_t j, i = simd_oprsz(desc);
3710 uint64_t *g = vg;
3711 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3712 float32 neg_real = float32_chs(neg_imag);
3713
3714 do {
3715 uint64_t pg = g[(i - 1) >> 6];
3716 do {
3717 float32 e0, e1, e2, e3;
3718
3719 /* I holds the real index; J holds the imag index. */
3720 j = i - sizeof(float32);
3721 i -= 2 * sizeof(float32);
3722
3723 e0 = *(float32 *)(vn + H1_2(i));
3724 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3725 e2 = *(float32 *)(vn + H1_2(j));
3726 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3727
3728 if (likely((pg >> (i & 63)) & 1)) {
3729 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3730 }
3731 if (likely((pg >> (j & 63)) & 1)) {
3732 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3733 }
3734 } while (i & 63);
3735 } while (i != 0);
3736 }
3737
3738 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3739 void *vs, uint32_t desc)
3740 {
3741 intptr_t j, i = simd_oprsz(desc);
3742 uint64_t *g = vg;
3743 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3744 float64 neg_real = float64_chs(neg_imag);
3745
3746 do {
3747 uint64_t pg = g[(i - 1) >> 6];
3748 do {
3749 float64 e0, e1, e2, e3;
3750
3751 /* I holds the real index; J holds the imag index. */
3752 j = i - sizeof(float64);
3753 i -= 2 * sizeof(float64);
3754
3755 e0 = *(float64 *)(vn + H1_2(i));
3756 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3757 e2 = *(float64 *)(vn + H1_2(j));
3758 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3759
3760 if (likely((pg >> (i & 63)) & 1)) {
3761 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3762 }
3763 if (likely((pg >> (j & 63)) & 1)) {
3764 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3765 }
3766 } while (i & 63);
3767 } while (i != 0);
3768 }
3769
3770 /*
3771 * FP Complex Multiply
3772 */
3773
3774 QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 22 > 32);
3775
3776 void HELPER(sve_fcmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3777 {
3778 intptr_t j, i = simd_oprsz(desc);
3779 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3780 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3781 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3782 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3783 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3784 bool flip = rot & 1;
3785 float16 neg_imag, neg_real;
3786 void *vd = &env->vfp.zregs[rd];
3787 void *vn = &env->vfp.zregs[rn];
3788 void *vm = &env->vfp.zregs[rm];
3789 void *va = &env->vfp.zregs[ra];
3790 uint64_t *g = vg;
3791
3792 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3793 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3794
3795 do {
3796 uint64_t pg = g[(i - 1) >> 6];
3797 do {
3798 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3799
3800 /* I holds the real index; J holds the imag index. */
3801 j = i - sizeof(float16);
3802 i -= 2 * sizeof(float16);
3803
3804 nr = *(float16 *)(vn + H1_2(i));
3805 ni = *(float16 *)(vn + H1_2(j));
3806 mr = *(float16 *)(vm + H1_2(i));
3807 mi = *(float16 *)(vm + H1_2(j));
3808
3809 e2 = (flip ? ni : nr);
3810 e1 = (flip ? mi : mr) ^ neg_real;
3811 e4 = e2;
3812 e3 = (flip ? mr : mi) ^ neg_imag;
3813
3814 if (likely((pg >> (i & 63)) & 1)) {
3815 d = *(float16 *)(va + H1_2(i));
3816 d = float16_muladd(e2, e1, d, 0, &env->vfp.fp_status_f16);
3817 *(float16 *)(vd + H1_2(i)) = d;
3818 }
3819 if (likely((pg >> (j & 63)) & 1)) {
3820 d = *(float16 *)(va + H1_2(j));
3821 d = float16_muladd(e4, e3, d, 0, &env->vfp.fp_status_f16);
3822 *(float16 *)(vd + H1_2(j)) = d;
3823 }
3824 } while (i & 63);
3825 } while (i != 0);
3826 }
3827
3828 void HELPER(sve_fcmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3829 {
3830 intptr_t j, i = simd_oprsz(desc);
3831 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3832 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3833 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3834 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3835 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3836 bool flip = rot & 1;
3837 float32 neg_imag, neg_real;
3838 void *vd = &env->vfp.zregs[rd];
3839 void *vn = &env->vfp.zregs[rn];
3840 void *vm = &env->vfp.zregs[rm];
3841 void *va = &env->vfp.zregs[ra];
3842 uint64_t *g = vg;
3843
3844 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3845 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3846
3847 do {
3848 uint64_t pg = g[(i - 1) >> 6];
3849 do {
3850 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3851
3852 /* I holds the real index; J holds the imag index. */
3853 j = i - sizeof(float32);
3854 i -= 2 * sizeof(float32);
3855
3856 nr = *(float32 *)(vn + H1_2(i));
3857 ni = *(float32 *)(vn + H1_2(j));
3858 mr = *(float32 *)(vm + H1_2(i));
3859 mi = *(float32 *)(vm + H1_2(j));
3860
3861 e2 = (flip ? ni : nr);
3862 e1 = (flip ? mi : mr) ^ neg_real;
3863 e4 = e2;
3864 e3 = (flip ? mr : mi) ^ neg_imag;
3865
3866 if (likely((pg >> (i & 63)) & 1)) {
3867 d = *(float32 *)(va + H1_2(i));
3868 d = float32_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3869 *(float32 *)(vd + H1_2(i)) = d;
3870 }
3871 if (likely((pg >> (j & 63)) & 1)) {
3872 d = *(float32 *)(va + H1_2(j));
3873 d = float32_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3874 *(float32 *)(vd + H1_2(j)) = d;
3875 }
3876 } while (i & 63);
3877 } while (i != 0);
3878 }
3879
3880 void HELPER(sve_fcmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3881 {
3882 intptr_t j, i = simd_oprsz(desc);
3883 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3884 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3885 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3886 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3887 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3888 bool flip = rot & 1;
3889 float64 neg_imag, neg_real;
3890 void *vd = &env->vfp.zregs[rd];
3891 void *vn = &env->vfp.zregs[rn];
3892 void *vm = &env->vfp.zregs[rm];
3893 void *va = &env->vfp.zregs[ra];
3894 uint64_t *g = vg;
3895
3896 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3897 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3898
3899 do {
3900 uint64_t pg = g[(i - 1) >> 6];
3901 do {
3902 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3903
3904 /* I holds the real index; J holds the imag index. */
3905 j = i - sizeof(float64);
3906 i -= 2 * sizeof(float64);
3907
3908 nr = *(float64 *)(vn + H1_2(i));
3909 ni = *(float64 *)(vn + H1_2(j));
3910 mr = *(float64 *)(vm + H1_2(i));
3911 mi = *(float64 *)(vm + H1_2(j));
3912
3913 e2 = (flip ? ni : nr);
3914 e1 = (flip ? mi : mr) ^ neg_real;
3915 e4 = e2;
3916 e3 = (flip ? mr : mi) ^ neg_imag;
3917
3918 if (likely((pg >> (i & 63)) & 1)) {
3919 d = *(float64 *)(va + H1_2(i));
3920 d = float64_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3921 *(float64 *)(vd + H1_2(i)) = d;
3922 }
3923 if (likely((pg >> (j & 63)) & 1)) {
3924 d = *(float64 *)(va + H1_2(j));
3925 d = float64_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3926 *(float64 *)(vd + H1_2(j)) = d;
3927 }
3928 } while (i & 63);
3929 } while (i != 0);
3930 }
3931
3932 /*
3933 * Load contiguous data, protected by a governing predicate.
3934 */
3935 #define DO_LD1(NAME, FN, TYPEE, TYPEM, H) \
3936 static void do_##NAME(CPUARMState *env, void *vd, void *vg, \
3937 target_ulong addr, intptr_t oprsz, \
3938 uintptr_t ra) \
3939 { \
3940 intptr_t i = 0; \
3941 do { \
3942 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3943 do { \
3944 TYPEM m = 0; \
3945 if (pg & 1) { \
3946 m = FN(env, addr, ra); \
3947 } \
3948 *(TYPEE *)(vd + H(i)) = m; \
3949 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3950 addr += sizeof(TYPEM); \
3951 } while (i & 15); \
3952 } while (i < oprsz); \
3953 } \
3954 void HELPER(NAME)(CPUARMState *env, void *vg, \
3955 target_ulong addr, uint32_t desc) \
3956 { \
3957 do_##NAME(env, &env->vfp.zregs[simd_data(desc)], vg, \
3958 addr, simd_oprsz(desc), GETPC()); \
3959 }
3960
3961 #define DO_LD2(NAME, FN, TYPEE, TYPEM, H) \
3962 void HELPER(NAME)(CPUARMState *env, void *vg, \
3963 target_ulong addr, uint32_t desc) \
3964 { \
3965 intptr_t i, oprsz = simd_oprsz(desc); \
3966 intptr_t ra = GETPC(); \
3967 unsigned rd = simd_data(desc); \
3968 void *d1 = &env->vfp.zregs[rd]; \
3969 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3970 for (i = 0; i < oprsz; ) { \
3971 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3972 do { \
3973 TYPEM m1 = 0, m2 = 0; \
3974 if (pg & 1) { \
3975 m1 = FN(env, addr, ra); \
3976 m2 = FN(env, addr + sizeof(TYPEM), ra); \
3977 } \
3978 *(TYPEE *)(d1 + H(i)) = m1; \
3979 *(TYPEE *)(d2 + H(i)) = m2; \
3980 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
3981 addr += 2 * sizeof(TYPEM); \
3982 } while (i & 15); \
3983 } \
3984 }
3985
3986 #define DO_LD3(NAME, FN, TYPEE, TYPEM, H) \
3987 void HELPER(NAME)(CPUARMState *env, void *vg, \
3988 target_ulong addr, uint32_t desc) \
3989 { \
3990 intptr_t i, oprsz = simd_oprsz(desc); \
3991 intptr_t ra = GETPC(); \
3992 unsigned rd = simd_data(desc); \
3993 void *d1 = &env->vfp.zregs[rd]; \
3994 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
3995 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
3996 for (i = 0; i < oprsz; ) { \
3997 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3998 do { \
3999 TYPEM m1 = 0, m2 = 0, m3 = 0; \
4000 if (pg & 1) { \
4001 m1 = FN(env, addr, ra); \
4002 m2 = FN(env, addr + sizeof(TYPEM), ra); \
4003 m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
4004 } \
4005 *(TYPEE *)(d1 + H(i)) = m1; \
4006 *(TYPEE *)(d2 + H(i)) = m2; \
4007 *(TYPEE *)(d3 + H(i)) = m3; \
4008 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4009 addr += 3 * sizeof(TYPEM); \
4010 } while (i & 15); \
4011 } \
4012 }
4013
4014 #define DO_LD4(NAME, FN, TYPEE, TYPEM, H) \
4015 void HELPER(NAME)(CPUARMState *env, void *vg, \
4016 target_ulong addr, uint32_t desc) \
4017 { \
4018 intptr_t i, oprsz = simd_oprsz(desc); \
4019 intptr_t ra = GETPC(); \
4020 unsigned rd = simd_data(desc); \
4021 void *d1 = &env->vfp.zregs[rd]; \
4022 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4023 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
4024 void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
4025 for (i = 0; i < oprsz; ) { \
4026 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4027 do { \
4028 TYPEM m1 = 0, m2 = 0, m3 = 0, m4 = 0; \
4029 if (pg & 1) { \
4030 m1 = FN(env, addr, ra); \
4031 m2 = FN(env, addr + sizeof(TYPEM), ra); \
4032 m3 = FN(env, addr + 2 * sizeof(TYPEM), ra); \
4033 m4 = FN(env, addr + 3 * sizeof(TYPEM), ra); \
4034 } \
4035 *(TYPEE *)(d1 + H(i)) = m1; \
4036 *(TYPEE *)(d2 + H(i)) = m2; \
4037 *(TYPEE *)(d3 + H(i)) = m3; \
4038 *(TYPEE *)(d4 + H(i)) = m4; \
4039 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4040 addr += 4 * sizeof(TYPEM); \
4041 } while (i & 15); \
4042 } \
4043 }
4044
4045 DO_LD1(sve_ld1bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
4046 DO_LD1(sve_ld1bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
4047 DO_LD1(sve_ld1bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
4048 DO_LD1(sve_ld1bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
4049 DO_LD1(sve_ld1bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
4050 DO_LD1(sve_ld1bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
4051
4052 DO_LD1(sve_ld1hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
4053 DO_LD1(sve_ld1hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
4054 DO_LD1(sve_ld1hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
4055 DO_LD1(sve_ld1hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
4056
4057 DO_LD1(sve_ld1sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
4058 DO_LD1(sve_ld1sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
4059
4060 DO_LD1(sve_ld1bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4061 DO_LD2(sve_ld2bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4062 DO_LD3(sve_ld3bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4063 DO_LD4(sve_ld4bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4064
4065 DO_LD1(sve_ld1hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4066 DO_LD2(sve_ld2hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4067 DO_LD3(sve_ld3hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4068 DO_LD4(sve_ld4hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4069
4070 DO_LD1(sve_ld1ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4071 DO_LD2(sve_ld2ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4072 DO_LD3(sve_ld3ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4073 DO_LD4(sve_ld4ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4074
4075 DO_LD1(sve_ld1dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4076 DO_LD2(sve_ld2dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4077 DO_LD3(sve_ld3dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4078 DO_LD4(sve_ld4dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4079
4080 #undef DO_LD1
4081 #undef DO_LD2
4082 #undef DO_LD3
4083 #undef DO_LD4
4084
4085 /*
4086 * Load contiguous data, first-fault and no-fault.
4087 */
4088
4089 #ifdef CONFIG_USER_ONLY
4090
4091 /* Fault on byte I. All bits in FFR from I are cleared. The vector
4092 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4093 * option, which leaves subsequent data unchanged.
4094 */
4095 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4096 {
4097 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4098
4099 if (i & 63) {
4100 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4101 i = ROUND_UP(i, 64);
4102 }
4103 for (; i < oprsz; i += 64) {
4104 ffr[i / 64] = 0;
4105 }
4106 }
4107
4108 /* Hold the mmap lock during the operation so that there is no race
4109 * between page_check_range and the load operation. We expect the
4110 * usual case to have no faults at all, so we check the whole range
4111 * first and if successful defer to the normal load operation.
4112 *
4113 * TODO: Change mmap_lock to a rwlock so that multiple readers
4114 * can run simultaneously. This will probably help other uses
4115 * within QEMU as well.
4116 */
4117 #define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
4118 static void do_sve_ldff1##PART(CPUARMState *env, void *vd, void *vg, \
4119 target_ulong addr, intptr_t oprsz, \
4120 bool first, uintptr_t ra) \
4121 { \
4122 intptr_t i = 0; \
4123 do { \
4124 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4125 do { \
4126 TYPEM m = 0; \
4127 if (pg & 1) { \
4128 if (!first && \
4129 unlikely(page_check_range(addr, sizeof(TYPEM), \
4130 PAGE_READ))) { \
4131 record_fault(env, i, oprsz); \
4132 return; \
4133 } \
4134 m = FN(env, addr, ra); \
4135 first = false; \
4136 } \
4137 *(TYPEE *)(vd + H(i)) = m; \
4138 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4139 addr += sizeof(TYPEM); \
4140 } while (i & 15); \
4141 } while (i < oprsz); \
4142 } \
4143 void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
4144 target_ulong addr, uint32_t desc) \
4145 { \
4146 intptr_t oprsz = simd_oprsz(desc); \
4147 unsigned rd = simd_data(desc); \
4148 void *vd = &env->vfp.zregs[rd]; \
4149 mmap_lock(); \
4150 if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
4151 do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
4152 } else { \
4153 do_sve_ldff1##PART(env, vd, vg, addr, oprsz, true, GETPC()); \
4154 } \
4155 mmap_unlock(); \
4156 }
4157
4158 /* No-fault loads are like first-fault loads without the
4159 * first faulting special case.
4160 */
4161 #define DO_LDNF1(PART) \
4162 void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
4163 target_ulong addr, uint32_t desc) \
4164 { \
4165 intptr_t oprsz = simd_oprsz(desc); \
4166 unsigned rd = simd_data(desc); \
4167 void *vd = &env->vfp.zregs[rd]; \
4168 mmap_lock(); \
4169 if (likely(page_check_range(addr, oprsz, PAGE_READ) == 0)) { \
4170 do_sve_ld1##PART(env, vd, vg, addr, oprsz, GETPC()); \
4171 } else { \
4172 do_sve_ldff1##PART(env, vd, vg, addr, oprsz, false, GETPC()); \
4173 } \
4174 mmap_unlock(); \
4175 }
4176
4177 #else
4178
4179 /* TODO: System mode is not yet supported.
4180 * This would probably use tlb_vaddr_to_host.
4181 */
4182 #define DO_LDFF1(PART, FN, TYPEE, TYPEM, H) \
4183 void HELPER(sve_ldff1##PART)(CPUARMState *env, void *vg, \
4184 target_ulong addr, uint32_t desc) \
4185 { \
4186 g_assert_not_reached(); \
4187 }
4188
4189 #define DO_LDNF1(PART) \
4190 void HELPER(sve_ldnf1##PART)(CPUARMState *env, void *vg, \
4191 target_ulong addr, uint32_t desc) \
4192 { \
4193 g_assert_not_reached(); \
4194 }
4195
4196 #endif
4197
4198 DO_LDFF1(bb_r, cpu_ldub_data_ra, uint8_t, uint8_t, H1)
4199 DO_LDFF1(bhu_r, cpu_ldub_data_ra, uint16_t, uint8_t, H1_2)
4200 DO_LDFF1(bhs_r, cpu_ldsb_data_ra, uint16_t, int8_t, H1_2)
4201 DO_LDFF1(bsu_r, cpu_ldub_data_ra, uint32_t, uint8_t, H1_4)
4202 DO_LDFF1(bss_r, cpu_ldsb_data_ra, uint32_t, int8_t, H1_4)
4203 DO_LDFF1(bdu_r, cpu_ldub_data_ra, uint64_t, uint8_t, )
4204 DO_LDFF1(bds_r, cpu_ldsb_data_ra, uint64_t, int8_t, )
4205
4206 DO_LDFF1(hh_r, cpu_lduw_data_ra, uint16_t, uint16_t, H1_2)
4207 DO_LDFF1(hsu_r, cpu_lduw_data_ra, uint32_t, uint16_t, H1_4)
4208 DO_LDFF1(hss_r, cpu_ldsw_data_ra, uint32_t, int8_t, H1_4)
4209 DO_LDFF1(hdu_r, cpu_lduw_data_ra, uint64_t, uint16_t, )
4210 DO_LDFF1(hds_r, cpu_ldsw_data_ra, uint64_t, int16_t, )
4211
4212 DO_LDFF1(ss_r, cpu_ldl_data_ra, uint32_t, uint32_t, H1_4)
4213 DO_LDFF1(sdu_r, cpu_ldl_data_ra, uint64_t, uint32_t, )
4214 DO_LDFF1(sds_r, cpu_ldl_data_ra, uint64_t, int32_t, )
4215
4216 DO_LDFF1(dd_r, cpu_ldq_data_ra, uint64_t, uint64_t, )
4217
4218 #undef DO_LDFF1
4219
4220 DO_LDNF1(bb_r)
4221 DO_LDNF1(bhu_r)
4222 DO_LDNF1(bhs_r)
4223 DO_LDNF1(bsu_r)
4224 DO_LDNF1(bss_r)
4225 DO_LDNF1(bdu_r)
4226 DO_LDNF1(bds_r)
4227
4228 DO_LDNF1(hh_r)
4229 DO_LDNF1(hsu_r)
4230 DO_LDNF1(hss_r)
4231 DO_LDNF1(hdu_r)
4232 DO_LDNF1(hds_r)
4233
4234 DO_LDNF1(ss_r)
4235 DO_LDNF1(sdu_r)
4236 DO_LDNF1(sds_r)
4237
4238 DO_LDNF1(dd_r)
4239
4240 #undef DO_LDNF1
4241
4242 /*
4243 * Store contiguous data, protected by a governing predicate.
4244 */
4245 #define DO_ST1(NAME, FN, TYPEE, TYPEM, H) \
4246 void HELPER(NAME)(CPUARMState *env, void *vg, \
4247 target_ulong addr, uint32_t desc) \
4248 { \
4249 intptr_t i, oprsz = simd_oprsz(desc); \
4250 intptr_t ra = GETPC(); \
4251 unsigned rd = simd_data(desc); \
4252 void *vd = &env->vfp.zregs[rd]; \
4253 for (i = 0; i < oprsz; ) { \
4254 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4255 do { \
4256 if (pg & 1) { \
4257 TYPEM m = *(TYPEE *)(vd + H(i)); \
4258 FN(env, addr, m, ra); \
4259 } \
4260 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4261 addr += sizeof(TYPEM); \
4262 } while (i & 15); \
4263 } \
4264 }
4265
4266 #define DO_ST1_D(NAME, FN, TYPEM) \
4267 void HELPER(NAME)(CPUARMState *env, void *vg, \
4268 target_ulong addr, uint32_t desc) \
4269 { \
4270 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4271 intptr_t ra = GETPC(); \
4272 unsigned rd = simd_data(desc); \
4273 uint64_t *d = &env->vfp.zregs[rd].d[0]; \
4274 uint8_t *pg = vg; \
4275 for (i = 0; i < oprsz; i += 1) { \
4276 if (pg[H1(i)] & 1) { \
4277 FN(env, addr, d[i], ra); \
4278 } \
4279 addr += sizeof(TYPEM); \
4280 } \
4281 }
4282
4283 #define DO_ST2(NAME, FN, TYPEE, TYPEM, H) \
4284 void HELPER(NAME)(CPUARMState *env, void *vg, \
4285 target_ulong addr, uint32_t desc) \
4286 { \
4287 intptr_t i, oprsz = simd_oprsz(desc); \
4288 intptr_t ra = GETPC(); \
4289 unsigned rd = simd_data(desc); \
4290 void *d1 = &env->vfp.zregs[rd]; \
4291 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4292 for (i = 0; i < oprsz; ) { \
4293 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4294 do { \
4295 if (pg & 1) { \
4296 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4297 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4298 FN(env, addr, m1, ra); \
4299 FN(env, addr + sizeof(TYPEM), m2, ra); \
4300 } \
4301 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4302 addr += 2 * sizeof(TYPEM); \
4303 } while (i & 15); \
4304 } \
4305 }
4306
4307 #define DO_ST3(NAME, FN, TYPEE, TYPEM, H) \
4308 void HELPER(NAME)(CPUARMState *env, void *vg, \
4309 target_ulong addr, uint32_t desc) \
4310 { \
4311 intptr_t i, oprsz = simd_oprsz(desc); \
4312 intptr_t ra = GETPC(); \
4313 unsigned rd = simd_data(desc); \
4314 void *d1 = &env->vfp.zregs[rd]; \
4315 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4316 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
4317 for (i = 0; i < oprsz; ) { \
4318 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4319 do { \
4320 if (pg & 1) { \
4321 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4322 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4323 TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
4324 FN(env, addr, m1, ra); \
4325 FN(env, addr + sizeof(TYPEM), m2, ra); \
4326 FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
4327 } \
4328 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4329 addr += 3 * sizeof(TYPEM); \
4330 } while (i & 15); \
4331 } \
4332 }
4333
4334 #define DO_ST4(NAME, FN, TYPEE, TYPEM, H) \
4335 void HELPER(NAME)(CPUARMState *env, void *vg, \
4336 target_ulong addr, uint32_t desc) \
4337 { \
4338 intptr_t i, oprsz = simd_oprsz(desc); \
4339 intptr_t ra = GETPC(); \
4340 unsigned rd = simd_data(desc); \
4341 void *d1 = &env->vfp.zregs[rd]; \
4342 void *d2 = &env->vfp.zregs[(rd + 1) & 31]; \
4343 void *d3 = &env->vfp.zregs[(rd + 2) & 31]; \
4344 void *d4 = &env->vfp.zregs[(rd + 3) & 31]; \
4345 for (i = 0; i < oprsz; ) { \
4346 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4347 do { \
4348 if (pg & 1) { \
4349 TYPEM m1 = *(TYPEE *)(d1 + H(i)); \
4350 TYPEM m2 = *(TYPEE *)(d2 + H(i)); \
4351 TYPEM m3 = *(TYPEE *)(d3 + H(i)); \
4352 TYPEM m4 = *(TYPEE *)(d4 + H(i)); \
4353 FN(env, addr, m1, ra); \
4354 FN(env, addr + sizeof(TYPEM), m2, ra); \
4355 FN(env, addr + 2 * sizeof(TYPEM), m3, ra); \
4356 FN(env, addr + 3 * sizeof(TYPEM), m4, ra); \
4357 } \
4358 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4359 addr += 4 * sizeof(TYPEM); \
4360 } while (i & 15); \
4361 } \
4362 }
4363
4364 DO_ST1(sve_st1bh_r, cpu_stb_data_ra, uint16_t, uint8_t, H1_2)
4365 DO_ST1(sve_st1bs_r, cpu_stb_data_ra, uint32_t, uint8_t, H1_4)
4366 DO_ST1_D(sve_st1bd_r, cpu_stb_data_ra, uint8_t)
4367
4368 DO_ST1(sve_st1hs_r, cpu_stw_data_ra, uint32_t, uint16_t, H1_4)
4369 DO_ST1_D(sve_st1hd_r, cpu_stw_data_ra, uint16_t)
4370
4371 DO_ST1_D(sve_st1sd_r, cpu_stl_data_ra, uint32_t)
4372
4373 DO_ST1(sve_st1bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4374 DO_ST2(sve_st2bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4375 DO_ST3(sve_st3bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4376 DO_ST4(sve_st4bb_r, cpu_stb_data_ra, uint8_t, uint8_t, H1)
4377
4378 DO_ST1(sve_st1hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4379 DO_ST2(sve_st2hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4380 DO_ST3(sve_st3hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4381 DO_ST4(sve_st4hh_r, cpu_stw_data_ra, uint16_t, uint16_t, H1_2)
4382
4383 DO_ST1(sve_st1ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4384 DO_ST2(sve_st2ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4385 DO_ST3(sve_st3ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4386 DO_ST4(sve_st4ss_r, cpu_stl_data_ra, uint32_t, uint32_t, H1_4)
4387
4388 DO_ST1_D(sve_st1dd_r, cpu_stq_data_ra, uint64_t)
4389
4390 void HELPER(sve_st2dd_r)(CPUARMState *env, void *vg,
4391 target_ulong addr, uint32_t desc)
4392 {
4393 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4394 intptr_t ra = GETPC();
4395 unsigned rd = simd_data(desc);
4396 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4397 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4398 uint8_t *pg = vg;
4399
4400 for (i = 0; i < oprsz; i += 1) {
4401 if (pg[H1(i)] & 1) {
4402 cpu_stq_data_ra(env, addr, d1[i], ra);
4403 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4404 }
4405 addr += 2 * 8;
4406 }
4407 }
4408
4409 void HELPER(sve_st3dd_r)(CPUARMState *env, void *vg,
4410 target_ulong addr, uint32_t desc)
4411 {
4412 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4413 intptr_t ra = GETPC();
4414 unsigned rd = simd_data(desc);
4415 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4416 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4417 uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
4418 uint8_t *pg = vg;
4419
4420 for (i = 0; i < oprsz; i += 1) {
4421 if (pg[H1(i)] & 1) {
4422 cpu_stq_data_ra(env, addr, d1[i], ra);
4423 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4424 cpu_stq_data_ra(env, addr + 16, d3[i], ra);
4425 }
4426 addr += 3 * 8;
4427 }
4428 }
4429
4430 void HELPER(sve_st4dd_r)(CPUARMState *env, void *vg,
4431 target_ulong addr, uint32_t desc)
4432 {
4433 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4434 intptr_t ra = GETPC();
4435 unsigned rd = simd_data(desc);
4436 uint64_t *d1 = &env->vfp.zregs[rd].d[0];
4437 uint64_t *d2 = &env->vfp.zregs[(rd + 1) & 31].d[0];
4438 uint64_t *d3 = &env->vfp.zregs[(rd + 2) & 31].d[0];
4439 uint64_t *d4 = &env->vfp.zregs[(rd + 3) & 31].d[0];
4440 uint8_t *pg = vg;
4441
4442 for (i = 0; i < oprsz; i += 1) {
4443 if (pg[H1(i)] & 1) {
4444 cpu_stq_data_ra(env, addr, d1[i], ra);
4445 cpu_stq_data_ra(env, addr + 8, d2[i], ra);
4446 cpu_stq_data_ra(env, addr + 16, d3[i], ra);
4447 cpu_stq_data_ra(env, addr + 24, d4[i], ra);
4448 }
4449 addr += 4 * 8;
4450 }
4451 }
4452
4453 /* Loads with a vector index. */
4454
4455 #define DO_LD1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
4456 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4457 target_ulong base, uint32_t desc) \
4458 { \
4459 intptr_t i, oprsz = simd_oprsz(desc); \
4460 unsigned scale = simd_data(desc); \
4461 uintptr_t ra = GETPC(); \
4462 for (i = 0; i < oprsz; ) { \
4463 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4464 do { \
4465 TYPEM m = 0; \
4466 if (pg & 1) { \
4467 target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
4468 m = FN(env, base + (off << scale), ra); \
4469 } \
4470 *(uint32_t *)(vd + H1_4(i)) = m; \
4471 i += 4, pg >>= 4; \
4472 } while (i & 15); \
4473 } \
4474 }
4475
4476 #define DO_LD1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
4477 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4478 target_ulong base, uint32_t desc) \
4479 { \
4480 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4481 unsigned scale = simd_data(desc); \
4482 uintptr_t ra = GETPC(); \
4483 uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
4484 for (i = 0; i < oprsz; i++) { \
4485 TYPEM mm = 0; \
4486 if (pg[H1(i)] & 1) { \
4487 target_ulong off = (TYPEI)m[i]; \
4488 mm = FN(env, base + (off << scale), ra); \
4489 } \
4490 d[i] = mm; \
4491 } \
4492 }
4493
4494 DO_LD1_ZPZ_S(sve_ldbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4495 DO_LD1_ZPZ_S(sve_ldhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4496 DO_LD1_ZPZ_S(sve_ldssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4497 DO_LD1_ZPZ_S(sve_ldbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4498 DO_LD1_ZPZ_S(sve_ldhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4499
4500 DO_LD1_ZPZ_S(sve_ldbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4501 DO_LD1_ZPZ_S(sve_ldhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4502 DO_LD1_ZPZ_S(sve_ldssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4503 DO_LD1_ZPZ_S(sve_ldbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
4504 DO_LD1_ZPZ_S(sve_ldhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
4505
4506 DO_LD1_ZPZ_D(sve_ldbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4507 DO_LD1_ZPZ_D(sve_ldhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4508 DO_LD1_ZPZ_D(sve_ldsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4509 DO_LD1_ZPZ_D(sve_ldddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
4510 DO_LD1_ZPZ_D(sve_ldbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4511 DO_LD1_ZPZ_D(sve_ldhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4512 DO_LD1_ZPZ_D(sve_ldsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
4513
4514 DO_LD1_ZPZ_D(sve_ldbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4515 DO_LD1_ZPZ_D(sve_ldhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4516 DO_LD1_ZPZ_D(sve_ldsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4517 DO_LD1_ZPZ_D(sve_ldddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
4518 DO_LD1_ZPZ_D(sve_ldbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
4519 DO_LD1_ZPZ_D(sve_ldhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
4520 DO_LD1_ZPZ_D(sve_ldsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
4521
4522 DO_LD1_ZPZ_D(sve_ldbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
4523 DO_LD1_ZPZ_D(sve_ldhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
4524 DO_LD1_ZPZ_D(sve_ldsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
4525 DO_LD1_ZPZ_D(sve_ldddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
4526 DO_LD1_ZPZ_D(sve_ldbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
4527 DO_LD1_ZPZ_D(sve_ldhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
4528 DO_LD1_ZPZ_D(sve_ldsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
4529
4530 /* First fault loads with a vector index. */
4531
4532 #ifdef CONFIG_USER_ONLY
4533
4534 #define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
4535 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4536 target_ulong base, uint32_t desc) \
4537 { \
4538 intptr_t i, oprsz = simd_oprsz(desc); \
4539 unsigned scale = simd_data(desc); \
4540 uintptr_t ra = GETPC(); \
4541 bool first = true; \
4542 mmap_lock(); \
4543 for (i = 0; i < oprsz; ) { \
4544 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4545 do { \
4546 TYPEM m = 0; \
4547 if (pg & 1) { \
4548 target_ulong off = *(TYPEI *)(vm + H(i)); \
4549 target_ulong addr = base + (off << scale); \
4550 if (!first && \
4551 page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \
4552 record_fault(env, i, oprsz); \
4553 goto exit; \
4554 } \
4555 m = FN(env, addr, ra); \
4556 first = false; \
4557 } \
4558 *(TYPEE *)(vd + H(i)) = m; \
4559 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4560 } while (i & 15); \
4561 } \
4562 exit: \
4563 mmap_unlock(); \
4564 }
4565
4566 #else
4567
4568 #define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
4569 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4570 target_ulong base, uint32_t desc) \
4571 { \
4572 g_assert_not_reached(); \
4573 }
4574
4575 #endif
4576
4577 #define DO_LDFF1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
4578 DO_LDFF1_ZPZ(NAME, uint32_t, TYPEI, TYPEM, FN, H1_4)
4579 #define DO_LDFF1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
4580 DO_LDFF1_ZPZ(NAME, uint64_t, TYPEI, TYPEM, FN, )
4581
4582 DO_LDFF1_ZPZ_S(sve_ldffbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4583 DO_LDFF1_ZPZ_S(sve_ldffhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4584 DO_LDFF1_ZPZ_S(sve_ldffssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4585 DO_LDFF1_ZPZ_S(sve_ldffbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4586 DO_LDFF1_ZPZ_S(sve_ldffhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4587
4588 DO_LDFF1_ZPZ_S(sve_ldffbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4589 DO_LDFF1_ZPZ_S(sve_ldffhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4590 DO_LDFF1_ZPZ_S(sve_ldffssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4591 DO_LDFF1_ZPZ_S(sve_ldffbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
4592 DO_LDFF1_ZPZ_S(sve_ldffhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
4593
4594 DO_LDFF1_ZPZ_D(sve_ldffbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4595 DO_LDFF1_ZPZ_D(sve_ldffhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4596 DO_LDFF1_ZPZ_D(sve_ldffsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4597 DO_LDFF1_ZPZ_D(sve_ldffddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
4598 DO_LDFF1_ZPZ_D(sve_ldffbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4599 DO_LDFF1_ZPZ_D(sve_ldffhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4600 DO_LDFF1_ZPZ_D(sve_ldffsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
4601
4602 DO_LDFF1_ZPZ_D(sve_ldffbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4603 DO_LDFF1_ZPZ_D(sve_ldffhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4604 DO_LDFF1_ZPZ_D(sve_ldffsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4605 DO_LDFF1_ZPZ_D(sve_ldffddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
4606 DO_LDFF1_ZPZ_D(sve_ldffbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
4607 DO_LDFF1_ZPZ_D(sve_ldffhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
4608 DO_LDFF1_ZPZ_D(sve_ldffsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
4609
4610 DO_LDFF1_ZPZ_D(sve_ldffbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
4611 DO_LDFF1_ZPZ_D(sve_ldffhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
4612 DO_LDFF1_ZPZ_D(sve_ldffsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
4613 DO_LDFF1_ZPZ_D(sve_ldffddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
4614 DO_LDFF1_ZPZ_D(sve_ldffbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
4615 DO_LDFF1_ZPZ_D(sve_ldffhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
4616 DO_LDFF1_ZPZ_D(sve_ldffsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
4617
4618 /* Stores with a vector index. */
4619
4620 #define DO_ST1_ZPZ_S(NAME, TYPEI, FN) \
4621 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4622 target_ulong base, uint32_t desc) \
4623 { \
4624 intptr_t i, oprsz = simd_oprsz(desc); \
4625 unsigned scale = simd_data(desc); \
4626 uintptr_t ra = GETPC(); \
4627 for (i = 0; i < oprsz; ) { \
4628 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4629 do { \
4630 if (likely(pg & 1)) { \
4631 target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
4632 uint32_t d = *(uint32_t *)(vd + H1_4(i)); \
4633 FN(env, base + (off << scale), d, ra); \
4634 } \
4635 i += sizeof(uint32_t), pg >>= sizeof(uint32_t); \
4636 } while (i & 15); \
4637 } \
4638 }
4639
4640 #define DO_ST1_ZPZ_D(NAME, TYPEI, FN) \
4641 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4642 target_ulong base, uint32_t desc) \
4643 { \
4644 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4645 unsigned scale = simd_data(desc); \
4646 uintptr_t ra = GETPC(); \
4647 uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
4648 for (i = 0; i < oprsz; i++) { \
4649 if (likely(pg[H1(i)] & 1)) { \
4650 target_ulong off = (target_ulong)(TYPEI)m[i] << scale; \
4651 FN(env, base + off, d[i], ra); \
4652 } \
4653 } \
4654 }
4655
4656 DO_ST1_ZPZ_S(sve_stbs_zsu, uint32_t, cpu_stb_data_ra)
4657 DO_ST1_ZPZ_S(sve_sths_zsu, uint32_t, cpu_stw_data_ra)
4658 DO_ST1_ZPZ_S(sve_stss_zsu, uint32_t, cpu_stl_data_ra)
4659
4660 DO_ST1_ZPZ_S(sve_stbs_zss, int32_t, cpu_stb_data_ra)
4661 DO_ST1_ZPZ_S(sve_sths_zss, int32_t, cpu_stw_data_ra)
4662 DO_ST1_ZPZ_S(sve_stss_zss, int32_t, cpu_stl_data_ra)
4663
4664 DO_ST1_ZPZ_D(sve_stbd_zsu, uint32_t, cpu_stb_data_ra)
4665 DO_ST1_ZPZ_D(sve_sthd_zsu, uint32_t, cpu_stw_data_ra)
4666 DO_ST1_ZPZ_D(sve_stsd_zsu, uint32_t, cpu_stl_data_ra)
4667 DO_ST1_ZPZ_D(sve_stdd_zsu, uint32_t, cpu_stq_data_ra)
4668
4669 DO_ST1_ZPZ_D(sve_stbd_zss, int32_t, cpu_stb_data_ra)
4670 DO_ST1_ZPZ_D(sve_sthd_zss, int32_t, cpu_stw_data_ra)
4671 DO_ST1_ZPZ_D(sve_stsd_zss, int32_t, cpu_stl_data_ra)
4672 DO_ST1_ZPZ_D(sve_stdd_zss, int32_t, cpu_stq_data_ra)
4673
4674 DO_ST1_ZPZ_D(sve_stbd_zd, uint64_t, cpu_stb_data_ra)
4675 DO_ST1_ZPZ_D(sve_sthd_zd, uint64_t, cpu_stw_data_ra)
4676 DO_ST1_ZPZ_D(sve_stsd_zd, uint64_t, cpu_stl_data_ra)
4677 DO_ST1_ZPZ_D(sve_stdd_zd, uint64_t, cpu_stq_data_ra)