]> git.proxmox.com Git - mirror_qemu.git/blob - target/arm/sve_helper.c
target/arm: Rewrite vector gather loads
[mirror_qemu.git] / target / arm / sve_helper.c
1 /*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "exec/exec-all.h"
23 #include "exec/cpu_ldst.h"
24 #include "exec/helper-proto.h"
25 #include "tcg/tcg-gvec-desc.h"
26 #include "fpu/softfloat.h"
27
28
29 /* Note that vector data is stored in host-endian 64-bit chunks,
30 so addressing units smaller than that needs a host-endian fixup. */
31 #ifdef HOST_WORDS_BIGENDIAN
32 #define H1(x) ((x) ^ 7)
33 #define H1_2(x) ((x) ^ 6)
34 #define H1_4(x) ((x) ^ 4)
35 #define H2(x) ((x) ^ 3)
36 #define H4(x) ((x) ^ 1)
37 #else
38 #define H1(x) (x)
39 #define H1_2(x) (x)
40 #define H1_4(x) (x)
41 #define H2(x) (x)
42 #define H4(x) (x)
43 #endif
44
45 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
46 *
47 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
48 * and bit 0 set if C is set. Compare the definitions of these variables
49 * within CPUARMState.
50 */
51
52 /* For no G bits set, NZCV = C. */
53 #define PREDTEST_INIT 1
54
55 /* This is an iterative function, called for each Pd and Pg word
56 * moving forward.
57 */
58 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
59 {
60 if (likely(g)) {
61 /* Compute N from first D & G.
62 Use bit 2 to signal first G bit seen. */
63 if (!(flags & 4)) {
64 flags |= ((d & (g & -g)) != 0) << 31;
65 flags |= 4;
66 }
67
68 /* Accumulate Z from each D & G. */
69 flags |= ((d & g) != 0) << 1;
70
71 /* Compute C from last !(D & G). Replace previous. */
72 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
73 }
74 return flags;
75 }
76
77 /* This is an iterative function, called for each Pd and Pg word
78 * moving backward.
79 */
80 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
81 {
82 if (likely(g)) {
83 /* Compute C from first (i.e last) !(D & G).
84 Use bit 2 to signal first G bit seen. */
85 if (!(flags & 4)) {
86 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
87 flags |= (d & pow2floor(g)) == 0;
88 }
89
90 /* Accumulate Z from each D & G. */
91 flags |= ((d & g) != 0) << 1;
92
93 /* Compute N from last (i.e first) D & G. Replace previous. */
94 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
95 }
96 return flags;
97 }
98
99 /* The same for a single word predicate. */
100 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
101 {
102 return iter_predtest_fwd(d, g, PREDTEST_INIT);
103 }
104
105 /* The same for a multi-word predicate. */
106 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
107 {
108 uint32_t flags = PREDTEST_INIT;
109 uint64_t *d = vd, *g = vg;
110 uintptr_t i = 0;
111
112 do {
113 flags = iter_predtest_fwd(d[i], g[i], flags);
114 } while (++i < words);
115
116 return flags;
117 }
118
119 /* Expand active predicate bits to bytes, for byte elements.
120 * for (i = 0; i < 256; ++i) {
121 * unsigned long m = 0;
122 * for (j = 0; j < 8; j++) {
123 * if ((i >> j) & 1) {
124 * m |= 0xfful << (j << 3);
125 * }
126 * }
127 * printf("0x%016lx,\n", m);
128 * }
129 */
130 static inline uint64_t expand_pred_b(uint8_t byte)
131 {
132 static const uint64_t word[256] = {
133 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
134 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
135 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
136 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
137 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
138 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
139 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
140 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
141 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
142 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
143 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
144 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
145 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
146 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
147 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
148 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
149 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
150 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
151 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
152 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
153 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
154 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
155 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
156 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
157 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
158 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
159 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
160 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
161 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
162 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
163 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
164 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
165 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
166 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
167 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
168 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
169 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
170 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
171 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
172 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
173 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
174 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
175 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
176 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
177 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
178 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
179 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
180 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
181 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
182 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
183 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
184 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
185 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
186 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
187 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
188 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
189 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
190 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
191 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
192 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
193 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
194 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
195 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
196 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
197 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
198 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
199 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
200 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
201 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
202 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
203 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
204 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
205 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
206 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
207 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
208 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
209 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
210 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
211 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
212 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
213 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
214 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
215 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
216 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
217 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
218 0xffffffffffffffff,
219 };
220 return word[byte];
221 }
222
223 /* Similarly for half-word elements.
224 * for (i = 0; i < 256; ++i) {
225 * unsigned long m = 0;
226 * if (i & 0xaa) {
227 * continue;
228 * }
229 * for (j = 0; j < 8; j += 2) {
230 * if ((i >> j) & 1) {
231 * m |= 0xfffful << (j << 3);
232 * }
233 * }
234 * printf("[0x%x] = 0x%016lx,\n", i, m);
235 * }
236 */
237 static inline uint64_t expand_pred_h(uint8_t byte)
238 {
239 static const uint64_t word[] = {
240 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
241 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
242 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
243 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
244 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
245 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
246 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
247 [0x55] = 0xffffffffffffffff,
248 };
249 return word[byte & 0x55];
250 }
251
252 /* Similarly for single word elements. */
253 static inline uint64_t expand_pred_s(uint8_t byte)
254 {
255 static const uint64_t word[] = {
256 [0x01] = 0x00000000ffffffffull,
257 [0x10] = 0xffffffff00000000ull,
258 [0x11] = 0xffffffffffffffffull,
259 };
260 return word[byte & 0x11];
261 }
262
263 /* Swap 16-bit words within a 32-bit word. */
264 static inline uint32_t hswap32(uint32_t h)
265 {
266 return rol32(h, 16);
267 }
268
269 /* Swap 16-bit words within a 64-bit word. */
270 static inline uint64_t hswap64(uint64_t h)
271 {
272 uint64_t m = 0x0000ffff0000ffffull;
273 h = rol64(h, 32);
274 return ((h & m) << 16) | ((h >> 16) & m);
275 }
276
277 /* Swap 32-bit words within a 64-bit word. */
278 static inline uint64_t wswap64(uint64_t h)
279 {
280 return rol64(h, 32);
281 }
282
283 #define LOGICAL_PPPP(NAME, FUNC) \
284 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
285 { \
286 uintptr_t opr_sz = simd_oprsz(desc); \
287 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
288 uintptr_t i; \
289 for (i = 0; i < opr_sz / 8; ++i) { \
290 d[i] = FUNC(n[i], m[i], g[i]); \
291 } \
292 }
293
294 #define DO_AND(N, M, G) (((N) & (M)) & (G))
295 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
296 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
297 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
298 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
299 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
300 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
301 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
302
303 LOGICAL_PPPP(sve_and_pppp, DO_AND)
304 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
305 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
306 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
307 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
308 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
309 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
310 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
311
312 #undef DO_AND
313 #undef DO_BIC
314 #undef DO_EOR
315 #undef DO_ORR
316 #undef DO_ORN
317 #undef DO_NOR
318 #undef DO_NAND
319 #undef DO_SEL
320 #undef LOGICAL_PPPP
321
322 /* Fully general three-operand expander, controlled by a predicate.
323 * This is complicated by the host-endian storage of the register file.
324 */
325 /* ??? I don't expect the compiler could ever vectorize this itself.
326 * With some tables we can convert bit masks to byte masks, and with
327 * extra care wrt byte/word ordering we could use gcc generic vectors
328 * and do 16 bytes at a time.
329 */
330 #define DO_ZPZZ(NAME, TYPE, H, OP) \
331 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
332 { \
333 intptr_t i, opr_sz = simd_oprsz(desc); \
334 for (i = 0; i < opr_sz; ) { \
335 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
336 do { \
337 if (pg & 1) { \
338 TYPE nn = *(TYPE *)(vn + H(i)); \
339 TYPE mm = *(TYPE *)(vm + H(i)); \
340 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
341 } \
342 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
343 } while (i & 15); \
344 } \
345 }
346
347 /* Similarly, specialized for 64-bit operands. */
348 #define DO_ZPZZ_D(NAME, TYPE, OP) \
349 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
350 { \
351 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
352 TYPE *d = vd, *n = vn, *m = vm; \
353 uint8_t *pg = vg; \
354 for (i = 0; i < opr_sz; i += 1) { \
355 if (pg[H1(i)] & 1) { \
356 TYPE nn = n[i], mm = m[i]; \
357 d[i] = OP(nn, mm); \
358 } \
359 } \
360 }
361
362 #define DO_AND(N, M) (N & M)
363 #define DO_EOR(N, M) (N ^ M)
364 #define DO_ORR(N, M) (N | M)
365 #define DO_BIC(N, M) (N & ~M)
366 #define DO_ADD(N, M) (N + M)
367 #define DO_SUB(N, M) (N - M)
368 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
369 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
370 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
371 #define DO_MUL(N, M) (N * M)
372
373
374 /*
375 * We must avoid the C undefined behaviour cases: division by
376 * zero and signed division of INT_MIN by -1. Both of these
377 * have architecturally defined required results for Arm.
378 * We special case all signed divisions by -1 to avoid having
379 * to deduce the minimum integer for the type involved.
380 */
381 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
382 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
383
384 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
385 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
386 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
387 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
388
389 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
390 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
391 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
392 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
393
394 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
395 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
396 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
397 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
398
399 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
400 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
401 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
402 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
403
404 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
405 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
406 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
407 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
408
409 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
410 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
411 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
412 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
413
414 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
415 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
416 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
417 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
418
419 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
420 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
421 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
422 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
423
424 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
425 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
426 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
427 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
428
429 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
430 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
431 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
432 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
433
434 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
435 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
436 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
437 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
438
439 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
440 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
441 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
442 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
443
444 /* Because the computation type is at least twice as large as required,
445 these work for both signed and unsigned source types. */
446 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
447 {
448 return (n * m) >> 8;
449 }
450
451 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
452 {
453 return (n * m) >> 16;
454 }
455
456 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
457 {
458 return (n * m) >> 32;
459 }
460
461 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
462 {
463 uint64_t lo, hi;
464 muls64(&lo, &hi, n, m);
465 return hi;
466 }
467
468 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
469 {
470 uint64_t lo, hi;
471 mulu64(&lo, &hi, n, m);
472 return hi;
473 }
474
475 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
476 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
477 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
478 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
479
480 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
481 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
482 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
483 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
484
485 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
486 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
487 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
488 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
489
490 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
491 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
492
493 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
494 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
495
496 /* Note that all bits of the shift are significant
497 and not modulo the element size. */
498 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
499 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
500 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
501
502 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
503 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
504 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
505
506 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
507 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
508 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
509
510 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
511 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
512 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
513
514 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
515 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
516 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
517
518 #undef DO_ZPZZ
519 #undef DO_ZPZZ_D
520
521 /* Three-operand expander, controlled by a predicate, in which the
522 * third operand is "wide". That is, for D = N op M, the same 64-bit
523 * value of M is used with all of the narrower values of N.
524 */
525 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
526 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
527 { \
528 intptr_t i, opr_sz = simd_oprsz(desc); \
529 for (i = 0; i < opr_sz; ) { \
530 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
531 TYPEW mm = *(TYPEW *)(vm + i); \
532 do { \
533 if (pg & 1) { \
534 TYPE nn = *(TYPE *)(vn + H(i)); \
535 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
536 } \
537 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
538 } while (i & 7); \
539 } \
540 }
541
542 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
543 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
544 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
545
546 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
547 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
548 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
549
550 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
551 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
552 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
553
554 #undef DO_ZPZW
555
556 /* Fully general two-operand expander, controlled by a predicate.
557 */
558 #define DO_ZPZ(NAME, TYPE, H, OP) \
559 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
560 { \
561 intptr_t i, opr_sz = simd_oprsz(desc); \
562 for (i = 0; i < opr_sz; ) { \
563 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
564 do { \
565 if (pg & 1) { \
566 TYPE nn = *(TYPE *)(vn + H(i)); \
567 *(TYPE *)(vd + H(i)) = OP(nn); \
568 } \
569 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
570 } while (i & 15); \
571 } \
572 }
573
574 /* Similarly, specialized for 64-bit operands. */
575 #define DO_ZPZ_D(NAME, TYPE, OP) \
576 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
577 { \
578 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
579 TYPE *d = vd, *n = vn; \
580 uint8_t *pg = vg; \
581 for (i = 0; i < opr_sz; i += 1) { \
582 if (pg[H1(i)] & 1) { \
583 TYPE nn = n[i]; \
584 d[i] = OP(nn); \
585 } \
586 } \
587 }
588
589 #define DO_CLS_B(N) (clrsb32(N) - 24)
590 #define DO_CLS_H(N) (clrsb32(N) - 16)
591
592 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
593 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
594 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
595 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
596
597 #define DO_CLZ_B(N) (clz32(N) - 24)
598 #define DO_CLZ_H(N) (clz32(N) - 16)
599
600 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
601 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
602 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
603 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
604
605 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
606 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
607 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
608 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
609
610 #define DO_CNOT(N) (N == 0)
611
612 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
613 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
614 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
615 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
616
617 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
618
619 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
620 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
621 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
622
623 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
624
625 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
626 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
627 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
628
629 #define DO_NOT(N) (~N)
630
631 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
632 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
633 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
634 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
635
636 #define DO_SXTB(N) ((int8_t)N)
637 #define DO_SXTH(N) ((int16_t)N)
638 #define DO_SXTS(N) ((int32_t)N)
639 #define DO_UXTB(N) ((uint8_t)N)
640 #define DO_UXTH(N) ((uint16_t)N)
641 #define DO_UXTS(N) ((uint32_t)N)
642
643 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
644 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
645 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
646 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
647 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
648 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
649
650 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
651 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
652 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
653 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
654 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
655 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
656
657 #define DO_ABS(N) (N < 0 ? -N : N)
658
659 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
660 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
661 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
662 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
663
664 #define DO_NEG(N) (-N)
665
666 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
667 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
668 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
669 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
670
671 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
672 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
673 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
674
675 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
676 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
677
678 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
679
680 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
681 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
682 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
683 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
684
685 /* Three-operand expander, unpredicated, in which the third operand is "wide".
686 */
687 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
688 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
689 { \
690 intptr_t i, opr_sz = simd_oprsz(desc); \
691 for (i = 0; i < opr_sz; ) { \
692 TYPEW mm = *(TYPEW *)(vm + i); \
693 do { \
694 TYPE nn = *(TYPE *)(vn + H(i)); \
695 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
696 i += sizeof(TYPE); \
697 } while (i & 7); \
698 } \
699 }
700
701 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
702 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
703 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
704
705 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
706 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
707 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
708
709 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
710 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
711 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
712
713 #undef DO_ZZW
714
715 #undef DO_CLS_B
716 #undef DO_CLS_H
717 #undef DO_CLZ_B
718 #undef DO_CLZ_H
719 #undef DO_CNOT
720 #undef DO_FABS
721 #undef DO_FNEG
722 #undef DO_ABS
723 #undef DO_NEG
724 #undef DO_ZPZ
725 #undef DO_ZPZ_D
726
727 /* Two-operand reduction expander, controlled by a predicate.
728 * The difference between TYPERED and TYPERET has to do with
729 * sign-extension. E.g. for SMAX, TYPERED must be signed,
730 * but TYPERET must be unsigned so that e.g. a 32-bit value
731 * is not sign-extended to the ABI uint64_t return type.
732 */
733 /* ??? If we were to vectorize this by hand the reduction ordering
734 * would change. For integer operands, this is perfectly fine.
735 */
736 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
737 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
738 { \
739 intptr_t i, opr_sz = simd_oprsz(desc); \
740 TYPERED ret = INIT; \
741 for (i = 0; i < opr_sz; ) { \
742 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
743 do { \
744 if (pg & 1) { \
745 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
746 ret = OP(ret, nn); \
747 } \
748 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
749 } while (i & 15); \
750 } \
751 return (TYPERET)ret; \
752 }
753
754 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
755 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
756 { \
757 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
758 TYPEE *n = vn; \
759 uint8_t *pg = vg; \
760 TYPER ret = INIT; \
761 for (i = 0; i < opr_sz; i += 1) { \
762 if (pg[H1(i)] & 1) { \
763 TYPEE nn = n[i]; \
764 ret = OP(ret, nn); \
765 } \
766 } \
767 return ret; \
768 }
769
770 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
771 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
772 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
773 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
774
775 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
776 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
777 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
778 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
779
780 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
781 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
782 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
783 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
784
785 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
786 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
787 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
788
789 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
790 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
791 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
792 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
793
794 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
795 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
796 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
797 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
798
799 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
800 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
801 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
802 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
803
804 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
805 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
806 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
807 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
808
809 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
810 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
811 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
812 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
813
814 #undef DO_VPZ
815 #undef DO_VPZ_D
816
817 /* Two vector operand, one scalar operand, unpredicated. */
818 #define DO_ZZI(NAME, TYPE, OP) \
819 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
820 { \
821 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
822 TYPE s = s64, *d = vd, *n = vn; \
823 for (i = 0; i < opr_sz; ++i) { \
824 d[i] = OP(n[i], s); \
825 } \
826 }
827
828 #define DO_SUBR(X, Y) (Y - X)
829
830 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
831 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
832 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
833 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
834
835 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
836 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
837 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
838 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
839
840 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
841 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
842 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
843 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
844
845 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
846 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
847 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
848 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
849
850 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
851 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
852 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
853 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
854
855 #undef DO_ZZI
856
857 #undef DO_AND
858 #undef DO_ORR
859 #undef DO_EOR
860 #undef DO_BIC
861 #undef DO_ADD
862 #undef DO_SUB
863 #undef DO_MAX
864 #undef DO_MIN
865 #undef DO_ABD
866 #undef DO_MUL
867 #undef DO_DIV
868 #undef DO_ASR
869 #undef DO_LSR
870 #undef DO_LSL
871 #undef DO_SUBR
872
873 /* Similar to the ARM LastActiveElement pseudocode function, except the
874 result is multiplied by the element size. This includes the not found
875 indication; e.g. not found for esz=3 is -8. */
876 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
877 {
878 uint64_t mask = pred_esz_masks[esz];
879 intptr_t i = words;
880
881 do {
882 uint64_t this_g = g[--i] & mask;
883 if (this_g) {
884 return i * 64 + (63 - clz64(this_g));
885 }
886 } while (i > 0);
887 return (intptr_t)-1 << esz;
888 }
889
890 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
891 {
892 uint32_t flags = PREDTEST_INIT;
893 uint64_t *d = vd, *g = vg;
894 intptr_t i = 0;
895
896 do {
897 uint64_t this_d = d[i];
898 uint64_t this_g = g[i];
899
900 if (this_g) {
901 if (!(flags & 4)) {
902 /* Set in D the first bit of G. */
903 this_d |= this_g & -this_g;
904 d[i] = this_d;
905 }
906 flags = iter_predtest_fwd(this_d, this_g, flags);
907 }
908 } while (++i < words);
909
910 return flags;
911 }
912
913 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
914 {
915 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
916 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
917 uint32_t flags = PREDTEST_INIT;
918 uint64_t *d = vd, *g = vg, esz_mask;
919 intptr_t i, next;
920
921 next = last_active_element(vd, words, esz) + (1 << esz);
922 esz_mask = pred_esz_masks[esz];
923
924 /* Similar to the pseudocode for pnext, but scaled by ESZ
925 so that we find the correct bit. */
926 if (next < words * 64) {
927 uint64_t mask = -1;
928
929 if (next & 63) {
930 mask = ~((1ull << (next & 63)) - 1);
931 next &= -64;
932 }
933 do {
934 uint64_t this_g = g[next / 64] & esz_mask & mask;
935 if (this_g != 0) {
936 next = (next & -64) + ctz64(this_g);
937 break;
938 }
939 next += 64;
940 mask = -1;
941 } while (next < words * 64);
942 }
943
944 i = 0;
945 do {
946 uint64_t this_d = 0;
947 if (i == next / 64) {
948 this_d = 1ull << (next & 63);
949 }
950 d[i] = this_d;
951 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
952 } while (++i < words);
953
954 return flags;
955 }
956
957 /* Store zero into every active element of Zd. We will use this for two
958 * and three-operand predicated instructions for which logic dictates a
959 * zero result. In particular, logical shift by element size, which is
960 * otherwise undefined on the host.
961 *
962 * For element sizes smaller than uint64_t, we use tables to expand
963 * the N bits of the controlling predicate to a byte mask, and clear
964 * those bytes.
965 */
966 void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
967 {
968 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
969 uint64_t *d = vd;
970 uint8_t *pg = vg;
971 for (i = 0; i < opr_sz; i += 1) {
972 d[i] &= ~expand_pred_b(pg[H1(i)]);
973 }
974 }
975
976 void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
977 {
978 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
979 uint64_t *d = vd;
980 uint8_t *pg = vg;
981 for (i = 0; i < opr_sz; i += 1) {
982 d[i] &= ~expand_pred_h(pg[H1(i)]);
983 }
984 }
985
986 void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
987 {
988 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
989 uint64_t *d = vd;
990 uint8_t *pg = vg;
991 for (i = 0; i < opr_sz; i += 1) {
992 d[i] &= ~expand_pred_s(pg[H1(i)]);
993 }
994 }
995
996 void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
997 {
998 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
999 uint64_t *d = vd;
1000 uint8_t *pg = vg;
1001 for (i = 0; i < opr_sz; i += 1) {
1002 if (pg[H1(i)] & 1) {
1003 d[i] = 0;
1004 }
1005 }
1006 }
1007
1008 /* Copy Zn into Zd, and store zero into inactive elements. */
1009 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1010 {
1011 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1012 uint64_t *d = vd, *n = vn;
1013 uint8_t *pg = vg;
1014 for (i = 0; i < opr_sz; i += 1) {
1015 d[i] = n[i] & expand_pred_b(pg[H1(i)]);
1016 }
1017 }
1018
1019 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1020 {
1021 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1022 uint64_t *d = vd, *n = vn;
1023 uint8_t *pg = vg;
1024 for (i = 0; i < opr_sz; i += 1) {
1025 d[i] = n[i] & expand_pred_h(pg[H1(i)]);
1026 }
1027 }
1028
1029 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1030 {
1031 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1032 uint64_t *d = vd, *n = vn;
1033 uint8_t *pg = vg;
1034 for (i = 0; i < opr_sz; i += 1) {
1035 d[i] = n[i] & expand_pred_s(pg[H1(i)]);
1036 }
1037 }
1038
1039 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1040 {
1041 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1042 uint64_t *d = vd, *n = vn;
1043 uint8_t *pg = vg;
1044 for (i = 0; i < opr_sz; i += 1) {
1045 d[i] = n[i] & -(uint64_t)(pg[H1(i)] & 1);
1046 }
1047 }
1048
1049 /* Three-operand expander, immediate operand, controlled by a predicate.
1050 */
1051 #define DO_ZPZI(NAME, TYPE, H, OP) \
1052 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1053 { \
1054 intptr_t i, opr_sz = simd_oprsz(desc); \
1055 TYPE imm = simd_data(desc); \
1056 for (i = 0; i < opr_sz; ) { \
1057 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1058 do { \
1059 if (pg & 1) { \
1060 TYPE nn = *(TYPE *)(vn + H(i)); \
1061 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1062 } \
1063 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1064 } while (i & 15); \
1065 } \
1066 }
1067
1068 /* Similarly, specialized for 64-bit operands. */
1069 #define DO_ZPZI_D(NAME, TYPE, OP) \
1070 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1071 { \
1072 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1073 TYPE *d = vd, *n = vn; \
1074 TYPE imm = simd_data(desc); \
1075 uint8_t *pg = vg; \
1076 for (i = 0; i < opr_sz; i += 1) { \
1077 if (pg[H1(i)] & 1) { \
1078 TYPE nn = n[i]; \
1079 d[i] = OP(nn, imm); \
1080 } \
1081 } \
1082 }
1083
1084 #define DO_SHR(N, M) (N >> M)
1085 #define DO_SHL(N, M) (N << M)
1086
1087 /* Arithmetic shift right for division. This rounds negative numbers
1088 toward zero as per signed division. Therefore before shifting,
1089 when N is negative, add 2**M-1. */
1090 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1091
1092 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1093 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1094 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1095 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1096
1097 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1098 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1099 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1100 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1101
1102 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1103 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1104 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1105 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1106
1107 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1108 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1109 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1110 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1111
1112 #undef DO_SHR
1113 #undef DO_SHL
1114 #undef DO_ASRD
1115 #undef DO_ZPZI
1116 #undef DO_ZPZI_D
1117
1118 /* Fully general four-operand expander, controlled by a predicate.
1119 */
1120 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1121 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1122 void *vg, uint32_t desc) \
1123 { \
1124 intptr_t i, opr_sz = simd_oprsz(desc); \
1125 for (i = 0; i < opr_sz; ) { \
1126 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1127 do { \
1128 if (pg & 1) { \
1129 TYPE nn = *(TYPE *)(vn + H(i)); \
1130 TYPE mm = *(TYPE *)(vm + H(i)); \
1131 TYPE aa = *(TYPE *)(va + H(i)); \
1132 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1133 } \
1134 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1135 } while (i & 15); \
1136 } \
1137 }
1138
1139 /* Similarly, specialized for 64-bit operands. */
1140 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1141 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1142 void *vg, uint32_t desc) \
1143 { \
1144 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1145 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1146 uint8_t *pg = vg; \
1147 for (i = 0; i < opr_sz; i += 1) { \
1148 if (pg[H1(i)] & 1) { \
1149 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1150 d[i] = OP(aa, nn, mm); \
1151 } \
1152 } \
1153 }
1154
1155 #define DO_MLA(A, N, M) (A + N * M)
1156 #define DO_MLS(A, N, M) (A - N * M)
1157
1158 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1159 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1160
1161 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1162 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1163
1164 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1165 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1166
1167 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1168 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1169
1170 #undef DO_MLA
1171 #undef DO_MLS
1172 #undef DO_ZPZZZ
1173 #undef DO_ZPZZZ_D
1174
1175 void HELPER(sve_index_b)(void *vd, uint32_t start,
1176 uint32_t incr, uint32_t desc)
1177 {
1178 intptr_t i, opr_sz = simd_oprsz(desc);
1179 uint8_t *d = vd;
1180 for (i = 0; i < opr_sz; i += 1) {
1181 d[H1(i)] = start + i * incr;
1182 }
1183 }
1184
1185 void HELPER(sve_index_h)(void *vd, uint32_t start,
1186 uint32_t incr, uint32_t desc)
1187 {
1188 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1189 uint16_t *d = vd;
1190 for (i = 0; i < opr_sz; i += 1) {
1191 d[H2(i)] = start + i * incr;
1192 }
1193 }
1194
1195 void HELPER(sve_index_s)(void *vd, uint32_t start,
1196 uint32_t incr, uint32_t desc)
1197 {
1198 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1199 uint32_t *d = vd;
1200 for (i = 0; i < opr_sz; i += 1) {
1201 d[H4(i)] = start + i * incr;
1202 }
1203 }
1204
1205 void HELPER(sve_index_d)(void *vd, uint64_t start,
1206 uint64_t incr, uint32_t desc)
1207 {
1208 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1209 uint64_t *d = vd;
1210 for (i = 0; i < opr_sz; i += 1) {
1211 d[i] = start + i * incr;
1212 }
1213 }
1214
1215 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1216 {
1217 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1218 uint32_t sh = simd_data(desc);
1219 uint32_t *d = vd, *n = vn, *m = vm;
1220 for (i = 0; i < opr_sz; i += 1) {
1221 d[i] = n[i] + (m[i] << sh);
1222 }
1223 }
1224
1225 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1226 {
1227 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1228 uint64_t sh = simd_data(desc);
1229 uint64_t *d = vd, *n = vn, *m = vm;
1230 for (i = 0; i < opr_sz; i += 1) {
1231 d[i] = n[i] + (m[i] << sh);
1232 }
1233 }
1234
1235 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1236 {
1237 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1238 uint64_t sh = simd_data(desc);
1239 uint64_t *d = vd, *n = vn, *m = vm;
1240 for (i = 0; i < opr_sz; i += 1) {
1241 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1242 }
1243 }
1244
1245 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1246 {
1247 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1248 uint64_t sh = simd_data(desc);
1249 uint64_t *d = vd, *n = vn, *m = vm;
1250 for (i = 0; i < opr_sz; i += 1) {
1251 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1252 }
1253 }
1254
1255 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1256 {
1257 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1258 static const uint16_t coeff[] = {
1259 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1260 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1261 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1262 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1263 };
1264 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1265 uint16_t *d = vd, *n = vn;
1266
1267 for (i = 0; i < opr_sz; i++) {
1268 uint16_t nn = n[i];
1269 intptr_t idx = extract32(nn, 0, 5);
1270 uint16_t exp = extract32(nn, 5, 5);
1271 d[i] = coeff[idx] | (exp << 10);
1272 }
1273 }
1274
1275 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1276 {
1277 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1278 static const uint32_t coeff[] = {
1279 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1280 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1281 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1282 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1283 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1284 0x1ef532, 0x20b051, 0x227043, 0x243516,
1285 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1286 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1287 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1288 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1289 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1290 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1291 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1292 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1293 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1294 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1295 };
1296 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1297 uint32_t *d = vd, *n = vn;
1298
1299 for (i = 0; i < opr_sz; i++) {
1300 uint32_t nn = n[i];
1301 intptr_t idx = extract32(nn, 0, 6);
1302 uint32_t exp = extract32(nn, 6, 8);
1303 d[i] = coeff[idx] | (exp << 23);
1304 }
1305 }
1306
1307 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1308 {
1309 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1310 static const uint64_t coeff[] = {
1311 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1312 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1313 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1314 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1315 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1316 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1317 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1318 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1319 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1320 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1321 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1322 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1323 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1324 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1325 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1326 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1327 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1328 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1329 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1330 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1331 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1332 0xFA7C1819E90D8ull,
1333 };
1334 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1335 uint64_t *d = vd, *n = vn;
1336
1337 for (i = 0; i < opr_sz; i++) {
1338 uint64_t nn = n[i];
1339 intptr_t idx = extract32(nn, 0, 6);
1340 uint64_t exp = extract32(nn, 6, 11);
1341 d[i] = coeff[idx] | (exp << 52);
1342 }
1343 }
1344
1345 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1346 {
1347 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1348 uint16_t *d = vd, *n = vn, *m = vm;
1349 for (i = 0; i < opr_sz; i += 1) {
1350 uint16_t nn = n[i];
1351 uint16_t mm = m[i];
1352 if (mm & 1) {
1353 nn = float16_one;
1354 }
1355 d[i] = nn ^ (mm & 2) << 14;
1356 }
1357 }
1358
1359 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1360 {
1361 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1362 uint32_t *d = vd, *n = vn, *m = vm;
1363 for (i = 0; i < opr_sz; i += 1) {
1364 uint32_t nn = n[i];
1365 uint32_t mm = m[i];
1366 if (mm & 1) {
1367 nn = float32_one;
1368 }
1369 d[i] = nn ^ (mm & 2) << 30;
1370 }
1371 }
1372
1373 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1374 {
1375 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1376 uint64_t *d = vd, *n = vn, *m = vm;
1377 for (i = 0; i < opr_sz; i += 1) {
1378 uint64_t nn = n[i];
1379 uint64_t mm = m[i];
1380 if (mm & 1) {
1381 nn = float64_one;
1382 }
1383 d[i] = nn ^ (mm & 2) << 62;
1384 }
1385 }
1386
1387 /*
1388 * Signed saturating addition with scalar operand.
1389 */
1390
1391 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1392 {
1393 intptr_t i, oprsz = simd_oprsz(desc);
1394
1395 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1396 int r = *(int8_t *)(a + i) + b;
1397 if (r > INT8_MAX) {
1398 r = INT8_MAX;
1399 } else if (r < INT8_MIN) {
1400 r = INT8_MIN;
1401 }
1402 *(int8_t *)(d + i) = r;
1403 }
1404 }
1405
1406 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1407 {
1408 intptr_t i, oprsz = simd_oprsz(desc);
1409
1410 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1411 int r = *(int16_t *)(a + i) + b;
1412 if (r > INT16_MAX) {
1413 r = INT16_MAX;
1414 } else if (r < INT16_MIN) {
1415 r = INT16_MIN;
1416 }
1417 *(int16_t *)(d + i) = r;
1418 }
1419 }
1420
1421 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1422 {
1423 intptr_t i, oprsz = simd_oprsz(desc);
1424
1425 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1426 int64_t r = *(int32_t *)(a + i) + b;
1427 if (r > INT32_MAX) {
1428 r = INT32_MAX;
1429 } else if (r < INT32_MIN) {
1430 r = INT32_MIN;
1431 }
1432 *(int32_t *)(d + i) = r;
1433 }
1434 }
1435
1436 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1437 {
1438 intptr_t i, oprsz = simd_oprsz(desc);
1439
1440 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1441 int64_t ai = *(int64_t *)(a + i);
1442 int64_t r = ai + b;
1443 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1444 /* Signed overflow. */
1445 r = (r < 0 ? INT64_MAX : INT64_MIN);
1446 }
1447 *(int64_t *)(d + i) = r;
1448 }
1449 }
1450
1451 /*
1452 * Unsigned saturating addition with scalar operand.
1453 */
1454
1455 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1456 {
1457 intptr_t i, oprsz = simd_oprsz(desc);
1458
1459 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1460 int r = *(uint8_t *)(a + i) + b;
1461 if (r > UINT8_MAX) {
1462 r = UINT8_MAX;
1463 } else if (r < 0) {
1464 r = 0;
1465 }
1466 *(uint8_t *)(d + i) = r;
1467 }
1468 }
1469
1470 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1471 {
1472 intptr_t i, oprsz = simd_oprsz(desc);
1473
1474 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1475 int r = *(uint16_t *)(a + i) + b;
1476 if (r > UINT16_MAX) {
1477 r = UINT16_MAX;
1478 } else if (r < 0) {
1479 r = 0;
1480 }
1481 *(uint16_t *)(d + i) = r;
1482 }
1483 }
1484
1485 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1486 {
1487 intptr_t i, oprsz = simd_oprsz(desc);
1488
1489 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1490 int64_t r = *(uint32_t *)(a + i) + b;
1491 if (r > UINT32_MAX) {
1492 r = UINT32_MAX;
1493 } else if (r < 0) {
1494 r = 0;
1495 }
1496 *(uint32_t *)(d + i) = r;
1497 }
1498 }
1499
1500 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1501 {
1502 intptr_t i, oprsz = simd_oprsz(desc);
1503
1504 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1505 uint64_t r = *(uint64_t *)(a + i) + b;
1506 if (r < b) {
1507 r = UINT64_MAX;
1508 }
1509 *(uint64_t *)(d + i) = r;
1510 }
1511 }
1512
1513 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1514 {
1515 intptr_t i, oprsz = simd_oprsz(desc);
1516
1517 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1518 uint64_t ai = *(uint64_t *)(a + i);
1519 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1520 }
1521 }
1522
1523 /* Two operand predicated copy immediate with merge. All valid immediates
1524 * can fit within 17 signed bits in the simd_data field.
1525 */
1526 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1527 uint64_t mm, uint32_t desc)
1528 {
1529 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1530 uint64_t *d = vd, *n = vn;
1531 uint8_t *pg = vg;
1532
1533 mm = dup_const(MO_8, mm);
1534 for (i = 0; i < opr_sz; i += 1) {
1535 uint64_t nn = n[i];
1536 uint64_t pp = expand_pred_b(pg[H1(i)]);
1537 d[i] = (mm & pp) | (nn & ~pp);
1538 }
1539 }
1540
1541 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1542 uint64_t mm, uint32_t desc)
1543 {
1544 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1545 uint64_t *d = vd, *n = vn;
1546 uint8_t *pg = vg;
1547
1548 mm = dup_const(MO_16, mm);
1549 for (i = 0; i < opr_sz; i += 1) {
1550 uint64_t nn = n[i];
1551 uint64_t pp = expand_pred_h(pg[H1(i)]);
1552 d[i] = (mm & pp) | (nn & ~pp);
1553 }
1554 }
1555
1556 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1557 uint64_t mm, uint32_t desc)
1558 {
1559 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1560 uint64_t *d = vd, *n = vn;
1561 uint8_t *pg = vg;
1562
1563 mm = dup_const(MO_32, mm);
1564 for (i = 0; i < opr_sz; i += 1) {
1565 uint64_t nn = n[i];
1566 uint64_t pp = expand_pred_s(pg[H1(i)]);
1567 d[i] = (mm & pp) | (nn & ~pp);
1568 }
1569 }
1570
1571 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1572 uint64_t mm, uint32_t desc)
1573 {
1574 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1575 uint64_t *d = vd, *n = vn;
1576 uint8_t *pg = vg;
1577
1578 for (i = 0; i < opr_sz; i += 1) {
1579 uint64_t nn = n[i];
1580 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1581 }
1582 }
1583
1584 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1585 {
1586 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1587 uint64_t *d = vd;
1588 uint8_t *pg = vg;
1589
1590 val = dup_const(MO_8, val);
1591 for (i = 0; i < opr_sz; i += 1) {
1592 d[i] = val & expand_pred_b(pg[H1(i)]);
1593 }
1594 }
1595
1596 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1597 {
1598 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1599 uint64_t *d = vd;
1600 uint8_t *pg = vg;
1601
1602 val = dup_const(MO_16, val);
1603 for (i = 0; i < opr_sz; i += 1) {
1604 d[i] = val & expand_pred_h(pg[H1(i)]);
1605 }
1606 }
1607
1608 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1609 {
1610 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1611 uint64_t *d = vd;
1612 uint8_t *pg = vg;
1613
1614 val = dup_const(MO_32, val);
1615 for (i = 0; i < opr_sz; i += 1) {
1616 d[i] = val & expand_pred_s(pg[H1(i)]);
1617 }
1618 }
1619
1620 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1621 {
1622 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1623 uint64_t *d = vd;
1624 uint8_t *pg = vg;
1625
1626 for (i = 0; i < opr_sz; i += 1) {
1627 d[i] = (pg[H1(i)] & 1 ? val : 0);
1628 }
1629 }
1630
1631 /* Big-endian hosts need to frob the byte indicies. If the copy
1632 * happens to be 8-byte aligned, then no frobbing necessary.
1633 */
1634 static void swap_memmove(void *vd, void *vs, size_t n)
1635 {
1636 uintptr_t d = (uintptr_t)vd;
1637 uintptr_t s = (uintptr_t)vs;
1638 uintptr_t o = (d | s | n) & 7;
1639 size_t i;
1640
1641 #ifndef HOST_WORDS_BIGENDIAN
1642 o = 0;
1643 #endif
1644 switch (o) {
1645 case 0:
1646 memmove(vd, vs, n);
1647 break;
1648
1649 case 4:
1650 if (d < s || d >= s + n) {
1651 for (i = 0; i < n; i += 4) {
1652 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1653 }
1654 } else {
1655 for (i = n; i > 0; ) {
1656 i -= 4;
1657 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1658 }
1659 }
1660 break;
1661
1662 case 2:
1663 case 6:
1664 if (d < s || d >= s + n) {
1665 for (i = 0; i < n; i += 2) {
1666 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1667 }
1668 } else {
1669 for (i = n; i > 0; ) {
1670 i -= 2;
1671 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1672 }
1673 }
1674 break;
1675
1676 default:
1677 if (d < s || d >= s + n) {
1678 for (i = 0; i < n; i++) {
1679 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1680 }
1681 } else {
1682 for (i = n; i > 0; ) {
1683 i -= 1;
1684 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1685 }
1686 }
1687 break;
1688 }
1689 }
1690
1691 /* Similarly for memset of 0. */
1692 static void swap_memzero(void *vd, size_t n)
1693 {
1694 uintptr_t d = (uintptr_t)vd;
1695 uintptr_t o = (d | n) & 7;
1696 size_t i;
1697
1698 /* Usually, the first bit of a predicate is set, so N is 0. */
1699 if (likely(n == 0)) {
1700 return;
1701 }
1702
1703 #ifndef HOST_WORDS_BIGENDIAN
1704 o = 0;
1705 #endif
1706 switch (o) {
1707 case 0:
1708 memset(vd, 0, n);
1709 break;
1710
1711 case 4:
1712 for (i = 0; i < n; i += 4) {
1713 *(uint32_t *)H1_4(d + i) = 0;
1714 }
1715 break;
1716
1717 case 2:
1718 case 6:
1719 for (i = 0; i < n; i += 2) {
1720 *(uint16_t *)H1_2(d + i) = 0;
1721 }
1722 break;
1723
1724 default:
1725 for (i = 0; i < n; i++) {
1726 *(uint8_t *)H1(d + i) = 0;
1727 }
1728 break;
1729 }
1730 }
1731
1732 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1733 {
1734 intptr_t opr_sz = simd_oprsz(desc);
1735 size_t n_ofs = simd_data(desc);
1736 size_t n_siz = opr_sz - n_ofs;
1737
1738 if (vd != vm) {
1739 swap_memmove(vd, vn + n_ofs, n_siz);
1740 swap_memmove(vd + n_siz, vm, n_ofs);
1741 } else if (vd != vn) {
1742 swap_memmove(vd + n_siz, vd, n_ofs);
1743 swap_memmove(vd, vn + n_ofs, n_siz);
1744 } else {
1745 /* vd == vn == vm. Need temp space. */
1746 ARMVectorReg tmp;
1747 swap_memmove(&tmp, vm, n_ofs);
1748 swap_memmove(vd, vd + n_ofs, n_siz);
1749 memcpy(vd + n_siz, &tmp, n_ofs);
1750 }
1751 }
1752
1753 #define DO_INSR(NAME, TYPE, H) \
1754 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1755 { \
1756 intptr_t opr_sz = simd_oprsz(desc); \
1757 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1758 *(TYPE *)(vd + H(0)) = val; \
1759 }
1760
1761 DO_INSR(sve_insr_b, uint8_t, H1)
1762 DO_INSR(sve_insr_h, uint16_t, H1_2)
1763 DO_INSR(sve_insr_s, uint32_t, H1_4)
1764 DO_INSR(sve_insr_d, uint64_t, )
1765
1766 #undef DO_INSR
1767
1768 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1769 {
1770 intptr_t i, j, opr_sz = simd_oprsz(desc);
1771 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1772 uint64_t f = *(uint64_t *)(vn + i);
1773 uint64_t b = *(uint64_t *)(vn + j);
1774 *(uint64_t *)(vd + i) = bswap64(b);
1775 *(uint64_t *)(vd + j) = bswap64(f);
1776 }
1777 }
1778
1779 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1780 {
1781 intptr_t i, j, opr_sz = simd_oprsz(desc);
1782 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1783 uint64_t f = *(uint64_t *)(vn + i);
1784 uint64_t b = *(uint64_t *)(vn + j);
1785 *(uint64_t *)(vd + i) = hswap64(b);
1786 *(uint64_t *)(vd + j) = hswap64(f);
1787 }
1788 }
1789
1790 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1791 {
1792 intptr_t i, j, opr_sz = simd_oprsz(desc);
1793 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1794 uint64_t f = *(uint64_t *)(vn + i);
1795 uint64_t b = *(uint64_t *)(vn + j);
1796 *(uint64_t *)(vd + i) = rol64(b, 32);
1797 *(uint64_t *)(vd + j) = rol64(f, 32);
1798 }
1799 }
1800
1801 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1802 {
1803 intptr_t i, j, opr_sz = simd_oprsz(desc);
1804 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1805 uint64_t f = *(uint64_t *)(vn + i);
1806 uint64_t b = *(uint64_t *)(vn + j);
1807 *(uint64_t *)(vd + i) = b;
1808 *(uint64_t *)(vd + j) = f;
1809 }
1810 }
1811
1812 #define DO_TBL(NAME, TYPE, H) \
1813 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1814 { \
1815 intptr_t i, opr_sz = simd_oprsz(desc); \
1816 uintptr_t elem = opr_sz / sizeof(TYPE); \
1817 TYPE *d = vd, *n = vn, *m = vm; \
1818 ARMVectorReg tmp; \
1819 if (unlikely(vd == vn)) { \
1820 n = memcpy(&tmp, vn, opr_sz); \
1821 } \
1822 for (i = 0; i < elem; i++) { \
1823 TYPE j = m[H(i)]; \
1824 d[H(i)] = j < elem ? n[H(j)] : 0; \
1825 } \
1826 }
1827
1828 DO_TBL(sve_tbl_b, uint8_t, H1)
1829 DO_TBL(sve_tbl_h, uint16_t, H2)
1830 DO_TBL(sve_tbl_s, uint32_t, H4)
1831 DO_TBL(sve_tbl_d, uint64_t, )
1832
1833 #undef TBL
1834
1835 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1836 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1837 { \
1838 intptr_t i, opr_sz = simd_oprsz(desc); \
1839 TYPED *d = vd; \
1840 TYPES *n = vn; \
1841 ARMVectorReg tmp; \
1842 if (unlikely(vn - vd < opr_sz)) { \
1843 n = memcpy(&tmp, n, opr_sz / 2); \
1844 } \
1845 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1846 d[HD(i)] = n[HS(i)]; \
1847 } \
1848 }
1849
1850 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1851 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1852 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1853
1854 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1855 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1856 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1857
1858 #undef DO_UNPK
1859
1860 /* Mask of bits included in the even numbered predicates of width esz.
1861 * We also use this for expand_bits/compress_bits, and so extend the
1862 * same pattern out to 16-bit units.
1863 */
1864 static const uint64_t even_bit_esz_masks[5] = {
1865 0x5555555555555555ull,
1866 0x3333333333333333ull,
1867 0x0f0f0f0f0f0f0f0full,
1868 0x00ff00ff00ff00ffull,
1869 0x0000ffff0000ffffull,
1870 };
1871
1872 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1873 * For N==0, this corresponds to the operation that in qemu/bitops.h
1874 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1875 * section 7-2 Shuffling Bits.
1876 */
1877 static uint64_t expand_bits(uint64_t x, int n)
1878 {
1879 int i;
1880
1881 x &= 0xffffffffu;
1882 for (i = 4; i >= n; i--) {
1883 int sh = 1 << i;
1884 x = ((x << sh) | x) & even_bit_esz_masks[i];
1885 }
1886 return x;
1887 }
1888
1889 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1890 * For N==0, this corresponds to the operation that in qemu/bitops.h
1891 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1892 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1893 */
1894 static uint64_t compress_bits(uint64_t x, int n)
1895 {
1896 int i;
1897
1898 for (i = n; i <= 4; i++) {
1899 int sh = 1 << i;
1900 x &= even_bit_esz_masks[i];
1901 x = (x >> sh) | x;
1902 }
1903 return x & 0xffffffffu;
1904 }
1905
1906 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1907 {
1908 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1909 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1910 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1911 uint64_t *d = vd;
1912 intptr_t i;
1913
1914 if (oprsz <= 8) {
1915 uint64_t nn = *(uint64_t *)vn;
1916 uint64_t mm = *(uint64_t *)vm;
1917 int half = 4 * oprsz;
1918
1919 nn = extract64(nn, high * half, half);
1920 mm = extract64(mm, high * half, half);
1921 nn = expand_bits(nn, esz);
1922 mm = expand_bits(mm, esz);
1923 d[0] = nn + (mm << (1 << esz));
1924 } else {
1925 ARMPredicateReg tmp_n, tmp_m;
1926
1927 /* We produce output faster than we consume input.
1928 Therefore we must be mindful of possible overlap. */
1929 if ((vn - vd) < (uintptr_t)oprsz) {
1930 vn = memcpy(&tmp_n, vn, oprsz);
1931 }
1932 if ((vm - vd) < (uintptr_t)oprsz) {
1933 vm = memcpy(&tmp_m, vm, oprsz);
1934 }
1935 if (high) {
1936 high = oprsz >> 1;
1937 }
1938
1939 if ((high & 3) == 0) {
1940 uint32_t *n = vn, *m = vm;
1941 high >>= 2;
1942
1943 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1944 uint64_t nn = n[H4(high + i)];
1945 uint64_t mm = m[H4(high + i)];
1946
1947 nn = expand_bits(nn, esz);
1948 mm = expand_bits(mm, esz);
1949 d[i] = nn + (mm << (1 << esz));
1950 }
1951 } else {
1952 uint8_t *n = vn, *m = vm;
1953 uint16_t *d16 = vd;
1954
1955 for (i = 0; i < oprsz / 2; i++) {
1956 uint16_t nn = n[H1(high + i)];
1957 uint16_t mm = m[H1(high + i)];
1958
1959 nn = expand_bits(nn, esz);
1960 mm = expand_bits(mm, esz);
1961 d16[H2(i)] = nn + (mm << (1 << esz));
1962 }
1963 }
1964 }
1965 }
1966
1967 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1968 {
1969 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1970 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1971 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1972 uint64_t *d = vd, *n = vn, *m = vm;
1973 uint64_t l, h;
1974 intptr_t i;
1975
1976 if (oprsz <= 8) {
1977 l = compress_bits(n[0] >> odd, esz);
1978 h = compress_bits(m[0] >> odd, esz);
1979 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1980 } else {
1981 ARMPredicateReg tmp_m;
1982 intptr_t oprsz_16 = oprsz / 16;
1983
1984 if ((vm - vd) < (uintptr_t)oprsz) {
1985 m = memcpy(&tmp_m, vm, oprsz);
1986 }
1987
1988 for (i = 0; i < oprsz_16; i++) {
1989 l = n[2 * i + 0];
1990 h = n[2 * i + 1];
1991 l = compress_bits(l >> odd, esz);
1992 h = compress_bits(h >> odd, esz);
1993 d[i] = l + (h << 32);
1994 }
1995
1996 /* For VL which is not a power of 2, the results from M do not
1997 align nicely with the uint64_t for D. Put the aligned results
1998 from M into TMP_M and then copy it into place afterward. */
1999 if (oprsz & 15) {
2000 d[i] = compress_bits(n[2 * i] >> odd, esz);
2001
2002 for (i = 0; i < oprsz_16; i++) {
2003 l = m[2 * i + 0];
2004 h = m[2 * i + 1];
2005 l = compress_bits(l >> odd, esz);
2006 h = compress_bits(h >> odd, esz);
2007 tmp_m.p[i] = l + (h << 32);
2008 }
2009 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
2010
2011 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
2012 } else {
2013 for (i = 0; i < oprsz_16; i++) {
2014 l = m[2 * i + 0];
2015 h = m[2 * i + 1];
2016 l = compress_bits(l >> odd, esz);
2017 h = compress_bits(h >> odd, esz);
2018 d[oprsz_16 + i] = l + (h << 32);
2019 }
2020 }
2021 }
2022 }
2023
2024 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2025 {
2026 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2027 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2028 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2029 uint64_t *d = vd, *n = vn, *m = vm;
2030 uint64_t mask;
2031 int shr, shl;
2032 intptr_t i;
2033
2034 shl = 1 << esz;
2035 shr = 0;
2036 mask = even_bit_esz_masks[esz];
2037 if (odd) {
2038 mask <<= shl;
2039 shr = shl;
2040 shl = 0;
2041 }
2042
2043 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2044 uint64_t nn = (n[i] & mask) >> shr;
2045 uint64_t mm = (m[i] & mask) << shl;
2046 d[i] = nn + mm;
2047 }
2048 }
2049
2050 /* Reverse units of 2**N bits. */
2051 static uint64_t reverse_bits_64(uint64_t x, int n)
2052 {
2053 int i, sh;
2054
2055 x = bswap64(x);
2056 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2057 uint64_t mask = even_bit_esz_masks[i];
2058 x = ((x & mask) << sh) | ((x >> sh) & mask);
2059 }
2060 return x;
2061 }
2062
2063 static uint8_t reverse_bits_8(uint8_t x, int n)
2064 {
2065 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2066 int i, sh;
2067
2068 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2069 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2070 }
2071 return x;
2072 }
2073
2074 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2075 {
2076 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2077 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2078 intptr_t i, oprsz_2 = oprsz / 2;
2079
2080 if (oprsz <= 8) {
2081 uint64_t l = *(uint64_t *)vn;
2082 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2083 *(uint64_t *)vd = l;
2084 } else if ((oprsz & 15) == 0) {
2085 for (i = 0; i < oprsz_2; i += 8) {
2086 intptr_t ih = oprsz - 8 - i;
2087 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2088 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2089 *(uint64_t *)(vd + i) = h;
2090 *(uint64_t *)(vd + ih) = l;
2091 }
2092 } else {
2093 for (i = 0; i < oprsz_2; i += 1) {
2094 intptr_t il = H1(i);
2095 intptr_t ih = H1(oprsz - 1 - i);
2096 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2097 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2098 *(uint8_t *)(vd + il) = h;
2099 *(uint8_t *)(vd + ih) = l;
2100 }
2101 }
2102 }
2103
2104 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2105 {
2106 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2107 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2108 uint64_t *d = vd;
2109 intptr_t i;
2110
2111 if (oprsz <= 8) {
2112 uint64_t nn = *(uint64_t *)vn;
2113 int half = 4 * oprsz;
2114
2115 nn = extract64(nn, high * half, half);
2116 nn = expand_bits(nn, 0);
2117 d[0] = nn;
2118 } else {
2119 ARMPredicateReg tmp_n;
2120
2121 /* We produce output faster than we consume input.
2122 Therefore we must be mindful of possible overlap. */
2123 if ((vn - vd) < (uintptr_t)oprsz) {
2124 vn = memcpy(&tmp_n, vn, oprsz);
2125 }
2126 if (high) {
2127 high = oprsz >> 1;
2128 }
2129
2130 if ((high & 3) == 0) {
2131 uint32_t *n = vn;
2132 high >>= 2;
2133
2134 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2135 uint64_t nn = n[H4(high + i)];
2136 d[i] = expand_bits(nn, 0);
2137 }
2138 } else {
2139 uint16_t *d16 = vd;
2140 uint8_t *n = vn;
2141
2142 for (i = 0; i < oprsz / 2; i++) {
2143 uint16_t nn = n[H1(high + i)];
2144 d16[H2(i)] = expand_bits(nn, 0);
2145 }
2146 }
2147 }
2148 }
2149
2150 #define DO_ZIP(NAME, TYPE, H) \
2151 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2152 { \
2153 intptr_t oprsz = simd_oprsz(desc); \
2154 intptr_t i, oprsz_2 = oprsz / 2; \
2155 ARMVectorReg tmp_n, tmp_m; \
2156 /* We produce output faster than we consume input. \
2157 Therefore we must be mindful of possible overlap. */ \
2158 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2159 vn = memcpy(&tmp_n, vn, oprsz_2); \
2160 } \
2161 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2162 vm = memcpy(&tmp_m, vm, oprsz_2); \
2163 } \
2164 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2165 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2166 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2167 } \
2168 }
2169
2170 DO_ZIP(sve_zip_b, uint8_t, H1)
2171 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2172 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2173 DO_ZIP(sve_zip_d, uint64_t, )
2174
2175 #define DO_UZP(NAME, TYPE, H) \
2176 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2177 { \
2178 intptr_t oprsz = simd_oprsz(desc); \
2179 intptr_t oprsz_2 = oprsz / 2; \
2180 intptr_t odd_ofs = simd_data(desc); \
2181 intptr_t i; \
2182 ARMVectorReg tmp_m; \
2183 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2184 vm = memcpy(&tmp_m, vm, oprsz); \
2185 } \
2186 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2187 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2188 } \
2189 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2190 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2191 } \
2192 }
2193
2194 DO_UZP(sve_uzp_b, uint8_t, H1)
2195 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2196 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2197 DO_UZP(sve_uzp_d, uint64_t, )
2198
2199 #define DO_TRN(NAME, TYPE, H) \
2200 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2201 { \
2202 intptr_t oprsz = simd_oprsz(desc); \
2203 intptr_t odd_ofs = simd_data(desc); \
2204 intptr_t i; \
2205 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2206 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2207 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2208 *(TYPE *)(vd + H(i + 0)) = ae; \
2209 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2210 } \
2211 }
2212
2213 DO_TRN(sve_trn_b, uint8_t, H1)
2214 DO_TRN(sve_trn_h, uint16_t, H1_2)
2215 DO_TRN(sve_trn_s, uint32_t, H1_4)
2216 DO_TRN(sve_trn_d, uint64_t, )
2217
2218 #undef DO_ZIP
2219 #undef DO_UZP
2220 #undef DO_TRN
2221
2222 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2223 {
2224 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2225 uint32_t *d = vd, *n = vn;
2226 uint8_t *pg = vg;
2227
2228 for (i = j = 0; i < opr_sz; i++) {
2229 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2230 d[H4(j)] = n[H4(i)];
2231 j++;
2232 }
2233 }
2234 for (; j < opr_sz; j++) {
2235 d[H4(j)] = 0;
2236 }
2237 }
2238
2239 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2240 {
2241 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2242 uint64_t *d = vd, *n = vn;
2243 uint8_t *pg = vg;
2244
2245 for (i = j = 0; i < opr_sz; i++) {
2246 if (pg[H1(i)] & 1) {
2247 d[j] = n[i];
2248 j++;
2249 }
2250 }
2251 for (; j < opr_sz; j++) {
2252 d[j] = 0;
2253 }
2254 }
2255
2256 /* Similar to the ARM LastActiveElement pseudocode function, except the
2257 * result is multiplied by the element size. This includes the not found
2258 * indication; e.g. not found for esz=3 is -8.
2259 */
2260 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2261 {
2262 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2263 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2264
2265 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2266 }
2267
2268 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2269 {
2270 intptr_t opr_sz = simd_oprsz(desc) / 8;
2271 int esz = simd_data(desc);
2272 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2273 intptr_t i, first_i, last_i;
2274 ARMVectorReg tmp;
2275
2276 first_i = last_i = 0;
2277 first_g = last_g = 0;
2278
2279 /* Find the extent of the active elements within VG. */
2280 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2281 pg = *(uint64_t *)(vg + i) & mask;
2282 if (pg) {
2283 if (last_g == 0) {
2284 last_g = pg;
2285 last_i = i;
2286 }
2287 first_g = pg;
2288 first_i = i;
2289 }
2290 }
2291
2292 len = 0;
2293 if (first_g != 0) {
2294 first_i = first_i * 8 + ctz64(first_g);
2295 last_i = last_i * 8 + 63 - clz64(last_g);
2296 len = last_i - first_i + (1 << esz);
2297 if (vd == vm) {
2298 vm = memcpy(&tmp, vm, opr_sz * 8);
2299 }
2300 swap_memmove(vd, vn + first_i, len);
2301 }
2302 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2303 }
2304
2305 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2306 void *vg, uint32_t desc)
2307 {
2308 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2309 uint64_t *d = vd, *n = vn, *m = vm;
2310 uint8_t *pg = vg;
2311
2312 for (i = 0; i < opr_sz; i += 1) {
2313 uint64_t nn = n[i], mm = m[i];
2314 uint64_t pp = expand_pred_b(pg[H1(i)]);
2315 d[i] = (nn & pp) | (mm & ~pp);
2316 }
2317 }
2318
2319 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2320 void *vg, uint32_t desc)
2321 {
2322 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2323 uint64_t *d = vd, *n = vn, *m = vm;
2324 uint8_t *pg = vg;
2325
2326 for (i = 0; i < opr_sz; i += 1) {
2327 uint64_t nn = n[i], mm = m[i];
2328 uint64_t pp = expand_pred_h(pg[H1(i)]);
2329 d[i] = (nn & pp) | (mm & ~pp);
2330 }
2331 }
2332
2333 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2334 void *vg, uint32_t desc)
2335 {
2336 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2337 uint64_t *d = vd, *n = vn, *m = vm;
2338 uint8_t *pg = vg;
2339
2340 for (i = 0; i < opr_sz; i += 1) {
2341 uint64_t nn = n[i], mm = m[i];
2342 uint64_t pp = expand_pred_s(pg[H1(i)]);
2343 d[i] = (nn & pp) | (mm & ~pp);
2344 }
2345 }
2346
2347 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2348 void *vg, uint32_t desc)
2349 {
2350 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2351 uint64_t *d = vd, *n = vn, *m = vm;
2352 uint8_t *pg = vg;
2353
2354 for (i = 0; i < opr_sz; i += 1) {
2355 uint64_t nn = n[i], mm = m[i];
2356 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2357 }
2358 }
2359
2360 /* Two operand comparison controlled by a predicate.
2361 * ??? It is very tempting to want to be able to expand this inline
2362 * with x86 instructions, e.g.
2363 *
2364 * vcmpeqw zm, zn, %ymm0
2365 * vpmovmskb %ymm0, %eax
2366 * and $0x5555, %eax
2367 * and pg, %eax
2368 *
2369 * or even aarch64, e.g.
2370 *
2371 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2372 * cmeq v0.8h, zn, zm
2373 * and v0.8h, v0.8h, mask
2374 * addv h0, v0.8h
2375 * and v0.8b, pg
2376 *
2377 * However, coming up with an abstraction that allows vector inputs and
2378 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2379 * scalar outputs, is tricky.
2380 */
2381 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2382 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2383 { \
2384 intptr_t opr_sz = simd_oprsz(desc); \
2385 uint32_t flags = PREDTEST_INIT; \
2386 intptr_t i = opr_sz; \
2387 do { \
2388 uint64_t out = 0, pg; \
2389 do { \
2390 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2391 TYPE nn = *(TYPE *)(vn + H(i)); \
2392 TYPE mm = *(TYPE *)(vm + H(i)); \
2393 out |= nn OP mm; \
2394 } while (i & 63); \
2395 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2396 out &= pg; \
2397 *(uint64_t *)(vd + (i >> 3)) = out; \
2398 flags = iter_predtest_bwd(out, pg, flags); \
2399 } while (i > 0); \
2400 return flags; \
2401 }
2402
2403 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2404 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2405 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2406 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2407 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2408 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2409 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2410 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2411
2412 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2413 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2414 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2415 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2416
2417 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2418 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2419 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2420 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2421
2422 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2423 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2424 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2425 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2426
2427 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2428 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2429 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2430 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2431
2432 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2433 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2434 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2435 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2436
2437 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2438 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2439 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2440 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2441
2442 #undef DO_CMP_PPZZ_B
2443 #undef DO_CMP_PPZZ_H
2444 #undef DO_CMP_PPZZ_S
2445 #undef DO_CMP_PPZZ_D
2446 #undef DO_CMP_PPZZ
2447
2448 /* Similar, but the second source is "wide". */
2449 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2450 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2451 { \
2452 intptr_t opr_sz = simd_oprsz(desc); \
2453 uint32_t flags = PREDTEST_INIT; \
2454 intptr_t i = opr_sz; \
2455 do { \
2456 uint64_t out = 0, pg; \
2457 do { \
2458 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2459 do { \
2460 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2461 TYPE nn = *(TYPE *)(vn + H(i)); \
2462 out |= nn OP mm; \
2463 } while (i & 7); \
2464 } while (i & 63); \
2465 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2466 out &= pg; \
2467 *(uint64_t *)(vd + (i >> 3)) = out; \
2468 flags = iter_predtest_bwd(out, pg, flags); \
2469 } while (i > 0); \
2470 return flags; \
2471 }
2472
2473 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2474 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2475 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2476 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2477 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2478 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2479
2480 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
2481 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2482 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
2483
2484 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
2485 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2486 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
2487
2488 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2489 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2490 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2491
2492 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2493 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2494 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2495
2496 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2497 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2498 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2499
2500 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2501 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2502 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2503
2504 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2505 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2506 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2507
2508 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2509 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2510 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2511
2512 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2513 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2514 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2515
2516 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2517 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2518 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2519
2520 #undef DO_CMP_PPZW_B
2521 #undef DO_CMP_PPZW_H
2522 #undef DO_CMP_PPZW_S
2523 #undef DO_CMP_PPZW
2524
2525 /* Similar, but the second source is immediate. */
2526 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2527 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2528 { \
2529 intptr_t opr_sz = simd_oprsz(desc); \
2530 uint32_t flags = PREDTEST_INIT; \
2531 TYPE mm = simd_data(desc); \
2532 intptr_t i = opr_sz; \
2533 do { \
2534 uint64_t out = 0, pg; \
2535 do { \
2536 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2537 TYPE nn = *(TYPE *)(vn + H(i)); \
2538 out |= nn OP mm; \
2539 } while (i & 63); \
2540 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2541 out &= pg; \
2542 *(uint64_t *)(vd + (i >> 3)) = out; \
2543 flags = iter_predtest_bwd(out, pg, flags); \
2544 } while (i > 0); \
2545 return flags; \
2546 }
2547
2548 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2549 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2550 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2551 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2552 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2553 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2554 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2555 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2556
2557 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2558 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2559 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2560 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2561
2562 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2563 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2564 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2565 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2566
2567 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2568 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2569 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2570 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2571
2572 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2573 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2574 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2575 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2576
2577 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2578 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2579 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2580 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2581
2582 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2583 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2584 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2585 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2586
2587 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2588 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2589 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2590 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2591
2592 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2593 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2594 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2595 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2596
2597 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2598 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2599 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2600 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2601
2602 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2603 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2604 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2605 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2606
2607 #undef DO_CMP_PPZI_B
2608 #undef DO_CMP_PPZI_H
2609 #undef DO_CMP_PPZI_S
2610 #undef DO_CMP_PPZI_D
2611 #undef DO_CMP_PPZI
2612
2613 /* Similar to the ARM LastActive pseudocode function. */
2614 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2615 {
2616 intptr_t i;
2617
2618 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2619 uint64_t pg = *(uint64_t *)(vg + i);
2620 if (pg) {
2621 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2622 }
2623 }
2624 return 0;
2625 }
2626
2627 /* Compute a mask into RETB that is true for all G, up to and including
2628 * (if after) or excluding (if !after) the first G & N.
2629 * Return true if BRK found.
2630 */
2631 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2632 bool brk, bool after)
2633 {
2634 uint64_t b;
2635
2636 if (brk) {
2637 b = 0;
2638 } else if ((g & n) == 0) {
2639 /* For all G, no N are set; break not found. */
2640 b = g;
2641 } else {
2642 /* Break somewhere in N. Locate it. */
2643 b = g & n; /* guard true, pred true */
2644 b = b & -b; /* first such */
2645 if (after) {
2646 b = b | (b - 1); /* break after same */
2647 } else {
2648 b = b - 1; /* break before same */
2649 }
2650 brk = true;
2651 }
2652
2653 *retb = b;
2654 return brk;
2655 }
2656
2657 /* Compute a zeroing BRK. */
2658 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2659 intptr_t oprsz, bool after)
2660 {
2661 bool brk = false;
2662 intptr_t i;
2663
2664 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2665 uint64_t this_b, this_g = g[i];
2666
2667 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2668 d[i] = this_b & this_g;
2669 }
2670 }
2671
2672 /* Likewise, but also compute flags. */
2673 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2674 intptr_t oprsz, bool after)
2675 {
2676 uint32_t flags = PREDTEST_INIT;
2677 bool brk = false;
2678 intptr_t i;
2679
2680 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2681 uint64_t this_b, this_d, this_g = g[i];
2682
2683 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2684 d[i] = this_d = this_b & this_g;
2685 flags = iter_predtest_fwd(this_d, this_g, flags);
2686 }
2687 return flags;
2688 }
2689
2690 /* Compute a merging BRK. */
2691 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2692 intptr_t oprsz, bool after)
2693 {
2694 bool brk = false;
2695 intptr_t i;
2696
2697 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2698 uint64_t this_b, this_g = g[i];
2699
2700 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2701 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2702 }
2703 }
2704
2705 /* Likewise, but also compute flags. */
2706 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2707 intptr_t oprsz, bool after)
2708 {
2709 uint32_t flags = PREDTEST_INIT;
2710 bool brk = false;
2711 intptr_t i;
2712
2713 for (i = 0; i < oprsz / 8; ++i) {
2714 uint64_t this_b, this_d = d[i], this_g = g[i];
2715
2716 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2717 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2718 flags = iter_predtest_fwd(this_d, this_g, flags);
2719 }
2720 return flags;
2721 }
2722
2723 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2724 {
2725 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2726 * The compiler should turn this into 4 64-bit integer stores.
2727 */
2728 memset(d, 0, sizeof(ARMPredicateReg));
2729 return PREDTEST_INIT;
2730 }
2731
2732 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2733 uint32_t pred_desc)
2734 {
2735 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2736 if (last_active_pred(vn, vg, oprsz)) {
2737 compute_brk_z(vd, vm, vg, oprsz, true);
2738 } else {
2739 do_zero(vd, oprsz);
2740 }
2741 }
2742
2743 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2744 uint32_t pred_desc)
2745 {
2746 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2747 if (last_active_pred(vn, vg, oprsz)) {
2748 return compute_brks_z(vd, vm, vg, oprsz, true);
2749 } else {
2750 return do_zero(vd, oprsz);
2751 }
2752 }
2753
2754 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2755 uint32_t pred_desc)
2756 {
2757 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2758 if (last_active_pred(vn, vg, oprsz)) {
2759 compute_brk_z(vd, vm, vg, oprsz, false);
2760 } else {
2761 do_zero(vd, oprsz);
2762 }
2763 }
2764
2765 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2766 uint32_t pred_desc)
2767 {
2768 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2769 if (last_active_pred(vn, vg, oprsz)) {
2770 return compute_brks_z(vd, vm, vg, oprsz, false);
2771 } else {
2772 return do_zero(vd, oprsz);
2773 }
2774 }
2775
2776 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2777 {
2778 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2779 compute_brk_z(vd, vn, vg, oprsz, true);
2780 }
2781
2782 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2783 {
2784 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2785 return compute_brks_z(vd, vn, vg, oprsz, true);
2786 }
2787
2788 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2789 {
2790 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2791 compute_brk_z(vd, vn, vg, oprsz, false);
2792 }
2793
2794 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2795 {
2796 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2797 return compute_brks_z(vd, vn, vg, oprsz, false);
2798 }
2799
2800 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2801 {
2802 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2803 compute_brk_m(vd, vn, vg, oprsz, true);
2804 }
2805
2806 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2807 {
2808 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2809 return compute_brks_m(vd, vn, vg, oprsz, true);
2810 }
2811
2812 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2813 {
2814 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2815 compute_brk_m(vd, vn, vg, oprsz, false);
2816 }
2817
2818 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2819 {
2820 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2821 return compute_brks_m(vd, vn, vg, oprsz, false);
2822 }
2823
2824 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2825 {
2826 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2827
2828 if (!last_active_pred(vn, vg, oprsz)) {
2829 do_zero(vd, oprsz);
2830 }
2831 }
2832
2833 /* As if PredTest(Ones(PL), D, esz). */
2834 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2835 uint64_t esz_mask)
2836 {
2837 uint32_t flags = PREDTEST_INIT;
2838 intptr_t i;
2839
2840 for (i = 0; i < oprsz / 8; i++) {
2841 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2842 }
2843 if (oprsz & 7) {
2844 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2845 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2846 }
2847 return flags;
2848 }
2849
2850 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2851 {
2852 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2853
2854 if (last_active_pred(vn, vg, oprsz)) {
2855 return predtest_ones(vd, oprsz, -1);
2856 } else {
2857 return do_zero(vd, oprsz);
2858 }
2859 }
2860
2861 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2862 {
2863 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2864 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2865 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2866 intptr_t i;
2867
2868 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2869 uint64_t t = n[i] & g[i] & mask;
2870 sum += ctpop64(t);
2871 }
2872 return sum;
2873 }
2874
2875 uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2876 {
2877 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2878 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2879 uint64_t esz_mask = pred_esz_masks[esz];
2880 ARMPredicateReg *d = vd;
2881 uint32_t flags;
2882 intptr_t i;
2883
2884 /* Begin with a zero predicate register. */
2885 flags = do_zero(d, oprsz);
2886 if (count == 0) {
2887 return flags;
2888 }
2889
2890 /* Set all of the requested bits. */
2891 for (i = 0; i < count / 64; ++i) {
2892 d->p[i] = esz_mask;
2893 }
2894 if (count & 63) {
2895 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2896 }
2897
2898 return predtest_ones(d, oprsz, esz_mask);
2899 }
2900
2901 /* Recursive reduction on a function;
2902 * C.f. the ARM ARM function ReducePredicated.
2903 *
2904 * While it would be possible to write this without the DATA temporary,
2905 * it is much simpler to process the predicate register this way.
2906 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2907 * little to gain with a more complex non-recursive form.
2908 */
2909 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2910 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2911 { \
2912 if (n == 1) { \
2913 return *data; \
2914 } else { \
2915 uintptr_t half = n / 2; \
2916 TYPE lo = NAME##_reduce(data, status, half); \
2917 TYPE hi = NAME##_reduce(data + half, status, half); \
2918 return TYPE##_##FUNC(lo, hi, status); \
2919 } \
2920 } \
2921 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2922 { \
2923 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2924 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2925 for (i = 0; i < oprsz; ) { \
2926 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2927 do { \
2928 TYPE nn = *(TYPE *)(vn + H(i)); \
2929 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2930 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2931 } while (i & 15); \
2932 } \
2933 for (; i < maxsz; i += sizeof(TYPE)) { \
2934 *(TYPE *)((void *)data + i) = IDENT; \
2935 } \
2936 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2937 }
2938
2939 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2940 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2941 DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2942
2943 /* Identity is floatN_default_nan, without the function call. */
2944 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2945 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2946 DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2947
2948 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2949 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2950 DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2951
2952 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2953 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2954 DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2955
2956 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2957 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2958 DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2959
2960 #undef DO_REDUCE
2961
2962 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2963 void *status, uint32_t desc)
2964 {
2965 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2966 float16 result = nn;
2967
2968 do {
2969 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2970 do {
2971 if (pg & 1) {
2972 float16 mm = *(float16 *)(vm + H1_2(i));
2973 result = float16_add(result, mm, status);
2974 }
2975 i += sizeof(float16), pg >>= sizeof(float16);
2976 } while (i & 15);
2977 } while (i < opr_sz);
2978
2979 return result;
2980 }
2981
2982 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2983 void *status, uint32_t desc)
2984 {
2985 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2986 float32 result = nn;
2987
2988 do {
2989 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2990 do {
2991 if (pg & 1) {
2992 float32 mm = *(float32 *)(vm + H1_2(i));
2993 result = float32_add(result, mm, status);
2994 }
2995 i += sizeof(float32), pg >>= sizeof(float32);
2996 } while (i & 15);
2997 } while (i < opr_sz);
2998
2999 return result;
3000 }
3001
3002 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
3003 void *status, uint32_t desc)
3004 {
3005 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
3006 uint64_t *m = vm;
3007 uint8_t *pg = vg;
3008
3009 for (i = 0; i < opr_sz; i++) {
3010 if (pg[H1(i)] & 1) {
3011 nn = float64_add(nn, m[i], status);
3012 }
3013 }
3014
3015 return nn;
3016 }
3017
3018 /* Fully general three-operand expander, controlled by a predicate,
3019 * With the extra float_status parameter.
3020 */
3021 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
3022 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3023 void *status, uint32_t desc) \
3024 { \
3025 intptr_t i = simd_oprsz(desc); \
3026 uint64_t *g = vg; \
3027 do { \
3028 uint64_t pg = g[(i - 1) >> 6]; \
3029 do { \
3030 i -= sizeof(TYPE); \
3031 if (likely((pg >> (i & 63)) & 1)) { \
3032 TYPE nn = *(TYPE *)(vn + H(i)); \
3033 TYPE mm = *(TYPE *)(vm + H(i)); \
3034 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3035 } \
3036 } while (i & 63); \
3037 } while (i != 0); \
3038 }
3039
3040 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3041 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3042 DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3043
3044 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3045 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3046 DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3047
3048 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3049 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3050 DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3051
3052 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3053 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3054 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3055
3056 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3057 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3058 DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3059
3060 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3061 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3062 DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3063
3064 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3065 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3066 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3067
3068 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3069 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3070 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3071
3072 static inline float16 abd_h(float16 a, float16 b, float_status *s)
3073 {
3074 return float16_abs(float16_sub(a, b, s));
3075 }
3076
3077 static inline float32 abd_s(float32 a, float32 b, float_status *s)
3078 {
3079 return float32_abs(float32_sub(a, b, s));
3080 }
3081
3082 static inline float64 abd_d(float64 a, float64 b, float_status *s)
3083 {
3084 return float64_abs(float64_sub(a, b, s));
3085 }
3086
3087 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3088 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3089 DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3090
3091 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3092 {
3093 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3094 return float64_scalbn(a, b_int, s);
3095 }
3096
3097 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3098 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3099 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3100
3101 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3102 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3103 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3104
3105 #undef DO_ZPZZ_FP
3106
3107 /* Three-operand expander, with one scalar operand, controlled by
3108 * a predicate, with the extra float_status parameter.
3109 */
3110 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3111 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3112 void *status, uint32_t desc) \
3113 { \
3114 intptr_t i = simd_oprsz(desc); \
3115 uint64_t *g = vg; \
3116 TYPE mm = scalar; \
3117 do { \
3118 uint64_t pg = g[(i - 1) >> 6]; \
3119 do { \
3120 i -= sizeof(TYPE); \
3121 if (likely((pg >> (i & 63)) & 1)) { \
3122 TYPE nn = *(TYPE *)(vn + H(i)); \
3123 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3124 } \
3125 } while (i & 63); \
3126 } while (i != 0); \
3127 }
3128
3129 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3130 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3131 DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3132
3133 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3134 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3135 DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3136
3137 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3138 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3139 DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3140
3141 static inline float16 subr_h(float16 a, float16 b, float_status *s)
3142 {
3143 return float16_sub(b, a, s);
3144 }
3145
3146 static inline float32 subr_s(float32 a, float32 b, float_status *s)
3147 {
3148 return float32_sub(b, a, s);
3149 }
3150
3151 static inline float64 subr_d(float64 a, float64 b, float_status *s)
3152 {
3153 return float64_sub(b, a, s);
3154 }
3155
3156 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3157 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3158 DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3159
3160 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3161 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3162 DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3163
3164 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3165 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3166 DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3167
3168 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3169 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3170 DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3171
3172 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3173 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3174 DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3175
3176 /* Fully general two-operand expander, controlled by a predicate,
3177 * With the extra float_status parameter.
3178 */
3179 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3180 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3181 { \
3182 intptr_t i = simd_oprsz(desc); \
3183 uint64_t *g = vg; \
3184 do { \
3185 uint64_t pg = g[(i - 1) >> 6]; \
3186 do { \
3187 i -= sizeof(TYPE); \
3188 if (likely((pg >> (i & 63)) & 1)) { \
3189 TYPE nn = *(TYPE *)(vn + H(i)); \
3190 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3191 } \
3192 } while (i & 63); \
3193 } while (i != 0); \
3194 }
3195
3196 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3197 * FZ16. When converting from fp16, this affects flushing input denormals;
3198 * when converting to fp16, this affects flushing output denormals.
3199 */
3200 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3201 {
3202 flag save = get_flush_inputs_to_zero(fpst);
3203 float32 ret;
3204
3205 set_flush_inputs_to_zero(false, fpst);
3206 ret = float16_to_float32(f, true, fpst);
3207 set_flush_inputs_to_zero(save, fpst);
3208 return ret;
3209 }
3210
3211 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3212 {
3213 flag save = get_flush_inputs_to_zero(fpst);
3214 float64 ret;
3215
3216 set_flush_inputs_to_zero(false, fpst);
3217 ret = float16_to_float64(f, true, fpst);
3218 set_flush_inputs_to_zero(save, fpst);
3219 return ret;
3220 }
3221
3222 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3223 {
3224 flag save = get_flush_to_zero(fpst);
3225 float16 ret;
3226
3227 set_flush_to_zero(false, fpst);
3228 ret = float32_to_float16(f, true, fpst);
3229 set_flush_to_zero(save, fpst);
3230 return ret;
3231 }
3232
3233 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3234 {
3235 flag save = get_flush_to_zero(fpst);
3236 float16 ret;
3237
3238 set_flush_to_zero(false, fpst);
3239 ret = float64_to_float16(f, true, fpst);
3240 set_flush_to_zero(save, fpst);
3241 return ret;
3242 }
3243
3244 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3245 {
3246 if (float16_is_any_nan(f)) {
3247 float_raise(float_flag_invalid, s);
3248 return 0;
3249 }
3250 return float16_to_int16_round_to_zero(f, s);
3251 }
3252
3253 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3254 {
3255 if (float16_is_any_nan(f)) {
3256 float_raise(float_flag_invalid, s);
3257 return 0;
3258 }
3259 return float16_to_int64_round_to_zero(f, s);
3260 }
3261
3262 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3263 {
3264 if (float32_is_any_nan(f)) {
3265 float_raise(float_flag_invalid, s);
3266 return 0;
3267 }
3268 return float32_to_int64_round_to_zero(f, s);
3269 }
3270
3271 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3272 {
3273 if (float64_is_any_nan(f)) {
3274 float_raise(float_flag_invalid, s);
3275 return 0;
3276 }
3277 return float64_to_int64_round_to_zero(f, s);
3278 }
3279
3280 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3281 {
3282 if (float16_is_any_nan(f)) {
3283 float_raise(float_flag_invalid, s);
3284 return 0;
3285 }
3286 return float16_to_uint16_round_to_zero(f, s);
3287 }
3288
3289 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3290 {
3291 if (float16_is_any_nan(f)) {
3292 float_raise(float_flag_invalid, s);
3293 return 0;
3294 }
3295 return float16_to_uint64_round_to_zero(f, s);
3296 }
3297
3298 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3299 {
3300 if (float32_is_any_nan(f)) {
3301 float_raise(float_flag_invalid, s);
3302 return 0;
3303 }
3304 return float32_to_uint64_round_to_zero(f, s);
3305 }
3306
3307 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3308 {
3309 if (float64_is_any_nan(f)) {
3310 float_raise(float_flag_invalid, s);
3311 return 0;
3312 }
3313 return float64_to_uint64_round_to_zero(f, s);
3314 }
3315
3316 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3317 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3318 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3319 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3320 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3321 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3322
3323 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3324 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3325 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3326 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3327 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3328 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3329 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3330
3331 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3332 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3333 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3334 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3335 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3336 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3337 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3338
3339 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3340 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3341 DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3342
3343 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3344 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3345 DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3346
3347 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3348 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3349 DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3350
3351 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3352 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3353 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3354
3355 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3356 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3357 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3358 DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3359 DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3360 DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3361 DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3362
3363 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3364 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3365 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3366 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3367 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3368 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3369 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3370
3371 #undef DO_ZPZ_FP
3372
3373 /* 4-operand predicated multiply-add. This requires 7 operands to pass
3374 * "properly", so we need to encode some of the registers into DESC.
3375 */
3376 QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
3377
3378 static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
3379 uint16_t neg1, uint16_t neg3)
3380 {
3381 intptr_t i = simd_oprsz(desc);
3382 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3383 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3384 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3385 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3386 void *vd = &env->vfp.zregs[rd];
3387 void *vn = &env->vfp.zregs[rn];
3388 void *vm = &env->vfp.zregs[rm];
3389 void *va = &env->vfp.zregs[ra];
3390 uint64_t *g = vg;
3391
3392 do {
3393 uint64_t pg = g[(i - 1) >> 6];
3394 do {
3395 i -= 2;
3396 if (likely((pg >> (i & 63)) & 1)) {
3397 float16 e1, e2, e3, r;
3398
3399 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3400 e2 = *(uint16_t *)(vm + H1_2(i));
3401 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3402 r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status_f16);
3403 *(uint16_t *)(vd + H1_2(i)) = r;
3404 }
3405 } while (i & 63);
3406 } while (i != 0);
3407 }
3408
3409 void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3410 {
3411 do_fmla_zpzzz_h(env, vg, desc, 0, 0);
3412 }
3413
3414 void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3415 {
3416 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
3417 }
3418
3419 void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3420 {
3421 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
3422 }
3423
3424 void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3425 {
3426 do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
3427 }
3428
3429 static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
3430 uint32_t neg1, uint32_t neg3)
3431 {
3432 intptr_t i = simd_oprsz(desc);
3433 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3434 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3435 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3436 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3437 void *vd = &env->vfp.zregs[rd];
3438 void *vn = &env->vfp.zregs[rn];
3439 void *vm = &env->vfp.zregs[rm];
3440 void *va = &env->vfp.zregs[ra];
3441 uint64_t *g = vg;
3442
3443 do {
3444 uint64_t pg = g[(i - 1) >> 6];
3445 do {
3446 i -= 4;
3447 if (likely((pg >> (i & 63)) & 1)) {
3448 float32 e1, e2, e3, r;
3449
3450 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3451 e2 = *(uint32_t *)(vm + H1_4(i));
3452 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3453 r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3454 *(uint32_t *)(vd + H1_4(i)) = r;
3455 }
3456 } while (i & 63);
3457 } while (i != 0);
3458 }
3459
3460 void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3461 {
3462 do_fmla_zpzzz_s(env, vg, desc, 0, 0);
3463 }
3464
3465 void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3466 {
3467 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
3468 }
3469
3470 void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3471 {
3472 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
3473 }
3474
3475 void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3476 {
3477 do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
3478 }
3479
3480 static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
3481 uint64_t neg1, uint64_t neg3)
3482 {
3483 intptr_t i = simd_oprsz(desc);
3484 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3485 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3486 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3487 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3488 void *vd = &env->vfp.zregs[rd];
3489 void *vn = &env->vfp.zregs[rn];
3490 void *vm = &env->vfp.zregs[rm];
3491 void *va = &env->vfp.zregs[ra];
3492 uint64_t *g = vg;
3493
3494 do {
3495 uint64_t pg = g[(i - 1) >> 6];
3496 do {
3497 i -= 8;
3498 if (likely((pg >> (i & 63)) & 1)) {
3499 float64 e1, e2, e3, r;
3500
3501 e1 = *(uint64_t *)(vn + i) ^ neg1;
3502 e2 = *(uint64_t *)(vm + i);
3503 e3 = *(uint64_t *)(va + i) ^ neg3;
3504 r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3505 *(uint64_t *)(vd + i) = r;
3506 }
3507 } while (i & 63);
3508 } while (i != 0);
3509 }
3510
3511 void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3512 {
3513 do_fmla_zpzzz_d(env, vg, desc, 0, 0);
3514 }
3515
3516 void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3517 {
3518 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
3519 }
3520
3521 void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3522 {
3523 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
3524 }
3525
3526 void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3527 {
3528 do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
3529 }
3530
3531 /* Two operand floating-point comparison controlled by a predicate.
3532 * Unlike the integer version, we are not allowed to optimistically
3533 * compare operands, since the comparison may have side effects wrt
3534 * the FPSR.
3535 */
3536 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3537 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3538 void *status, uint32_t desc) \
3539 { \
3540 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3541 uint64_t *d = vd, *g = vg; \
3542 do { \
3543 uint64_t out = 0, pg = g[j]; \
3544 do { \
3545 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3546 if (likely((pg >> (i & 63)) & 1)) { \
3547 TYPE nn = *(TYPE *)(vn + H(i)); \
3548 TYPE mm = *(TYPE *)(vm + H(i)); \
3549 out |= OP(TYPE, nn, mm, status); \
3550 } \
3551 } while (i & 63); \
3552 d[j--] = out; \
3553 } while (i > 0); \
3554 }
3555
3556 #define DO_FPCMP_PPZZ_H(NAME, OP) \
3557 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3558 #define DO_FPCMP_PPZZ_S(NAME, OP) \
3559 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3560 #define DO_FPCMP_PPZZ_D(NAME, OP) \
3561 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3562
3563 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3564 DO_FPCMP_PPZZ_H(NAME, OP) \
3565 DO_FPCMP_PPZZ_S(NAME, OP) \
3566 DO_FPCMP_PPZZ_D(NAME, OP)
3567
3568 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3569 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
3570 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3571 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
3572 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3573 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3574 #define DO_FCMUO(TYPE, X, Y, ST) \
3575 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3576 #define DO_FACGE(TYPE, X, Y, ST) \
3577 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3578 #define DO_FACGT(TYPE, X, Y, ST) \
3579 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3580
3581 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3582 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3583 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3584 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3585 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3586 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3587 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3588
3589 #undef DO_FPCMP_PPZZ_ALL
3590 #undef DO_FPCMP_PPZZ_D
3591 #undef DO_FPCMP_PPZZ_S
3592 #undef DO_FPCMP_PPZZ_H
3593 #undef DO_FPCMP_PPZZ
3594
3595 /* One operand floating-point comparison against zero, controlled
3596 * by a predicate.
3597 */
3598 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3599 void HELPER(NAME)(void *vd, void *vn, void *vg, \
3600 void *status, uint32_t desc) \
3601 { \
3602 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3603 uint64_t *d = vd, *g = vg; \
3604 do { \
3605 uint64_t out = 0, pg = g[j]; \
3606 do { \
3607 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3608 if ((pg >> (i & 63)) & 1) { \
3609 TYPE nn = *(TYPE *)(vn + H(i)); \
3610 out |= OP(TYPE, nn, 0, status); \
3611 } \
3612 } while (i & 63); \
3613 d[j--] = out; \
3614 } while (i > 0); \
3615 }
3616
3617 #define DO_FPCMP_PPZ0_H(NAME, OP) \
3618 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3619 #define DO_FPCMP_PPZ0_S(NAME, OP) \
3620 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3621 #define DO_FPCMP_PPZ0_D(NAME, OP) \
3622 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3623
3624 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3625 DO_FPCMP_PPZ0_H(NAME, OP) \
3626 DO_FPCMP_PPZ0_S(NAME, OP) \
3627 DO_FPCMP_PPZ0_D(NAME, OP)
3628
3629 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3630 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3631 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3632 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3633 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3634 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3635
3636 /* FP Trig Multiply-Add. */
3637
3638 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3639 {
3640 static const float16 coeff[16] = {
3641 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3642 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3643 };
3644 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3645 intptr_t x = simd_data(desc);
3646 float16 *d = vd, *n = vn, *m = vm;
3647 for (i = 0; i < opr_sz; i++) {
3648 float16 mm = m[i];
3649 intptr_t xx = x;
3650 if (float16_is_neg(mm)) {
3651 mm = float16_abs(mm);
3652 xx += 8;
3653 }
3654 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3655 }
3656 }
3657
3658 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3659 {
3660 static const float32 coeff[16] = {
3661 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3662 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3663 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3664 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3665 };
3666 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3667 intptr_t x = simd_data(desc);
3668 float32 *d = vd, *n = vn, *m = vm;
3669 for (i = 0; i < opr_sz; i++) {
3670 float32 mm = m[i];
3671 intptr_t xx = x;
3672 if (float32_is_neg(mm)) {
3673 mm = float32_abs(mm);
3674 xx += 8;
3675 }
3676 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3677 }
3678 }
3679
3680 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3681 {
3682 static const float64 coeff[16] = {
3683 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3684 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3685 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3686 0x3de5d8408868552full, 0x0000000000000000ull,
3687 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3688 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3689 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3690 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3691 };
3692 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3693 intptr_t x = simd_data(desc);
3694 float64 *d = vd, *n = vn, *m = vm;
3695 for (i = 0; i < opr_sz; i++) {
3696 float64 mm = m[i];
3697 intptr_t xx = x;
3698 if (float64_is_neg(mm)) {
3699 mm = float64_abs(mm);
3700 xx += 8;
3701 }
3702 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3703 }
3704 }
3705
3706 /*
3707 * FP Complex Add
3708 */
3709
3710 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3711 void *vs, uint32_t desc)
3712 {
3713 intptr_t j, i = simd_oprsz(desc);
3714 uint64_t *g = vg;
3715 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3716 float16 neg_real = float16_chs(neg_imag);
3717
3718 do {
3719 uint64_t pg = g[(i - 1) >> 6];
3720 do {
3721 float16 e0, e1, e2, e3;
3722
3723 /* I holds the real index; J holds the imag index. */
3724 j = i - sizeof(float16);
3725 i -= 2 * sizeof(float16);
3726
3727 e0 = *(float16 *)(vn + H1_2(i));
3728 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3729 e2 = *(float16 *)(vn + H1_2(j));
3730 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3731
3732 if (likely((pg >> (i & 63)) & 1)) {
3733 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3734 }
3735 if (likely((pg >> (j & 63)) & 1)) {
3736 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3737 }
3738 } while (i & 63);
3739 } while (i != 0);
3740 }
3741
3742 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3743 void *vs, uint32_t desc)
3744 {
3745 intptr_t j, i = simd_oprsz(desc);
3746 uint64_t *g = vg;
3747 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3748 float32 neg_real = float32_chs(neg_imag);
3749
3750 do {
3751 uint64_t pg = g[(i - 1) >> 6];
3752 do {
3753 float32 e0, e1, e2, e3;
3754
3755 /* I holds the real index; J holds the imag index. */
3756 j = i - sizeof(float32);
3757 i -= 2 * sizeof(float32);
3758
3759 e0 = *(float32 *)(vn + H1_2(i));
3760 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3761 e2 = *(float32 *)(vn + H1_2(j));
3762 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3763
3764 if (likely((pg >> (i & 63)) & 1)) {
3765 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3766 }
3767 if (likely((pg >> (j & 63)) & 1)) {
3768 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3769 }
3770 } while (i & 63);
3771 } while (i != 0);
3772 }
3773
3774 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3775 void *vs, uint32_t desc)
3776 {
3777 intptr_t j, i = simd_oprsz(desc);
3778 uint64_t *g = vg;
3779 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3780 float64 neg_real = float64_chs(neg_imag);
3781
3782 do {
3783 uint64_t pg = g[(i - 1) >> 6];
3784 do {
3785 float64 e0, e1, e2, e3;
3786
3787 /* I holds the real index; J holds the imag index. */
3788 j = i - sizeof(float64);
3789 i -= 2 * sizeof(float64);
3790
3791 e0 = *(float64 *)(vn + H1_2(i));
3792 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3793 e2 = *(float64 *)(vn + H1_2(j));
3794 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3795
3796 if (likely((pg >> (i & 63)) & 1)) {
3797 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3798 }
3799 if (likely((pg >> (j & 63)) & 1)) {
3800 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3801 }
3802 } while (i & 63);
3803 } while (i != 0);
3804 }
3805
3806 /*
3807 * FP Complex Multiply
3808 */
3809
3810 QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 22 > 32);
3811
3812 void HELPER(sve_fcmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3813 {
3814 intptr_t j, i = simd_oprsz(desc);
3815 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3816 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3817 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3818 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3819 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3820 bool flip = rot & 1;
3821 float16 neg_imag, neg_real;
3822 void *vd = &env->vfp.zregs[rd];
3823 void *vn = &env->vfp.zregs[rn];
3824 void *vm = &env->vfp.zregs[rm];
3825 void *va = &env->vfp.zregs[ra];
3826 uint64_t *g = vg;
3827
3828 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3829 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3830
3831 do {
3832 uint64_t pg = g[(i - 1) >> 6];
3833 do {
3834 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3835
3836 /* I holds the real index; J holds the imag index. */
3837 j = i - sizeof(float16);
3838 i -= 2 * sizeof(float16);
3839
3840 nr = *(float16 *)(vn + H1_2(i));
3841 ni = *(float16 *)(vn + H1_2(j));
3842 mr = *(float16 *)(vm + H1_2(i));
3843 mi = *(float16 *)(vm + H1_2(j));
3844
3845 e2 = (flip ? ni : nr);
3846 e1 = (flip ? mi : mr) ^ neg_real;
3847 e4 = e2;
3848 e3 = (flip ? mr : mi) ^ neg_imag;
3849
3850 if (likely((pg >> (i & 63)) & 1)) {
3851 d = *(float16 *)(va + H1_2(i));
3852 d = float16_muladd(e2, e1, d, 0, &env->vfp.fp_status_f16);
3853 *(float16 *)(vd + H1_2(i)) = d;
3854 }
3855 if (likely((pg >> (j & 63)) & 1)) {
3856 d = *(float16 *)(va + H1_2(j));
3857 d = float16_muladd(e4, e3, d, 0, &env->vfp.fp_status_f16);
3858 *(float16 *)(vd + H1_2(j)) = d;
3859 }
3860 } while (i & 63);
3861 } while (i != 0);
3862 }
3863
3864 void HELPER(sve_fcmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3865 {
3866 intptr_t j, i = simd_oprsz(desc);
3867 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3868 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3869 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3870 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3871 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3872 bool flip = rot & 1;
3873 float32 neg_imag, neg_real;
3874 void *vd = &env->vfp.zregs[rd];
3875 void *vn = &env->vfp.zregs[rn];
3876 void *vm = &env->vfp.zregs[rm];
3877 void *va = &env->vfp.zregs[ra];
3878 uint64_t *g = vg;
3879
3880 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3881 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3882
3883 do {
3884 uint64_t pg = g[(i - 1) >> 6];
3885 do {
3886 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3887
3888 /* I holds the real index; J holds the imag index. */
3889 j = i - sizeof(float32);
3890 i -= 2 * sizeof(float32);
3891
3892 nr = *(float32 *)(vn + H1_2(i));
3893 ni = *(float32 *)(vn + H1_2(j));
3894 mr = *(float32 *)(vm + H1_2(i));
3895 mi = *(float32 *)(vm + H1_2(j));
3896
3897 e2 = (flip ? ni : nr);
3898 e1 = (flip ? mi : mr) ^ neg_real;
3899 e4 = e2;
3900 e3 = (flip ? mr : mi) ^ neg_imag;
3901
3902 if (likely((pg >> (i & 63)) & 1)) {
3903 d = *(float32 *)(va + H1_2(i));
3904 d = float32_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3905 *(float32 *)(vd + H1_2(i)) = d;
3906 }
3907 if (likely((pg >> (j & 63)) & 1)) {
3908 d = *(float32 *)(va + H1_2(j));
3909 d = float32_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3910 *(float32 *)(vd + H1_2(j)) = d;
3911 }
3912 } while (i & 63);
3913 } while (i != 0);
3914 }
3915
3916 void HELPER(sve_fcmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3917 {
3918 intptr_t j, i = simd_oprsz(desc);
3919 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3920 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3921 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3922 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3923 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3924 bool flip = rot & 1;
3925 float64 neg_imag, neg_real;
3926 void *vd = &env->vfp.zregs[rd];
3927 void *vn = &env->vfp.zregs[rn];
3928 void *vm = &env->vfp.zregs[rm];
3929 void *va = &env->vfp.zregs[ra];
3930 uint64_t *g = vg;
3931
3932 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3933 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3934
3935 do {
3936 uint64_t pg = g[(i - 1) >> 6];
3937 do {
3938 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3939
3940 /* I holds the real index; J holds the imag index. */
3941 j = i - sizeof(float64);
3942 i -= 2 * sizeof(float64);
3943
3944 nr = *(float64 *)(vn + H1_2(i));
3945 ni = *(float64 *)(vn + H1_2(j));
3946 mr = *(float64 *)(vm + H1_2(i));
3947 mi = *(float64 *)(vm + H1_2(j));
3948
3949 e2 = (flip ? ni : nr);
3950 e1 = (flip ? mi : mr) ^ neg_real;
3951 e4 = e2;
3952 e3 = (flip ? mr : mi) ^ neg_imag;
3953
3954 if (likely((pg >> (i & 63)) & 1)) {
3955 d = *(float64 *)(va + H1_2(i));
3956 d = float64_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3957 *(float64 *)(vd + H1_2(i)) = d;
3958 }
3959 if (likely((pg >> (j & 63)) & 1)) {
3960 d = *(float64 *)(va + H1_2(j));
3961 d = float64_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3962 *(float64 *)(vd + H1_2(j)) = d;
3963 }
3964 } while (i & 63);
3965 } while (i != 0);
3966 }
3967
3968 /*
3969 * Load contiguous data, protected by a governing predicate.
3970 */
3971
3972 /*
3973 * Load elements into @vd, controlled by @vg, from @host + @mem_ofs.
3974 * Memory is valid through @host + @mem_max. The register element
3975 * indicies are inferred from @mem_ofs, as modified by the types for
3976 * which the helper is built. Return the @mem_ofs of the first element
3977 * not loaded (which is @mem_max if they are all loaded).
3978 *
3979 * For softmmu, we have fully validated the guest page. For user-only,
3980 * we cannot fully validate without taking the mmap lock, but since we
3981 * know the access is within one host page, if any access is valid they
3982 * all must be valid. However, when @vg is all false, it may be that
3983 * no access is valid.
3984 */
3985 typedef intptr_t sve_ld1_host_fn(void *vd, void *vg, void *host,
3986 intptr_t mem_ofs, intptr_t mem_max);
3987
3988 /*
3989 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
3990 * The controlling predicate is known to be true.
3991 */
3992 typedef void sve_ld1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
3993 target_ulong vaddr, int mmu_idx, uintptr_t ra);
3994 typedef sve_ld1_tlb_fn sve_st1_tlb_fn;
3995
3996 /*
3997 * Generate the above primitives.
3998 */
3999
4000 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
4001 static intptr_t sve_##NAME##_host(void *vd, void *vg, void *host, \
4002 intptr_t mem_off, const intptr_t mem_max) \
4003 { \
4004 intptr_t reg_off = mem_off * (sizeof(TYPEE) / sizeof(TYPEM)); \
4005 uint64_t *pg = vg; \
4006 while (mem_off + sizeof(TYPEM) <= mem_max) { \
4007 TYPEM val = 0; \
4008 if (likely((pg[reg_off >> 6] >> (reg_off & 63)) & 1)) { \
4009 val = HOST(host + mem_off); \
4010 } \
4011 *(TYPEE *)(vd + H(reg_off)) = val; \
4012 mem_off += sizeof(TYPEM), reg_off += sizeof(TYPEE); \
4013 } \
4014 return mem_off; \
4015 }
4016
4017 #ifdef CONFIG_SOFTMMU
4018 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, HOST, MOEND, TLB) \
4019 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4020 target_ulong addr, int mmu_idx, uintptr_t ra) \
4021 { \
4022 TCGMemOpIdx oi = make_memop_idx(ctz32(sizeof(TYPEM)) | MOEND, mmu_idx); \
4023 TYPEM val = TLB(env, addr, oi, ra); \
4024 *(TYPEE *)(vd + H(reg_off)) = val; \
4025 }
4026 #else
4027 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, HOST, MOEND, TLB) \
4028 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4029 target_ulong addr, int mmu_idx, uintptr_t ra) \
4030 { \
4031 TYPEM val = HOST(g2h(addr)); \
4032 *(TYPEE *)(vd + H(reg_off)) = val; \
4033 }
4034 #endif
4035
4036 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
4037 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
4038 DO_LD_TLB(NAME, H, TE, TM, ldub_p, 0, helper_ret_ldub_mmu)
4039
4040 DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
4041 DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
4042 DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
4043 DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
4044 DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
4045 DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
4046 DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
4047
4048 #define DO_LD_PRIM_2(NAME, end, MOEND, H, TE, TM, PH, PT) \
4049 DO_LD_HOST(NAME##_##end, H, TE, TM, PH##_##end##_p) \
4050 DO_LD_TLB(NAME##_##end, H, TE, TM, PH##_##end##_p, \
4051 MOEND, helper_##end##_##PT##_mmu)
4052
4053 DO_LD_PRIM_2(ld1hh, le, MO_LE, H1_2, uint16_t, uint16_t, lduw, lduw)
4054 DO_LD_PRIM_2(ld1hsu, le, MO_LE, H1_4, uint32_t, uint16_t, lduw, lduw)
4055 DO_LD_PRIM_2(ld1hss, le, MO_LE, H1_4, uint32_t, int16_t, lduw, lduw)
4056 DO_LD_PRIM_2(ld1hdu, le, MO_LE, , uint64_t, uint16_t, lduw, lduw)
4057 DO_LD_PRIM_2(ld1hds, le, MO_LE, , uint64_t, int16_t, lduw, lduw)
4058
4059 DO_LD_PRIM_2(ld1ss, le, MO_LE, H1_4, uint32_t, uint32_t, ldl, ldul)
4060 DO_LD_PRIM_2(ld1sdu, le, MO_LE, , uint64_t, uint32_t, ldl, ldul)
4061 DO_LD_PRIM_2(ld1sds, le, MO_LE, , uint64_t, int32_t, ldl, ldul)
4062
4063 DO_LD_PRIM_2(ld1dd, le, MO_LE, , uint64_t, uint64_t, ldq, ldq)
4064
4065 DO_LD_PRIM_2(ld1hh, be, MO_BE, H1_2, uint16_t, uint16_t, lduw, lduw)
4066 DO_LD_PRIM_2(ld1hsu, be, MO_BE, H1_4, uint32_t, uint16_t, lduw, lduw)
4067 DO_LD_PRIM_2(ld1hss, be, MO_BE, H1_4, uint32_t, int16_t, lduw, lduw)
4068 DO_LD_PRIM_2(ld1hdu, be, MO_BE, , uint64_t, uint16_t, lduw, lduw)
4069 DO_LD_PRIM_2(ld1hds, be, MO_BE, , uint64_t, int16_t, lduw, lduw)
4070
4071 DO_LD_PRIM_2(ld1ss, be, MO_BE, H1_4, uint32_t, uint32_t, ldl, ldul)
4072 DO_LD_PRIM_2(ld1sdu, be, MO_BE, , uint64_t, uint32_t, ldl, ldul)
4073 DO_LD_PRIM_2(ld1sds, be, MO_BE, , uint64_t, int32_t, ldl, ldul)
4074
4075 DO_LD_PRIM_2(ld1dd, be, MO_BE, , uint64_t, uint64_t, ldq, ldq)
4076
4077 #undef DO_LD_TLB
4078 #undef DO_LD_HOST
4079 #undef DO_LD_PRIM_1
4080 #undef DO_LD_PRIM_2
4081
4082 /*
4083 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4084 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4085 * element >= @reg_off, or @reg_max if there were no active elements at all.
4086 */
4087 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4088 intptr_t reg_max, int esz)
4089 {
4090 uint64_t pg_mask = pred_esz_masks[esz];
4091 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4092
4093 /* In normal usage, the first element is active. */
4094 if (likely(pg & 1)) {
4095 return reg_off;
4096 }
4097
4098 if (pg == 0) {
4099 reg_off &= -64;
4100 do {
4101 reg_off += 64;
4102 if (unlikely(reg_off >= reg_max)) {
4103 /* The entire predicate was false. */
4104 return reg_max;
4105 }
4106 pg = vg[reg_off >> 6] & pg_mask;
4107 } while (pg == 0);
4108 }
4109 reg_off += ctz64(pg);
4110
4111 /* We should never see an out of range predicate bit set. */
4112 tcg_debug_assert(reg_off < reg_max);
4113 return reg_off;
4114 }
4115
4116 /*
4117 * Return the maximum offset <= @mem_max which is still within the page
4118 * referenced by @base + @mem_off.
4119 */
4120 static intptr_t max_for_page(target_ulong base, intptr_t mem_off,
4121 intptr_t mem_max)
4122 {
4123 target_ulong addr = base + mem_off;
4124 intptr_t split = -(intptr_t)(addr | TARGET_PAGE_MASK);
4125 return MIN(split, mem_max - mem_off) + mem_off;
4126 }
4127
4128 static inline void set_helper_retaddr(uintptr_t ra)
4129 {
4130 #ifdef CONFIG_USER_ONLY
4131 helper_retaddr = ra;
4132 #endif
4133 }
4134
4135 /*
4136 * The result of tlb_vaddr_to_host for user-only is just g2h(x),
4137 * which is always non-null. Elide the useless test.
4138 */
4139 static inline bool test_host_page(void *host)
4140 {
4141 #ifdef CONFIG_USER_ONLY
4142 return true;
4143 #else
4144 return likely(host != NULL);
4145 #endif
4146 }
4147
4148 /*
4149 * Common helper for all contiguous one-register predicated loads.
4150 */
4151 static void sve_ld1_r(CPUARMState *env, void *vg, const target_ulong addr,
4152 uint32_t desc, const uintptr_t retaddr,
4153 const int esz, const int msz,
4154 sve_ld1_host_fn *host_fn,
4155 sve_ld1_tlb_fn *tlb_fn)
4156 {
4157 void *vd = &env->vfp.zregs[simd_data(desc)];
4158 const int diffsz = esz - msz;
4159 const intptr_t reg_max = simd_oprsz(desc);
4160 const intptr_t mem_max = reg_max >> diffsz;
4161 const int mmu_idx = cpu_mmu_index(env, false);
4162 ARMVectorReg scratch;
4163 void *host;
4164 intptr_t split, reg_off, mem_off;
4165
4166 /* Find the first active element. */
4167 reg_off = find_next_active(vg, 0, reg_max, esz);
4168 if (unlikely(reg_off == reg_max)) {
4169 /* The entire predicate was false; no load occurs. */
4170 memset(vd, 0, reg_max);
4171 return;
4172 }
4173 mem_off = reg_off >> diffsz;
4174 set_helper_retaddr(retaddr);
4175
4176 /*
4177 * If the (remaining) load is entirely within a single page, then:
4178 * For softmmu, and the tlb hits, then no faults will occur;
4179 * For user-only, either the first load will fault or none will.
4180 * We can thus perform the load directly to the destination and
4181 * Vd will be unmodified on any exception path.
4182 */
4183 split = max_for_page(addr, mem_off, mem_max);
4184 if (likely(split == mem_max)) {
4185 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4186 if (test_host_page(host)) {
4187 mem_off = host_fn(vd, vg, host - mem_off, mem_off, mem_max);
4188 tcg_debug_assert(mem_off == mem_max);
4189 set_helper_retaddr(0);
4190 /* After having taken any fault, zero leading inactive elements. */
4191 swap_memzero(vd, reg_off);
4192 return;
4193 }
4194 }
4195
4196 /*
4197 * Perform the predicated read into a temporary, thus ensuring
4198 * if the load of the last element faults, Vd is not modified.
4199 */
4200 #ifdef CONFIG_USER_ONLY
4201 swap_memzero(&scratch, reg_off);
4202 host_fn(&scratch, vg, g2h(addr), mem_off, mem_max);
4203 #else
4204 memset(&scratch, 0, reg_max);
4205 goto start;
4206 while (1) {
4207 reg_off = find_next_active(vg, reg_off, reg_max, esz);
4208 if (reg_off >= reg_max) {
4209 break;
4210 }
4211 mem_off = reg_off >> diffsz;
4212 split = max_for_page(addr, mem_off, mem_max);
4213
4214 start:
4215 if (split - mem_off >= (1 << msz)) {
4216 /* At least one whole element on this page. */
4217 host = tlb_vaddr_to_host(env, addr + mem_off,
4218 MMU_DATA_LOAD, mmu_idx);
4219 if (host) {
4220 mem_off = host_fn(&scratch, vg, host - mem_off,
4221 mem_off, split);
4222 reg_off = mem_off << diffsz;
4223 continue;
4224 }
4225 }
4226
4227 /*
4228 * Perform one normal read. This may fault, longjmping out to the
4229 * main loop in order to raise an exception. It may succeed, and
4230 * as a side-effect load the TLB entry for the next round. Finally,
4231 * in the extremely unlikely case we're performing this operation
4232 * on I/O memory, it may succeed but not bring in the TLB entry.
4233 * But even then we have still made forward progress.
4234 */
4235 tlb_fn(env, &scratch, reg_off, addr + mem_off, mmu_idx, retaddr);
4236 reg_off += 1 << esz;
4237 }
4238 #endif
4239
4240 set_helper_retaddr(0);
4241 memcpy(vd, &scratch, reg_max);
4242 }
4243
4244 #define DO_LD1_1(NAME, ESZ) \
4245 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
4246 target_ulong addr, uint32_t desc) \
4247 { \
4248 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, 0, \
4249 sve_##NAME##_host, sve_##NAME##_tlb); \
4250 }
4251
4252 #define DO_LD1_2(NAME, ESZ, MSZ) \
4253 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
4254 target_ulong addr, uint32_t desc) \
4255 { \
4256 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4257 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
4258 } \
4259 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
4260 target_ulong addr, uint32_t desc) \
4261 { \
4262 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4263 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
4264 }
4265
4266 DO_LD1_1(ld1bb, 0)
4267 DO_LD1_1(ld1bhu, 1)
4268 DO_LD1_1(ld1bhs, 1)
4269 DO_LD1_1(ld1bsu, 2)
4270 DO_LD1_1(ld1bss, 2)
4271 DO_LD1_1(ld1bdu, 3)
4272 DO_LD1_1(ld1bds, 3)
4273
4274 DO_LD1_2(ld1hh, 1, 1)
4275 DO_LD1_2(ld1hsu, 2, 1)
4276 DO_LD1_2(ld1hss, 2, 1)
4277 DO_LD1_2(ld1hdu, 3, 1)
4278 DO_LD1_2(ld1hds, 3, 1)
4279
4280 DO_LD1_2(ld1ss, 2, 2)
4281 DO_LD1_2(ld1sdu, 3, 2)
4282 DO_LD1_2(ld1sds, 3, 2)
4283
4284 DO_LD1_2(ld1dd, 3, 3)
4285
4286 #undef DO_LD1_1
4287 #undef DO_LD1_2
4288
4289 /*
4290 * Common helpers for all contiguous 2,3,4-register predicated loads.
4291 */
4292 static void sve_ld2_r(CPUARMState *env, void *vg, target_ulong addr,
4293 uint32_t desc, int size, uintptr_t ra,
4294 sve_ld1_tlb_fn *tlb_fn)
4295 {
4296 const int mmu_idx = cpu_mmu_index(env, false);
4297 intptr_t i, oprsz = simd_oprsz(desc);
4298 unsigned rd = simd_data(desc);
4299 ARMVectorReg scratch[2] = { };
4300
4301 set_helper_retaddr(ra);
4302 for (i = 0; i < oprsz; ) {
4303 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4304 do {
4305 if (pg & 1) {
4306 tlb_fn(env, &scratch[0], i, addr, mmu_idx, ra);
4307 tlb_fn(env, &scratch[1], i, addr + size, mmu_idx, ra);
4308 }
4309 i += size, pg >>= size;
4310 addr += 2 * size;
4311 } while (i & 15);
4312 }
4313 set_helper_retaddr(0);
4314
4315 /* Wait until all exceptions have been raised to write back. */
4316 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4317 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4318 }
4319
4320 static void sve_ld3_r(CPUARMState *env, void *vg, target_ulong addr,
4321 uint32_t desc, int size, uintptr_t ra,
4322 sve_ld1_tlb_fn *tlb_fn)
4323 {
4324 const int mmu_idx = cpu_mmu_index(env, false);
4325 intptr_t i, oprsz = simd_oprsz(desc);
4326 unsigned rd = simd_data(desc);
4327 ARMVectorReg scratch[3] = { };
4328
4329 set_helper_retaddr(ra);
4330 for (i = 0; i < oprsz; ) {
4331 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4332 do {
4333 if (pg & 1) {
4334 tlb_fn(env, &scratch[0], i, addr, mmu_idx, ra);
4335 tlb_fn(env, &scratch[1], i, addr + size, mmu_idx, ra);
4336 tlb_fn(env, &scratch[2], i, addr + 2 * size, mmu_idx, ra);
4337 }
4338 i += size, pg >>= size;
4339 addr += 3 * size;
4340 } while (i & 15);
4341 }
4342 set_helper_retaddr(0);
4343
4344 /* Wait until all exceptions have been raised to write back. */
4345 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4346 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4347 memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
4348 }
4349
4350 static void sve_ld4_r(CPUARMState *env, void *vg, target_ulong addr,
4351 uint32_t desc, int size, uintptr_t ra,
4352 sve_ld1_tlb_fn *tlb_fn)
4353 {
4354 const int mmu_idx = cpu_mmu_index(env, false);
4355 intptr_t i, oprsz = simd_oprsz(desc);
4356 unsigned rd = simd_data(desc);
4357 ARMVectorReg scratch[4] = { };
4358
4359 set_helper_retaddr(ra);
4360 for (i = 0; i < oprsz; ) {
4361 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4362 do {
4363 if (pg & 1) {
4364 tlb_fn(env, &scratch[0], i, addr, mmu_idx, ra);
4365 tlb_fn(env, &scratch[1], i, addr + size, mmu_idx, ra);
4366 tlb_fn(env, &scratch[2], i, addr + 2 * size, mmu_idx, ra);
4367 tlb_fn(env, &scratch[3], i, addr + 3 * size, mmu_idx, ra);
4368 }
4369 i += size, pg >>= size;
4370 addr += 4 * size;
4371 } while (i & 15);
4372 }
4373 set_helper_retaddr(0);
4374
4375 /* Wait until all exceptions have been raised to write back. */
4376 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4377 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4378 memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
4379 memcpy(&env->vfp.zregs[(rd + 3) & 31], &scratch[3], oprsz);
4380 }
4381
4382 #define DO_LDN_1(N) \
4383 void __attribute__((flatten)) HELPER(sve_ld##N##bb_r) \
4384 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4385 { \
4386 sve_ld##N##_r(env, vg, addr, desc, 1, GETPC(), sve_ld1bb_tlb); \
4387 }
4388
4389 #define DO_LDN_2(N, SUFF, SIZE) \
4390 void __attribute__((flatten)) HELPER(sve_ld##N##SUFF##_le_r) \
4391 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4392 { \
4393 sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \
4394 sve_ld1##SUFF##_le_tlb); \
4395 } \
4396 void __attribute__((flatten)) HELPER(sve_ld##N##SUFF##_be_r) \
4397 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4398 { \
4399 sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \
4400 sve_ld1##SUFF##_be_tlb); \
4401 }
4402
4403 DO_LDN_1(2)
4404 DO_LDN_1(3)
4405 DO_LDN_1(4)
4406
4407 DO_LDN_2(2, hh, 2)
4408 DO_LDN_2(3, hh, 2)
4409 DO_LDN_2(4, hh, 2)
4410
4411 DO_LDN_2(2, ss, 4)
4412 DO_LDN_2(3, ss, 4)
4413 DO_LDN_2(4, ss, 4)
4414
4415 DO_LDN_2(2, dd, 8)
4416 DO_LDN_2(3, dd, 8)
4417 DO_LDN_2(4, dd, 8)
4418
4419 #undef DO_LDN_1
4420 #undef DO_LDN_2
4421
4422 /*
4423 * Load contiguous data, first-fault and no-fault.
4424 *
4425 * For user-only, one could argue that we should hold the mmap_lock during
4426 * the operation so that there is no race between page_check_range and the
4427 * load operation. However, unmapping pages out from under a running thread
4428 * is extraordinarily unlikely. This theoretical race condition also affects
4429 * linux-user/ in its get_user/put_user macros.
4430 *
4431 * TODO: Construct some helpers, written in assembly, that interact with
4432 * handle_cpu_signal to produce memory ops which can properly report errors
4433 * without racing.
4434 */
4435
4436 /* Fault on byte I. All bits in FFR from I are cleared. The vector
4437 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4438 * option, which leaves subsequent data unchanged.
4439 */
4440 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4441 {
4442 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4443
4444 if (i & 63) {
4445 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4446 i = ROUND_UP(i, 64);
4447 }
4448 for (; i < oprsz; i += 64) {
4449 ffr[i / 64] = 0;
4450 }
4451 }
4452
4453 /*
4454 * Common helper for all contiguous first-fault loads.
4455 */
4456 static void sve_ldff1_r(CPUARMState *env, void *vg, const target_ulong addr,
4457 uint32_t desc, const uintptr_t retaddr,
4458 const int esz, const int msz,
4459 sve_ld1_host_fn *host_fn,
4460 sve_ld1_tlb_fn *tlb_fn)
4461 {
4462 void *vd = &env->vfp.zregs[simd_data(desc)];
4463 const int diffsz = esz - msz;
4464 const intptr_t reg_max = simd_oprsz(desc);
4465 const intptr_t mem_max = reg_max >> diffsz;
4466 const int mmu_idx = cpu_mmu_index(env, false);
4467 intptr_t split, reg_off, mem_off;
4468 void *host;
4469
4470 /* Skip to the first active element. */
4471 reg_off = find_next_active(vg, 0, reg_max, esz);
4472 if (unlikely(reg_off == reg_max)) {
4473 /* The entire predicate was false; no load occurs. */
4474 memset(vd, 0, reg_max);
4475 return;
4476 }
4477 mem_off = reg_off >> diffsz;
4478 set_helper_retaddr(retaddr);
4479
4480 /*
4481 * If the (remaining) load is entirely within a single page, then:
4482 * For softmmu, and the tlb hits, then no faults will occur;
4483 * For user-only, either the first load will fault or none will.
4484 * We can thus perform the load directly to the destination and
4485 * Vd will be unmodified on any exception path.
4486 */
4487 split = max_for_page(addr, mem_off, mem_max);
4488 if (likely(split == mem_max)) {
4489 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4490 if (test_host_page(host)) {
4491 mem_off = host_fn(vd, vg, host - mem_off, mem_off, mem_max);
4492 tcg_debug_assert(mem_off == mem_max);
4493 set_helper_retaddr(0);
4494 /* After any fault, zero any leading inactive elements. */
4495 swap_memzero(vd, reg_off);
4496 return;
4497 }
4498 }
4499
4500 #ifdef CONFIG_USER_ONLY
4501 /*
4502 * The page(s) containing this first element at ADDR+MEM_OFF must
4503 * be valid. Considering that this first element may be misaligned
4504 * and cross a page boundary itself, take the rest of the page from
4505 * the last byte of the element.
4506 */
4507 split = max_for_page(addr, mem_off + (1 << msz) - 1, mem_max);
4508 mem_off = host_fn(vd, vg, g2h(addr), mem_off, split);
4509
4510 /* After any fault, zero any leading inactive elements. */
4511 swap_memzero(vd, reg_off);
4512 reg_off = mem_off << diffsz;
4513 #else
4514 /*
4515 * Perform one normal read, which will fault or not.
4516 * But it is likely to bring the page into the tlb.
4517 */
4518 tlb_fn(env, vd, reg_off, addr + mem_off, mmu_idx, retaddr);
4519
4520 /* After any fault, zero any leading predicated false elts. */
4521 swap_memzero(vd, reg_off);
4522 mem_off += 1 << msz;
4523 reg_off += 1 << esz;
4524
4525 /* Try again to read the balance of the page. */
4526 split = max_for_page(addr, mem_off - 1, mem_max);
4527 if (split >= (1 << msz)) {
4528 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4529 if (host) {
4530 mem_off = host_fn(vd, vg, host - mem_off, mem_off, split);
4531 reg_off = mem_off << diffsz;
4532 }
4533 }
4534 #endif
4535
4536 set_helper_retaddr(0);
4537 record_fault(env, reg_off, reg_max);
4538 }
4539
4540 /*
4541 * Common helper for all contiguous no-fault loads.
4542 */
4543 static void sve_ldnf1_r(CPUARMState *env, void *vg, const target_ulong addr,
4544 uint32_t desc, const int esz, const int msz,
4545 sve_ld1_host_fn *host_fn)
4546 {
4547 void *vd = &env->vfp.zregs[simd_data(desc)];
4548 const int diffsz = esz - msz;
4549 const intptr_t reg_max = simd_oprsz(desc);
4550 const intptr_t mem_max = reg_max >> diffsz;
4551 const int mmu_idx = cpu_mmu_index(env, false);
4552 intptr_t split, reg_off, mem_off;
4553 void *host;
4554
4555 #ifdef CONFIG_USER_ONLY
4556 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, mmu_idx);
4557 if (likely(page_check_range(addr, mem_max, PAGE_READ) == 0)) {
4558 /* The entire operation is valid and will not fault. */
4559 host_fn(vd, vg, host, 0, mem_max);
4560 return;
4561 }
4562 #endif
4563
4564 /* There will be no fault, so we may modify in advance. */
4565 memset(vd, 0, reg_max);
4566
4567 /* Skip to the first active element. */
4568 reg_off = find_next_active(vg, 0, reg_max, esz);
4569 if (unlikely(reg_off == reg_max)) {
4570 /* The entire predicate was false; no load occurs. */
4571 return;
4572 }
4573 mem_off = reg_off >> diffsz;
4574
4575 #ifdef CONFIG_USER_ONLY
4576 if (page_check_range(addr + mem_off, 1 << msz, PAGE_READ) == 0) {
4577 /* At least one load is valid; take the rest of the page. */
4578 split = max_for_page(addr, mem_off + (1 << msz) - 1, mem_max);
4579 mem_off = host_fn(vd, vg, host, mem_off, split);
4580 reg_off = mem_off << diffsz;
4581 }
4582 #else
4583 /*
4584 * If the address is not in the TLB, we have no way to bring the
4585 * entry into the TLB without also risking a fault. Note that
4586 * the corollary is that we never load from an address not in RAM.
4587 *
4588 * This last is out of spec, in a weird corner case.
4589 * Per the MemNF/MemSingleNF pseudocode, a NF load from Device memory
4590 * must not actually hit the bus -- it returns UNKNOWN data instead.
4591 * But if you map non-RAM with Normal memory attributes and do a NF
4592 * load then it should access the bus. (Nobody ought actually do this
4593 * in the real world, obviously.)
4594 *
4595 * Then there are the annoying special cases with watchpoints...
4596 *
4597 * TODO: Add a form of tlb_fill that does not raise an exception,
4598 * with a form of tlb_vaddr_to_host and a set of loads to match.
4599 * The non_fault_vaddr_to_host would handle everything, usually,
4600 * and the loads would handle the iomem path for watchpoints.
4601 */
4602 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4603 split = max_for_page(addr, mem_off, mem_max);
4604 if (host && split >= (1 << msz)) {
4605 mem_off = host_fn(vd, vg, host - mem_off, mem_off, split);
4606 reg_off = mem_off << diffsz;
4607 }
4608 #endif
4609
4610 record_fault(env, reg_off, reg_max);
4611 }
4612
4613 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
4614 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
4615 target_ulong addr, uint32_t desc) \
4616 { \
4617 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, 0, \
4618 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4619 } \
4620 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
4621 target_ulong addr, uint32_t desc) \
4622 { \
4623 sve_ldnf1_r(env, vg, addr, desc, ESZ, 0, sve_ld1##PART##_host); \
4624 }
4625
4626 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
4627 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
4628 target_ulong addr, uint32_t desc) \
4629 { \
4630 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4631 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
4632 } \
4633 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
4634 target_ulong addr, uint32_t desc) \
4635 { \
4636 sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_le_host); \
4637 } \
4638 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
4639 target_ulong addr, uint32_t desc) \
4640 { \
4641 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4642 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
4643 } \
4644 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
4645 target_ulong addr, uint32_t desc) \
4646 { \
4647 sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_be_host); \
4648 }
4649
4650 DO_LDFF1_LDNF1_1(bb, 0)
4651 DO_LDFF1_LDNF1_1(bhu, 1)
4652 DO_LDFF1_LDNF1_1(bhs, 1)
4653 DO_LDFF1_LDNF1_1(bsu, 2)
4654 DO_LDFF1_LDNF1_1(bss, 2)
4655 DO_LDFF1_LDNF1_1(bdu, 3)
4656 DO_LDFF1_LDNF1_1(bds, 3)
4657
4658 DO_LDFF1_LDNF1_2(hh, 1, 1)
4659 DO_LDFF1_LDNF1_2(hsu, 2, 1)
4660 DO_LDFF1_LDNF1_2(hss, 2, 1)
4661 DO_LDFF1_LDNF1_2(hdu, 3, 1)
4662 DO_LDFF1_LDNF1_2(hds, 3, 1)
4663
4664 DO_LDFF1_LDNF1_2(ss, 2, 2)
4665 DO_LDFF1_LDNF1_2(sdu, 3, 2)
4666 DO_LDFF1_LDNF1_2(sds, 3, 2)
4667
4668 DO_LDFF1_LDNF1_2(dd, 3, 3)
4669
4670 #undef DO_LDFF1_LDNF1_1
4671 #undef DO_LDFF1_LDNF1_2
4672
4673 /*
4674 * Store contiguous data, protected by a governing predicate.
4675 */
4676
4677 #ifdef CONFIG_SOFTMMU
4678 #define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
4679 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4680 target_ulong addr, int mmu_idx, uintptr_t ra) \
4681 { \
4682 TCGMemOpIdx oi = make_memop_idx(ctz32(sizeof(TYPEM)) | MOEND, mmu_idx); \
4683 TLB(env, addr, *(TYPEM *)(vd + H(reg_off)), oi, ra); \
4684 }
4685 #else
4686 #define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
4687 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4688 target_ulong addr, int mmu_idx, uintptr_t ra) \
4689 { \
4690 HOST(g2h(addr), *(TYPEM *)(vd + H(reg_off))); \
4691 }
4692 #endif
4693
4694 DO_ST_TLB(st1bb, H1, uint8_t, stb_p, 0, helper_ret_stb_mmu)
4695 DO_ST_TLB(st1bh, H1_2, uint16_t, stb_p, 0, helper_ret_stb_mmu)
4696 DO_ST_TLB(st1bs, H1_4, uint32_t, stb_p, 0, helper_ret_stb_mmu)
4697 DO_ST_TLB(st1bd, , uint64_t, stb_p, 0, helper_ret_stb_mmu)
4698
4699 DO_ST_TLB(st1hh_le, H1_2, uint16_t, stw_le_p, MO_LE, helper_le_stw_mmu)
4700 DO_ST_TLB(st1hs_le, H1_4, uint32_t, stw_le_p, MO_LE, helper_le_stw_mmu)
4701 DO_ST_TLB(st1hd_le, , uint64_t, stw_le_p, MO_LE, helper_le_stw_mmu)
4702
4703 DO_ST_TLB(st1ss_le, H1_4, uint32_t, stl_le_p, MO_LE, helper_le_stl_mmu)
4704 DO_ST_TLB(st1sd_le, , uint64_t, stl_le_p, MO_LE, helper_le_stl_mmu)
4705
4706 DO_ST_TLB(st1dd_le, , uint64_t, stq_le_p, MO_LE, helper_le_stq_mmu)
4707
4708 DO_ST_TLB(st1hh_be, H1_2, uint16_t, stw_be_p, MO_BE, helper_be_stw_mmu)
4709 DO_ST_TLB(st1hs_be, H1_4, uint32_t, stw_be_p, MO_BE, helper_be_stw_mmu)
4710 DO_ST_TLB(st1hd_be, , uint64_t, stw_be_p, MO_BE, helper_be_stw_mmu)
4711
4712 DO_ST_TLB(st1ss_be, H1_4, uint32_t, stl_be_p, MO_BE, helper_be_stl_mmu)
4713 DO_ST_TLB(st1sd_be, , uint64_t, stl_be_p, MO_BE, helper_be_stl_mmu)
4714
4715 DO_ST_TLB(st1dd_be, , uint64_t, stq_be_p, MO_BE, helper_be_stq_mmu)
4716
4717 #undef DO_ST_TLB
4718
4719 /*
4720 * Common helpers for all contiguous 1,2,3,4-register predicated stores.
4721 */
4722 static void sve_st1_r(CPUARMState *env, void *vg, target_ulong addr,
4723 uint32_t desc, const uintptr_t ra,
4724 const int esize, const int msize,
4725 sve_st1_tlb_fn *tlb_fn)
4726 {
4727 const int mmu_idx = cpu_mmu_index(env, false);
4728 intptr_t i, oprsz = simd_oprsz(desc);
4729 unsigned rd = simd_data(desc);
4730 void *vd = &env->vfp.zregs[rd];
4731
4732 set_helper_retaddr(ra);
4733 for (i = 0; i < oprsz; ) {
4734 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4735 do {
4736 if (pg & 1) {
4737 tlb_fn(env, vd, i, addr, mmu_idx, ra);
4738 }
4739 i += esize, pg >>= esize;
4740 addr += msize;
4741 } while (i & 15);
4742 }
4743 set_helper_retaddr(0);
4744 }
4745
4746 static void sve_st2_r(CPUARMState *env, void *vg, target_ulong addr,
4747 uint32_t desc, const uintptr_t ra,
4748 const int esize, const int msize,
4749 sve_st1_tlb_fn *tlb_fn)
4750 {
4751 const int mmu_idx = cpu_mmu_index(env, false);
4752 intptr_t i, oprsz = simd_oprsz(desc);
4753 unsigned rd = simd_data(desc);
4754 void *d1 = &env->vfp.zregs[rd];
4755 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
4756
4757 set_helper_retaddr(ra);
4758 for (i = 0; i < oprsz; ) {
4759 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4760 do {
4761 if (pg & 1) {
4762 tlb_fn(env, d1, i, addr, mmu_idx, ra);
4763 tlb_fn(env, d2, i, addr + msize, mmu_idx, ra);
4764 }
4765 i += esize, pg >>= esize;
4766 addr += 2 * msize;
4767 } while (i & 15);
4768 }
4769 set_helper_retaddr(0);
4770 }
4771
4772 static void sve_st3_r(CPUARMState *env, void *vg, target_ulong addr,
4773 uint32_t desc, const uintptr_t ra,
4774 const int esize, const int msize,
4775 sve_st1_tlb_fn *tlb_fn)
4776 {
4777 const int mmu_idx = cpu_mmu_index(env, false);
4778 intptr_t i, oprsz = simd_oprsz(desc);
4779 unsigned rd = simd_data(desc);
4780 void *d1 = &env->vfp.zregs[rd];
4781 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
4782 void *d3 = &env->vfp.zregs[(rd + 2) & 31];
4783
4784 set_helper_retaddr(ra);
4785 for (i = 0; i < oprsz; ) {
4786 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4787 do {
4788 if (pg & 1) {
4789 tlb_fn(env, d1, i, addr, mmu_idx, ra);
4790 tlb_fn(env, d2, i, addr + msize, mmu_idx, ra);
4791 tlb_fn(env, d3, i, addr + 2 * msize, mmu_idx, ra);
4792 }
4793 i += esize, pg >>= esize;
4794 addr += 3 * msize;
4795 } while (i & 15);
4796 }
4797 set_helper_retaddr(0);
4798 }
4799
4800 static void sve_st4_r(CPUARMState *env, void *vg, target_ulong addr,
4801 uint32_t desc, const uintptr_t ra,
4802 const int esize, const int msize,
4803 sve_st1_tlb_fn *tlb_fn)
4804 {
4805 const int mmu_idx = cpu_mmu_index(env, false);
4806 intptr_t i, oprsz = simd_oprsz(desc);
4807 unsigned rd = simd_data(desc);
4808 void *d1 = &env->vfp.zregs[rd];
4809 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
4810 void *d3 = &env->vfp.zregs[(rd + 2) & 31];
4811 void *d4 = &env->vfp.zregs[(rd + 3) & 31];
4812
4813 set_helper_retaddr(ra);
4814 for (i = 0; i < oprsz; ) {
4815 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4816 do {
4817 if (pg & 1) {
4818 tlb_fn(env, d1, i, addr, mmu_idx, ra);
4819 tlb_fn(env, d2, i, addr + msize, mmu_idx, ra);
4820 tlb_fn(env, d3, i, addr + 2 * msize, mmu_idx, ra);
4821 tlb_fn(env, d4, i, addr + 3 * msize, mmu_idx, ra);
4822 }
4823 i += esize, pg >>= esize;
4824 addr += 4 * msize;
4825 } while (i & 15);
4826 }
4827 set_helper_retaddr(0);
4828 }
4829
4830 #define DO_STN_1(N, NAME, ESIZE) \
4831 void __attribute__((flatten)) HELPER(sve_st##N##NAME##_r) \
4832 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4833 { \
4834 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, 1, \
4835 sve_st1##NAME##_tlb); \
4836 }
4837
4838 #define DO_STN_2(N, NAME, ESIZE, MSIZE) \
4839 void __attribute__((flatten)) HELPER(sve_st##N##NAME##_le_r) \
4840 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4841 { \
4842 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, MSIZE, \
4843 sve_st1##NAME##_le_tlb); \
4844 } \
4845 void __attribute__((flatten)) HELPER(sve_st##N##NAME##_be_r) \
4846 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4847 { \
4848 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, MSIZE, \
4849 sve_st1##NAME##_be_tlb); \
4850 }
4851
4852 DO_STN_1(1, bb, 1)
4853 DO_STN_1(1, bh, 2)
4854 DO_STN_1(1, bs, 4)
4855 DO_STN_1(1, bd, 8)
4856 DO_STN_1(2, bb, 1)
4857 DO_STN_1(3, bb, 1)
4858 DO_STN_1(4, bb, 1)
4859
4860 DO_STN_2(1, hh, 2, 2)
4861 DO_STN_2(1, hs, 4, 2)
4862 DO_STN_2(1, hd, 8, 2)
4863 DO_STN_2(2, hh, 2, 2)
4864 DO_STN_2(3, hh, 2, 2)
4865 DO_STN_2(4, hh, 2, 2)
4866
4867 DO_STN_2(1, ss, 4, 4)
4868 DO_STN_2(1, sd, 8, 4)
4869 DO_STN_2(2, ss, 4, 4)
4870 DO_STN_2(3, ss, 4, 4)
4871 DO_STN_2(4, ss, 4, 4)
4872
4873 DO_STN_2(1, dd, 8, 8)
4874 DO_STN_2(2, dd, 8, 8)
4875 DO_STN_2(3, dd, 8, 8)
4876 DO_STN_2(4, dd, 8, 8)
4877
4878 #undef DO_STN_1
4879 #undef DO_STN_2
4880
4881 /*
4882 * Loads with a vector index.
4883 */
4884
4885 /*
4886 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
4887 */
4888 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
4889
4890 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
4891 {
4892 return *(uint32_t *)(reg + H1_4(reg_ofs));
4893 }
4894
4895 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
4896 {
4897 return *(int32_t *)(reg + H1_4(reg_ofs));
4898 }
4899
4900 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
4901 {
4902 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
4903 }
4904
4905 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
4906 {
4907 return (int32_t)*(uint64_t *)(reg + reg_ofs);
4908 }
4909
4910 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
4911 {
4912 return *(uint64_t *)(reg + reg_ofs);
4913 }
4914
4915 static void sve_ld1_zs(CPUARMState *env, void *vd, void *vg, void *vm,
4916 target_ulong base, uint32_t desc, uintptr_t ra,
4917 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
4918 {
4919 const int mmu_idx = cpu_mmu_index(env, false);
4920 intptr_t i, oprsz = simd_oprsz(desc);
4921 unsigned scale = simd_data(desc);
4922 ARMVectorReg scratch = { };
4923
4924 set_helper_retaddr(ra);
4925 for (i = 0; i < oprsz; ) {
4926 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4927 do {
4928 if (likely(pg & 1)) {
4929 target_ulong off = off_fn(vm, i);
4930 tlb_fn(env, &scratch, i, base + (off << scale), mmu_idx, ra);
4931 }
4932 i += 4, pg >>= 4;
4933 } while (i & 15);
4934 }
4935 set_helper_retaddr(0);
4936
4937 /* Wait until all exceptions have been raised to write back. */
4938 memcpy(vd, &scratch, oprsz);
4939 }
4940
4941 static void sve_ld1_zd(CPUARMState *env, void *vd, void *vg, void *vm,
4942 target_ulong base, uint32_t desc, uintptr_t ra,
4943 zreg_off_fn *off_fn, sve_ld1_tlb_fn *tlb_fn)
4944 {
4945 const int mmu_idx = cpu_mmu_index(env, false);
4946 intptr_t i, oprsz = simd_oprsz(desc) / 8;
4947 unsigned scale = simd_data(desc);
4948 ARMVectorReg scratch = { };
4949
4950 set_helper_retaddr(ra);
4951 for (i = 0; i < oprsz; i++) {
4952 uint8_t pg = *(uint8_t *)(vg + H1(i));
4953 if (likely(pg & 1)) {
4954 target_ulong off = off_fn(vm, i * 8);
4955 tlb_fn(env, &scratch, i * 8, base + (off << scale), mmu_idx, ra);
4956 }
4957 }
4958 set_helper_retaddr(0);
4959
4960 /* Wait until all exceptions have been raised to write back. */
4961 memcpy(vd, &scratch, oprsz * 8);
4962 }
4963
4964 #define DO_LD1_ZPZ_S(MEM, OFS) \
4965 void __attribute__((flatten)) HELPER(sve_ld##MEM##_##OFS) \
4966 (CPUARMState *env, void *vd, void *vg, void *vm, \
4967 target_ulong base, uint32_t desc) \
4968 { \
4969 sve_ld1_zs(env, vd, vg, vm, base, desc, GETPC(), \
4970 off_##OFS##_s, sve_ld1##MEM##_tlb); \
4971 }
4972
4973 #define DO_LD1_ZPZ_D(MEM, OFS) \
4974 void __attribute__((flatten)) HELPER(sve_ld##MEM##_##OFS) \
4975 (CPUARMState *env, void *vd, void *vg, void *vm, \
4976 target_ulong base, uint32_t desc) \
4977 { \
4978 sve_ld1_zd(env, vd, vg, vm, base, desc, GETPC(), \
4979 off_##OFS##_d, sve_ld1##MEM##_tlb); \
4980 }
4981
4982 DO_LD1_ZPZ_S(bsu, zsu)
4983 DO_LD1_ZPZ_S(bsu, zss)
4984 DO_LD1_ZPZ_D(bdu, zsu)
4985 DO_LD1_ZPZ_D(bdu, zss)
4986 DO_LD1_ZPZ_D(bdu, zd)
4987
4988 DO_LD1_ZPZ_S(bss, zsu)
4989 DO_LD1_ZPZ_S(bss, zss)
4990 DO_LD1_ZPZ_D(bds, zsu)
4991 DO_LD1_ZPZ_D(bds, zss)
4992 DO_LD1_ZPZ_D(bds, zd)
4993
4994 DO_LD1_ZPZ_S(hsu_le, zsu)
4995 DO_LD1_ZPZ_S(hsu_le, zss)
4996 DO_LD1_ZPZ_D(hdu_le, zsu)
4997 DO_LD1_ZPZ_D(hdu_le, zss)
4998 DO_LD1_ZPZ_D(hdu_le, zd)
4999
5000 DO_LD1_ZPZ_S(hsu_be, zsu)
5001 DO_LD1_ZPZ_S(hsu_be, zss)
5002 DO_LD1_ZPZ_D(hdu_be, zsu)
5003 DO_LD1_ZPZ_D(hdu_be, zss)
5004 DO_LD1_ZPZ_D(hdu_be, zd)
5005
5006 DO_LD1_ZPZ_S(hss_le, zsu)
5007 DO_LD1_ZPZ_S(hss_le, zss)
5008 DO_LD1_ZPZ_D(hds_le, zsu)
5009 DO_LD1_ZPZ_D(hds_le, zss)
5010 DO_LD1_ZPZ_D(hds_le, zd)
5011
5012 DO_LD1_ZPZ_S(hss_be, zsu)
5013 DO_LD1_ZPZ_S(hss_be, zss)
5014 DO_LD1_ZPZ_D(hds_be, zsu)
5015 DO_LD1_ZPZ_D(hds_be, zss)
5016 DO_LD1_ZPZ_D(hds_be, zd)
5017
5018 DO_LD1_ZPZ_S(ss_le, zsu)
5019 DO_LD1_ZPZ_S(ss_le, zss)
5020 DO_LD1_ZPZ_D(sdu_le, zsu)
5021 DO_LD1_ZPZ_D(sdu_le, zss)
5022 DO_LD1_ZPZ_D(sdu_le, zd)
5023
5024 DO_LD1_ZPZ_S(ss_be, zsu)
5025 DO_LD1_ZPZ_S(ss_be, zss)
5026 DO_LD1_ZPZ_D(sdu_be, zsu)
5027 DO_LD1_ZPZ_D(sdu_be, zss)
5028 DO_LD1_ZPZ_D(sdu_be, zd)
5029
5030 DO_LD1_ZPZ_D(sds_le, zsu)
5031 DO_LD1_ZPZ_D(sds_le, zss)
5032 DO_LD1_ZPZ_D(sds_le, zd)
5033
5034 DO_LD1_ZPZ_D(sds_be, zsu)
5035 DO_LD1_ZPZ_D(sds_be, zss)
5036 DO_LD1_ZPZ_D(sds_be, zd)
5037
5038 DO_LD1_ZPZ_D(dd_le, zsu)
5039 DO_LD1_ZPZ_D(dd_le, zss)
5040 DO_LD1_ZPZ_D(dd_le, zd)
5041
5042 DO_LD1_ZPZ_D(dd_be, zsu)
5043 DO_LD1_ZPZ_D(dd_be, zss)
5044 DO_LD1_ZPZ_D(dd_be, zd)
5045
5046 #undef DO_LD1_ZPZ_S
5047 #undef DO_LD1_ZPZ_D
5048
5049 /* First fault loads with a vector index. */
5050
5051 #ifdef CONFIG_USER_ONLY
5052
5053 #define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
5054 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
5055 target_ulong base, uint32_t desc) \
5056 { \
5057 intptr_t i, oprsz = simd_oprsz(desc); \
5058 unsigned scale = simd_data(desc); \
5059 uintptr_t ra = GETPC(); \
5060 bool first = true; \
5061 mmap_lock(); \
5062 for (i = 0; i < oprsz; ) { \
5063 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
5064 do { \
5065 TYPEM m = 0; \
5066 if (pg & 1) { \
5067 target_ulong off = *(TYPEI *)(vm + H(i)); \
5068 target_ulong addr = base + (off << scale); \
5069 if (!first && \
5070 page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \
5071 record_fault(env, i, oprsz); \
5072 goto exit; \
5073 } \
5074 m = FN(env, addr, ra); \
5075 first = false; \
5076 } \
5077 *(TYPEE *)(vd + H(i)) = m; \
5078 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
5079 } while (i & 15); \
5080 } \
5081 exit: \
5082 mmap_unlock(); \
5083 }
5084
5085 #else
5086
5087 #define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
5088 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
5089 target_ulong base, uint32_t desc) \
5090 { \
5091 g_assert_not_reached(); \
5092 }
5093
5094 #endif
5095
5096 #define DO_LDFF1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
5097 DO_LDFF1_ZPZ(NAME, uint32_t, TYPEI, TYPEM, FN, H1_4)
5098 #define DO_LDFF1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
5099 DO_LDFF1_ZPZ(NAME, uint64_t, TYPEI, TYPEM, FN, )
5100
5101 DO_LDFF1_ZPZ_S(sve_ldffbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
5102 DO_LDFF1_ZPZ_S(sve_ldffhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
5103 DO_LDFF1_ZPZ_S(sve_ldffssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
5104 DO_LDFF1_ZPZ_S(sve_ldffbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
5105 DO_LDFF1_ZPZ_S(sve_ldffhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
5106
5107 DO_LDFF1_ZPZ_S(sve_ldffbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
5108 DO_LDFF1_ZPZ_S(sve_ldffhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
5109 DO_LDFF1_ZPZ_S(sve_ldffssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
5110 DO_LDFF1_ZPZ_S(sve_ldffbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
5111 DO_LDFF1_ZPZ_S(sve_ldffhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
5112
5113 DO_LDFF1_ZPZ_D(sve_ldffbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
5114 DO_LDFF1_ZPZ_D(sve_ldffhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
5115 DO_LDFF1_ZPZ_D(sve_ldffsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
5116 DO_LDFF1_ZPZ_D(sve_ldffddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
5117 DO_LDFF1_ZPZ_D(sve_ldffbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
5118 DO_LDFF1_ZPZ_D(sve_ldffhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
5119 DO_LDFF1_ZPZ_D(sve_ldffsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
5120
5121 DO_LDFF1_ZPZ_D(sve_ldffbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
5122 DO_LDFF1_ZPZ_D(sve_ldffhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
5123 DO_LDFF1_ZPZ_D(sve_ldffsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
5124 DO_LDFF1_ZPZ_D(sve_ldffddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
5125 DO_LDFF1_ZPZ_D(sve_ldffbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
5126 DO_LDFF1_ZPZ_D(sve_ldffhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
5127 DO_LDFF1_ZPZ_D(sve_ldffsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
5128
5129 DO_LDFF1_ZPZ_D(sve_ldffbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
5130 DO_LDFF1_ZPZ_D(sve_ldffhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
5131 DO_LDFF1_ZPZ_D(sve_ldffsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
5132 DO_LDFF1_ZPZ_D(sve_ldffddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
5133 DO_LDFF1_ZPZ_D(sve_ldffbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
5134 DO_LDFF1_ZPZ_D(sve_ldffhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
5135 DO_LDFF1_ZPZ_D(sve_ldffsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
5136
5137 /* Stores with a vector index. */
5138
5139 #define DO_ST1_ZPZ_S(NAME, TYPEI, FN) \
5140 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
5141 target_ulong base, uint32_t desc) \
5142 { \
5143 intptr_t i, oprsz = simd_oprsz(desc); \
5144 unsigned scale = simd_data(desc); \
5145 uintptr_t ra = GETPC(); \
5146 for (i = 0; i < oprsz; ) { \
5147 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
5148 do { \
5149 if (likely(pg & 1)) { \
5150 target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
5151 uint32_t d = *(uint32_t *)(vd + H1_4(i)); \
5152 FN(env, base + (off << scale), d, ra); \
5153 } \
5154 i += sizeof(uint32_t), pg >>= sizeof(uint32_t); \
5155 } while (i & 15); \
5156 } \
5157 }
5158
5159 #define DO_ST1_ZPZ_D(NAME, TYPEI, FN) \
5160 void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
5161 target_ulong base, uint32_t desc) \
5162 { \
5163 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
5164 unsigned scale = simd_data(desc); \
5165 uintptr_t ra = GETPC(); \
5166 uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
5167 for (i = 0; i < oprsz; i++) { \
5168 if (likely(pg[H1(i)] & 1)) { \
5169 target_ulong off = (target_ulong)(TYPEI)m[i] << scale; \
5170 FN(env, base + off, d[i], ra); \
5171 } \
5172 } \
5173 }
5174
5175 DO_ST1_ZPZ_S(sve_stbs_zsu, uint32_t, cpu_stb_data_ra)
5176 DO_ST1_ZPZ_S(sve_sths_zsu, uint32_t, cpu_stw_data_ra)
5177 DO_ST1_ZPZ_S(sve_stss_zsu, uint32_t, cpu_stl_data_ra)
5178
5179 DO_ST1_ZPZ_S(sve_stbs_zss, int32_t, cpu_stb_data_ra)
5180 DO_ST1_ZPZ_S(sve_sths_zss, int32_t, cpu_stw_data_ra)
5181 DO_ST1_ZPZ_S(sve_stss_zss, int32_t, cpu_stl_data_ra)
5182
5183 DO_ST1_ZPZ_D(sve_stbd_zsu, uint32_t, cpu_stb_data_ra)
5184 DO_ST1_ZPZ_D(sve_sthd_zsu, uint32_t, cpu_stw_data_ra)
5185 DO_ST1_ZPZ_D(sve_stsd_zsu, uint32_t, cpu_stl_data_ra)
5186 DO_ST1_ZPZ_D(sve_stdd_zsu, uint32_t, cpu_stq_data_ra)
5187
5188 DO_ST1_ZPZ_D(sve_stbd_zss, int32_t, cpu_stb_data_ra)
5189 DO_ST1_ZPZ_D(sve_sthd_zss, int32_t, cpu_stw_data_ra)
5190 DO_ST1_ZPZ_D(sve_stsd_zss, int32_t, cpu_stl_data_ra)
5191 DO_ST1_ZPZ_D(sve_stdd_zss, int32_t, cpu_stq_data_ra)
5192
5193 DO_ST1_ZPZ_D(sve_stbd_zd, uint64_t, cpu_stb_data_ra)
5194 DO_ST1_ZPZ_D(sve_sthd_zd, uint64_t, cpu_stw_data_ra)
5195 DO_ST1_ZPZ_D(sve_stsd_zd, uint64_t, cpu_stl_data_ra)
5196 DO_ST1_ZPZ_D(sve_stdd_zd, uint64_t, cpu_stq_data_ra)