]> git.proxmox.com Git - mirror_qemu.git/blob - target/arm/sve_helper.c
target/arm: Fix sve_uzp_p vs odd vector lengths
[mirror_qemu.git] / target / arm / sve_helper.c
1 /*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg.h"
29
30
31 /* Note that vector data is stored in host-endian 64-bit chunks,
32 so addressing units smaller than that needs a host-endian fixup. */
33 #ifdef HOST_WORDS_BIGENDIAN
34 #define H1(x) ((x) ^ 7)
35 #define H1_2(x) ((x) ^ 6)
36 #define H1_4(x) ((x) ^ 4)
37 #define H2(x) ((x) ^ 3)
38 #define H4(x) ((x) ^ 1)
39 #else
40 #define H1(x) (x)
41 #define H1_2(x) (x)
42 #define H1_4(x) (x)
43 #define H2(x) (x)
44 #define H4(x) (x)
45 #endif
46
47 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
48 *
49 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
50 * and bit 0 set if C is set. Compare the definitions of these variables
51 * within CPUARMState.
52 */
53
54 /* For no G bits set, NZCV = C. */
55 #define PREDTEST_INIT 1
56
57 /* This is an iterative function, called for each Pd and Pg word
58 * moving forward.
59 */
60 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
61 {
62 if (likely(g)) {
63 /* Compute N from first D & G.
64 Use bit 2 to signal first G bit seen. */
65 if (!(flags & 4)) {
66 flags |= ((d & (g & -g)) != 0) << 31;
67 flags |= 4;
68 }
69
70 /* Accumulate Z from each D & G. */
71 flags |= ((d & g) != 0) << 1;
72
73 /* Compute C from last !(D & G). Replace previous. */
74 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
75 }
76 return flags;
77 }
78
79 /* This is an iterative function, called for each Pd and Pg word
80 * moving backward.
81 */
82 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
83 {
84 if (likely(g)) {
85 /* Compute C from first (i.e last) !(D & G).
86 Use bit 2 to signal first G bit seen. */
87 if (!(flags & 4)) {
88 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
89 flags |= (d & pow2floor(g)) == 0;
90 }
91
92 /* Accumulate Z from each D & G. */
93 flags |= ((d & g) != 0) << 1;
94
95 /* Compute N from last (i.e first) D & G. Replace previous. */
96 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
97 }
98 return flags;
99 }
100
101 /* The same for a single word predicate. */
102 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
103 {
104 return iter_predtest_fwd(d, g, PREDTEST_INIT);
105 }
106
107 /* The same for a multi-word predicate. */
108 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
109 {
110 uint32_t flags = PREDTEST_INIT;
111 uint64_t *d = vd, *g = vg;
112 uintptr_t i = 0;
113
114 do {
115 flags = iter_predtest_fwd(d[i], g[i], flags);
116 } while (++i < words);
117
118 return flags;
119 }
120
121 /* Expand active predicate bits to bytes, for byte elements.
122 * for (i = 0; i < 256; ++i) {
123 * unsigned long m = 0;
124 * for (j = 0; j < 8; j++) {
125 * if ((i >> j) & 1) {
126 * m |= 0xfful << (j << 3);
127 * }
128 * }
129 * printf("0x%016lx,\n", m);
130 * }
131 */
132 static inline uint64_t expand_pred_b(uint8_t byte)
133 {
134 static const uint64_t word[256] = {
135 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
136 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
137 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
138 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
139 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
140 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
141 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
142 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
143 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
144 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
145 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
146 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
147 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
148 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
149 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
150 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
151 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
152 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
153 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
154 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
155 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
156 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
157 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
158 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
159 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
160 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
161 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
162 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
163 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
164 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
165 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
166 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
167 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
168 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
169 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
170 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
171 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
172 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
173 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
174 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
175 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
176 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
177 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
178 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
179 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
180 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
181 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
182 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
183 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
184 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
185 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
186 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
187 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
188 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
189 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
190 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
191 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
192 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
193 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
194 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
195 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
196 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
197 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
198 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
199 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
200 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
201 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
202 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
203 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
204 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
205 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
206 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
207 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
208 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
209 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
210 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
211 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
212 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
213 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
214 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
215 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
216 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
217 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
218 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
219 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
220 0xffffffffffffffff,
221 };
222 return word[byte];
223 }
224
225 /* Similarly for half-word elements.
226 * for (i = 0; i < 256; ++i) {
227 * unsigned long m = 0;
228 * if (i & 0xaa) {
229 * continue;
230 * }
231 * for (j = 0; j < 8; j += 2) {
232 * if ((i >> j) & 1) {
233 * m |= 0xfffful << (j << 3);
234 * }
235 * }
236 * printf("[0x%x] = 0x%016lx,\n", i, m);
237 * }
238 */
239 static inline uint64_t expand_pred_h(uint8_t byte)
240 {
241 static const uint64_t word[] = {
242 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
243 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
244 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
245 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
246 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
247 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
248 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
249 [0x55] = 0xffffffffffffffff,
250 };
251 return word[byte & 0x55];
252 }
253
254 /* Similarly for single word elements. */
255 static inline uint64_t expand_pred_s(uint8_t byte)
256 {
257 static const uint64_t word[] = {
258 [0x01] = 0x00000000ffffffffull,
259 [0x10] = 0xffffffff00000000ull,
260 [0x11] = 0xffffffffffffffffull,
261 };
262 return word[byte & 0x11];
263 }
264
265 /* Swap 16-bit words within a 32-bit word. */
266 static inline uint32_t hswap32(uint32_t h)
267 {
268 return rol32(h, 16);
269 }
270
271 /* Swap 16-bit words within a 64-bit word. */
272 static inline uint64_t hswap64(uint64_t h)
273 {
274 uint64_t m = 0x0000ffff0000ffffull;
275 h = rol64(h, 32);
276 return ((h & m) << 16) | ((h >> 16) & m);
277 }
278
279 /* Swap 32-bit words within a 64-bit word. */
280 static inline uint64_t wswap64(uint64_t h)
281 {
282 return rol64(h, 32);
283 }
284
285 #define LOGICAL_PPPP(NAME, FUNC) \
286 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
287 { \
288 uintptr_t opr_sz = simd_oprsz(desc); \
289 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
290 uintptr_t i; \
291 for (i = 0; i < opr_sz / 8; ++i) { \
292 d[i] = FUNC(n[i], m[i], g[i]); \
293 } \
294 }
295
296 #define DO_AND(N, M, G) (((N) & (M)) & (G))
297 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
298 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
299 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
300 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
301 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
302 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
303 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
304
305 LOGICAL_PPPP(sve_and_pppp, DO_AND)
306 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
307 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
308 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
309 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
310 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
311 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
312 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
313
314 #undef DO_AND
315 #undef DO_BIC
316 #undef DO_EOR
317 #undef DO_ORR
318 #undef DO_ORN
319 #undef DO_NOR
320 #undef DO_NAND
321 #undef DO_SEL
322 #undef LOGICAL_PPPP
323
324 /* Fully general three-operand expander, controlled by a predicate.
325 * This is complicated by the host-endian storage of the register file.
326 */
327 /* ??? I don't expect the compiler could ever vectorize this itself.
328 * With some tables we can convert bit masks to byte masks, and with
329 * extra care wrt byte/word ordering we could use gcc generic vectors
330 * and do 16 bytes at a time.
331 */
332 #define DO_ZPZZ(NAME, TYPE, H, OP) \
333 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
334 { \
335 intptr_t i, opr_sz = simd_oprsz(desc); \
336 for (i = 0; i < opr_sz; ) { \
337 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
338 do { \
339 if (pg & 1) { \
340 TYPE nn = *(TYPE *)(vn + H(i)); \
341 TYPE mm = *(TYPE *)(vm + H(i)); \
342 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
343 } \
344 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
345 } while (i & 15); \
346 } \
347 }
348
349 /* Similarly, specialized for 64-bit operands. */
350 #define DO_ZPZZ_D(NAME, TYPE, OP) \
351 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
352 { \
353 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
354 TYPE *d = vd, *n = vn, *m = vm; \
355 uint8_t *pg = vg; \
356 for (i = 0; i < opr_sz; i += 1) { \
357 if (pg[H1(i)] & 1) { \
358 TYPE nn = n[i], mm = m[i]; \
359 d[i] = OP(nn, mm); \
360 } \
361 } \
362 }
363
364 #define DO_AND(N, M) (N & M)
365 #define DO_EOR(N, M) (N ^ M)
366 #define DO_ORR(N, M) (N | M)
367 #define DO_BIC(N, M) (N & ~M)
368 #define DO_ADD(N, M) (N + M)
369 #define DO_SUB(N, M) (N - M)
370 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
371 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
372 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
373 #define DO_MUL(N, M) (N * M)
374
375
376 /*
377 * We must avoid the C undefined behaviour cases: division by
378 * zero and signed division of INT_MIN by -1. Both of these
379 * have architecturally defined required results for Arm.
380 * We special case all signed divisions by -1 to avoid having
381 * to deduce the minimum integer for the type involved.
382 */
383 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
384 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
385
386 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
387 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
388 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
389 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
390
391 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
392 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
393 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
394 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
395
396 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
397 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
398 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
399 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
400
401 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
402 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
403 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
404 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
405
406 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
407 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
408 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
409 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
410
411 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
412 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
413 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
414 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
415
416 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
417 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
418 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
419 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
420
421 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
422 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
423 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
424 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
425
426 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
427 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
428 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
429 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
430
431 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
432 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
433 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
434 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
435
436 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
437 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
438 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
439 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
440
441 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
442 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
443 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
444 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
445
446 /* Because the computation type is at least twice as large as required,
447 these work for both signed and unsigned source types. */
448 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
449 {
450 return (n * m) >> 8;
451 }
452
453 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
454 {
455 return (n * m) >> 16;
456 }
457
458 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
459 {
460 return (n * m) >> 32;
461 }
462
463 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
464 {
465 uint64_t lo, hi;
466 muls64(&lo, &hi, n, m);
467 return hi;
468 }
469
470 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
471 {
472 uint64_t lo, hi;
473 mulu64(&lo, &hi, n, m);
474 return hi;
475 }
476
477 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
478 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
479 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
480 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
481
482 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
483 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
484 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
485 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
486
487 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
488 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
489 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
490 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
491
492 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
493 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
494
495 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
496 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
497
498 /* Note that all bits of the shift are significant
499 and not modulo the element size. */
500 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
501 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
502 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
503
504 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
505 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
506 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
507
508 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
509 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
510 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
511
512 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
513 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
514 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
515
516 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
517 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
518 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
519
520 #undef DO_ZPZZ
521 #undef DO_ZPZZ_D
522
523 /* Three-operand expander, controlled by a predicate, in which the
524 * third operand is "wide". That is, for D = N op M, the same 64-bit
525 * value of M is used with all of the narrower values of N.
526 */
527 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
528 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
529 { \
530 intptr_t i, opr_sz = simd_oprsz(desc); \
531 for (i = 0; i < opr_sz; ) { \
532 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
533 TYPEW mm = *(TYPEW *)(vm + i); \
534 do { \
535 if (pg & 1) { \
536 TYPE nn = *(TYPE *)(vn + H(i)); \
537 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
538 } \
539 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
540 } while (i & 7); \
541 } \
542 }
543
544 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
545 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
546 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
547
548 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
549 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
550 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
551
552 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
553 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
554 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
555
556 #undef DO_ZPZW
557
558 /* Fully general two-operand expander, controlled by a predicate.
559 */
560 #define DO_ZPZ(NAME, TYPE, H, OP) \
561 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
562 { \
563 intptr_t i, opr_sz = simd_oprsz(desc); \
564 for (i = 0; i < opr_sz; ) { \
565 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
566 do { \
567 if (pg & 1) { \
568 TYPE nn = *(TYPE *)(vn + H(i)); \
569 *(TYPE *)(vd + H(i)) = OP(nn); \
570 } \
571 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
572 } while (i & 15); \
573 } \
574 }
575
576 /* Similarly, specialized for 64-bit operands. */
577 #define DO_ZPZ_D(NAME, TYPE, OP) \
578 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
579 { \
580 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
581 TYPE *d = vd, *n = vn; \
582 uint8_t *pg = vg; \
583 for (i = 0; i < opr_sz; i += 1) { \
584 if (pg[H1(i)] & 1) { \
585 TYPE nn = n[i]; \
586 d[i] = OP(nn); \
587 } \
588 } \
589 }
590
591 #define DO_CLS_B(N) (clrsb32(N) - 24)
592 #define DO_CLS_H(N) (clrsb32(N) - 16)
593
594 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
595 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
596 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
597 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
598
599 #define DO_CLZ_B(N) (clz32(N) - 24)
600 #define DO_CLZ_H(N) (clz32(N) - 16)
601
602 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
603 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
604 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
605 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
606
607 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
608 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
609 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
610 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
611
612 #define DO_CNOT(N) (N == 0)
613
614 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
615 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
616 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
617 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
618
619 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
620
621 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
622 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
623 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
624
625 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
626
627 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
628 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
629 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
630
631 #define DO_NOT(N) (~N)
632
633 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
634 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
635 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
636 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
637
638 #define DO_SXTB(N) ((int8_t)N)
639 #define DO_SXTH(N) ((int16_t)N)
640 #define DO_SXTS(N) ((int32_t)N)
641 #define DO_UXTB(N) ((uint8_t)N)
642 #define DO_UXTH(N) ((uint16_t)N)
643 #define DO_UXTS(N) ((uint32_t)N)
644
645 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
646 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
647 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
648 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
649 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
650 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
651
652 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
653 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
654 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
655 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
656 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
657 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
658
659 #define DO_ABS(N) (N < 0 ? -N : N)
660
661 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
662 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
663 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
664 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
665
666 #define DO_NEG(N) (-N)
667
668 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
669 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
670 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
671 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
672
673 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
674 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
675 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
676
677 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
678 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
679
680 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
681
682 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
683 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
684 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
685 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
686
687 /* Three-operand expander, unpredicated, in which the third operand is "wide".
688 */
689 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
690 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
691 { \
692 intptr_t i, opr_sz = simd_oprsz(desc); \
693 for (i = 0; i < opr_sz; ) { \
694 TYPEW mm = *(TYPEW *)(vm + i); \
695 do { \
696 TYPE nn = *(TYPE *)(vn + H(i)); \
697 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
698 i += sizeof(TYPE); \
699 } while (i & 7); \
700 } \
701 }
702
703 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
704 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
705 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
706
707 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
708 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
709 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
710
711 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
712 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
713 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
714
715 #undef DO_ZZW
716
717 #undef DO_CLS_B
718 #undef DO_CLS_H
719 #undef DO_CLZ_B
720 #undef DO_CLZ_H
721 #undef DO_CNOT
722 #undef DO_FABS
723 #undef DO_FNEG
724 #undef DO_ABS
725 #undef DO_NEG
726 #undef DO_ZPZ
727 #undef DO_ZPZ_D
728
729 /* Two-operand reduction expander, controlled by a predicate.
730 * The difference between TYPERED and TYPERET has to do with
731 * sign-extension. E.g. for SMAX, TYPERED must be signed,
732 * but TYPERET must be unsigned so that e.g. a 32-bit value
733 * is not sign-extended to the ABI uint64_t return type.
734 */
735 /* ??? If we were to vectorize this by hand the reduction ordering
736 * would change. For integer operands, this is perfectly fine.
737 */
738 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
739 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
740 { \
741 intptr_t i, opr_sz = simd_oprsz(desc); \
742 TYPERED ret = INIT; \
743 for (i = 0; i < opr_sz; ) { \
744 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
745 do { \
746 if (pg & 1) { \
747 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
748 ret = OP(ret, nn); \
749 } \
750 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
751 } while (i & 15); \
752 } \
753 return (TYPERET)ret; \
754 }
755
756 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
757 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
758 { \
759 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
760 TYPEE *n = vn; \
761 uint8_t *pg = vg; \
762 TYPER ret = INIT; \
763 for (i = 0; i < opr_sz; i += 1) { \
764 if (pg[H1(i)] & 1) { \
765 TYPEE nn = n[i]; \
766 ret = OP(ret, nn); \
767 } \
768 } \
769 return ret; \
770 }
771
772 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
773 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
774 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
775 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
776
777 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
778 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
779 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
780 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
781
782 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
783 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
784 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
785 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
786
787 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
788 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
789 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
790
791 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
792 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
793 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
794 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
795
796 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
797 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
798 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
799 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
800
801 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
802 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
803 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
804 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
805
806 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
807 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
808 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
809 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
810
811 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
812 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
813 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
814 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
815
816 #undef DO_VPZ
817 #undef DO_VPZ_D
818
819 /* Two vector operand, one scalar operand, unpredicated. */
820 #define DO_ZZI(NAME, TYPE, OP) \
821 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
822 { \
823 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
824 TYPE s = s64, *d = vd, *n = vn; \
825 for (i = 0; i < opr_sz; ++i) { \
826 d[i] = OP(n[i], s); \
827 } \
828 }
829
830 #define DO_SUBR(X, Y) (Y - X)
831
832 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
833 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
834 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
835 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
836
837 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
838 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
839 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
840 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
841
842 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
843 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
844 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
845 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
846
847 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
848 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
849 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
850 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
851
852 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
853 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
854 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
855 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
856
857 #undef DO_ZZI
858
859 #undef DO_AND
860 #undef DO_ORR
861 #undef DO_EOR
862 #undef DO_BIC
863 #undef DO_ADD
864 #undef DO_SUB
865 #undef DO_MAX
866 #undef DO_MIN
867 #undef DO_ABD
868 #undef DO_MUL
869 #undef DO_DIV
870 #undef DO_ASR
871 #undef DO_LSR
872 #undef DO_LSL
873 #undef DO_SUBR
874
875 /* Similar to the ARM LastActiveElement pseudocode function, except the
876 result is multiplied by the element size. This includes the not found
877 indication; e.g. not found for esz=3 is -8. */
878 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
879 {
880 uint64_t mask = pred_esz_masks[esz];
881 intptr_t i = words;
882
883 do {
884 uint64_t this_g = g[--i] & mask;
885 if (this_g) {
886 return i * 64 + (63 - clz64(this_g));
887 }
888 } while (i > 0);
889 return (intptr_t)-1 << esz;
890 }
891
892 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
893 {
894 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
895 uint32_t flags = PREDTEST_INIT;
896 uint64_t *d = vd, *g = vg;
897 intptr_t i = 0;
898
899 do {
900 uint64_t this_d = d[i];
901 uint64_t this_g = g[i];
902
903 if (this_g) {
904 if (!(flags & 4)) {
905 /* Set in D the first bit of G. */
906 this_d |= this_g & -this_g;
907 d[i] = this_d;
908 }
909 flags = iter_predtest_fwd(this_d, this_g, flags);
910 }
911 } while (++i < words);
912
913 return flags;
914 }
915
916 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
917 {
918 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
919 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
920 uint32_t flags = PREDTEST_INIT;
921 uint64_t *d = vd, *g = vg, esz_mask;
922 intptr_t i, next;
923
924 next = last_active_element(vd, words, esz) + (1 << esz);
925 esz_mask = pred_esz_masks[esz];
926
927 /* Similar to the pseudocode for pnext, but scaled by ESZ
928 so that we find the correct bit. */
929 if (next < words * 64) {
930 uint64_t mask = -1;
931
932 if (next & 63) {
933 mask = ~((1ull << (next & 63)) - 1);
934 next &= -64;
935 }
936 do {
937 uint64_t this_g = g[next / 64] & esz_mask & mask;
938 if (this_g != 0) {
939 next = (next & -64) + ctz64(this_g);
940 break;
941 }
942 next += 64;
943 mask = -1;
944 } while (next < words * 64);
945 }
946
947 i = 0;
948 do {
949 uint64_t this_d = 0;
950 if (i == next / 64) {
951 this_d = 1ull << (next & 63);
952 }
953 d[i] = this_d;
954 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
955 } while (++i < words);
956
957 return flags;
958 }
959
960 /*
961 * Copy Zn into Zd, and store zero into inactive elements.
962 * If inv, store zeros into the active elements.
963 */
964 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
965 {
966 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
967 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
968 uint64_t *d = vd, *n = vn;
969 uint8_t *pg = vg;
970
971 for (i = 0; i < opr_sz; i += 1) {
972 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
973 }
974 }
975
976 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
977 {
978 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
979 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
980 uint64_t *d = vd, *n = vn;
981 uint8_t *pg = vg;
982
983 for (i = 0; i < opr_sz; i += 1) {
984 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
985 }
986 }
987
988 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
989 {
990 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
991 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
992 uint64_t *d = vd, *n = vn;
993 uint8_t *pg = vg;
994
995 for (i = 0; i < opr_sz; i += 1) {
996 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
997 }
998 }
999
1000 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1001 {
1002 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1003 uint64_t *d = vd, *n = vn;
1004 uint8_t *pg = vg;
1005 uint8_t inv = simd_data(desc);
1006
1007 for (i = 0; i < opr_sz; i += 1) {
1008 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
1009 }
1010 }
1011
1012 /* Three-operand expander, immediate operand, controlled by a predicate.
1013 */
1014 #define DO_ZPZI(NAME, TYPE, H, OP) \
1015 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1016 { \
1017 intptr_t i, opr_sz = simd_oprsz(desc); \
1018 TYPE imm = simd_data(desc); \
1019 for (i = 0; i < opr_sz; ) { \
1020 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1021 do { \
1022 if (pg & 1) { \
1023 TYPE nn = *(TYPE *)(vn + H(i)); \
1024 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1025 } \
1026 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1027 } while (i & 15); \
1028 } \
1029 }
1030
1031 /* Similarly, specialized for 64-bit operands. */
1032 #define DO_ZPZI_D(NAME, TYPE, OP) \
1033 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1034 { \
1035 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1036 TYPE *d = vd, *n = vn; \
1037 TYPE imm = simd_data(desc); \
1038 uint8_t *pg = vg; \
1039 for (i = 0; i < opr_sz; i += 1) { \
1040 if (pg[H1(i)] & 1) { \
1041 TYPE nn = n[i]; \
1042 d[i] = OP(nn, imm); \
1043 } \
1044 } \
1045 }
1046
1047 #define DO_SHR(N, M) (N >> M)
1048 #define DO_SHL(N, M) (N << M)
1049
1050 /* Arithmetic shift right for division. This rounds negative numbers
1051 toward zero as per signed division. Therefore before shifting,
1052 when N is negative, add 2**M-1. */
1053 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1054
1055 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1056 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1057 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1058 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1059
1060 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1061 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1062 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1063 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1064
1065 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1066 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1067 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1068 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1069
1070 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1071 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1072 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1073 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1074
1075 #undef DO_SHR
1076 #undef DO_SHL
1077 #undef DO_ASRD
1078 #undef DO_ZPZI
1079 #undef DO_ZPZI_D
1080
1081 /* Fully general four-operand expander, controlled by a predicate.
1082 */
1083 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1084 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1085 void *vg, uint32_t desc) \
1086 { \
1087 intptr_t i, opr_sz = simd_oprsz(desc); \
1088 for (i = 0; i < opr_sz; ) { \
1089 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1090 do { \
1091 if (pg & 1) { \
1092 TYPE nn = *(TYPE *)(vn + H(i)); \
1093 TYPE mm = *(TYPE *)(vm + H(i)); \
1094 TYPE aa = *(TYPE *)(va + H(i)); \
1095 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1096 } \
1097 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1098 } while (i & 15); \
1099 } \
1100 }
1101
1102 /* Similarly, specialized for 64-bit operands. */
1103 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1104 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1105 void *vg, uint32_t desc) \
1106 { \
1107 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1108 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1109 uint8_t *pg = vg; \
1110 for (i = 0; i < opr_sz; i += 1) { \
1111 if (pg[H1(i)] & 1) { \
1112 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1113 d[i] = OP(aa, nn, mm); \
1114 } \
1115 } \
1116 }
1117
1118 #define DO_MLA(A, N, M) (A + N * M)
1119 #define DO_MLS(A, N, M) (A - N * M)
1120
1121 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1122 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1123
1124 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1125 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1126
1127 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1128 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1129
1130 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1131 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1132
1133 #undef DO_MLA
1134 #undef DO_MLS
1135 #undef DO_ZPZZZ
1136 #undef DO_ZPZZZ_D
1137
1138 void HELPER(sve_index_b)(void *vd, uint32_t start,
1139 uint32_t incr, uint32_t desc)
1140 {
1141 intptr_t i, opr_sz = simd_oprsz(desc);
1142 uint8_t *d = vd;
1143 for (i = 0; i < opr_sz; i += 1) {
1144 d[H1(i)] = start + i * incr;
1145 }
1146 }
1147
1148 void HELPER(sve_index_h)(void *vd, uint32_t start,
1149 uint32_t incr, uint32_t desc)
1150 {
1151 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1152 uint16_t *d = vd;
1153 for (i = 0; i < opr_sz; i += 1) {
1154 d[H2(i)] = start + i * incr;
1155 }
1156 }
1157
1158 void HELPER(sve_index_s)(void *vd, uint32_t start,
1159 uint32_t incr, uint32_t desc)
1160 {
1161 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1162 uint32_t *d = vd;
1163 for (i = 0; i < opr_sz; i += 1) {
1164 d[H4(i)] = start + i * incr;
1165 }
1166 }
1167
1168 void HELPER(sve_index_d)(void *vd, uint64_t start,
1169 uint64_t incr, uint32_t desc)
1170 {
1171 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1172 uint64_t *d = vd;
1173 for (i = 0; i < opr_sz; i += 1) {
1174 d[i] = start + i * incr;
1175 }
1176 }
1177
1178 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1179 {
1180 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1181 uint32_t sh = simd_data(desc);
1182 uint32_t *d = vd, *n = vn, *m = vm;
1183 for (i = 0; i < opr_sz; i += 1) {
1184 d[i] = n[i] + (m[i] << sh);
1185 }
1186 }
1187
1188 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1189 {
1190 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1191 uint64_t sh = simd_data(desc);
1192 uint64_t *d = vd, *n = vn, *m = vm;
1193 for (i = 0; i < opr_sz; i += 1) {
1194 d[i] = n[i] + (m[i] << sh);
1195 }
1196 }
1197
1198 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1199 {
1200 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1201 uint64_t sh = simd_data(desc);
1202 uint64_t *d = vd, *n = vn, *m = vm;
1203 for (i = 0; i < opr_sz; i += 1) {
1204 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1205 }
1206 }
1207
1208 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1209 {
1210 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1211 uint64_t sh = simd_data(desc);
1212 uint64_t *d = vd, *n = vn, *m = vm;
1213 for (i = 0; i < opr_sz; i += 1) {
1214 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1215 }
1216 }
1217
1218 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1219 {
1220 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1221 static const uint16_t coeff[] = {
1222 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1223 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1224 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1225 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1226 };
1227 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1228 uint16_t *d = vd, *n = vn;
1229
1230 for (i = 0; i < opr_sz; i++) {
1231 uint16_t nn = n[i];
1232 intptr_t idx = extract32(nn, 0, 5);
1233 uint16_t exp = extract32(nn, 5, 5);
1234 d[i] = coeff[idx] | (exp << 10);
1235 }
1236 }
1237
1238 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1239 {
1240 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1241 static const uint32_t coeff[] = {
1242 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1243 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1244 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1245 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1246 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1247 0x1ef532, 0x20b051, 0x227043, 0x243516,
1248 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1249 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1250 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1251 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1252 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1253 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1254 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1255 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1256 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1257 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1258 };
1259 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1260 uint32_t *d = vd, *n = vn;
1261
1262 for (i = 0; i < opr_sz; i++) {
1263 uint32_t nn = n[i];
1264 intptr_t idx = extract32(nn, 0, 6);
1265 uint32_t exp = extract32(nn, 6, 8);
1266 d[i] = coeff[idx] | (exp << 23);
1267 }
1268 }
1269
1270 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1271 {
1272 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1273 static const uint64_t coeff[] = {
1274 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1275 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1276 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1277 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1278 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1279 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1280 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1281 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1282 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1283 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1284 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1285 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1286 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1287 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1288 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1289 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1290 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1291 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1292 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1293 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1294 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1295 0xFA7C1819E90D8ull,
1296 };
1297 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1298 uint64_t *d = vd, *n = vn;
1299
1300 for (i = 0; i < opr_sz; i++) {
1301 uint64_t nn = n[i];
1302 intptr_t idx = extract32(nn, 0, 6);
1303 uint64_t exp = extract32(nn, 6, 11);
1304 d[i] = coeff[idx] | (exp << 52);
1305 }
1306 }
1307
1308 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1309 {
1310 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1311 uint16_t *d = vd, *n = vn, *m = vm;
1312 for (i = 0; i < opr_sz; i += 1) {
1313 uint16_t nn = n[i];
1314 uint16_t mm = m[i];
1315 if (mm & 1) {
1316 nn = float16_one;
1317 }
1318 d[i] = nn ^ (mm & 2) << 14;
1319 }
1320 }
1321
1322 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1323 {
1324 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1325 uint32_t *d = vd, *n = vn, *m = vm;
1326 for (i = 0; i < opr_sz; i += 1) {
1327 uint32_t nn = n[i];
1328 uint32_t mm = m[i];
1329 if (mm & 1) {
1330 nn = float32_one;
1331 }
1332 d[i] = nn ^ (mm & 2) << 30;
1333 }
1334 }
1335
1336 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1337 {
1338 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1339 uint64_t *d = vd, *n = vn, *m = vm;
1340 for (i = 0; i < opr_sz; i += 1) {
1341 uint64_t nn = n[i];
1342 uint64_t mm = m[i];
1343 if (mm & 1) {
1344 nn = float64_one;
1345 }
1346 d[i] = nn ^ (mm & 2) << 62;
1347 }
1348 }
1349
1350 /*
1351 * Signed saturating addition with scalar operand.
1352 */
1353
1354 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1355 {
1356 intptr_t i, oprsz = simd_oprsz(desc);
1357
1358 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1359 int r = *(int8_t *)(a + i) + b;
1360 if (r > INT8_MAX) {
1361 r = INT8_MAX;
1362 } else if (r < INT8_MIN) {
1363 r = INT8_MIN;
1364 }
1365 *(int8_t *)(d + i) = r;
1366 }
1367 }
1368
1369 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1370 {
1371 intptr_t i, oprsz = simd_oprsz(desc);
1372
1373 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1374 int r = *(int16_t *)(a + i) + b;
1375 if (r > INT16_MAX) {
1376 r = INT16_MAX;
1377 } else if (r < INT16_MIN) {
1378 r = INT16_MIN;
1379 }
1380 *(int16_t *)(d + i) = r;
1381 }
1382 }
1383
1384 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1385 {
1386 intptr_t i, oprsz = simd_oprsz(desc);
1387
1388 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1389 int64_t r = *(int32_t *)(a + i) + b;
1390 if (r > INT32_MAX) {
1391 r = INT32_MAX;
1392 } else if (r < INT32_MIN) {
1393 r = INT32_MIN;
1394 }
1395 *(int32_t *)(d + i) = r;
1396 }
1397 }
1398
1399 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1400 {
1401 intptr_t i, oprsz = simd_oprsz(desc);
1402
1403 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1404 int64_t ai = *(int64_t *)(a + i);
1405 int64_t r = ai + b;
1406 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1407 /* Signed overflow. */
1408 r = (r < 0 ? INT64_MAX : INT64_MIN);
1409 }
1410 *(int64_t *)(d + i) = r;
1411 }
1412 }
1413
1414 /*
1415 * Unsigned saturating addition with scalar operand.
1416 */
1417
1418 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1419 {
1420 intptr_t i, oprsz = simd_oprsz(desc);
1421
1422 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1423 int r = *(uint8_t *)(a + i) + b;
1424 if (r > UINT8_MAX) {
1425 r = UINT8_MAX;
1426 } else if (r < 0) {
1427 r = 0;
1428 }
1429 *(uint8_t *)(d + i) = r;
1430 }
1431 }
1432
1433 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1434 {
1435 intptr_t i, oprsz = simd_oprsz(desc);
1436
1437 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1438 int r = *(uint16_t *)(a + i) + b;
1439 if (r > UINT16_MAX) {
1440 r = UINT16_MAX;
1441 } else if (r < 0) {
1442 r = 0;
1443 }
1444 *(uint16_t *)(d + i) = r;
1445 }
1446 }
1447
1448 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1449 {
1450 intptr_t i, oprsz = simd_oprsz(desc);
1451
1452 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1453 int64_t r = *(uint32_t *)(a + i) + b;
1454 if (r > UINT32_MAX) {
1455 r = UINT32_MAX;
1456 } else if (r < 0) {
1457 r = 0;
1458 }
1459 *(uint32_t *)(d + i) = r;
1460 }
1461 }
1462
1463 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1464 {
1465 intptr_t i, oprsz = simd_oprsz(desc);
1466
1467 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1468 uint64_t r = *(uint64_t *)(a + i) + b;
1469 if (r < b) {
1470 r = UINT64_MAX;
1471 }
1472 *(uint64_t *)(d + i) = r;
1473 }
1474 }
1475
1476 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1477 {
1478 intptr_t i, oprsz = simd_oprsz(desc);
1479
1480 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1481 uint64_t ai = *(uint64_t *)(a + i);
1482 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1483 }
1484 }
1485
1486 /* Two operand predicated copy immediate with merge. All valid immediates
1487 * can fit within 17 signed bits in the simd_data field.
1488 */
1489 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1490 uint64_t mm, uint32_t desc)
1491 {
1492 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1493 uint64_t *d = vd, *n = vn;
1494 uint8_t *pg = vg;
1495
1496 mm = dup_const(MO_8, mm);
1497 for (i = 0; i < opr_sz; i += 1) {
1498 uint64_t nn = n[i];
1499 uint64_t pp = expand_pred_b(pg[H1(i)]);
1500 d[i] = (mm & pp) | (nn & ~pp);
1501 }
1502 }
1503
1504 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1505 uint64_t mm, uint32_t desc)
1506 {
1507 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1508 uint64_t *d = vd, *n = vn;
1509 uint8_t *pg = vg;
1510
1511 mm = dup_const(MO_16, mm);
1512 for (i = 0; i < opr_sz; i += 1) {
1513 uint64_t nn = n[i];
1514 uint64_t pp = expand_pred_h(pg[H1(i)]);
1515 d[i] = (mm & pp) | (nn & ~pp);
1516 }
1517 }
1518
1519 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1520 uint64_t mm, uint32_t desc)
1521 {
1522 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1523 uint64_t *d = vd, *n = vn;
1524 uint8_t *pg = vg;
1525
1526 mm = dup_const(MO_32, mm);
1527 for (i = 0; i < opr_sz; i += 1) {
1528 uint64_t nn = n[i];
1529 uint64_t pp = expand_pred_s(pg[H1(i)]);
1530 d[i] = (mm & pp) | (nn & ~pp);
1531 }
1532 }
1533
1534 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1535 uint64_t mm, uint32_t desc)
1536 {
1537 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1538 uint64_t *d = vd, *n = vn;
1539 uint8_t *pg = vg;
1540
1541 for (i = 0; i < opr_sz; i += 1) {
1542 uint64_t nn = n[i];
1543 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1544 }
1545 }
1546
1547 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1548 {
1549 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1550 uint64_t *d = vd;
1551 uint8_t *pg = vg;
1552
1553 val = dup_const(MO_8, val);
1554 for (i = 0; i < opr_sz; i += 1) {
1555 d[i] = val & expand_pred_b(pg[H1(i)]);
1556 }
1557 }
1558
1559 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1560 {
1561 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1562 uint64_t *d = vd;
1563 uint8_t *pg = vg;
1564
1565 val = dup_const(MO_16, val);
1566 for (i = 0; i < opr_sz; i += 1) {
1567 d[i] = val & expand_pred_h(pg[H1(i)]);
1568 }
1569 }
1570
1571 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1572 {
1573 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1574 uint64_t *d = vd;
1575 uint8_t *pg = vg;
1576
1577 val = dup_const(MO_32, val);
1578 for (i = 0; i < opr_sz; i += 1) {
1579 d[i] = val & expand_pred_s(pg[H1(i)]);
1580 }
1581 }
1582
1583 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1584 {
1585 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1586 uint64_t *d = vd;
1587 uint8_t *pg = vg;
1588
1589 for (i = 0; i < opr_sz; i += 1) {
1590 d[i] = (pg[H1(i)] & 1 ? val : 0);
1591 }
1592 }
1593
1594 /* Big-endian hosts need to frob the byte indices. If the copy
1595 * happens to be 8-byte aligned, then no frobbing necessary.
1596 */
1597 static void swap_memmove(void *vd, void *vs, size_t n)
1598 {
1599 uintptr_t d = (uintptr_t)vd;
1600 uintptr_t s = (uintptr_t)vs;
1601 uintptr_t o = (d | s | n) & 7;
1602 size_t i;
1603
1604 #ifndef HOST_WORDS_BIGENDIAN
1605 o = 0;
1606 #endif
1607 switch (o) {
1608 case 0:
1609 memmove(vd, vs, n);
1610 break;
1611
1612 case 4:
1613 if (d < s || d >= s + n) {
1614 for (i = 0; i < n; i += 4) {
1615 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1616 }
1617 } else {
1618 for (i = n; i > 0; ) {
1619 i -= 4;
1620 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1621 }
1622 }
1623 break;
1624
1625 case 2:
1626 case 6:
1627 if (d < s || d >= s + n) {
1628 for (i = 0; i < n; i += 2) {
1629 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1630 }
1631 } else {
1632 for (i = n; i > 0; ) {
1633 i -= 2;
1634 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1635 }
1636 }
1637 break;
1638
1639 default:
1640 if (d < s || d >= s + n) {
1641 for (i = 0; i < n; i++) {
1642 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1643 }
1644 } else {
1645 for (i = n; i > 0; ) {
1646 i -= 1;
1647 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1648 }
1649 }
1650 break;
1651 }
1652 }
1653
1654 /* Similarly for memset of 0. */
1655 static void swap_memzero(void *vd, size_t n)
1656 {
1657 uintptr_t d = (uintptr_t)vd;
1658 uintptr_t o = (d | n) & 7;
1659 size_t i;
1660
1661 /* Usually, the first bit of a predicate is set, so N is 0. */
1662 if (likely(n == 0)) {
1663 return;
1664 }
1665
1666 #ifndef HOST_WORDS_BIGENDIAN
1667 o = 0;
1668 #endif
1669 switch (o) {
1670 case 0:
1671 memset(vd, 0, n);
1672 break;
1673
1674 case 4:
1675 for (i = 0; i < n; i += 4) {
1676 *(uint32_t *)H1_4(d + i) = 0;
1677 }
1678 break;
1679
1680 case 2:
1681 case 6:
1682 for (i = 0; i < n; i += 2) {
1683 *(uint16_t *)H1_2(d + i) = 0;
1684 }
1685 break;
1686
1687 default:
1688 for (i = 0; i < n; i++) {
1689 *(uint8_t *)H1(d + i) = 0;
1690 }
1691 break;
1692 }
1693 }
1694
1695 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1696 {
1697 intptr_t opr_sz = simd_oprsz(desc);
1698 size_t n_ofs = simd_data(desc);
1699 size_t n_siz = opr_sz - n_ofs;
1700
1701 if (vd != vm) {
1702 swap_memmove(vd, vn + n_ofs, n_siz);
1703 swap_memmove(vd + n_siz, vm, n_ofs);
1704 } else if (vd != vn) {
1705 swap_memmove(vd + n_siz, vd, n_ofs);
1706 swap_memmove(vd, vn + n_ofs, n_siz);
1707 } else {
1708 /* vd == vn == vm. Need temp space. */
1709 ARMVectorReg tmp;
1710 swap_memmove(&tmp, vm, n_ofs);
1711 swap_memmove(vd, vd + n_ofs, n_siz);
1712 memcpy(vd + n_siz, &tmp, n_ofs);
1713 }
1714 }
1715
1716 #define DO_INSR(NAME, TYPE, H) \
1717 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1718 { \
1719 intptr_t opr_sz = simd_oprsz(desc); \
1720 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1721 *(TYPE *)(vd + H(0)) = val; \
1722 }
1723
1724 DO_INSR(sve_insr_b, uint8_t, H1)
1725 DO_INSR(sve_insr_h, uint16_t, H1_2)
1726 DO_INSR(sve_insr_s, uint32_t, H1_4)
1727 DO_INSR(sve_insr_d, uint64_t, )
1728
1729 #undef DO_INSR
1730
1731 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1732 {
1733 intptr_t i, j, opr_sz = simd_oprsz(desc);
1734 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1735 uint64_t f = *(uint64_t *)(vn + i);
1736 uint64_t b = *(uint64_t *)(vn + j);
1737 *(uint64_t *)(vd + i) = bswap64(b);
1738 *(uint64_t *)(vd + j) = bswap64(f);
1739 }
1740 }
1741
1742 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1743 {
1744 intptr_t i, j, opr_sz = simd_oprsz(desc);
1745 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1746 uint64_t f = *(uint64_t *)(vn + i);
1747 uint64_t b = *(uint64_t *)(vn + j);
1748 *(uint64_t *)(vd + i) = hswap64(b);
1749 *(uint64_t *)(vd + j) = hswap64(f);
1750 }
1751 }
1752
1753 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1754 {
1755 intptr_t i, j, opr_sz = simd_oprsz(desc);
1756 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1757 uint64_t f = *(uint64_t *)(vn + i);
1758 uint64_t b = *(uint64_t *)(vn + j);
1759 *(uint64_t *)(vd + i) = rol64(b, 32);
1760 *(uint64_t *)(vd + j) = rol64(f, 32);
1761 }
1762 }
1763
1764 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1765 {
1766 intptr_t i, j, opr_sz = simd_oprsz(desc);
1767 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1768 uint64_t f = *(uint64_t *)(vn + i);
1769 uint64_t b = *(uint64_t *)(vn + j);
1770 *(uint64_t *)(vd + i) = b;
1771 *(uint64_t *)(vd + j) = f;
1772 }
1773 }
1774
1775 #define DO_TBL(NAME, TYPE, H) \
1776 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1777 { \
1778 intptr_t i, opr_sz = simd_oprsz(desc); \
1779 uintptr_t elem = opr_sz / sizeof(TYPE); \
1780 TYPE *d = vd, *n = vn, *m = vm; \
1781 ARMVectorReg tmp; \
1782 if (unlikely(vd == vn)) { \
1783 n = memcpy(&tmp, vn, opr_sz); \
1784 } \
1785 for (i = 0; i < elem; i++) { \
1786 TYPE j = m[H(i)]; \
1787 d[H(i)] = j < elem ? n[H(j)] : 0; \
1788 } \
1789 }
1790
1791 DO_TBL(sve_tbl_b, uint8_t, H1)
1792 DO_TBL(sve_tbl_h, uint16_t, H2)
1793 DO_TBL(sve_tbl_s, uint32_t, H4)
1794 DO_TBL(sve_tbl_d, uint64_t, )
1795
1796 #undef TBL
1797
1798 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1799 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1800 { \
1801 intptr_t i, opr_sz = simd_oprsz(desc); \
1802 TYPED *d = vd; \
1803 TYPES *n = vn; \
1804 ARMVectorReg tmp; \
1805 if (unlikely(vn - vd < opr_sz)) { \
1806 n = memcpy(&tmp, n, opr_sz / 2); \
1807 } \
1808 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1809 d[HD(i)] = n[HS(i)]; \
1810 } \
1811 }
1812
1813 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1814 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1815 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1816
1817 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1818 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1819 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1820
1821 #undef DO_UNPK
1822
1823 /* Mask of bits included in the even numbered predicates of width esz.
1824 * We also use this for expand_bits/compress_bits, and so extend the
1825 * same pattern out to 16-bit units.
1826 */
1827 static const uint64_t even_bit_esz_masks[5] = {
1828 0x5555555555555555ull,
1829 0x3333333333333333ull,
1830 0x0f0f0f0f0f0f0f0full,
1831 0x00ff00ff00ff00ffull,
1832 0x0000ffff0000ffffull,
1833 };
1834
1835 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1836 * For N==0, this corresponds to the operation that in qemu/bitops.h
1837 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1838 * section 7-2 Shuffling Bits.
1839 */
1840 static uint64_t expand_bits(uint64_t x, int n)
1841 {
1842 int i;
1843
1844 x &= 0xffffffffu;
1845 for (i = 4; i >= n; i--) {
1846 int sh = 1 << i;
1847 x = ((x << sh) | x) & even_bit_esz_masks[i];
1848 }
1849 return x;
1850 }
1851
1852 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1853 * For N==0, this corresponds to the operation that in qemu/bitops.h
1854 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1855 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1856 */
1857 static uint64_t compress_bits(uint64_t x, int n)
1858 {
1859 int i;
1860
1861 for (i = n; i <= 4; i++) {
1862 int sh = 1 << i;
1863 x &= even_bit_esz_masks[i];
1864 x = (x >> sh) | x;
1865 }
1866 return x & 0xffffffffu;
1867 }
1868
1869 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1870 {
1871 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
1872 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1873 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
1874 uint64_t *d = vd;
1875 intptr_t i;
1876
1877 if (oprsz <= 8) {
1878 uint64_t nn = *(uint64_t *)vn;
1879 uint64_t mm = *(uint64_t *)vm;
1880 int half = 4 * oprsz;
1881
1882 nn = extract64(nn, high * half, half);
1883 mm = extract64(mm, high * half, half);
1884 nn = expand_bits(nn, esz);
1885 mm = expand_bits(mm, esz);
1886 d[0] = nn + (mm << (1 << esz));
1887 } else {
1888 ARMPredicateReg tmp_n, tmp_m;
1889
1890 /* We produce output faster than we consume input.
1891 Therefore we must be mindful of possible overlap. */
1892 if ((vn - vd) < (uintptr_t)oprsz) {
1893 vn = memcpy(&tmp_n, vn, oprsz);
1894 }
1895 if ((vm - vd) < (uintptr_t)oprsz) {
1896 vm = memcpy(&tmp_m, vm, oprsz);
1897 }
1898 if (high) {
1899 high = oprsz >> 1;
1900 }
1901
1902 if ((high & 3) == 0) {
1903 uint32_t *n = vn, *m = vm;
1904 high >>= 2;
1905
1906 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1907 uint64_t nn = n[H4(high + i)];
1908 uint64_t mm = m[H4(high + i)];
1909
1910 nn = expand_bits(nn, esz);
1911 mm = expand_bits(mm, esz);
1912 d[i] = nn + (mm << (1 << esz));
1913 }
1914 } else {
1915 uint8_t *n = vn, *m = vm;
1916 uint16_t *d16 = vd;
1917
1918 for (i = 0; i < oprsz / 2; i++) {
1919 uint16_t nn = n[H1(high + i)];
1920 uint16_t mm = m[H1(high + i)];
1921
1922 nn = expand_bits(nn, esz);
1923 mm = expand_bits(mm, esz);
1924 d16[H2(i)] = nn + (mm << (1 << esz));
1925 }
1926 }
1927 }
1928 }
1929
1930 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1931 {
1932 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
1933 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1934 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
1935 uint64_t *d = vd, *n = vn, *m = vm;
1936 uint64_t l, h;
1937 intptr_t i;
1938
1939 if (oprsz <= 8) {
1940 l = compress_bits(n[0] >> odd, esz);
1941 h = compress_bits(m[0] >> odd, esz);
1942 d[0] = l | (h << (4 * oprsz));
1943 } else {
1944 ARMPredicateReg tmp_m;
1945 intptr_t oprsz_16 = oprsz / 16;
1946
1947 if ((vm - vd) < (uintptr_t)oprsz) {
1948 m = memcpy(&tmp_m, vm, oprsz);
1949 }
1950
1951 for (i = 0; i < oprsz_16; i++) {
1952 l = n[2 * i + 0];
1953 h = n[2 * i + 1];
1954 l = compress_bits(l >> odd, esz);
1955 h = compress_bits(h >> odd, esz);
1956 d[i] = l | (h << 32);
1957 }
1958
1959 /*
1960 * For VL which is not a multiple of 512, the results from M do not
1961 * align nicely with the uint64_t for D. Put the aligned results
1962 * from M into TMP_M and then copy it into place afterward.
1963 */
1964 if (oprsz & 15) {
1965 int final_shift = (oprsz & 15) * 2;
1966
1967 l = n[2 * i + 0];
1968 h = n[2 * i + 1];
1969 l = compress_bits(l >> odd, esz);
1970 h = compress_bits(h >> odd, esz);
1971 d[i] = l | (h << final_shift);
1972
1973 for (i = 0; i < oprsz_16; i++) {
1974 l = m[2 * i + 0];
1975 h = m[2 * i + 1];
1976 l = compress_bits(l >> odd, esz);
1977 h = compress_bits(h >> odd, esz);
1978 tmp_m.p[i] = l | (h << 32);
1979 }
1980 l = m[2 * i + 0];
1981 h = m[2 * i + 1];
1982 l = compress_bits(l >> odd, esz);
1983 h = compress_bits(h >> odd, esz);
1984 tmp_m.p[i] = l | (h << final_shift);
1985
1986 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1987 } else {
1988 for (i = 0; i < oprsz_16; i++) {
1989 l = m[2 * i + 0];
1990 h = m[2 * i + 1];
1991 l = compress_bits(l >> odd, esz);
1992 h = compress_bits(h >> odd, esz);
1993 d[oprsz_16 + i] = l | (h << 32);
1994 }
1995 }
1996 }
1997 }
1998
1999 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2000 {
2001 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2002 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2003 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
2004 uint64_t *d = vd, *n = vn, *m = vm;
2005 uint64_t mask;
2006 int shr, shl;
2007 intptr_t i;
2008
2009 shl = 1 << esz;
2010 shr = 0;
2011 mask = even_bit_esz_masks[esz];
2012 if (odd) {
2013 mask <<= shl;
2014 shr = shl;
2015 shl = 0;
2016 }
2017
2018 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2019 uint64_t nn = (n[i] & mask) >> shr;
2020 uint64_t mm = (m[i] & mask) << shl;
2021 d[i] = nn + mm;
2022 }
2023 }
2024
2025 /* Reverse units of 2**N bits. */
2026 static uint64_t reverse_bits_64(uint64_t x, int n)
2027 {
2028 int i, sh;
2029
2030 x = bswap64(x);
2031 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2032 uint64_t mask = even_bit_esz_masks[i];
2033 x = ((x & mask) << sh) | ((x >> sh) & mask);
2034 }
2035 return x;
2036 }
2037
2038 static uint8_t reverse_bits_8(uint8_t x, int n)
2039 {
2040 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2041 int i, sh;
2042
2043 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2044 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2045 }
2046 return x;
2047 }
2048
2049 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2050 {
2051 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2052 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2053 intptr_t i, oprsz_2 = oprsz / 2;
2054
2055 if (oprsz <= 8) {
2056 uint64_t l = *(uint64_t *)vn;
2057 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2058 *(uint64_t *)vd = l;
2059 } else if ((oprsz & 15) == 0) {
2060 for (i = 0; i < oprsz_2; i += 8) {
2061 intptr_t ih = oprsz - 8 - i;
2062 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2063 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2064 *(uint64_t *)(vd + i) = h;
2065 *(uint64_t *)(vd + ih) = l;
2066 }
2067 } else {
2068 for (i = 0; i < oprsz_2; i += 1) {
2069 intptr_t il = H1(i);
2070 intptr_t ih = H1(oprsz - 1 - i);
2071 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2072 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2073 *(uint8_t *)(vd + il) = h;
2074 *(uint8_t *)(vd + ih) = l;
2075 }
2076 }
2077 }
2078
2079 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2080 {
2081 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2082 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
2083 uint64_t *d = vd;
2084 intptr_t i;
2085
2086 if (oprsz <= 8) {
2087 uint64_t nn = *(uint64_t *)vn;
2088 int half = 4 * oprsz;
2089
2090 nn = extract64(nn, high * half, half);
2091 nn = expand_bits(nn, 0);
2092 d[0] = nn;
2093 } else {
2094 ARMPredicateReg tmp_n;
2095
2096 /* We produce output faster than we consume input.
2097 Therefore we must be mindful of possible overlap. */
2098 if ((vn - vd) < (uintptr_t)oprsz) {
2099 vn = memcpy(&tmp_n, vn, oprsz);
2100 }
2101 if (high) {
2102 high = oprsz >> 1;
2103 }
2104
2105 if ((high & 3) == 0) {
2106 uint32_t *n = vn;
2107 high >>= 2;
2108
2109 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2110 uint64_t nn = n[H4(high + i)];
2111 d[i] = expand_bits(nn, 0);
2112 }
2113 } else {
2114 uint16_t *d16 = vd;
2115 uint8_t *n = vn;
2116
2117 for (i = 0; i < oprsz / 2; i++) {
2118 uint16_t nn = n[H1(high + i)];
2119 d16[H2(i)] = expand_bits(nn, 0);
2120 }
2121 }
2122 }
2123 }
2124
2125 #define DO_ZIP(NAME, TYPE, H) \
2126 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2127 { \
2128 intptr_t oprsz = simd_oprsz(desc); \
2129 intptr_t i, oprsz_2 = oprsz / 2; \
2130 ARMVectorReg tmp_n, tmp_m; \
2131 /* We produce output faster than we consume input. \
2132 Therefore we must be mindful of possible overlap. */ \
2133 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2134 vn = memcpy(&tmp_n, vn, oprsz_2); \
2135 } \
2136 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2137 vm = memcpy(&tmp_m, vm, oprsz_2); \
2138 } \
2139 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2140 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2141 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2142 } \
2143 }
2144
2145 DO_ZIP(sve_zip_b, uint8_t, H1)
2146 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2147 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2148 DO_ZIP(sve_zip_d, uint64_t, )
2149
2150 #define DO_UZP(NAME, TYPE, H) \
2151 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2152 { \
2153 intptr_t oprsz = simd_oprsz(desc); \
2154 intptr_t oprsz_2 = oprsz / 2; \
2155 intptr_t odd_ofs = simd_data(desc); \
2156 intptr_t i; \
2157 ARMVectorReg tmp_m; \
2158 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2159 vm = memcpy(&tmp_m, vm, oprsz); \
2160 } \
2161 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2162 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2163 } \
2164 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2165 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2166 } \
2167 }
2168
2169 DO_UZP(sve_uzp_b, uint8_t, H1)
2170 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2171 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2172 DO_UZP(sve_uzp_d, uint64_t, )
2173
2174 #define DO_TRN(NAME, TYPE, H) \
2175 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2176 { \
2177 intptr_t oprsz = simd_oprsz(desc); \
2178 intptr_t odd_ofs = simd_data(desc); \
2179 intptr_t i; \
2180 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2181 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2182 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2183 *(TYPE *)(vd + H(i + 0)) = ae; \
2184 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2185 } \
2186 }
2187
2188 DO_TRN(sve_trn_b, uint8_t, H1)
2189 DO_TRN(sve_trn_h, uint16_t, H1_2)
2190 DO_TRN(sve_trn_s, uint32_t, H1_4)
2191 DO_TRN(sve_trn_d, uint64_t, )
2192
2193 #undef DO_ZIP
2194 #undef DO_UZP
2195 #undef DO_TRN
2196
2197 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2198 {
2199 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2200 uint32_t *d = vd, *n = vn;
2201 uint8_t *pg = vg;
2202
2203 for (i = j = 0; i < opr_sz; i++) {
2204 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2205 d[H4(j)] = n[H4(i)];
2206 j++;
2207 }
2208 }
2209 for (; j < opr_sz; j++) {
2210 d[H4(j)] = 0;
2211 }
2212 }
2213
2214 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2215 {
2216 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2217 uint64_t *d = vd, *n = vn;
2218 uint8_t *pg = vg;
2219
2220 for (i = j = 0; i < opr_sz; i++) {
2221 if (pg[H1(i)] & 1) {
2222 d[j] = n[i];
2223 j++;
2224 }
2225 }
2226 for (; j < opr_sz; j++) {
2227 d[j] = 0;
2228 }
2229 }
2230
2231 /* Similar to the ARM LastActiveElement pseudocode function, except the
2232 * result is multiplied by the element size. This includes the not found
2233 * indication; e.g. not found for esz=3 is -8.
2234 */
2235 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2236 {
2237 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2238 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2239
2240 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2241 }
2242
2243 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2244 {
2245 intptr_t opr_sz = simd_oprsz(desc) / 8;
2246 int esz = simd_data(desc);
2247 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2248 intptr_t i, first_i, last_i;
2249 ARMVectorReg tmp;
2250
2251 first_i = last_i = 0;
2252 first_g = last_g = 0;
2253
2254 /* Find the extent of the active elements within VG. */
2255 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2256 pg = *(uint64_t *)(vg + i) & mask;
2257 if (pg) {
2258 if (last_g == 0) {
2259 last_g = pg;
2260 last_i = i;
2261 }
2262 first_g = pg;
2263 first_i = i;
2264 }
2265 }
2266
2267 len = 0;
2268 if (first_g != 0) {
2269 first_i = first_i * 8 + ctz64(first_g);
2270 last_i = last_i * 8 + 63 - clz64(last_g);
2271 len = last_i - first_i + (1 << esz);
2272 if (vd == vm) {
2273 vm = memcpy(&tmp, vm, opr_sz * 8);
2274 }
2275 swap_memmove(vd, vn + first_i, len);
2276 }
2277 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2278 }
2279
2280 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2281 void *vg, uint32_t desc)
2282 {
2283 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2284 uint64_t *d = vd, *n = vn, *m = vm;
2285 uint8_t *pg = vg;
2286
2287 for (i = 0; i < opr_sz; i += 1) {
2288 uint64_t nn = n[i], mm = m[i];
2289 uint64_t pp = expand_pred_b(pg[H1(i)]);
2290 d[i] = (nn & pp) | (mm & ~pp);
2291 }
2292 }
2293
2294 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2295 void *vg, uint32_t desc)
2296 {
2297 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2298 uint64_t *d = vd, *n = vn, *m = vm;
2299 uint8_t *pg = vg;
2300
2301 for (i = 0; i < opr_sz; i += 1) {
2302 uint64_t nn = n[i], mm = m[i];
2303 uint64_t pp = expand_pred_h(pg[H1(i)]);
2304 d[i] = (nn & pp) | (mm & ~pp);
2305 }
2306 }
2307
2308 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2309 void *vg, uint32_t desc)
2310 {
2311 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2312 uint64_t *d = vd, *n = vn, *m = vm;
2313 uint8_t *pg = vg;
2314
2315 for (i = 0; i < opr_sz; i += 1) {
2316 uint64_t nn = n[i], mm = m[i];
2317 uint64_t pp = expand_pred_s(pg[H1(i)]);
2318 d[i] = (nn & pp) | (mm & ~pp);
2319 }
2320 }
2321
2322 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2323 void *vg, uint32_t desc)
2324 {
2325 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2326 uint64_t *d = vd, *n = vn, *m = vm;
2327 uint8_t *pg = vg;
2328
2329 for (i = 0; i < opr_sz; i += 1) {
2330 uint64_t nn = n[i], mm = m[i];
2331 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2332 }
2333 }
2334
2335 /* Two operand comparison controlled by a predicate.
2336 * ??? It is very tempting to want to be able to expand this inline
2337 * with x86 instructions, e.g.
2338 *
2339 * vcmpeqw zm, zn, %ymm0
2340 * vpmovmskb %ymm0, %eax
2341 * and $0x5555, %eax
2342 * and pg, %eax
2343 *
2344 * or even aarch64, e.g.
2345 *
2346 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2347 * cmeq v0.8h, zn, zm
2348 * and v0.8h, v0.8h, mask
2349 * addv h0, v0.8h
2350 * and v0.8b, pg
2351 *
2352 * However, coming up with an abstraction that allows vector inputs and
2353 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2354 * scalar outputs, is tricky.
2355 */
2356 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2357 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2358 { \
2359 intptr_t opr_sz = simd_oprsz(desc); \
2360 uint32_t flags = PREDTEST_INIT; \
2361 intptr_t i = opr_sz; \
2362 do { \
2363 uint64_t out = 0, pg; \
2364 do { \
2365 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2366 TYPE nn = *(TYPE *)(vn + H(i)); \
2367 TYPE mm = *(TYPE *)(vm + H(i)); \
2368 out |= nn OP mm; \
2369 } while (i & 63); \
2370 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2371 out &= pg; \
2372 *(uint64_t *)(vd + (i >> 3)) = out; \
2373 flags = iter_predtest_bwd(out, pg, flags); \
2374 } while (i > 0); \
2375 return flags; \
2376 }
2377
2378 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2379 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2380 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2381 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2382 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2383 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2384 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2385 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2386
2387 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2388 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2389 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2390 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2391
2392 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2393 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2394 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2395 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2396
2397 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2398 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2399 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2400 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2401
2402 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2403 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2404 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2405 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2406
2407 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2408 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2409 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2410 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2411
2412 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2413 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2414 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2415 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2416
2417 #undef DO_CMP_PPZZ_B
2418 #undef DO_CMP_PPZZ_H
2419 #undef DO_CMP_PPZZ_S
2420 #undef DO_CMP_PPZZ_D
2421 #undef DO_CMP_PPZZ
2422
2423 /* Similar, but the second source is "wide". */
2424 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2425 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2426 { \
2427 intptr_t opr_sz = simd_oprsz(desc); \
2428 uint32_t flags = PREDTEST_INIT; \
2429 intptr_t i = opr_sz; \
2430 do { \
2431 uint64_t out = 0, pg; \
2432 do { \
2433 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2434 do { \
2435 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2436 TYPE nn = *(TYPE *)(vn + H(i)); \
2437 out |= nn OP mm; \
2438 } while (i & 7); \
2439 } while (i & 63); \
2440 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2441 out &= pg; \
2442 *(uint64_t *)(vd + (i >> 3)) = out; \
2443 flags = iter_predtest_bwd(out, pg, flags); \
2444 } while (i > 0); \
2445 return flags; \
2446 }
2447
2448 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2449 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2450 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2451 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2452 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2453 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2454
2455 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
2456 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2457 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
2458
2459 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
2460 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2461 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
2462
2463 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2464 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2465 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2466
2467 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2468 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2469 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2470
2471 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2472 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2473 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2474
2475 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2476 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2477 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2478
2479 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2480 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2481 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2482
2483 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2484 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2485 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2486
2487 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2488 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2489 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2490
2491 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2492 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2493 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2494
2495 #undef DO_CMP_PPZW_B
2496 #undef DO_CMP_PPZW_H
2497 #undef DO_CMP_PPZW_S
2498 #undef DO_CMP_PPZW
2499
2500 /* Similar, but the second source is immediate. */
2501 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2502 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2503 { \
2504 intptr_t opr_sz = simd_oprsz(desc); \
2505 uint32_t flags = PREDTEST_INIT; \
2506 TYPE mm = simd_data(desc); \
2507 intptr_t i = opr_sz; \
2508 do { \
2509 uint64_t out = 0, pg; \
2510 do { \
2511 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2512 TYPE nn = *(TYPE *)(vn + H(i)); \
2513 out |= nn OP mm; \
2514 } while (i & 63); \
2515 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2516 out &= pg; \
2517 *(uint64_t *)(vd + (i >> 3)) = out; \
2518 flags = iter_predtest_bwd(out, pg, flags); \
2519 } while (i > 0); \
2520 return flags; \
2521 }
2522
2523 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2524 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2525 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2526 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2527 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2528 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2529 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2530 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2531
2532 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2533 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2534 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2535 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2536
2537 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2538 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2539 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2540 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2541
2542 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2543 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2544 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2545 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2546
2547 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2548 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2549 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2550 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2551
2552 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2553 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2554 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2555 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2556
2557 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2558 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2559 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2560 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2561
2562 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2563 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2564 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2565 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2566
2567 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2568 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2569 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2570 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2571
2572 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2573 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2574 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2575 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2576
2577 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2578 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2579 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2580 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2581
2582 #undef DO_CMP_PPZI_B
2583 #undef DO_CMP_PPZI_H
2584 #undef DO_CMP_PPZI_S
2585 #undef DO_CMP_PPZI_D
2586 #undef DO_CMP_PPZI
2587
2588 /* Similar to the ARM LastActive pseudocode function. */
2589 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2590 {
2591 intptr_t i;
2592
2593 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2594 uint64_t pg = *(uint64_t *)(vg + i);
2595 if (pg) {
2596 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2597 }
2598 }
2599 return 0;
2600 }
2601
2602 /* Compute a mask into RETB that is true for all G, up to and including
2603 * (if after) or excluding (if !after) the first G & N.
2604 * Return true if BRK found.
2605 */
2606 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2607 bool brk, bool after)
2608 {
2609 uint64_t b;
2610
2611 if (brk) {
2612 b = 0;
2613 } else if ((g & n) == 0) {
2614 /* For all G, no N are set; break not found. */
2615 b = g;
2616 } else {
2617 /* Break somewhere in N. Locate it. */
2618 b = g & n; /* guard true, pred true */
2619 b = b & -b; /* first such */
2620 if (after) {
2621 b = b | (b - 1); /* break after same */
2622 } else {
2623 b = b - 1; /* break before same */
2624 }
2625 brk = true;
2626 }
2627
2628 *retb = b;
2629 return brk;
2630 }
2631
2632 /* Compute a zeroing BRK. */
2633 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2634 intptr_t oprsz, bool after)
2635 {
2636 bool brk = false;
2637 intptr_t i;
2638
2639 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2640 uint64_t this_b, this_g = g[i];
2641
2642 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2643 d[i] = this_b & this_g;
2644 }
2645 }
2646
2647 /* Likewise, but also compute flags. */
2648 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2649 intptr_t oprsz, bool after)
2650 {
2651 uint32_t flags = PREDTEST_INIT;
2652 bool brk = false;
2653 intptr_t i;
2654
2655 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2656 uint64_t this_b, this_d, this_g = g[i];
2657
2658 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2659 d[i] = this_d = this_b & this_g;
2660 flags = iter_predtest_fwd(this_d, this_g, flags);
2661 }
2662 return flags;
2663 }
2664
2665 /* Compute a merging BRK. */
2666 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2667 intptr_t oprsz, bool after)
2668 {
2669 bool brk = false;
2670 intptr_t i;
2671
2672 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2673 uint64_t this_b, this_g = g[i];
2674
2675 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2676 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2677 }
2678 }
2679
2680 /* Likewise, but also compute flags. */
2681 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2682 intptr_t oprsz, bool after)
2683 {
2684 uint32_t flags = PREDTEST_INIT;
2685 bool brk = false;
2686 intptr_t i;
2687
2688 for (i = 0; i < oprsz / 8; ++i) {
2689 uint64_t this_b, this_d = d[i], this_g = g[i];
2690
2691 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2692 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2693 flags = iter_predtest_fwd(this_d, this_g, flags);
2694 }
2695 return flags;
2696 }
2697
2698 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2699 {
2700 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2701 * The compiler should turn this into 4 64-bit integer stores.
2702 */
2703 memset(d, 0, sizeof(ARMPredicateReg));
2704 return PREDTEST_INIT;
2705 }
2706
2707 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2708 uint32_t pred_desc)
2709 {
2710 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2711 if (last_active_pred(vn, vg, oprsz)) {
2712 compute_brk_z(vd, vm, vg, oprsz, true);
2713 } else {
2714 do_zero(vd, oprsz);
2715 }
2716 }
2717
2718 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2719 uint32_t pred_desc)
2720 {
2721 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2722 if (last_active_pred(vn, vg, oprsz)) {
2723 return compute_brks_z(vd, vm, vg, oprsz, true);
2724 } else {
2725 return do_zero(vd, oprsz);
2726 }
2727 }
2728
2729 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2730 uint32_t pred_desc)
2731 {
2732 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2733 if (last_active_pred(vn, vg, oprsz)) {
2734 compute_brk_z(vd, vm, vg, oprsz, false);
2735 } else {
2736 do_zero(vd, oprsz);
2737 }
2738 }
2739
2740 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2741 uint32_t pred_desc)
2742 {
2743 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2744 if (last_active_pred(vn, vg, oprsz)) {
2745 return compute_brks_z(vd, vm, vg, oprsz, false);
2746 } else {
2747 return do_zero(vd, oprsz);
2748 }
2749 }
2750
2751 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2752 {
2753 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2754 compute_brk_z(vd, vn, vg, oprsz, true);
2755 }
2756
2757 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2758 {
2759 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2760 return compute_brks_z(vd, vn, vg, oprsz, true);
2761 }
2762
2763 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2764 {
2765 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2766 compute_brk_z(vd, vn, vg, oprsz, false);
2767 }
2768
2769 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2770 {
2771 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2772 return compute_brks_z(vd, vn, vg, oprsz, false);
2773 }
2774
2775 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2776 {
2777 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2778 compute_brk_m(vd, vn, vg, oprsz, true);
2779 }
2780
2781 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2782 {
2783 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2784 return compute_brks_m(vd, vn, vg, oprsz, true);
2785 }
2786
2787 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2788 {
2789 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2790 compute_brk_m(vd, vn, vg, oprsz, false);
2791 }
2792
2793 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2794 {
2795 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2796 return compute_brks_m(vd, vn, vg, oprsz, false);
2797 }
2798
2799 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2800 {
2801 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2802
2803 if (!last_active_pred(vn, vg, oprsz)) {
2804 do_zero(vd, oprsz);
2805 }
2806 }
2807
2808 /* As if PredTest(Ones(PL), D, esz). */
2809 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2810 uint64_t esz_mask)
2811 {
2812 uint32_t flags = PREDTEST_INIT;
2813 intptr_t i;
2814
2815 for (i = 0; i < oprsz / 8; i++) {
2816 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2817 }
2818 if (oprsz & 7) {
2819 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2820 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2821 }
2822 return flags;
2823 }
2824
2825 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2826 {
2827 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2828
2829 if (last_active_pred(vn, vg, oprsz)) {
2830 return predtest_ones(vd, oprsz, -1);
2831 } else {
2832 return do_zero(vd, oprsz);
2833 }
2834 }
2835
2836 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2837 {
2838 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2839 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2840 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2841 intptr_t i;
2842
2843 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2844 uint64_t t = n[i] & g[i] & mask;
2845 sum += ctpop64(t);
2846 }
2847 return sum;
2848 }
2849
2850 uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2851 {
2852 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2853 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2854 uint64_t esz_mask = pred_esz_masks[esz];
2855 ARMPredicateReg *d = vd;
2856 uint32_t flags;
2857 intptr_t i;
2858
2859 /* Begin with a zero predicate register. */
2860 flags = do_zero(d, oprsz);
2861 if (count == 0) {
2862 return flags;
2863 }
2864
2865 /* Set all of the requested bits. */
2866 for (i = 0; i < count / 64; ++i) {
2867 d->p[i] = esz_mask;
2868 }
2869 if (count & 63) {
2870 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2871 }
2872
2873 return predtest_ones(d, oprsz, esz_mask);
2874 }
2875
2876 /* Recursive reduction on a function;
2877 * C.f. the ARM ARM function ReducePredicated.
2878 *
2879 * While it would be possible to write this without the DATA temporary,
2880 * it is much simpler to process the predicate register this way.
2881 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2882 * little to gain with a more complex non-recursive form.
2883 */
2884 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2885 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2886 { \
2887 if (n == 1) { \
2888 return *data; \
2889 } else { \
2890 uintptr_t half = n / 2; \
2891 TYPE lo = NAME##_reduce(data, status, half); \
2892 TYPE hi = NAME##_reduce(data + half, status, half); \
2893 return TYPE##_##FUNC(lo, hi, status); \
2894 } \
2895 } \
2896 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2897 { \
2898 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2899 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2900 for (i = 0; i < oprsz; ) { \
2901 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2902 do { \
2903 TYPE nn = *(TYPE *)(vn + H(i)); \
2904 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2905 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2906 } while (i & 15); \
2907 } \
2908 for (; i < maxsz; i += sizeof(TYPE)) { \
2909 *(TYPE *)((void *)data + i) = IDENT; \
2910 } \
2911 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2912 }
2913
2914 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2915 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2916 DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2917
2918 /* Identity is floatN_default_nan, without the function call. */
2919 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2920 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2921 DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2922
2923 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2924 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2925 DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2926
2927 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2928 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2929 DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2930
2931 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2932 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2933 DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2934
2935 #undef DO_REDUCE
2936
2937 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2938 void *status, uint32_t desc)
2939 {
2940 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2941 float16 result = nn;
2942
2943 do {
2944 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2945 do {
2946 if (pg & 1) {
2947 float16 mm = *(float16 *)(vm + H1_2(i));
2948 result = float16_add(result, mm, status);
2949 }
2950 i += sizeof(float16), pg >>= sizeof(float16);
2951 } while (i & 15);
2952 } while (i < opr_sz);
2953
2954 return result;
2955 }
2956
2957 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2958 void *status, uint32_t desc)
2959 {
2960 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2961 float32 result = nn;
2962
2963 do {
2964 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2965 do {
2966 if (pg & 1) {
2967 float32 mm = *(float32 *)(vm + H1_2(i));
2968 result = float32_add(result, mm, status);
2969 }
2970 i += sizeof(float32), pg >>= sizeof(float32);
2971 } while (i & 15);
2972 } while (i < opr_sz);
2973
2974 return result;
2975 }
2976
2977 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
2978 void *status, uint32_t desc)
2979 {
2980 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
2981 uint64_t *m = vm;
2982 uint8_t *pg = vg;
2983
2984 for (i = 0; i < opr_sz; i++) {
2985 if (pg[H1(i)] & 1) {
2986 nn = float64_add(nn, m[i], status);
2987 }
2988 }
2989
2990 return nn;
2991 }
2992
2993 /* Fully general three-operand expander, controlled by a predicate,
2994 * With the extra float_status parameter.
2995 */
2996 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
2997 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
2998 void *status, uint32_t desc) \
2999 { \
3000 intptr_t i = simd_oprsz(desc); \
3001 uint64_t *g = vg; \
3002 do { \
3003 uint64_t pg = g[(i - 1) >> 6]; \
3004 do { \
3005 i -= sizeof(TYPE); \
3006 if (likely((pg >> (i & 63)) & 1)) { \
3007 TYPE nn = *(TYPE *)(vn + H(i)); \
3008 TYPE mm = *(TYPE *)(vm + H(i)); \
3009 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3010 } \
3011 } while (i & 63); \
3012 } while (i != 0); \
3013 }
3014
3015 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3016 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3017 DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3018
3019 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3020 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3021 DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3022
3023 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3024 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3025 DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3026
3027 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3028 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3029 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3030
3031 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3032 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3033 DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3034
3035 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3036 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3037 DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3038
3039 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3040 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3041 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3042
3043 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3044 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3045 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3046
3047 static inline float16 abd_h(float16 a, float16 b, float_status *s)
3048 {
3049 return float16_abs(float16_sub(a, b, s));
3050 }
3051
3052 static inline float32 abd_s(float32 a, float32 b, float_status *s)
3053 {
3054 return float32_abs(float32_sub(a, b, s));
3055 }
3056
3057 static inline float64 abd_d(float64 a, float64 b, float_status *s)
3058 {
3059 return float64_abs(float64_sub(a, b, s));
3060 }
3061
3062 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3063 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3064 DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3065
3066 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3067 {
3068 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3069 return float64_scalbn(a, b_int, s);
3070 }
3071
3072 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3073 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3074 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3075
3076 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3077 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3078 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3079
3080 #undef DO_ZPZZ_FP
3081
3082 /* Three-operand expander, with one scalar operand, controlled by
3083 * a predicate, with the extra float_status parameter.
3084 */
3085 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3086 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3087 void *status, uint32_t desc) \
3088 { \
3089 intptr_t i = simd_oprsz(desc); \
3090 uint64_t *g = vg; \
3091 TYPE mm = scalar; \
3092 do { \
3093 uint64_t pg = g[(i - 1) >> 6]; \
3094 do { \
3095 i -= sizeof(TYPE); \
3096 if (likely((pg >> (i & 63)) & 1)) { \
3097 TYPE nn = *(TYPE *)(vn + H(i)); \
3098 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3099 } \
3100 } while (i & 63); \
3101 } while (i != 0); \
3102 }
3103
3104 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3105 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3106 DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3107
3108 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3109 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3110 DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3111
3112 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3113 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3114 DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3115
3116 static inline float16 subr_h(float16 a, float16 b, float_status *s)
3117 {
3118 return float16_sub(b, a, s);
3119 }
3120
3121 static inline float32 subr_s(float32 a, float32 b, float_status *s)
3122 {
3123 return float32_sub(b, a, s);
3124 }
3125
3126 static inline float64 subr_d(float64 a, float64 b, float_status *s)
3127 {
3128 return float64_sub(b, a, s);
3129 }
3130
3131 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3132 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3133 DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3134
3135 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3136 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3137 DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3138
3139 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3140 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3141 DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3142
3143 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3144 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3145 DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3146
3147 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3148 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3149 DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3150
3151 /* Fully general two-operand expander, controlled by a predicate,
3152 * With the extra float_status parameter.
3153 */
3154 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3155 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3156 { \
3157 intptr_t i = simd_oprsz(desc); \
3158 uint64_t *g = vg; \
3159 do { \
3160 uint64_t pg = g[(i - 1) >> 6]; \
3161 do { \
3162 i -= sizeof(TYPE); \
3163 if (likely((pg >> (i & 63)) & 1)) { \
3164 TYPE nn = *(TYPE *)(vn + H(i)); \
3165 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3166 } \
3167 } while (i & 63); \
3168 } while (i != 0); \
3169 }
3170
3171 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3172 * FZ16. When converting from fp16, this affects flushing input denormals;
3173 * when converting to fp16, this affects flushing output denormals.
3174 */
3175 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3176 {
3177 bool save = get_flush_inputs_to_zero(fpst);
3178 float32 ret;
3179
3180 set_flush_inputs_to_zero(false, fpst);
3181 ret = float16_to_float32(f, true, fpst);
3182 set_flush_inputs_to_zero(save, fpst);
3183 return ret;
3184 }
3185
3186 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3187 {
3188 bool save = get_flush_inputs_to_zero(fpst);
3189 float64 ret;
3190
3191 set_flush_inputs_to_zero(false, fpst);
3192 ret = float16_to_float64(f, true, fpst);
3193 set_flush_inputs_to_zero(save, fpst);
3194 return ret;
3195 }
3196
3197 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3198 {
3199 bool save = get_flush_to_zero(fpst);
3200 float16 ret;
3201
3202 set_flush_to_zero(false, fpst);
3203 ret = float32_to_float16(f, true, fpst);
3204 set_flush_to_zero(save, fpst);
3205 return ret;
3206 }
3207
3208 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3209 {
3210 bool save = get_flush_to_zero(fpst);
3211 float16 ret;
3212
3213 set_flush_to_zero(false, fpst);
3214 ret = float64_to_float16(f, true, fpst);
3215 set_flush_to_zero(save, fpst);
3216 return ret;
3217 }
3218
3219 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3220 {
3221 if (float16_is_any_nan(f)) {
3222 float_raise(float_flag_invalid, s);
3223 return 0;
3224 }
3225 return float16_to_int16_round_to_zero(f, s);
3226 }
3227
3228 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3229 {
3230 if (float16_is_any_nan(f)) {
3231 float_raise(float_flag_invalid, s);
3232 return 0;
3233 }
3234 return float16_to_int64_round_to_zero(f, s);
3235 }
3236
3237 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3238 {
3239 if (float32_is_any_nan(f)) {
3240 float_raise(float_flag_invalid, s);
3241 return 0;
3242 }
3243 return float32_to_int64_round_to_zero(f, s);
3244 }
3245
3246 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3247 {
3248 if (float64_is_any_nan(f)) {
3249 float_raise(float_flag_invalid, s);
3250 return 0;
3251 }
3252 return float64_to_int64_round_to_zero(f, s);
3253 }
3254
3255 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3256 {
3257 if (float16_is_any_nan(f)) {
3258 float_raise(float_flag_invalid, s);
3259 return 0;
3260 }
3261 return float16_to_uint16_round_to_zero(f, s);
3262 }
3263
3264 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3265 {
3266 if (float16_is_any_nan(f)) {
3267 float_raise(float_flag_invalid, s);
3268 return 0;
3269 }
3270 return float16_to_uint64_round_to_zero(f, s);
3271 }
3272
3273 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3274 {
3275 if (float32_is_any_nan(f)) {
3276 float_raise(float_flag_invalid, s);
3277 return 0;
3278 }
3279 return float32_to_uint64_round_to_zero(f, s);
3280 }
3281
3282 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3283 {
3284 if (float64_is_any_nan(f)) {
3285 float_raise(float_flag_invalid, s);
3286 return 0;
3287 }
3288 return float64_to_uint64_round_to_zero(f, s);
3289 }
3290
3291 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3292 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3293 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3294 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3295 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3296 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3297
3298 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3299 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3300 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3301 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3302 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3303 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3304 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3305
3306 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3307 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3308 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3309 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3310 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3311 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3312 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3313
3314 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3315 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3316 DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3317
3318 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3319 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3320 DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3321
3322 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3323 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3324 DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3325
3326 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3327 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3328 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3329
3330 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3331 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3332 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3333 DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3334 DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3335 DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3336 DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3337
3338 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3339 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3340 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3341 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3342 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3343 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3344 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3345
3346 #undef DO_ZPZ_FP
3347
3348 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
3349 float_status *status, uint32_t desc,
3350 uint16_t neg1, uint16_t neg3)
3351 {
3352 intptr_t i = simd_oprsz(desc);
3353 uint64_t *g = vg;
3354
3355 do {
3356 uint64_t pg = g[(i - 1) >> 6];
3357 do {
3358 i -= 2;
3359 if (likely((pg >> (i & 63)) & 1)) {
3360 float16 e1, e2, e3, r;
3361
3362 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3363 e2 = *(uint16_t *)(vm + H1_2(i));
3364 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3365 r = float16_muladd(e1, e2, e3, 0, status);
3366 *(uint16_t *)(vd + H1_2(i)) = r;
3367 }
3368 } while (i & 63);
3369 } while (i != 0);
3370 }
3371
3372 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3373 void *vg, void *status, uint32_t desc)
3374 {
3375 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
3376 }
3377
3378 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3379 void *vg, void *status, uint32_t desc)
3380 {
3381 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
3382 }
3383
3384 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3385 void *vg, void *status, uint32_t desc)
3386 {
3387 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
3388 }
3389
3390 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3391 void *vg, void *status, uint32_t desc)
3392 {
3393 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
3394 }
3395
3396 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
3397 float_status *status, uint32_t desc,
3398 uint32_t neg1, uint32_t neg3)
3399 {
3400 intptr_t i = simd_oprsz(desc);
3401 uint64_t *g = vg;
3402
3403 do {
3404 uint64_t pg = g[(i - 1) >> 6];
3405 do {
3406 i -= 4;
3407 if (likely((pg >> (i & 63)) & 1)) {
3408 float32 e1, e2, e3, r;
3409
3410 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3411 e2 = *(uint32_t *)(vm + H1_4(i));
3412 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3413 r = float32_muladd(e1, e2, e3, 0, status);
3414 *(uint32_t *)(vd + H1_4(i)) = r;
3415 }
3416 } while (i & 63);
3417 } while (i != 0);
3418 }
3419
3420 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3421 void *vg, void *status, uint32_t desc)
3422 {
3423 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
3424 }
3425
3426 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3427 void *vg, void *status, uint32_t desc)
3428 {
3429 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
3430 }
3431
3432 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3433 void *vg, void *status, uint32_t desc)
3434 {
3435 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
3436 }
3437
3438 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3439 void *vg, void *status, uint32_t desc)
3440 {
3441 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
3442 }
3443
3444 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
3445 float_status *status, uint32_t desc,
3446 uint64_t neg1, uint64_t neg3)
3447 {
3448 intptr_t i = simd_oprsz(desc);
3449 uint64_t *g = vg;
3450
3451 do {
3452 uint64_t pg = g[(i - 1) >> 6];
3453 do {
3454 i -= 8;
3455 if (likely((pg >> (i & 63)) & 1)) {
3456 float64 e1, e2, e3, r;
3457
3458 e1 = *(uint64_t *)(vn + i) ^ neg1;
3459 e2 = *(uint64_t *)(vm + i);
3460 e3 = *(uint64_t *)(va + i) ^ neg3;
3461 r = float64_muladd(e1, e2, e3, 0, status);
3462 *(uint64_t *)(vd + i) = r;
3463 }
3464 } while (i & 63);
3465 } while (i != 0);
3466 }
3467
3468 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3469 void *vg, void *status, uint32_t desc)
3470 {
3471 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
3472 }
3473
3474 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3475 void *vg, void *status, uint32_t desc)
3476 {
3477 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
3478 }
3479
3480 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3481 void *vg, void *status, uint32_t desc)
3482 {
3483 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
3484 }
3485
3486 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3487 void *vg, void *status, uint32_t desc)
3488 {
3489 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
3490 }
3491
3492 /* Two operand floating-point comparison controlled by a predicate.
3493 * Unlike the integer version, we are not allowed to optimistically
3494 * compare operands, since the comparison may have side effects wrt
3495 * the FPSR.
3496 */
3497 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3498 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3499 void *status, uint32_t desc) \
3500 { \
3501 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3502 uint64_t *d = vd, *g = vg; \
3503 do { \
3504 uint64_t out = 0, pg = g[j]; \
3505 do { \
3506 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3507 if (likely((pg >> (i & 63)) & 1)) { \
3508 TYPE nn = *(TYPE *)(vn + H(i)); \
3509 TYPE mm = *(TYPE *)(vm + H(i)); \
3510 out |= OP(TYPE, nn, mm, status); \
3511 } \
3512 } while (i & 63); \
3513 d[j--] = out; \
3514 } while (i > 0); \
3515 }
3516
3517 #define DO_FPCMP_PPZZ_H(NAME, OP) \
3518 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3519 #define DO_FPCMP_PPZZ_S(NAME, OP) \
3520 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3521 #define DO_FPCMP_PPZZ_D(NAME, OP) \
3522 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3523
3524 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3525 DO_FPCMP_PPZZ_H(NAME, OP) \
3526 DO_FPCMP_PPZZ_S(NAME, OP) \
3527 DO_FPCMP_PPZZ_D(NAME, OP)
3528
3529 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3530 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
3531 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3532 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
3533 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3534 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3535 #define DO_FCMUO(TYPE, X, Y, ST) \
3536 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3537 #define DO_FACGE(TYPE, X, Y, ST) \
3538 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3539 #define DO_FACGT(TYPE, X, Y, ST) \
3540 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3541
3542 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3543 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3544 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3545 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3546 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3547 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3548 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3549
3550 #undef DO_FPCMP_PPZZ_ALL
3551 #undef DO_FPCMP_PPZZ_D
3552 #undef DO_FPCMP_PPZZ_S
3553 #undef DO_FPCMP_PPZZ_H
3554 #undef DO_FPCMP_PPZZ
3555
3556 /* One operand floating-point comparison against zero, controlled
3557 * by a predicate.
3558 */
3559 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3560 void HELPER(NAME)(void *vd, void *vn, void *vg, \
3561 void *status, uint32_t desc) \
3562 { \
3563 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3564 uint64_t *d = vd, *g = vg; \
3565 do { \
3566 uint64_t out = 0, pg = g[j]; \
3567 do { \
3568 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3569 if ((pg >> (i & 63)) & 1) { \
3570 TYPE nn = *(TYPE *)(vn + H(i)); \
3571 out |= OP(TYPE, nn, 0, status); \
3572 } \
3573 } while (i & 63); \
3574 d[j--] = out; \
3575 } while (i > 0); \
3576 }
3577
3578 #define DO_FPCMP_PPZ0_H(NAME, OP) \
3579 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3580 #define DO_FPCMP_PPZ0_S(NAME, OP) \
3581 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3582 #define DO_FPCMP_PPZ0_D(NAME, OP) \
3583 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3584
3585 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3586 DO_FPCMP_PPZ0_H(NAME, OP) \
3587 DO_FPCMP_PPZ0_S(NAME, OP) \
3588 DO_FPCMP_PPZ0_D(NAME, OP)
3589
3590 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3591 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3592 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3593 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3594 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3595 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3596
3597 /* FP Trig Multiply-Add. */
3598
3599 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3600 {
3601 static const float16 coeff[16] = {
3602 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3603 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3604 };
3605 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3606 intptr_t x = simd_data(desc);
3607 float16 *d = vd, *n = vn, *m = vm;
3608 for (i = 0; i < opr_sz; i++) {
3609 float16 mm = m[i];
3610 intptr_t xx = x;
3611 if (float16_is_neg(mm)) {
3612 mm = float16_abs(mm);
3613 xx += 8;
3614 }
3615 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3616 }
3617 }
3618
3619 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3620 {
3621 static const float32 coeff[16] = {
3622 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3623 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3624 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3625 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3626 };
3627 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3628 intptr_t x = simd_data(desc);
3629 float32 *d = vd, *n = vn, *m = vm;
3630 for (i = 0; i < opr_sz; i++) {
3631 float32 mm = m[i];
3632 intptr_t xx = x;
3633 if (float32_is_neg(mm)) {
3634 mm = float32_abs(mm);
3635 xx += 8;
3636 }
3637 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3638 }
3639 }
3640
3641 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3642 {
3643 static const float64 coeff[16] = {
3644 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3645 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3646 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3647 0x3de5d8408868552full, 0x0000000000000000ull,
3648 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3649 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3650 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3651 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3652 };
3653 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3654 intptr_t x = simd_data(desc);
3655 float64 *d = vd, *n = vn, *m = vm;
3656 for (i = 0; i < opr_sz; i++) {
3657 float64 mm = m[i];
3658 intptr_t xx = x;
3659 if (float64_is_neg(mm)) {
3660 mm = float64_abs(mm);
3661 xx += 8;
3662 }
3663 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3664 }
3665 }
3666
3667 /*
3668 * FP Complex Add
3669 */
3670
3671 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3672 void *vs, uint32_t desc)
3673 {
3674 intptr_t j, i = simd_oprsz(desc);
3675 uint64_t *g = vg;
3676 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3677 float16 neg_real = float16_chs(neg_imag);
3678
3679 do {
3680 uint64_t pg = g[(i - 1) >> 6];
3681 do {
3682 float16 e0, e1, e2, e3;
3683
3684 /* I holds the real index; J holds the imag index. */
3685 j = i - sizeof(float16);
3686 i -= 2 * sizeof(float16);
3687
3688 e0 = *(float16 *)(vn + H1_2(i));
3689 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3690 e2 = *(float16 *)(vn + H1_2(j));
3691 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3692
3693 if (likely((pg >> (i & 63)) & 1)) {
3694 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3695 }
3696 if (likely((pg >> (j & 63)) & 1)) {
3697 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3698 }
3699 } while (i & 63);
3700 } while (i != 0);
3701 }
3702
3703 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3704 void *vs, uint32_t desc)
3705 {
3706 intptr_t j, i = simd_oprsz(desc);
3707 uint64_t *g = vg;
3708 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3709 float32 neg_real = float32_chs(neg_imag);
3710
3711 do {
3712 uint64_t pg = g[(i - 1) >> 6];
3713 do {
3714 float32 e0, e1, e2, e3;
3715
3716 /* I holds the real index; J holds the imag index. */
3717 j = i - sizeof(float32);
3718 i -= 2 * sizeof(float32);
3719
3720 e0 = *(float32 *)(vn + H1_2(i));
3721 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3722 e2 = *(float32 *)(vn + H1_2(j));
3723 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3724
3725 if (likely((pg >> (i & 63)) & 1)) {
3726 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3727 }
3728 if (likely((pg >> (j & 63)) & 1)) {
3729 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3730 }
3731 } while (i & 63);
3732 } while (i != 0);
3733 }
3734
3735 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3736 void *vs, uint32_t desc)
3737 {
3738 intptr_t j, i = simd_oprsz(desc);
3739 uint64_t *g = vg;
3740 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3741 float64 neg_real = float64_chs(neg_imag);
3742
3743 do {
3744 uint64_t pg = g[(i - 1) >> 6];
3745 do {
3746 float64 e0, e1, e2, e3;
3747
3748 /* I holds the real index; J holds the imag index. */
3749 j = i - sizeof(float64);
3750 i -= 2 * sizeof(float64);
3751
3752 e0 = *(float64 *)(vn + H1_2(i));
3753 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3754 e2 = *(float64 *)(vn + H1_2(j));
3755 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3756
3757 if (likely((pg >> (i & 63)) & 1)) {
3758 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3759 }
3760 if (likely((pg >> (j & 63)) & 1)) {
3761 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3762 }
3763 } while (i & 63);
3764 } while (i != 0);
3765 }
3766
3767 /*
3768 * FP Complex Multiply
3769 */
3770
3771 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3772 void *vg, void *status, uint32_t desc)
3773 {
3774 intptr_t j, i = simd_oprsz(desc);
3775 unsigned rot = simd_data(desc);
3776 bool flip = rot & 1;
3777 float16 neg_imag, neg_real;
3778 uint64_t *g = vg;
3779
3780 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3781 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3782
3783 do {
3784 uint64_t pg = g[(i - 1) >> 6];
3785 do {
3786 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3787
3788 /* I holds the real index; J holds the imag index. */
3789 j = i - sizeof(float16);
3790 i -= 2 * sizeof(float16);
3791
3792 nr = *(float16 *)(vn + H1_2(i));
3793 ni = *(float16 *)(vn + H1_2(j));
3794 mr = *(float16 *)(vm + H1_2(i));
3795 mi = *(float16 *)(vm + H1_2(j));
3796
3797 e2 = (flip ? ni : nr);
3798 e1 = (flip ? mi : mr) ^ neg_real;
3799 e4 = e2;
3800 e3 = (flip ? mr : mi) ^ neg_imag;
3801
3802 if (likely((pg >> (i & 63)) & 1)) {
3803 d = *(float16 *)(va + H1_2(i));
3804 d = float16_muladd(e2, e1, d, 0, status);
3805 *(float16 *)(vd + H1_2(i)) = d;
3806 }
3807 if (likely((pg >> (j & 63)) & 1)) {
3808 d = *(float16 *)(va + H1_2(j));
3809 d = float16_muladd(e4, e3, d, 0, status);
3810 *(float16 *)(vd + H1_2(j)) = d;
3811 }
3812 } while (i & 63);
3813 } while (i != 0);
3814 }
3815
3816 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3817 void *vg, void *status, uint32_t desc)
3818 {
3819 intptr_t j, i = simd_oprsz(desc);
3820 unsigned rot = simd_data(desc);
3821 bool flip = rot & 1;
3822 float32 neg_imag, neg_real;
3823 uint64_t *g = vg;
3824
3825 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3826 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3827
3828 do {
3829 uint64_t pg = g[(i - 1) >> 6];
3830 do {
3831 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3832
3833 /* I holds the real index; J holds the imag index. */
3834 j = i - sizeof(float32);
3835 i -= 2 * sizeof(float32);
3836
3837 nr = *(float32 *)(vn + H1_2(i));
3838 ni = *(float32 *)(vn + H1_2(j));
3839 mr = *(float32 *)(vm + H1_2(i));
3840 mi = *(float32 *)(vm + H1_2(j));
3841
3842 e2 = (flip ? ni : nr);
3843 e1 = (flip ? mi : mr) ^ neg_real;
3844 e4 = e2;
3845 e3 = (flip ? mr : mi) ^ neg_imag;
3846
3847 if (likely((pg >> (i & 63)) & 1)) {
3848 d = *(float32 *)(va + H1_2(i));
3849 d = float32_muladd(e2, e1, d, 0, status);
3850 *(float32 *)(vd + H1_2(i)) = d;
3851 }
3852 if (likely((pg >> (j & 63)) & 1)) {
3853 d = *(float32 *)(va + H1_2(j));
3854 d = float32_muladd(e4, e3, d, 0, status);
3855 *(float32 *)(vd + H1_2(j)) = d;
3856 }
3857 } while (i & 63);
3858 } while (i != 0);
3859 }
3860
3861 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3862 void *vg, void *status, uint32_t desc)
3863 {
3864 intptr_t j, i = simd_oprsz(desc);
3865 unsigned rot = simd_data(desc);
3866 bool flip = rot & 1;
3867 float64 neg_imag, neg_real;
3868 uint64_t *g = vg;
3869
3870 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3871 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3872
3873 do {
3874 uint64_t pg = g[(i - 1) >> 6];
3875 do {
3876 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3877
3878 /* I holds the real index; J holds the imag index. */
3879 j = i - sizeof(float64);
3880 i -= 2 * sizeof(float64);
3881
3882 nr = *(float64 *)(vn + H1_2(i));
3883 ni = *(float64 *)(vn + H1_2(j));
3884 mr = *(float64 *)(vm + H1_2(i));
3885 mi = *(float64 *)(vm + H1_2(j));
3886
3887 e2 = (flip ? ni : nr);
3888 e1 = (flip ? mi : mr) ^ neg_real;
3889 e4 = e2;
3890 e3 = (flip ? mr : mi) ^ neg_imag;
3891
3892 if (likely((pg >> (i & 63)) & 1)) {
3893 d = *(float64 *)(va + H1_2(i));
3894 d = float64_muladd(e2, e1, d, 0, status);
3895 *(float64 *)(vd + H1_2(i)) = d;
3896 }
3897 if (likely((pg >> (j & 63)) & 1)) {
3898 d = *(float64 *)(va + H1_2(j));
3899 d = float64_muladd(e4, e3, d, 0, status);
3900 *(float64 *)(vd + H1_2(j)) = d;
3901 }
3902 } while (i & 63);
3903 } while (i != 0);
3904 }
3905
3906 /*
3907 * Load contiguous data, protected by a governing predicate.
3908 */
3909
3910 /*
3911 * Load one element into @vd + @reg_off from @host.
3912 * The controlling predicate is known to be true.
3913 */
3914 typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
3915
3916 /*
3917 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
3918 * The controlling predicate is known to be true.
3919 */
3920 typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
3921 target_ulong vaddr, uintptr_t retaddr);
3922
3923 /*
3924 * Generate the above primitives.
3925 */
3926
3927 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
3928 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
3929 { \
3930 TYPEM val = HOST(host); \
3931 *(TYPEE *)(vd + H(reg_off)) = val; \
3932 }
3933
3934 #define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
3935 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
3936 { HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
3937
3938 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
3939 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
3940 target_ulong addr, uintptr_t ra) \
3941 { \
3942 *(TYPEE *)(vd + H(reg_off)) = \
3943 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
3944 }
3945
3946 #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
3947 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
3948 target_ulong addr, uintptr_t ra) \
3949 { \
3950 TLB(env, useronly_clean_ptr(addr), \
3951 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
3952 }
3953
3954 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
3955 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
3956 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
3957
3958 DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
3959 DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
3960 DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
3961 DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
3962 DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
3963 DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
3964 DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
3965
3966 #define DO_ST_PRIM_1(NAME, H, TE, TM) \
3967 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
3968 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
3969
3970 DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
3971 DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
3972 DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
3973 DO_ST_PRIM_1(bd, , uint64_t, uint8_t)
3974
3975 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
3976 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
3977 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
3978 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
3979 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
3980
3981 #define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
3982 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
3983 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
3984 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
3985 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
3986
3987 DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
3988 DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
3989 DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
3990 DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw)
3991 DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw)
3992
3993 DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
3994 DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
3995 DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw)
3996
3997 DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
3998 DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl)
3999 DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl)
4000
4001 DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
4002 DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl)
4003
4004 DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq)
4005 DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq)
4006
4007 #undef DO_LD_TLB
4008 #undef DO_ST_TLB
4009 #undef DO_LD_HOST
4010 #undef DO_LD_PRIM_1
4011 #undef DO_ST_PRIM_1
4012 #undef DO_LD_PRIM_2
4013 #undef DO_ST_PRIM_2
4014
4015 /*
4016 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4017 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4018 * element >= @reg_off, or @reg_max if there were no active elements at all.
4019 */
4020 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4021 intptr_t reg_max, int esz)
4022 {
4023 uint64_t pg_mask = pred_esz_masks[esz];
4024 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4025
4026 /* In normal usage, the first element is active. */
4027 if (likely(pg & 1)) {
4028 return reg_off;
4029 }
4030
4031 if (pg == 0) {
4032 reg_off &= -64;
4033 do {
4034 reg_off += 64;
4035 if (unlikely(reg_off >= reg_max)) {
4036 /* The entire predicate was false. */
4037 return reg_max;
4038 }
4039 pg = vg[reg_off >> 6] & pg_mask;
4040 } while (pg == 0);
4041 }
4042 reg_off += ctz64(pg);
4043
4044 /* We should never see an out of range predicate bit set. */
4045 tcg_debug_assert(reg_off < reg_max);
4046 return reg_off;
4047 }
4048
4049 /*
4050 * Resolve the guest virtual address to info->host and info->flags.
4051 * If @nofault, return false if the page is invalid, otherwise
4052 * exit via page fault exception.
4053 */
4054
4055 typedef struct {
4056 void *host;
4057 int flags;
4058 MemTxAttrs attrs;
4059 } SVEHostPage;
4060
4061 static bool sve_probe_page(SVEHostPage *info, bool nofault,
4062 CPUARMState *env, target_ulong addr,
4063 int mem_off, MMUAccessType access_type,
4064 int mmu_idx, uintptr_t retaddr)
4065 {
4066 int flags;
4067
4068 addr += mem_off;
4069
4070 /*
4071 * User-only currently always issues with TBI. See the comment
4072 * above useronly_clean_ptr. Usually we clean this top byte away
4073 * during translation, but we can't do that for e.g. vector + imm
4074 * addressing modes.
4075 *
4076 * We currently always enable TBI for user-only, and do not provide
4077 * a way to turn it off. So clean the pointer unconditionally here,
4078 * rather than look it up here, or pass it down from above.
4079 */
4080 addr = useronly_clean_ptr(addr);
4081
4082 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
4083 &info->host, retaddr);
4084 info->flags = flags;
4085
4086 if (flags & TLB_INVALID_MASK) {
4087 g_assert(nofault);
4088 return false;
4089 }
4090
4091 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
4092 info->host -= mem_off;
4093
4094 #ifdef CONFIG_USER_ONLY
4095 memset(&info->attrs, 0, sizeof(info->attrs));
4096 #else
4097 /*
4098 * Find the iotlbentry for addr and return the transaction attributes.
4099 * This *must* be present in the TLB because we just found the mapping.
4100 */
4101 {
4102 uintptr_t index = tlb_index(env, mmu_idx, addr);
4103
4104 # ifdef CONFIG_DEBUG_TCG
4105 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
4106 target_ulong comparator = (access_type == MMU_DATA_LOAD
4107 ? entry->addr_read
4108 : tlb_addr_write(entry));
4109 g_assert(tlb_hit(comparator, addr));
4110 # endif
4111
4112 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
4113 info->attrs = iotlbentry->attrs;
4114 }
4115 #endif
4116
4117 return true;
4118 }
4119
4120
4121 /*
4122 * Analyse contiguous data, protected by a governing predicate.
4123 */
4124
4125 typedef enum {
4126 FAULT_NO,
4127 FAULT_FIRST,
4128 FAULT_ALL,
4129 } SVEContFault;
4130
4131 typedef struct {
4132 /*
4133 * First and last element wholly contained within the two pages.
4134 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
4135 * reg_off_last[0] may be < 0 if the first element crosses pages.
4136 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
4137 * are set >= 0 only if there are complete elements on a second page.
4138 *
4139 * The reg_off_* offsets are relative to the internal vector register.
4140 * The mem_off_first offset is relative to the memory address; the
4141 * two offsets are different when a load operation extends, a store
4142 * operation truncates, or for multi-register operations.
4143 */
4144 int16_t mem_off_first[2];
4145 int16_t reg_off_first[2];
4146 int16_t reg_off_last[2];
4147
4148 /*
4149 * One element that is misaligned and spans both pages,
4150 * or -1 if there is no such active element.
4151 */
4152 int16_t mem_off_split;
4153 int16_t reg_off_split;
4154
4155 /*
4156 * The byte offset at which the entire operation crosses a page boundary.
4157 * Set >= 0 if and only if the entire operation spans two pages.
4158 */
4159 int16_t page_split;
4160
4161 /* TLB data for the two pages. */
4162 SVEHostPage page[2];
4163 } SVEContLdSt;
4164
4165 /*
4166 * Find first active element on each page, and a loose bound for the
4167 * final element on each page. Identify any single element that spans
4168 * the page boundary. Return true if there are any active elements.
4169 */
4170 static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
4171 uint64_t *vg, intptr_t reg_max,
4172 int esz, int msize)
4173 {
4174 const int esize = 1 << esz;
4175 const uint64_t pg_mask = pred_esz_masks[esz];
4176 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
4177 intptr_t mem_off_last, mem_off_split;
4178 intptr_t page_split, elt_split;
4179 intptr_t i;
4180
4181 /* Set all of the element indices to -1, and the TLB data to 0. */
4182 memset(info, -1, offsetof(SVEContLdSt, page));
4183 memset(info->page, 0, sizeof(info->page));
4184
4185 /* Gross scan over the entire predicate to find bounds. */
4186 i = 0;
4187 do {
4188 uint64_t pg = vg[i] & pg_mask;
4189 if (pg) {
4190 reg_off_last = i * 64 + 63 - clz64(pg);
4191 if (reg_off_first < 0) {
4192 reg_off_first = i * 64 + ctz64(pg);
4193 }
4194 }
4195 } while (++i * 64 < reg_max);
4196
4197 if (unlikely(reg_off_first < 0)) {
4198 /* No active elements, no pages touched. */
4199 return false;
4200 }
4201 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
4202
4203 info->reg_off_first[0] = reg_off_first;
4204 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
4205 mem_off_last = (reg_off_last >> esz) * msize;
4206
4207 page_split = -(addr | TARGET_PAGE_MASK);
4208 if (likely(mem_off_last + msize <= page_split)) {
4209 /* The entire operation fits within a single page. */
4210 info->reg_off_last[0] = reg_off_last;
4211 return true;
4212 }
4213
4214 info->page_split = page_split;
4215 elt_split = page_split / msize;
4216 reg_off_split = elt_split << esz;
4217 mem_off_split = elt_split * msize;
4218
4219 /*
4220 * This is the last full element on the first page, but it is not
4221 * necessarily active. If there is no full element, i.e. the first
4222 * active element is the one that's split, this value remains -1.
4223 * It is useful as iteration bounds.
4224 */
4225 if (elt_split != 0) {
4226 info->reg_off_last[0] = reg_off_split - esize;
4227 }
4228
4229 /* Determine if an unaligned element spans the pages. */
4230 if (page_split % msize != 0) {
4231 /* It is helpful to know if the split element is active. */
4232 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
4233 info->reg_off_split = reg_off_split;
4234 info->mem_off_split = mem_off_split;
4235
4236 if (reg_off_split == reg_off_last) {
4237 /* The page crossing element is last. */
4238 return true;
4239 }
4240 }
4241 reg_off_split += esize;
4242 mem_off_split += msize;
4243 }
4244
4245 /*
4246 * We do want the first active element on the second page, because
4247 * this may affect the address reported in an exception.
4248 */
4249 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
4250 tcg_debug_assert(reg_off_split <= reg_off_last);
4251 info->reg_off_first[1] = reg_off_split;
4252 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
4253 info->reg_off_last[1] = reg_off_last;
4254 return true;
4255 }
4256
4257 /*
4258 * Resolve the guest virtual addresses to info->page[].
4259 * Control the generation of page faults with @fault. Return false if
4260 * there is no work to do, which can only happen with @fault == FAULT_NO.
4261 */
4262 static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
4263 CPUARMState *env, target_ulong addr,
4264 MMUAccessType access_type, uintptr_t retaddr)
4265 {
4266 int mmu_idx = cpu_mmu_index(env, false);
4267 int mem_off = info->mem_off_first[0];
4268 bool nofault = fault == FAULT_NO;
4269 bool have_work = true;
4270
4271 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
4272 access_type, mmu_idx, retaddr)) {
4273 /* No work to be done. */
4274 return false;
4275 }
4276
4277 if (likely(info->page_split < 0)) {
4278 /* The entire operation was on the one page. */
4279 return true;
4280 }
4281
4282 /*
4283 * If the second page is invalid, then we want the fault address to be
4284 * the first byte on that page which is accessed.
4285 */
4286 if (info->mem_off_split >= 0) {
4287 /*
4288 * There is an element split across the pages. The fault address
4289 * should be the first byte of the second page.
4290 */
4291 mem_off = info->page_split;
4292 /*
4293 * If the split element is also the first active element
4294 * of the vector, then: For first-fault we should continue
4295 * to generate faults for the second page. For no-fault,
4296 * we have work only if the second page is valid.
4297 */
4298 if (info->mem_off_first[0] < info->mem_off_split) {
4299 nofault = FAULT_FIRST;
4300 have_work = false;
4301 }
4302 } else {
4303 /*
4304 * There is no element split across the pages. The fault address
4305 * should be the first active element on the second page.
4306 */
4307 mem_off = info->mem_off_first[1];
4308 /*
4309 * There must have been one active element on the first page,
4310 * so we're out of first-fault territory.
4311 */
4312 nofault = fault != FAULT_ALL;
4313 }
4314
4315 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
4316 access_type, mmu_idx, retaddr);
4317 return have_work;
4318 }
4319
4320 static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
4321 uint64_t *vg, target_ulong addr,
4322 int esize, int msize, int wp_access,
4323 uintptr_t retaddr)
4324 {
4325 #ifndef CONFIG_USER_ONLY
4326 intptr_t mem_off, reg_off, reg_last;
4327 int flags0 = info->page[0].flags;
4328 int flags1 = info->page[1].flags;
4329
4330 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
4331 return;
4332 }
4333
4334 /* Indicate that watchpoints are handled. */
4335 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
4336 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
4337
4338 if (flags0 & TLB_WATCHPOINT) {
4339 mem_off = info->mem_off_first[0];
4340 reg_off = info->reg_off_first[0];
4341 reg_last = info->reg_off_last[0];
4342
4343 while (reg_off <= reg_last) {
4344 uint64_t pg = vg[reg_off >> 6];
4345 do {
4346 if ((pg >> (reg_off & 63)) & 1) {
4347 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4348 msize, info->page[0].attrs,
4349 wp_access, retaddr);
4350 }
4351 reg_off += esize;
4352 mem_off += msize;
4353 } while (reg_off <= reg_last && (reg_off & 63));
4354 }
4355 }
4356
4357 mem_off = info->mem_off_split;
4358 if (mem_off >= 0) {
4359 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
4360 info->page[0].attrs, wp_access, retaddr);
4361 }
4362
4363 mem_off = info->mem_off_first[1];
4364 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
4365 reg_off = info->reg_off_first[1];
4366 reg_last = info->reg_off_last[1];
4367
4368 do {
4369 uint64_t pg = vg[reg_off >> 6];
4370 do {
4371 if ((pg >> (reg_off & 63)) & 1) {
4372 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4373 msize, info->page[1].attrs,
4374 wp_access, retaddr);
4375 }
4376 reg_off += esize;
4377 mem_off += msize;
4378 } while (reg_off & 63);
4379 } while (reg_off <= reg_last);
4380 }
4381 #endif
4382 }
4383
4384 typedef uint64_t mte_check_fn(CPUARMState *, uint32_t, uint64_t, uintptr_t);
4385
4386 static inline QEMU_ALWAYS_INLINE
4387 void sve_cont_ldst_mte_check_int(SVEContLdSt *info, CPUARMState *env,
4388 uint64_t *vg, target_ulong addr, int esize,
4389 int msize, uint32_t mtedesc, uintptr_t ra,
4390 mte_check_fn *check)
4391 {
4392 intptr_t mem_off, reg_off, reg_last;
4393
4394 /* Process the page only if MemAttr == Tagged. */
4395 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
4396 mem_off = info->mem_off_first[0];
4397 reg_off = info->reg_off_first[0];
4398 reg_last = info->reg_off_split;
4399 if (reg_last < 0) {
4400 reg_last = info->reg_off_last[0];
4401 }
4402
4403 do {
4404 uint64_t pg = vg[reg_off >> 6];
4405 do {
4406 if ((pg >> (reg_off & 63)) & 1) {
4407 check(env, mtedesc, addr, ra);
4408 }
4409 reg_off += esize;
4410 mem_off += msize;
4411 } while (reg_off <= reg_last && (reg_off & 63));
4412 } while (reg_off <= reg_last);
4413 }
4414
4415 mem_off = info->mem_off_first[1];
4416 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
4417 reg_off = info->reg_off_first[1];
4418 reg_last = info->reg_off_last[1];
4419
4420 do {
4421 uint64_t pg = vg[reg_off >> 6];
4422 do {
4423 if ((pg >> (reg_off & 63)) & 1) {
4424 check(env, mtedesc, addr, ra);
4425 }
4426 reg_off += esize;
4427 mem_off += msize;
4428 } while (reg_off & 63);
4429 } while (reg_off <= reg_last);
4430 }
4431 }
4432
4433 typedef void sve_cont_ldst_mte_check_fn(SVEContLdSt *info, CPUARMState *env,
4434 uint64_t *vg, target_ulong addr,
4435 int esize, int msize, uint32_t mtedesc,
4436 uintptr_t ra);
4437
4438 static void sve_cont_ldst_mte_check1(SVEContLdSt *info, CPUARMState *env,
4439 uint64_t *vg, target_ulong addr,
4440 int esize, int msize, uint32_t mtedesc,
4441 uintptr_t ra)
4442 {
4443 sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
4444 mtedesc, ra, mte_check1);
4445 }
4446
4447 static void sve_cont_ldst_mte_checkN(SVEContLdSt *info, CPUARMState *env,
4448 uint64_t *vg, target_ulong addr,
4449 int esize, int msize, uint32_t mtedesc,
4450 uintptr_t ra)
4451 {
4452 sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
4453 mtedesc, ra, mte_checkN);
4454 }
4455
4456
4457 /*
4458 * Common helper for all contiguous 1,2,3,4-register predicated stores.
4459 */
4460 static inline QEMU_ALWAYS_INLINE
4461 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
4462 uint32_t desc, const uintptr_t retaddr,
4463 const int esz, const int msz, const int N, uint32_t mtedesc,
4464 sve_ldst1_host_fn *host_fn,
4465 sve_ldst1_tlb_fn *tlb_fn,
4466 sve_cont_ldst_mte_check_fn *mte_check_fn)
4467 {
4468 const unsigned rd = simd_data(desc);
4469 const intptr_t reg_max = simd_oprsz(desc);
4470 intptr_t reg_off, reg_last, mem_off;
4471 SVEContLdSt info;
4472 void *host;
4473 int flags, i;
4474
4475 /* Find the active elements. */
4476 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
4477 /* The entire predicate was false; no load occurs. */
4478 for (i = 0; i < N; ++i) {
4479 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4480 }
4481 return;
4482 }
4483
4484 /* Probe the page(s). Exit with exception for any invalid page. */
4485 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
4486
4487 /* Handle watchpoints for all active elements. */
4488 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
4489 BP_MEM_READ, retaddr);
4490
4491 /*
4492 * Handle mte checks for all active elements.
4493 * Since TBI must be set for MTE, !mtedesc => !mte_active.
4494 */
4495 if (mte_check_fn && mtedesc) {
4496 mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
4497 mtedesc, retaddr);
4498 }
4499
4500 flags = info.page[0].flags | info.page[1].flags;
4501 if (unlikely(flags != 0)) {
4502 #ifdef CONFIG_USER_ONLY
4503 g_assert_not_reached();
4504 #else
4505 /*
4506 * At least one page includes MMIO.
4507 * Any bus operation can fail with cpu_transaction_failed,
4508 * which for ARM will raise SyncExternal. Perform the load
4509 * into scratch memory to preserve register state until the end.
4510 */
4511 ARMVectorReg scratch[4] = { };
4512
4513 mem_off = info.mem_off_first[0];
4514 reg_off = info.reg_off_first[0];
4515 reg_last = info.reg_off_last[1];
4516 if (reg_last < 0) {
4517 reg_last = info.reg_off_split;
4518 if (reg_last < 0) {
4519 reg_last = info.reg_off_last[0];
4520 }
4521 }
4522
4523 do {
4524 uint64_t pg = vg[reg_off >> 6];
4525 do {
4526 if ((pg >> (reg_off & 63)) & 1) {
4527 for (i = 0; i < N; ++i) {
4528 tlb_fn(env, &scratch[i], reg_off,
4529 addr + mem_off + (i << msz), retaddr);
4530 }
4531 }
4532 reg_off += 1 << esz;
4533 mem_off += N << msz;
4534 } while (reg_off & 63);
4535 } while (reg_off <= reg_last);
4536
4537 for (i = 0; i < N; ++i) {
4538 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
4539 }
4540 return;
4541 #endif
4542 }
4543
4544 /* The entire operation is in RAM, on valid pages. */
4545
4546 for (i = 0; i < N; ++i) {
4547 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4548 }
4549
4550 mem_off = info.mem_off_first[0];
4551 reg_off = info.reg_off_first[0];
4552 reg_last = info.reg_off_last[0];
4553 host = info.page[0].host;
4554
4555 while (reg_off <= reg_last) {
4556 uint64_t pg = vg[reg_off >> 6];
4557 do {
4558 if ((pg >> (reg_off & 63)) & 1) {
4559 for (i = 0; i < N; ++i) {
4560 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4561 host + mem_off + (i << msz));
4562 }
4563 }
4564 reg_off += 1 << esz;
4565 mem_off += N << msz;
4566 } while (reg_off <= reg_last && (reg_off & 63));
4567 }
4568
4569 /*
4570 * Use the slow path to manage the cross-page misalignment.
4571 * But we know this is RAM and cannot trap.
4572 */
4573 mem_off = info.mem_off_split;
4574 if (unlikely(mem_off >= 0)) {
4575 reg_off = info.reg_off_split;
4576 for (i = 0; i < N; ++i) {
4577 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
4578 addr + mem_off + (i << msz), retaddr);
4579 }
4580 }
4581
4582 mem_off = info.mem_off_first[1];
4583 if (unlikely(mem_off >= 0)) {
4584 reg_off = info.reg_off_first[1];
4585 reg_last = info.reg_off_last[1];
4586 host = info.page[1].host;
4587
4588 do {
4589 uint64_t pg = vg[reg_off >> 6];
4590 do {
4591 if ((pg >> (reg_off & 63)) & 1) {
4592 for (i = 0; i < N; ++i) {
4593 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4594 host + mem_off + (i << msz));
4595 }
4596 }
4597 reg_off += 1 << esz;
4598 mem_off += N << msz;
4599 } while (reg_off & 63);
4600 } while (reg_off <= reg_last);
4601 }
4602 }
4603
4604 static inline QEMU_ALWAYS_INLINE
4605 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
4606 uint32_t desc, const uintptr_t ra,
4607 const int esz, const int msz, const int N,
4608 sve_ldst1_host_fn *host_fn,
4609 sve_ldst1_tlb_fn *tlb_fn)
4610 {
4611 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4612 int bit55 = extract64(addr, 55, 1);
4613
4614 /* Remove mtedesc from the normal sve descriptor. */
4615 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4616
4617 /* Perform gross MTE suppression early. */
4618 if (!tbi_check(desc, bit55) ||
4619 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
4620 mtedesc = 0;
4621 }
4622
4623 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
4624 N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
4625 }
4626
4627 #define DO_LD1_1(NAME, ESZ) \
4628 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
4629 target_ulong addr, uint32_t desc) \
4630 { \
4631 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
4632 sve_##NAME##_host, sve_##NAME##_tlb, NULL); \
4633 } \
4634 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
4635 target_ulong addr, uint32_t desc) \
4636 { \
4637 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
4638 sve_##NAME##_host, sve_##NAME##_tlb); \
4639 }
4640
4641 #define DO_LD1_2(NAME, ESZ, MSZ) \
4642 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
4643 target_ulong addr, uint32_t desc) \
4644 { \
4645 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4646 sve_##NAME##_le_host, sve_##NAME##_le_tlb, NULL); \
4647 } \
4648 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
4649 target_ulong addr, uint32_t desc) \
4650 { \
4651 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4652 sve_##NAME##_be_host, sve_##NAME##_be_tlb, NULL); \
4653 } \
4654 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
4655 target_ulong addr, uint32_t desc) \
4656 { \
4657 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
4658 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
4659 } \
4660 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
4661 target_ulong addr, uint32_t desc) \
4662 { \
4663 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
4664 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
4665 }
4666
4667 DO_LD1_1(ld1bb, MO_8)
4668 DO_LD1_1(ld1bhu, MO_16)
4669 DO_LD1_1(ld1bhs, MO_16)
4670 DO_LD1_1(ld1bsu, MO_32)
4671 DO_LD1_1(ld1bss, MO_32)
4672 DO_LD1_1(ld1bdu, MO_64)
4673 DO_LD1_1(ld1bds, MO_64)
4674
4675 DO_LD1_2(ld1hh, MO_16, MO_16)
4676 DO_LD1_2(ld1hsu, MO_32, MO_16)
4677 DO_LD1_2(ld1hss, MO_32, MO_16)
4678 DO_LD1_2(ld1hdu, MO_64, MO_16)
4679 DO_LD1_2(ld1hds, MO_64, MO_16)
4680
4681 DO_LD1_2(ld1ss, MO_32, MO_32)
4682 DO_LD1_2(ld1sdu, MO_64, MO_32)
4683 DO_LD1_2(ld1sds, MO_64, MO_32)
4684
4685 DO_LD1_2(ld1dd, MO_64, MO_64)
4686
4687 #undef DO_LD1_1
4688 #undef DO_LD1_2
4689
4690 #define DO_LDN_1(N) \
4691 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
4692 target_ulong addr, uint32_t desc) \
4693 { \
4694 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
4695 sve_ld1bb_host, sve_ld1bb_tlb, NULL); \
4696 } \
4697 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
4698 target_ulong addr, uint32_t desc) \
4699 { \
4700 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
4701 sve_ld1bb_host, sve_ld1bb_tlb); \
4702 }
4703
4704 #define DO_LDN_2(N, SUFF, ESZ) \
4705 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
4706 target_ulong addr, uint32_t desc) \
4707 { \
4708 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4709 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb, NULL); \
4710 } \
4711 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
4712 target_ulong addr, uint32_t desc) \
4713 { \
4714 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4715 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb, NULL); \
4716 } \
4717 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
4718 target_ulong addr, uint32_t desc) \
4719 { \
4720 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
4721 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
4722 } \
4723 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
4724 target_ulong addr, uint32_t desc) \
4725 { \
4726 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
4727 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
4728 }
4729
4730 DO_LDN_1(2)
4731 DO_LDN_1(3)
4732 DO_LDN_1(4)
4733
4734 DO_LDN_2(2, hh, MO_16)
4735 DO_LDN_2(3, hh, MO_16)
4736 DO_LDN_2(4, hh, MO_16)
4737
4738 DO_LDN_2(2, ss, MO_32)
4739 DO_LDN_2(3, ss, MO_32)
4740 DO_LDN_2(4, ss, MO_32)
4741
4742 DO_LDN_2(2, dd, MO_64)
4743 DO_LDN_2(3, dd, MO_64)
4744 DO_LDN_2(4, dd, MO_64)
4745
4746 #undef DO_LDN_1
4747 #undef DO_LDN_2
4748
4749 /*
4750 * Load contiguous data, first-fault and no-fault.
4751 *
4752 * For user-only, one could argue that we should hold the mmap_lock during
4753 * the operation so that there is no race between page_check_range and the
4754 * load operation. However, unmapping pages out from under a running thread
4755 * is extraordinarily unlikely. This theoretical race condition also affects
4756 * linux-user/ in its get_user/put_user macros.
4757 *
4758 * TODO: Construct some helpers, written in assembly, that interact with
4759 * handle_cpu_signal to produce memory ops which can properly report errors
4760 * without racing.
4761 */
4762
4763 /* Fault on byte I. All bits in FFR from I are cleared. The vector
4764 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4765 * option, which leaves subsequent data unchanged.
4766 */
4767 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4768 {
4769 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4770
4771 if (i & 63) {
4772 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4773 i = ROUND_UP(i, 64);
4774 }
4775 for (; i < oprsz; i += 64) {
4776 ffr[i / 64] = 0;
4777 }
4778 }
4779
4780 /*
4781 * Common helper for all contiguous no-fault and first-fault loads.
4782 */
4783 static inline QEMU_ALWAYS_INLINE
4784 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
4785 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
4786 const int esz, const int msz, const SVEContFault fault,
4787 sve_ldst1_host_fn *host_fn,
4788 sve_ldst1_tlb_fn *tlb_fn)
4789 {
4790 const unsigned rd = simd_data(desc);
4791 void *vd = &env->vfp.zregs[rd];
4792 const intptr_t reg_max = simd_oprsz(desc);
4793 intptr_t reg_off, mem_off, reg_last;
4794 SVEContLdSt info;
4795 int flags;
4796 void *host;
4797
4798 /* Find the active elements. */
4799 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
4800 /* The entire predicate was false; no load occurs. */
4801 memset(vd, 0, reg_max);
4802 return;
4803 }
4804 reg_off = info.reg_off_first[0];
4805
4806 /* Probe the page(s). */
4807 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
4808 /* Fault on first element. */
4809 tcg_debug_assert(fault == FAULT_NO);
4810 memset(vd, 0, reg_max);
4811 goto do_fault;
4812 }
4813
4814 mem_off = info.mem_off_first[0];
4815 flags = info.page[0].flags;
4816
4817 /*
4818 * Disable MTE checking if the Tagged bit is not set. Since TBI must
4819 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
4820 */
4821 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
4822 mtedesc = 0;
4823 }
4824
4825 if (fault == FAULT_FIRST) {
4826 /* Trapping mte check for the first-fault element. */
4827 if (mtedesc) {
4828 mte_check1(env, mtedesc, addr + mem_off, retaddr);
4829 }
4830
4831 /*
4832 * Special handling of the first active element,
4833 * if it crosses a page boundary or is MMIO.
4834 */
4835 bool is_split = mem_off == info.mem_off_split;
4836 if (unlikely(flags != 0) || unlikely(is_split)) {
4837 /*
4838 * Use the slow path for cross-page handling.
4839 * Might trap for MMIO or watchpoints.
4840 */
4841 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4842
4843 /* After any fault, zero the other elements. */
4844 swap_memzero(vd, reg_off);
4845 reg_off += 1 << esz;
4846 mem_off += 1 << msz;
4847 swap_memzero(vd + reg_off, reg_max - reg_off);
4848
4849 if (is_split) {
4850 goto second_page;
4851 }
4852 } else {
4853 memset(vd, 0, reg_max);
4854 }
4855 } else {
4856 memset(vd, 0, reg_max);
4857 if (unlikely(mem_off == info.mem_off_split)) {
4858 /* The first active element crosses a page boundary. */
4859 flags |= info.page[1].flags;
4860 if (unlikely(flags & TLB_MMIO)) {
4861 /* Some page is MMIO, see below. */
4862 goto do_fault;
4863 }
4864 if (unlikely(flags & TLB_WATCHPOINT) &&
4865 (cpu_watchpoint_address_matches
4866 (env_cpu(env), addr + mem_off, 1 << msz)
4867 & BP_MEM_READ)) {
4868 /* Watchpoint hit, see below. */
4869 goto do_fault;
4870 }
4871 if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
4872 goto do_fault;
4873 }
4874 /*
4875 * Use the slow path for cross-page handling.
4876 * This is RAM, without a watchpoint, and will not trap.
4877 */
4878 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4879 goto second_page;
4880 }
4881 }
4882
4883 /*
4884 * From this point on, all memory operations are MemSingleNF.
4885 *
4886 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
4887 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
4888 *
4889 * Unfortuately we do not have access to the memory attributes from the
4890 * PTE to tell Device memory from Normal memory. So we make a mostly
4891 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
4892 * This gives the right answer for the common cases of "Normal memory,
4893 * backed by host RAM" and "Device memory, backed by MMIO".
4894 * The architecture allows us to suppress an NF load and return
4895 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
4896 * case of "Normal memory, backed by MMIO" is permitted. The case we
4897 * get wrong is "Device memory, backed by host RAM", for which we
4898 * should return (UNKNOWN, FAULT) for but do not.
4899 *
4900 * Similarly, CPU_BP breakpoints would raise exceptions, and so
4901 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
4902 * architectural breakpoints the same.
4903 */
4904 if (unlikely(flags & TLB_MMIO)) {
4905 goto do_fault;
4906 }
4907
4908 reg_last = info.reg_off_last[0];
4909 host = info.page[0].host;
4910
4911 do {
4912 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
4913 do {
4914 if ((pg >> (reg_off & 63)) & 1) {
4915 if (unlikely(flags & TLB_WATCHPOINT) &&
4916 (cpu_watchpoint_address_matches
4917 (env_cpu(env), addr + mem_off, 1 << msz)
4918 & BP_MEM_READ)) {
4919 goto do_fault;
4920 }
4921 if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
4922 goto do_fault;
4923 }
4924 host_fn(vd, reg_off, host + mem_off);
4925 }
4926 reg_off += 1 << esz;
4927 mem_off += 1 << msz;
4928 } while (reg_off <= reg_last && (reg_off & 63));
4929 } while (reg_off <= reg_last);
4930
4931 /*
4932 * MemSingleNF is allowed to fail for any reason. We have special
4933 * code above to handle the first element crossing a page boundary.
4934 * As an implementation choice, decline to handle a cross-page element
4935 * in any other position.
4936 */
4937 reg_off = info.reg_off_split;
4938 if (reg_off >= 0) {
4939 goto do_fault;
4940 }
4941
4942 second_page:
4943 reg_off = info.reg_off_first[1];
4944 if (likely(reg_off < 0)) {
4945 /* No active elements on the second page. All done. */
4946 return;
4947 }
4948
4949 /*
4950 * MemSingleNF is allowed to fail for any reason. As an implementation
4951 * choice, decline to handle elements on the second page. This should
4952 * be low frequency as the guest walks through memory -- the next
4953 * iteration of the guest's loop should be aligned on the page boundary,
4954 * and then all following iterations will stay aligned.
4955 */
4956
4957 do_fault:
4958 record_fault(env, reg_off, reg_max);
4959 }
4960
4961 static inline QEMU_ALWAYS_INLINE
4962 void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
4963 uint32_t desc, const uintptr_t retaddr,
4964 const int esz, const int msz, const SVEContFault fault,
4965 sve_ldst1_host_fn *host_fn,
4966 sve_ldst1_tlb_fn *tlb_fn)
4967 {
4968 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4969 int bit55 = extract64(addr, 55, 1);
4970
4971 /* Remove mtedesc from the normal sve descriptor. */
4972 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4973
4974 /* Perform gross MTE suppression early. */
4975 if (!tbi_check(desc, bit55) ||
4976 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
4977 mtedesc = 0;
4978 }
4979
4980 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
4981 esz, msz, fault, host_fn, tlb_fn);
4982 }
4983
4984 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
4985 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
4986 target_ulong addr, uint32_t desc) \
4987 { \
4988 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
4989 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4990 } \
4991 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
4992 target_ulong addr, uint32_t desc) \
4993 { \
4994 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
4995 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4996 } \
4997 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
4998 target_ulong addr, uint32_t desc) \
4999 { \
5000 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
5001 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5002 } \
5003 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
5004 target_ulong addr, uint32_t desc) \
5005 { \
5006 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
5007 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5008 }
5009
5010 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
5011 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
5012 target_ulong addr, uint32_t desc) \
5013 { \
5014 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
5015 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5016 } \
5017 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
5018 target_ulong addr, uint32_t desc) \
5019 { \
5020 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
5021 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5022 } \
5023 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
5024 target_ulong addr, uint32_t desc) \
5025 { \
5026 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
5027 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5028 } \
5029 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
5030 target_ulong addr, uint32_t desc) \
5031 { \
5032 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
5033 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5034 } \
5035 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5036 target_ulong addr, uint32_t desc) \
5037 { \
5038 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5039 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5040 } \
5041 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5042 target_ulong addr, uint32_t desc) \
5043 { \
5044 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5045 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5046 } \
5047 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5048 target_ulong addr, uint32_t desc) \
5049 { \
5050 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5051 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5052 } \
5053 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5054 target_ulong addr, uint32_t desc) \
5055 { \
5056 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5057 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5058 }
5059
5060 DO_LDFF1_LDNF1_1(bb, MO_8)
5061 DO_LDFF1_LDNF1_1(bhu, MO_16)
5062 DO_LDFF1_LDNF1_1(bhs, MO_16)
5063 DO_LDFF1_LDNF1_1(bsu, MO_32)
5064 DO_LDFF1_LDNF1_1(bss, MO_32)
5065 DO_LDFF1_LDNF1_1(bdu, MO_64)
5066 DO_LDFF1_LDNF1_1(bds, MO_64)
5067
5068 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
5069 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
5070 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
5071 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
5072 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
5073
5074 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
5075 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
5076 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
5077
5078 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
5079
5080 #undef DO_LDFF1_LDNF1_1
5081 #undef DO_LDFF1_LDNF1_2
5082
5083 /*
5084 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5085 */
5086
5087 static inline QEMU_ALWAYS_INLINE
5088 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
5089 uint32_t desc, const uintptr_t retaddr,
5090 const int esz, const int msz, const int N, uint32_t mtedesc,
5091 sve_ldst1_host_fn *host_fn,
5092 sve_ldst1_tlb_fn *tlb_fn,
5093 sve_cont_ldst_mte_check_fn *mte_check_fn)
5094 {
5095 const unsigned rd = simd_data(desc);
5096 const intptr_t reg_max = simd_oprsz(desc);
5097 intptr_t reg_off, reg_last, mem_off;
5098 SVEContLdSt info;
5099 void *host;
5100 int i, flags;
5101
5102 /* Find the active elements. */
5103 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5104 /* The entire predicate was false; no store occurs. */
5105 return;
5106 }
5107
5108 /* Probe the page(s). Exit with exception for any invalid page. */
5109 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
5110
5111 /* Handle watchpoints for all active elements. */
5112 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5113 BP_MEM_WRITE, retaddr);
5114
5115 /*
5116 * Handle mte checks for all active elements.
5117 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5118 */
5119 if (mte_check_fn && mtedesc) {
5120 mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
5121 mtedesc, retaddr);
5122 }
5123
5124 flags = info.page[0].flags | info.page[1].flags;
5125 if (unlikely(flags != 0)) {
5126 #ifdef CONFIG_USER_ONLY
5127 g_assert_not_reached();
5128 #else
5129 /*
5130 * At least one page includes MMIO.
5131 * Any bus operation can fail with cpu_transaction_failed,
5132 * which for ARM will raise SyncExternal. We cannot avoid
5133 * this fault and will leave with the store incomplete.
5134 */
5135 mem_off = info.mem_off_first[0];
5136 reg_off = info.reg_off_first[0];
5137 reg_last = info.reg_off_last[1];
5138 if (reg_last < 0) {
5139 reg_last = info.reg_off_split;
5140 if (reg_last < 0) {
5141 reg_last = info.reg_off_last[0];
5142 }
5143 }
5144
5145 do {
5146 uint64_t pg = vg[reg_off >> 6];
5147 do {
5148 if ((pg >> (reg_off & 63)) & 1) {
5149 for (i = 0; i < N; ++i) {
5150 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5151 addr + mem_off + (i << msz), retaddr);
5152 }
5153 }
5154 reg_off += 1 << esz;
5155 mem_off += N << msz;
5156 } while (reg_off & 63);
5157 } while (reg_off <= reg_last);
5158 return;
5159 #endif
5160 }
5161
5162 mem_off = info.mem_off_first[0];
5163 reg_off = info.reg_off_first[0];
5164 reg_last = info.reg_off_last[0];
5165 host = info.page[0].host;
5166
5167 while (reg_off <= reg_last) {
5168 uint64_t pg = vg[reg_off >> 6];
5169 do {
5170 if ((pg >> (reg_off & 63)) & 1) {
5171 for (i = 0; i < N; ++i) {
5172 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5173 host + mem_off + (i << msz));
5174 }
5175 }
5176 reg_off += 1 << esz;
5177 mem_off += N << msz;
5178 } while (reg_off <= reg_last && (reg_off & 63));
5179 }
5180
5181 /*
5182 * Use the slow path to manage the cross-page misalignment.
5183 * But we know this is RAM and cannot trap.
5184 */
5185 mem_off = info.mem_off_split;
5186 if (unlikely(mem_off >= 0)) {
5187 reg_off = info.reg_off_split;
5188 for (i = 0; i < N; ++i) {
5189 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5190 addr + mem_off + (i << msz), retaddr);
5191 }
5192 }
5193
5194 mem_off = info.mem_off_first[1];
5195 if (unlikely(mem_off >= 0)) {
5196 reg_off = info.reg_off_first[1];
5197 reg_last = info.reg_off_last[1];
5198 host = info.page[1].host;
5199
5200 do {
5201 uint64_t pg = vg[reg_off >> 6];
5202 do {
5203 if ((pg >> (reg_off & 63)) & 1) {
5204 for (i = 0; i < N; ++i) {
5205 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5206 host + mem_off + (i << msz));
5207 }
5208 }
5209 reg_off += 1 << esz;
5210 mem_off += N << msz;
5211 } while (reg_off & 63);
5212 } while (reg_off <= reg_last);
5213 }
5214 }
5215
5216 static inline QEMU_ALWAYS_INLINE
5217 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5218 uint32_t desc, const uintptr_t ra,
5219 const int esz, const int msz, const int N,
5220 sve_ldst1_host_fn *host_fn,
5221 sve_ldst1_tlb_fn *tlb_fn)
5222 {
5223 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5224 int bit55 = extract64(addr, 55, 1);
5225
5226 /* Remove mtedesc from the normal sve descriptor. */
5227 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5228
5229 /* Perform gross MTE suppression early. */
5230 if (!tbi_check(desc, bit55) ||
5231 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5232 mtedesc = 0;
5233 }
5234
5235 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
5236 N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
5237 }
5238
5239 #define DO_STN_1(N, NAME, ESZ) \
5240 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
5241 target_ulong addr, uint32_t desc) \
5242 { \
5243 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
5244 sve_st1##NAME##_host, sve_st1##NAME##_tlb, NULL); \
5245 } \
5246 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
5247 target_ulong addr, uint32_t desc) \
5248 { \
5249 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
5250 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
5251 }
5252
5253 #define DO_STN_2(N, NAME, ESZ, MSZ) \
5254 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
5255 target_ulong addr, uint32_t desc) \
5256 { \
5257 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
5258 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb, NULL); \
5259 } \
5260 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
5261 target_ulong addr, uint32_t desc) \
5262 { \
5263 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
5264 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb, NULL); \
5265 } \
5266 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5267 target_ulong addr, uint32_t desc) \
5268 { \
5269 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5270 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
5271 } \
5272 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5273 target_ulong addr, uint32_t desc) \
5274 { \
5275 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5276 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
5277 }
5278
5279 DO_STN_1(1, bb, MO_8)
5280 DO_STN_1(1, bh, MO_16)
5281 DO_STN_1(1, bs, MO_32)
5282 DO_STN_1(1, bd, MO_64)
5283 DO_STN_1(2, bb, MO_8)
5284 DO_STN_1(3, bb, MO_8)
5285 DO_STN_1(4, bb, MO_8)
5286
5287 DO_STN_2(1, hh, MO_16, MO_16)
5288 DO_STN_2(1, hs, MO_32, MO_16)
5289 DO_STN_2(1, hd, MO_64, MO_16)
5290 DO_STN_2(2, hh, MO_16, MO_16)
5291 DO_STN_2(3, hh, MO_16, MO_16)
5292 DO_STN_2(4, hh, MO_16, MO_16)
5293
5294 DO_STN_2(1, ss, MO_32, MO_32)
5295 DO_STN_2(1, sd, MO_64, MO_32)
5296 DO_STN_2(2, ss, MO_32, MO_32)
5297 DO_STN_2(3, ss, MO_32, MO_32)
5298 DO_STN_2(4, ss, MO_32, MO_32)
5299
5300 DO_STN_2(1, dd, MO_64, MO_64)
5301 DO_STN_2(2, dd, MO_64, MO_64)
5302 DO_STN_2(3, dd, MO_64, MO_64)
5303 DO_STN_2(4, dd, MO_64, MO_64)
5304
5305 #undef DO_STN_1
5306 #undef DO_STN_2
5307
5308 /*
5309 * Loads with a vector index.
5310 */
5311
5312 /*
5313 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
5314 */
5315 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
5316
5317 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
5318 {
5319 return *(uint32_t *)(reg + H1_4(reg_ofs));
5320 }
5321
5322 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
5323 {
5324 return *(int32_t *)(reg + H1_4(reg_ofs));
5325 }
5326
5327 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
5328 {
5329 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
5330 }
5331
5332 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
5333 {
5334 return (int32_t)*(uint64_t *)(reg + reg_ofs);
5335 }
5336
5337 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
5338 {
5339 return *(uint64_t *)(reg + reg_ofs);
5340 }
5341
5342 static inline QEMU_ALWAYS_INLINE
5343 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5344 target_ulong base, uint32_t desc, uintptr_t retaddr,
5345 uint32_t mtedesc, int esize, int msize,
5346 zreg_off_fn *off_fn,
5347 sve_ldst1_host_fn *host_fn,
5348 sve_ldst1_tlb_fn *tlb_fn)
5349 {
5350 const int mmu_idx = cpu_mmu_index(env, false);
5351 const intptr_t reg_max = simd_oprsz(desc);
5352 const int scale = simd_data(desc);
5353 ARMVectorReg scratch;
5354 intptr_t reg_off;
5355 SVEHostPage info, info2;
5356
5357 memset(&scratch, 0, reg_max);
5358 reg_off = 0;
5359 do {
5360 uint64_t pg = vg[reg_off >> 6];
5361 do {
5362 if (likely(pg & 1)) {
5363 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5364 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5365
5366 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
5367 mmu_idx, retaddr);
5368
5369 if (likely(in_page >= msize)) {
5370 if (unlikely(info.flags & TLB_WATCHPOINT)) {
5371 cpu_check_watchpoint(env_cpu(env), addr, msize,
5372 info.attrs, BP_MEM_READ, retaddr);
5373 }
5374 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5375 mte_check1(env, mtedesc, addr, retaddr);
5376 }
5377 host_fn(&scratch, reg_off, info.host);
5378 } else {
5379 /* Element crosses the page boundary. */
5380 sve_probe_page(&info2, false, env, addr + in_page, 0,
5381 MMU_DATA_LOAD, mmu_idx, retaddr);
5382 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
5383 cpu_check_watchpoint(env_cpu(env), addr,
5384 msize, info.attrs,
5385 BP_MEM_READ, retaddr);
5386 }
5387 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5388 mte_check1(env, mtedesc, addr, retaddr);
5389 }
5390 tlb_fn(env, &scratch, reg_off, addr, retaddr);
5391 }
5392 }
5393 reg_off += esize;
5394 pg >>= esize;
5395 } while (reg_off & 63);
5396 } while (reg_off < reg_max);
5397
5398 /* Wait until all exceptions have been raised to write back. */
5399 memcpy(vd, &scratch, reg_max);
5400 }
5401
5402 static inline QEMU_ALWAYS_INLINE
5403 void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5404 target_ulong base, uint32_t desc, uintptr_t retaddr,
5405 int esize, int msize, zreg_off_fn *off_fn,
5406 sve_ldst1_host_fn *host_fn,
5407 sve_ldst1_tlb_fn *tlb_fn)
5408 {
5409 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5410 /* Remove mtedesc from the normal sve descriptor. */
5411 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5412
5413 /*
5414 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5415 * offset base entirely over the address space hole to change the
5416 * pointer tag, or change the bit55 selector. So we could here
5417 * examine TBI + TCMA like we do for sve_ldN_r_mte().
5418 */
5419 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5420 esize, msize, off_fn, host_fn, tlb_fn);
5421 }
5422
5423 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
5424 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5425 void *vm, target_ulong base, uint32_t desc) \
5426 { \
5427 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
5428 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5429 } \
5430 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5431 void *vm, target_ulong base, uint32_t desc) \
5432 { \
5433 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
5434 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5435 }
5436
5437 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
5438 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5439 void *vm, target_ulong base, uint32_t desc) \
5440 { \
5441 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
5442 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5443 } \
5444 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5445 void *vm, target_ulong base, uint32_t desc) \
5446 { \
5447 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
5448 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5449 }
5450
5451 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
5452 DO_LD1_ZPZ_S(bsu, zss, MO_8)
5453 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
5454 DO_LD1_ZPZ_D(bdu, zss, MO_8)
5455 DO_LD1_ZPZ_D(bdu, zd, MO_8)
5456
5457 DO_LD1_ZPZ_S(bss, zsu, MO_8)
5458 DO_LD1_ZPZ_S(bss, zss, MO_8)
5459 DO_LD1_ZPZ_D(bds, zsu, MO_8)
5460 DO_LD1_ZPZ_D(bds, zss, MO_8)
5461 DO_LD1_ZPZ_D(bds, zd, MO_8)
5462
5463 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
5464 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
5465 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
5466 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
5467 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
5468
5469 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
5470 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
5471 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
5472 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
5473 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
5474
5475 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
5476 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
5477 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
5478 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
5479 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
5480
5481 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
5482 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
5483 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
5484 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
5485 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
5486
5487 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
5488 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
5489 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
5490 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
5491 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
5492
5493 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
5494 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
5495 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
5496 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
5497 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
5498
5499 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
5500 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
5501 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
5502
5503 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
5504 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
5505 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
5506
5507 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
5508 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
5509 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
5510
5511 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
5512 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
5513 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
5514
5515 #undef DO_LD1_ZPZ_S
5516 #undef DO_LD1_ZPZ_D
5517
5518 /* First fault loads with a vector index. */
5519
5520 /*
5521 * Common helpers for all gather first-faulting loads.
5522 */
5523
5524 static inline QEMU_ALWAYS_INLINE
5525 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5526 target_ulong base, uint32_t desc, uintptr_t retaddr,
5527 uint32_t mtedesc, const int esz, const int msz,
5528 zreg_off_fn *off_fn,
5529 sve_ldst1_host_fn *host_fn,
5530 sve_ldst1_tlb_fn *tlb_fn)
5531 {
5532 const int mmu_idx = cpu_mmu_index(env, false);
5533 const intptr_t reg_max = simd_oprsz(desc);
5534 const int scale = simd_data(desc);
5535 const int esize = 1 << esz;
5536 const int msize = 1 << msz;
5537 intptr_t reg_off;
5538 SVEHostPage info;
5539 target_ulong addr, in_page;
5540
5541 /* Skip to the first true predicate. */
5542 reg_off = find_next_active(vg, 0, reg_max, esz);
5543 if (unlikely(reg_off >= reg_max)) {
5544 /* The entire predicate was false; no load occurs. */
5545 memset(vd, 0, reg_max);
5546 return;
5547 }
5548
5549 /*
5550 * Probe the first element, allowing faults.
5551 */
5552 addr = base + (off_fn(vm, reg_off) << scale);
5553 if (mtedesc) {
5554 mte_check1(env, mtedesc, addr, retaddr);
5555 }
5556 tlb_fn(env, vd, reg_off, addr, retaddr);
5557
5558 /* After any fault, zero the other elements. */
5559 swap_memzero(vd, reg_off);
5560 reg_off += esize;
5561 swap_memzero(vd + reg_off, reg_max - reg_off);
5562
5563 /*
5564 * Probe the remaining elements, not allowing faults.
5565 */
5566 while (reg_off < reg_max) {
5567 uint64_t pg = vg[reg_off >> 6];
5568 do {
5569 if (likely((pg >> (reg_off & 63)) & 1)) {
5570 addr = base + (off_fn(vm, reg_off) << scale);
5571 in_page = -(addr | TARGET_PAGE_MASK);
5572
5573 if (unlikely(in_page < msize)) {
5574 /* Stop if the element crosses a page boundary. */
5575 goto fault;
5576 }
5577
5578 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
5579 mmu_idx, retaddr);
5580 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
5581 goto fault;
5582 }
5583 if (unlikely(info.flags & TLB_WATCHPOINT) &&
5584 (cpu_watchpoint_address_matches
5585 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
5586 goto fault;
5587 }
5588 if (mtedesc &&
5589 arm_tlb_mte_tagged(&info.attrs) &&
5590 !mte_probe1(env, mtedesc, addr)) {
5591 goto fault;
5592 }
5593
5594 host_fn(vd, reg_off, info.host);
5595 }
5596 reg_off += esize;
5597 } while (reg_off & 63);
5598 }
5599 return;
5600
5601 fault:
5602 record_fault(env, reg_off, reg_max);
5603 }
5604
5605 static inline QEMU_ALWAYS_INLINE
5606 void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5607 target_ulong base, uint32_t desc, uintptr_t retaddr,
5608 const int esz, const int msz,
5609 zreg_off_fn *off_fn,
5610 sve_ldst1_host_fn *host_fn,
5611 sve_ldst1_tlb_fn *tlb_fn)
5612 {
5613 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5614 /* Remove mtedesc from the normal sve descriptor. */
5615 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5616
5617 /*
5618 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5619 * offset base entirely over the address space hole to change the
5620 * pointer tag, or change the bit55 selector. So we could here
5621 * examine TBI + TCMA like we do for sve_ldN_r_mte().
5622 */
5623 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5624 esz, msz, off_fn, host_fn, tlb_fn);
5625 }
5626
5627 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
5628 void HELPER(sve_ldff##MEM##_##OFS) \
5629 (CPUARMState *env, void *vd, void *vg, \
5630 void *vm, target_ulong base, uint32_t desc) \
5631 { \
5632 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
5633 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5634 } \
5635 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
5636 (CPUARMState *env, void *vd, void *vg, \
5637 void *vm, target_ulong base, uint32_t desc) \
5638 { \
5639 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
5640 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5641 }
5642
5643 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
5644 void HELPER(sve_ldff##MEM##_##OFS) \
5645 (CPUARMState *env, void *vd, void *vg, \
5646 void *vm, target_ulong base, uint32_t desc) \
5647 { \
5648 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
5649 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5650 } \
5651 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
5652 (CPUARMState *env, void *vd, void *vg, \
5653 void *vm, target_ulong base, uint32_t desc) \
5654 { \
5655 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
5656 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5657 }
5658
5659 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
5660 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
5661 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
5662 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
5663 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
5664
5665 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
5666 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
5667 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
5668 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
5669 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
5670
5671 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
5672 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
5673 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
5674 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
5675 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
5676
5677 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
5678 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
5679 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
5680 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
5681 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
5682
5683 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
5684 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
5685 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
5686 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
5687 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
5688
5689 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
5690 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
5691 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
5692 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
5693 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
5694
5695 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
5696 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
5697 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
5698 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
5699 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
5700
5701 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
5702 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
5703 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
5704 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
5705 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
5706
5707 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
5708 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
5709 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
5710
5711 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
5712 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
5713 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
5714
5715 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
5716 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
5717 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
5718
5719 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
5720 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
5721 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
5722
5723 /* Stores with a vector index. */
5724
5725 static inline QEMU_ALWAYS_INLINE
5726 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5727 target_ulong base, uint32_t desc, uintptr_t retaddr,
5728 uint32_t mtedesc, int esize, int msize,
5729 zreg_off_fn *off_fn,
5730 sve_ldst1_host_fn *host_fn,
5731 sve_ldst1_tlb_fn *tlb_fn)
5732 {
5733 const int mmu_idx = cpu_mmu_index(env, false);
5734 const intptr_t reg_max = simd_oprsz(desc);
5735 const int scale = simd_data(desc);
5736 void *host[ARM_MAX_VQ * 4];
5737 intptr_t reg_off, i;
5738 SVEHostPage info, info2;
5739
5740 /*
5741 * Probe all of the elements for host addresses and flags.
5742 */
5743 i = reg_off = 0;
5744 do {
5745 uint64_t pg = vg[reg_off >> 6];
5746 do {
5747 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5748 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5749
5750 host[i] = NULL;
5751 if (likely((pg >> (reg_off & 63)) & 1)) {
5752 if (likely(in_page >= msize)) {
5753 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
5754 mmu_idx, retaddr);
5755 host[i] = info.host;
5756 } else {
5757 /*
5758 * Element crosses the page boundary.
5759 * Probe both pages, but do not record the host address,
5760 * so that we use the slow path.
5761 */
5762 sve_probe_page(&info, false, env, addr, 0,
5763 MMU_DATA_STORE, mmu_idx, retaddr);
5764 sve_probe_page(&info2, false, env, addr + in_page, 0,
5765 MMU_DATA_STORE, mmu_idx, retaddr);
5766 info.flags |= info2.flags;
5767 }
5768
5769 if (unlikely(info.flags & TLB_WATCHPOINT)) {
5770 cpu_check_watchpoint(env_cpu(env), addr, msize,
5771 info.attrs, BP_MEM_WRITE, retaddr);
5772 }
5773
5774 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5775 mte_check1(env, mtedesc, addr, retaddr);
5776 }
5777 }
5778 i += 1;
5779 reg_off += esize;
5780 } while (reg_off & 63);
5781 } while (reg_off < reg_max);
5782
5783 /*
5784 * Now that we have recognized all exceptions except SyncExternal
5785 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
5786 *
5787 * Note for the common case of an element in RAM, not crossing a page
5788 * boundary, we have stored the host address in host[]. This doubles
5789 * as a first-level check against the predicate, since only enabled
5790 * elements have non-null host addresses.
5791 */
5792 i = reg_off = 0;
5793 do {
5794 void *h = host[i];
5795 if (likely(h != NULL)) {
5796 host_fn(vd, reg_off, h);
5797 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
5798 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5799 tlb_fn(env, vd, reg_off, addr, retaddr);
5800 }
5801 i += 1;
5802 reg_off += esize;
5803 } while (reg_off < reg_max);
5804 }
5805
5806 static inline QEMU_ALWAYS_INLINE
5807 void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5808 target_ulong base, uint32_t desc, uintptr_t retaddr,
5809 int esize, int msize, zreg_off_fn *off_fn,
5810 sve_ldst1_host_fn *host_fn,
5811 sve_ldst1_tlb_fn *tlb_fn)
5812 {
5813 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5814 /* Remove mtedesc from the normal sve descriptor. */
5815 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5816
5817 /*
5818 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5819 * offset base entirely over the address space hole to change the
5820 * pointer tag, or change the bit55 selector. So we could here
5821 * examine TBI + TCMA like we do for sve_ldN_r_mte().
5822 */
5823 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5824 esize, msize, off_fn, host_fn, tlb_fn);
5825 }
5826
5827 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
5828 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5829 void *vm, target_ulong base, uint32_t desc) \
5830 { \
5831 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
5832 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5833 } \
5834 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5835 void *vm, target_ulong base, uint32_t desc) \
5836 { \
5837 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
5838 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5839 }
5840
5841 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
5842 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5843 void *vm, target_ulong base, uint32_t desc) \
5844 { \
5845 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
5846 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5847 } \
5848 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5849 void *vm, target_ulong base, uint32_t desc) \
5850 { \
5851 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
5852 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5853 }
5854
5855 DO_ST1_ZPZ_S(bs, zsu, MO_8)
5856 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
5857 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
5858 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
5859 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
5860
5861 DO_ST1_ZPZ_S(bs, zss, MO_8)
5862 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
5863 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
5864 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
5865 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
5866
5867 DO_ST1_ZPZ_D(bd, zsu, MO_8)
5868 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
5869 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
5870 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
5871 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
5872 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
5873 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
5874
5875 DO_ST1_ZPZ_D(bd, zss, MO_8)
5876 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
5877 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
5878 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
5879 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
5880 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
5881 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
5882
5883 DO_ST1_ZPZ_D(bd, zd, MO_8)
5884 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
5885 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
5886 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
5887 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
5888 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
5889 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
5890
5891 #undef DO_ST1_ZPZ_S
5892 #undef DO_ST1_ZPZ_D