]> git.proxmox.com Git - mirror_qemu.git/blob - target/arm/sve_helper.c
target/arm: Add mte helpers for sve scalar + int stores
[mirror_qemu.git] / target / arm / sve_helper.c
1 /*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "qemu/osdep.h"
21 #include "cpu.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
28 #include "tcg/tcg.h"
29
30
31 /* Note that vector data is stored in host-endian 64-bit chunks,
32 so addressing units smaller than that needs a host-endian fixup. */
33 #ifdef HOST_WORDS_BIGENDIAN
34 #define H1(x) ((x) ^ 7)
35 #define H1_2(x) ((x) ^ 6)
36 #define H1_4(x) ((x) ^ 4)
37 #define H2(x) ((x) ^ 3)
38 #define H4(x) ((x) ^ 1)
39 #else
40 #define H1(x) (x)
41 #define H1_2(x) (x)
42 #define H1_4(x) (x)
43 #define H2(x) (x)
44 #define H4(x) (x)
45 #endif
46
47 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
48 *
49 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
50 * and bit 0 set if C is set. Compare the definitions of these variables
51 * within CPUARMState.
52 */
53
54 /* For no G bits set, NZCV = C. */
55 #define PREDTEST_INIT 1
56
57 /* This is an iterative function, called for each Pd and Pg word
58 * moving forward.
59 */
60 static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
61 {
62 if (likely(g)) {
63 /* Compute N from first D & G.
64 Use bit 2 to signal first G bit seen. */
65 if (!(flags & 4)) {
66 flags |= ((d & (g & -g)) != 0) << 31;
67 flags |= 4;
68 }
69
70 /* Accumulate Z from each D & G. */
71 flags |= ((d & g) != 0) << 1;
72
73 /* Compute C from last !(D & G). Replace previous. */
74 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
75 }
76 return flags;
77 }
78
79 /* This is an iterative function, called for each Pd and Pg word
80 * moving backward.
81 */
82 static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
83 {
84 if (likely(g)) {
85 /* Compute C from first (i.e last) !(D & G).
86 Use bit 2 to signal first G bit seen. */
87 if (!(flags & 4)) {
88 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
89 flags |= (d & pow2floor(g)) == 0;
90 }
91
92 /* Accumulate Z from each D & G. */
93 flags |= ((d & g) != 0) << 1;
94
95 /* Compute N from last (i.e first) D & G. Replace previous. */
96 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
97 }
98 return flags;
99 }
100
101 /* The same for a single word predicate. */
102 uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
103 {
104 return iter_predtest_fwd(d, g, PREDTEST_INIT);
105 }
106
107 /* The same for a multi-word predicate. */
108 uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
109 {
110 uint32_t flags = PREDTEST_INIT;
111 uint64_t *d = vd, *g = vg;
112 uintptr_t i = 0;
113
114 do {
115 flags = iter_predtest_fwd(d[i], g[i], flags);
116 } while (++i < words);
117
118 return flags;
119 }
120
121 /* Expand active predicate bits to bytes, for byte elements.
122 * for (i = 0; i < 256; ++i) {
123 * unsigned long m = 0;
124 * for (j = 0; j < 8; j++) {
125 * if ((i >> j) & 1) {
126 * m |= 0xfful << (j << 3);
127 * }
128 * }
129 * printf("0x%016lx,\n", m);
130 * }
131 */
132 static inline uint64_t expand_pred_b(uint8_t byte)
133 {
134 static const uint64_t word[256] = {
135 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
136 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
137 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
138 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
139 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
140 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
141 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
142 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
143 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
144 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
145 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
146 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
147 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
148 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
149 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
150 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
151 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
152 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
153 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
154 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
155 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
156 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
157 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
158 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
159 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
160 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
161 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
162 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
163 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
164 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
165 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
166 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
167 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
168 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
169 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
170 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
171 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
172 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
173 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
174 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
175 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
176 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
177 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
178 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
179 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
180 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
181 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
182 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
183 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
184 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
185 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
186 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
187 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
188 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
189 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
190 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
191 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
192 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
193 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
194 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
195 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
196 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
197 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
198 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
199 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
200 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
201 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
202 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
203 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
204 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
205 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
206 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
207 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
208 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
209 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
210 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
211 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
212 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
213 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
214 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
215 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
216 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
217 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
218 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
219 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
220 0xffffffffffffffff,
221 };
222 return word[byte];
223 }
224
225 /* Similarly for half-word elements.
226 * for (i = 0; i < 256; ++i) {
227 * unsigned long m = 0;
228 * if (i & 0xaa) {
229 * continue;
230 * }
231 * for (j = 0; j < 8; j += 2) {
232 * if ((i >> j) & 1) {
233 * m |= 0xfffful << (j << 3);
234 * }
235 * }
236 * printf("[0x%x] = 0x%016lx,\n", i, m);
237 * }
238 */
239 static inline uint64_t expand_pred_h(uint8_t byte)
240 {
241 static const uint64_t word[] = {
242 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
243 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
244 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
245 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
246 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
247 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
248 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
249 [0x55] = 0xffffffffffffffff,
250 };
251 return word[byte & 0x55];
252 }
253
254 /* Similarly for single word elements. */
255 static inline uint64_t expand_pred_s(uint8_t byte)
256 {
257 static const uint64_t word[] = {
258 [0x01] = 0x00000000ffffffffull,
259 [0x10] = 0xffffffff00000000ull,
260 [0x11] = 0xffffffffffffffffull,
261 };
262 return word[byte & 0x11];
263 }
264
265 /* Swap 16-bit words within a 32-bit word. */
266 static inline uint32_t hswap32(uint32_t h)
267 {
268 return rol32(h, 16);
269 }
270
271 /* Swap 16-bit words within a 64-bit word. */
272 static inline uint64_t hswap64(uint64_t h)
273 {
274 uint64_t m = 0x0000ffff0000ffffull;
275 h = rol64(h, 32);
276 return ((h & m) << 16) | ((h >> 16) & m);
277 }
278
279 /* Swap 32-bit words within a 64-bit word. */
280 static inline uint64_t wswap64(uint64_t h)
281 {
282 return rol64(h, 32);
283 }
284
285 #define LOGICAL_PPPP(NAME, FUNC) \
286 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
287 { \
288 uintptr_t opr_sz = simd_oprsz(desc); \
289 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
290 uintptr_t i; \
291 for (i = 0; i < opr_sz / 8; ++i) { \
292 d[i] = FUNC(n[i], m[i], g[i]); \
293 } \
294 }
295
296 #define DO_AND(N, M, G) (((N) & (M)) & (G))
297 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
298 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
299 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
300 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
301 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
302 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
303 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
304
305 LOGICAL_PPPP(sve_and_pppp, DO_AND)
306 LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
307 LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
308 LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
309 LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
310 LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
311 LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
312 LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
313
314 #undef DO_AND
315 #undef DO_BIC
316 #undef DO_EOR
317 #undef DO_ORR
318 #undef DO_ORN
319 #undef DO_NOR
320 #undef DO_NAND
321 #undef DO_SEL
322 #undef LOGICAL_PPPP
323
324 /* Fully general three-operand expander, controlled by a predicate.
325 * This is complicated by the host-endian storage of the register file.
326 */
327 /* ??? I don't expect the compiler could ever vectorize this itself.
328 * With some tables we can convert bit masks to byte masks, and with
329 * extra care wrt byte/word ordering we could use gcc generic vectors
330 * and do 16 bytes at a time.
331 */
332 #define DO_ZPZZ(NAME, TYPE, H, OP) \
333 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
334 { \
335 intptr_t i, opr_sz = simd_oprsz(desc); \
336 for (i = 0; i < opr_sz; ) { \
337 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
338 do { \
339 if (pg & 1) { \
340 TYPE nn = *(TYPE *)(vn + H(i)); \
341 TYPE mm = *(TYPE *)(vm + H(i)); \
342 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
343 } \
344 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
345 } while (i & 15); \
346 } \
347 }
348
349 /* Similarly, specialized for 64-bit operands. */
350 #define DO_ZPZZ_D(NAME, TYPE, OP) \
351 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
352 { \
353 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
354 TYPE *d = vd, *n = vn, *m = vm; \
355 uint8_t *pg = vg; \
356 for (i = 0; i < opr_sz; i += 1) { \
357 if (pg[H1(i)] & 1) { \
358 TYPE nn = n[i], mm = m[i]; \
359 d[i] = OP(nn, mm); \
360 } \
361 } \
362 }
363
364 #define DO_AND(N, M) (N & M)
365 #define DO_EOR(N, M) (N ^ M)
366 #define DO_ORR(N, M) (N | M)
367 #define DO_BIC(N, M) (N & ~M)
368 #define DO_ADD(N, M) (N + M)
369 #define DO_SUB(N, M) (N - M)
370 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
371 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
372 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
373 #define DO_MUL(N, M) (N * M)
374
375
376 /*
377 * We must avoid the C undefined behaviour cases: division by
378 * zero and signed division of INT_MIN by -1. Both of these
379 * have architecturally defined required results for Arm.
380 * We special case all signed divisions by -1 to avoid having
381 * to deduce the minimum integer for the type involved.
382 */
383 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
384 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
385
386 DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
387 DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
388 DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
389 DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
390
391 DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
392 DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
393 DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
394 DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
395
396 DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
397 DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
398 DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
399 DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
400
401 DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
402 DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
403 DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
404 DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
405
406 DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
407 DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
408 DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
409 DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
410
411 DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
412 DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
413 DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
414 DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
415
416 DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
417 DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
418 DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
419 DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
420
421 DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
422 DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
423 DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
424 DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
425
426 DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
427 DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
428 DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
429 DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
430
431 DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
432 DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
433 DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
434 DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
435
436 DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
437 DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
438 DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
439 DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
440
441 DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
442 DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
443 DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
444 DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
445
446 /* Because the computation type is at least twice as large as required,
447 these work for both signed and unsigned source types. */
448 static inline uint8_t do_mulh_b(int32_t n, int32_t m)
449 {
450 return (n * m) >> 8;
451 }
452
453 static inline uint16_t do_mulh_h(int32_t n, int32_t m)
454 {
455 return (n * m) >> 16;
456 }
457
458 static inline uint32_t do_mulh_s(int64_t n, int64_t m)
459 {
460 return (n * m) >> 32;
461 }
462
463 static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
464 {
465 uint64_t lo, hi;
466 muls64(&lo, &hi, n, m);
467 return hi;
468 }
469
470 static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
471 {
472 uint64_t lo, hi;
473 mulu64(&lo, &hi, n, m);
474 return hi;
475 }
476
477 DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
478 DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
479 DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
480 DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
481
482 DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
483 DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
484 DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
485 DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
486
487 DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
488 DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
489 DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
490 DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
491
492 DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
493 DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
494
495 DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
496 DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
497
498 /* Note that all bits of the shift are significant
499 and not modulo the element size. */
500 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
501 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
502 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
503
504 DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
505 DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
506 DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
507
508 DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
509 DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
510 DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
511
512 DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
513 DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
514 DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
515
516 DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
517 DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
518 DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
519
520 #undef DO_ZPZZ
521 #undef DO_ZPZZ_D
522
523 /* Three-operand expander, controlled by a predicate, in which the
524 * third operand is "wide". That is, for D = N op M, the same 64-bit
525 * value of M is used with all of the narrower values of N.
526 */
527 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
528 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
529 { \
530 intptr_t i, opr_sz = simd_oprsz(desc); \
531 for (i = 0; i < opr_sz; ) { \
532 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
533 TYPEW mm = *(TYPEW *)(vm + i); \
534 do { \
535 if (pg & 1) { \
536 TYPE nn = *(TYPE *)(vn + H(i)); \
537 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
538 } \
539 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
540 } while (i & 7); \
541 } \
542 }
543
544 DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
545 DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
546 DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
547
548 DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
549 DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
550 DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
551
552 DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
553 DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
554 DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
555
556 #undef DO_ZPZW
557
558 /* Fully general two-operand expander, controlled by a predicate.
559 */
560 #define DO_ZPZ(NAME, TYPE, H, OP) \
561 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
562 { \
563 intptr_t i, opr_sz = simd_oprsz(desc); \
564 for (i = 0; i < opr_sz; ) { \
565 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
566 do { \
567 if (pg & 1) { \
568 TYPE nn = *(TYPE *)(vn + H(i)); \
569 *(TYPE *)(vd + H(i)) = OP(nn); \
570 } \
571 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
572 } while (i & 15); \
573 } \
574 }
575
576 /* Similarly, specialized for 64-bit operands. */
577 #define DO_ZPZ_D(NAME, TYPE, OP) \
578 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
579 { \
580 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
581 TYPE *d = vd, *n = vn; \
582 uint8_t *pg = vg; \
583 for (i = 0; i < opr_sz; i += 1) { \
584 if (pg[H1(i)] & 1) { \
585 TYPE nn = n[i]; \
586 d[i] = OP(nn); \
587 } \
588 } \
589 }
590
591 #define DO_CLS_B(N) (clrsb32(N) - 24)
592 #define DO_CLS_H(N) (clrsb32(N) - 16)
593
594 DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
595 DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
596 DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
597 DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
598
599 #define DO_CLZ_B(N) (clz32(N) - 24)
600 #define DO_CLZ_H(N) (clz32(N) - 16)
601
602 DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
603 DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
604 DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
605 DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
606
607 DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
608 DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
609 DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
610 DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
611
612 #define DO_CNOT(N) (N == 0)
613
614 DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
615 DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
616 DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
617 DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
618
619 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
620
621 DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
622 DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
623 DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
624
625 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
626
627 DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
628 DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
629 DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
630
631 #define DO_NOT(N) (~N)
632
633 DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
634 DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
635 DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
636 DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
637
638 #define DO_SXTB(N) ((int8_t)N)
639 #define DO_SXTH(N) ((int16_t)N)
640 #define DO_SXTS(N) ((int32_t)N)
641 #define DO_UXTB(N) ((uint8_t)N)
642 #define DO_UXTH(N) ((uint16_t)N)
643 #define DO_UXTS(N) ((uint32_t)N)
644
645 DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
646 DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
647 DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
648 DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
649 DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
650 DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
651
652 DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
653 DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
654 DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
655 DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
656 DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
657 DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
658
659 #define DO_ABS(N) (N < 0 ? -N : N)
660
661 DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
662 DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
663 DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
664 DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
665
666 #define DO_NEG(N) (-N)
667
668 DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
669 DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
670 DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
671 DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
672
673 DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
674 DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
675 DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
676
677 DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
678 DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
679
680 DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
681
682 DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
683 DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
684 DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
685 DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
686
687 /* Three-operand expander, unpredicated, in which the third operand is "wide".
688 */
689 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
690 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
691 { \
692 intptr_t i, opr_sz = simd_oprsz(desc); \
693 for (i = 0; i < opr_sz; ) { \
694 TYPEW mm = *(TYPEW *)(vm + i); \
695 do { \
696 TYPE nn = *(TYPE *)(vn + H(i)); \
697 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
698 i += sizeof(TYPE); \
699 } while (i & 7); \
700 } \
701 }
702
703 DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
704 DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
705 DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
706
707 DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
708 DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
709 DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
710
711 DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
712 DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
713 DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
714
715 #undef DO_ZZW
716
717 #undef DO_CLS_B
718 #undef DO_CLS_H
719 #undef DO_CLZ_B
720 #undef DO_CLZ_H
721 #undef DO_CNOT
722 #undef DO_FABS
723 #undef DO_FNEG
724 #undef DO_ABS
725 #undef DO_NEG
726 #undef DO_ZPZ
727 #undef DO_ZPZ_D
728
729 /* Two-operand reduction expander, controlled by a predicate.
730 * The difference between TYPERED and TYPERET has to do with
731 * sign-extension. E.g. for SMAX, TYPERED must be signed,
732 * but TYPERET must be unsigned so that e.g. a 32-bit value
733 * is not sign-extended to the ABI uint64_t return type.
734 */
735 /* ??? If we were to vectorize this by hand the reduction ordering
736 * would change. For integer operands, this is perfectly fine.
737 */
738 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
739 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
740 { \
741 intptr_t i, opr_sz = simd_oprsz(desc); \
742 TYPERED ret = INIT; \
743 for (i = 0; i < opr_sz; ) { \
744 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
745 do { \
746 if (pg & 1) { \
747 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
748 ret = OP(ret, nn); \
749 } \
750 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
751 } while (i & 15); \
752 } \
753 return (TYPERET)ret; \
754 }
755
756 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
757 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
758 { \
759 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
760 TYPEE *n = vn; \
761 uint8_t *pg = vg; \
762 TYPER ret = INIT; \
763 for (i = 0; i < opr_sz; i += 1) { \
764 if (pg[H1(i)] & 1) { \
765 TYPEE nn = n[i]; \
766 ret = OP(ret, nn); \
767 } \
768 } \
769 return ret; \
770 }
771
772 DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
773 DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
774 DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
775 DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
776
777 DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
778 DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
779 DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
780 DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
781
782 DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
783 DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
784 DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
785 DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
786
787 DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
788 DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
789 DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
790
791 DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
792 DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
793 DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
794 DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
795
796 DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
797 DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
798 DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
799 DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
800
801 DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
802 DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
803 DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
804 DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
805
806 DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
807 DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
808 DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
809 DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
810
811 DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
812 DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
813 DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
814 DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
815
816 #undef DO_VPZ
817 #undef DO_VPZ_D
818
819 /* Two vector operand, one scalar operand, unpredicated. */
820 #define DO_ZZI(NAME, TYPE, OP) \
821 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
822 { \
823 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
824 TYPE s = s64, *d = vd, *n = vn; \
825 for (i = 0; i < opr_sz; ++i) { \
826 d[i] = OP(n[i], s); \
827 } \
828 }
829
830 #define DO_SUBR(X, Y) (Y - X)
831
832 DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
833 DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
834 DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
835 DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
836
837 DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
838 DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
839 DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
840 DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
841
842 DO_ZZI(sve_smini_b, int8_t, DO_MIN)
843 DO_ZZI(sve_smini_h, int16_t, DO_MIN)
844 DO_ZZI(sve_smini_s, int32_t, DO_MIN)
845 DO_ZZI(sve_smini_d, int64_t, DO_MIN)
846
847 DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
848 DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
849 DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
850 DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
851
852 DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
853 DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
854 DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
855 DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
856
857 #undef DO_ZZI
858
859 #undef DO_AND
860 #undef DO_ORR
861 #undef DO_EOR
862 #undef DO_BIC
863 #undef DO_ADD
864 #undef DO_SUB
865 #undef DO_MAX
866 #undef DO_MIN
867 #undef DO_ABD
868 #undef DO_MUL
869 #undef DO_DIV
870 #undef DO_ASR
871 #undef DO_LSR
872 #undef DO_LSL
873 #undef DO_SUBR
874
875 /* Similar to the ARM LastActiveElement pseudocode function, except the
876 result is multiplied by the element size. This includes the not found
877 indication; e.g. not found for esz=3 is -8. */
878 static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
879 {
880 uint64_t mask = pred_esz_masks[esz];
881 intptr_t i = words;
882
883 do {
884 uint64_t this_g = g[--i] & mask;
885 if (this_g) {
886 return i * 64 + (63 - clz64(this_g));
887 }
888 } while (i > 0);
889 return (intptr_t)-1 << esz;
890 }
891
892 uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
893 {
894 uint32_t flags = PREDTEST_INIT;
895 uint64_t *d = vd, *g = vg;
896 intptr_t i = 0;
897
898 do {
899 uint64_t this_d = d[i];
900 uint64_t this_g = g[i];
901
902 if (this_g) {
903 if (!(flags & 4)) {
904 /* Set in D the first bit of G. */
905 this_d |= this_g & -this_g;
906 d[i] = this_d;
907 }
908 flags = iter_predtest_fwd(this_d, this_g, flags);
909 }
910 } while (++i < words);
911
912 return flags;
913 }
914
915 uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
916 {
917 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
918 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
919 uint32_t flags = PREDTEST_INIT;
920 uint64_t *d = vd, *g = vg, esz_mask;
921 intptr_t i, next;
922
923 next = last_active_element(vd, words, esz) + (1 << esz);
924 esz_mask = pred_esz_masks[esz];
925
926 /* Similar to the pseudocode for pnext, but scaled by ESZ
927 so that we find the correct bit. */
928 if (next < words * 64) {
929 uint64_t mask = -1;
930
931 if (next & 63) {
932 mask = ~((1ull << (next & 63)) - 1);
933 next &= -64;
934 }
935 do {
936 uint64_t this_g = g[next / 64] & esz_mask & mask;
937 if (this_g != 0) {
938 next = (next & -64) + ctz64(this_g);
939 break;
940 }
941 next += 64;
942 mask = -1;
943 } while (next < words * 64);
944 }
945
946 i = 0;
947 do {
948 uint64_t this_d = 0;
949 if (i == next / 64) {
950 this_d = 1ull << (next & 63);
951 }
952 d[i] = this_d;
953 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
954 } while (++i < words);
955
956 return flags;
957 }
958
959 /* Store zero into every active element of Zd. We will use this for two
960 * and three-operand predicated instructions for which logic dictates a
961 * zero result. In particular, logical shift by element size, which is
962 * otherwise undefined on the host.
963 *
964 * For element sizes smaller than uint64_t, we use tables to expand
965 * the N bits of the controlling predicate to a byte mask, and clear
966 * those bytes.
967 */
968 void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
969 {
970 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
971 uint64_t *d = vd;
972 uint8_t *pg = vg;
973 for (i = 0; i < opr_sz; i += 1) {
974 d[i] &= ~expand_pred_b(pg[H1(i)]);
975 }
976 }
977
978 void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
979 {
980 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
981 uint64_t *d = vd;
982 uint8_t *pg = vg;
983 for (i = 0; i < opr_sz; i += 1) {
984 d[i] &= ~expand_pred_h(pg[H1(i)]);
985 }
986 }
987
988 void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
989 {
990 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
991 uint64_t *d = vd;
992 uint8_t *pg = vg;
993 for (i = 0; i < opr_sz; i += 1) {
994 d[i] &= ~expand_pred_s(pg[H1(i)]);
995 }
996 }
997
998 void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
999 {
1000 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1001 uint64_t *d = vd;
1002 uint8_t *pg = vg;
1003 for (i = 0; i < opr_sz; i += 1) {
1004 if (pg[H1(i)] & 1) {
1005 d[i] = 0;
1006 }
1007 }
1008 }
1009
1010 /* Copy Zn into Zd, and store zero into inactive elements. */
1011 void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1012 {
1013 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1014 uint64_t *d = vd, *n = vn;
1015 uint8_t *pg = vg;
1016 for (i = 0; i < opr_sz; i += 1) {
1017 d[i] = n[i] & expand_pred_b(pg[H1(i)]);
1018 }
1019 }
1020
1021 void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1022 {
1023 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1024 uint64_t *d = vd, *n = vn;
1025 uint8_t *pg = vg;
1026 for (i = 0; i < opr_sz; i += 1) {
1027 d[i] = n[i] & expand_pred_h(pg[H1(i)]);
1028 }
1029 }
1030
1031 void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1032 {
1033 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1034 uint64_t *d = vd, *n = vn;
1035 uint8_t *pg = vg;
1036 for (i = 0; i < opr_sz; i += 1) {
1037 d[i] = n[i] & expand_pred_s(pg[H1(i)]);
1038 }
1039 }
1040
1041 void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1042 {
1043 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1044 uint64_t *d = vd, *n = vn;
1045 uint8_t *pg = vg;
1046 for (i = 0; i < opr_sz; i += 1) {
1047 d[i] = n[i] & -(uint64_t)(pg[H1(i)] & 1);
1048 }
1049 }
1050
1051 /* Three-operand expander, immediate operand, controlled by a predicate.
1052 */
1053 #define DO_ZPZI(NAME, TYPE, H, OP) \
1054 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1055 { \
1056 intptr_t i, opr_sz = simd_oprsz(desc); \
1057 TYPE imm = simd_data(desc); \
1058 for (i = 0; i < opr_sz; ) { \
1059 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1060 do { \
1061 if (pg & 1) { \
1062 TYPE nn = *(TYPE *)(vn + H(i)); \
1063 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1064 } \
1065 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1066 } while (i & 15); \
1067 } \
1068 }
1069
1070 /* Similarly, specialized for 64-bit operands. */
1071 #define DO_ZPZI_D(NAME, TYPE, OP) \
1072 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1073 { \
1074 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1075 TYPE *d = vd, *n = vn; \
1076 TYPE imm = simd_data(desc); \
1077 uint8_t *pg = vg; \
1078 for (i = 0; i < opr_sz; i += 1) { \
1079 if (pg[H1(i)] & 1) { \
1080 TYPE nn = n[i]; \
1081 d[i] = OP(nn, imm); \
1082 } \
1083 } \
1084 }
1085
1086 #define DO_SHR(N, M) (N >> M)
1087 #define DO_SHL(N, M) (N << M)
1088
1089 /* Arithmetic shift right for division. This rounds negative numbers
1090 toward zero as per signed division. Therefore before shifting,
1091 when N is negative, add 2**M-1. */
1092 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1093
1094 DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1095 DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1096 DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1097 DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1098
1099 DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1100 DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1101 DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1102 DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1103
1104 DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1105 DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1106 DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1107 DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1108
1109 DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1110 DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1111 DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1112 DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1113
1114 #undef DO_SHR
1115 #undef DO_SHL
1116 #undef DO_ASRD
1117 #undef DO_ZPZI
1118 #undef DO_ZPZI_D
1119
1120 /* Fully general four-operand expander, controlled by a predicate.
1121 */
1122 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1123 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1124 void *vg, uint32_t desc) \
1125 { \
1126 intptr_t i, opr_sz = simd_oprsz(desc); \
1127 for (i = 0; i < opr_sz; ) { \
1128 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1129 do { \
1130 if (pg & 1) { \
1131 TYPE nn = *(TYPE *)(vn + H(i)); \
1132 TYPE mm = *(TYPE *)(vm + H(i)); \
1133 TYPE aa = *(TYPE *)(va + H(i)); \
1134 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1135 } \
1136 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1137 } while (i & 15); \
1138 } \
1139 }
1140
1141 /* Similarly, specialized for 64-bit operands. */
1142 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1143 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1144 void *vg, uint32_t desc) \
1145 { \
1146 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1147 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1148 uint8_t *pg = vg; \
1149 for (i = 0; i < opr_sz; i += 1) { \
1150 if (pg[H1(i)] & 1) { \
1151 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1152 d[i] = OP(aa, nn, mm); \
1153 } \
1154 } \
1155 }
1156
1157 #define DO_MLA(A, N, M) (A + N * M)
1158 #define DO_MLS(A, N, M) (A - N * M)
1159
1160 DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1161 DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1162
1163 DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1164 DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1165
1166 DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1167 DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1168
1169 DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1170 DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1171
1172 #undef DO_MLA
1173 #undef DO_MLS
1174 #undef DO_ZPZZZ
1175 #undef DO_ZPZZZ_D
1176
1177 void HELPER(sve_index_b)(void *vd, uint32_t start,
1178 uint32_t incr, uint32_t desc)
1179 {
1180 intptr_t i, opr_sz = simd_oprsz(desc);
1181 uint8_t *d = vd;
1182 for (i = 0; i < opr_sz; i += 1) {
1183 d[H1(i)] = start + i * incr;
1184 }
1185 }
1186
1187 void HELPER(sve_index_h)(void *vd, uint32_t start,
1188 uint32_t incr, uint32_t desc)
1189 {
1190 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1191 uint16_t *d = vd;
1192 for (i = 0; i < opr_sz; i += 1) {
1193 d[H2(i)] = start + i * incr;
1194 }
1195 }
1196
1197 void HELPER(sve_index_s)(void *vd, uint32_t start,
1198 uint32_t incr, uint32_t desc)
1199 {
1200 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1201 uint32_t *d = vd;
1202 for (i = 0; i < opr_sz; i += 1) {
1203 d[H4(i)] = start + i * incr;
1204 }
1205 }
1206
1207 void HELPER(sve_index_d)(void *vd, uint64_t start,
1208 uint64_t incr, uint32_t desc)
1209 {
1210 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1211 uint64_t *d = vd;
1212 for (i = 0; i < opr_sz; i += 1) {
1213 d[i] = start + i * incr;
1214 }
1215 }
1216
1217 void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1218 {
1219 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1220 uint32_t sh = simd_data(desc);
1221 uint32_t *d = vd, *n = vn, *m = vm;
1222 for (i = 0; i < opr_sz; i += 1) {
1223 d[i] = n[i] + (m[i] << sh);
1224 }
1225 }
1226
1227 void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1228 {
1229 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1230 uint64_t sh = simd_data(desc);
1231 uint64_t *d = vd, *n = vn, *m = vm;
1232 for (i = 0; i < opr_sz; i += 1) {
1233 d[i] = n[i] + (m[i] << sh);
1234 }
1235 }
1236
1237 void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1238 {
1239 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1240 uint64_t sh = simd_data(desc);
1241 uint64_t *d = vd, *n = vn, *m = vm;
1242 for (i = 0; i < opr_sz; i += 1) {
1243 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1244 }
1245 }
1246
1247 void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1248 {
1249 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1250 uint64_t sh = simd_data(desc);
1251 uint64_t *d = vd, *n = vn, *m = vm;
1252 for (i = 0; i < opr_sz; i += 1) {
1253 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1254 }
1255 }
1256
1257 void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1258 {
1259 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1260 static const uint16_t coeff[] = {
1261 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1262 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1263 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1264 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1265 };
1266 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1267 uint16_t *d = vd, *n = vn;
1268
1269 for (i = 0; i < opr_sz; i++) {
1270 uint16_t nn = n[i];
1271 intptr_t idx = extract32(nn, 0, 5);
1272 uint16_t exp = extract32(nn, 5, 5);
1273 d[i] = coeff[idx] | (exp << 10);
1274 }
1275 }
1276
1277 void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1278 {
1279 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1280 static const uint32_t coeff[] = {
1281 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1282 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1283 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1284 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1285 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1286 0x1ef532, 0x20b051, 0x227043, 0x243516,
1287 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1288 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1289 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1290 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1291 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1292 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1293 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1294 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1295 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1296 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1297 };
1298 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1299 uint32_t *d = vd, *n = vn;
1300
1301 for (i = 0; i < opr_sz; i++) {
1302 uint32_t nn = n[i];
1303 intptr_t idx = extract32(nn, 0, 6);
1304 uint32_t exp = extract32(nn, 6, 8);
1305 d[i] = coeff[idx] | (exp << 23);
1306 }
1307 }
1308
1309 void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1310 {
1311 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1312 static const uint64_t coeff[] = {
1313 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1314 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1315 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1316 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1317 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1318 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1319 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1320 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1321 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1322 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1323 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1324 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1325 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1326 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1327 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1328 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1329 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1330 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1331 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1332 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1333 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1334 0xFA7C1819E90D8ull,
1335 };
1336 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1337 uint64_t *d = vd, *n = vn;
1338
1339 for (i = 0; i < opr_sz; i++) {
1340 uint64_t nn = n[i];
1341 intptr_t idx = extract32(nn, 0, 6);
1342 uint64_t exp = extract32(nn, 6, 11);
1343 d[i] = coeff[idx] | (exp << 52);
1344 }
1345 }
1346
1347 void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1348 {
1349 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1350 uint16_t *d = vd, *n = vn, *m = vm;
1351 for (i = 0; i < opr_sz; i += 1) {
1352 uint16_t nn = n[i];
1353 uint16_t mm = m[i];
1354 if (mm & 1) {
1355 nn = float16_one;
1356 }
1357 d[i] = nn ^ (mm & 2) << 14;
1358 }
1359 }
1360
1361 void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1362 {
1363 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1364 uint32_t *d = vd, *n = vn, *m = vm;
1365 for (i = 0; i < opr_sz; i += 1) {
1366 uint32_t nn = n[i];
1367 uint32_t mm = m[i];
1368 if (mm & 1) {
1369 nn = float32_one;
1370 }
1371 d[i] = nn ^ (mm & 2) << 30;
1372 }
1373 }
1374
1375 void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1376 {
1377 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1378 uint64_t *d = vd, *n = vn, *m = vm;
1379 for (i = 0; i < opr_sz; i += 1) {
1380 uint64_t nn = n[i];
1381 uint64_t mm = m[i];
1382 if (mm & 1) {
1383 nn = float64_one;
1384 }
1385 d[i] = nn ^ (mm & 2) << 62;
1386 }
1387 }
1388
1389 /*
1390 * Signed saturating addition with scalar operand.
1391 */
1392
1393 void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1394 {
1395 intptr_t i, oprsz = simd_oprsz(desc);
1396
1397 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1398 int r = *(int8_t *)(a + i) + b;
1399 if (r > INT8_MAX) {
1400 r = INT8_MAX;
1401 } else if (r < INT8_MIN) {
1402 r = INT8_MIN;
1403 }
1404 *(int8_t *)(d + i) = r;
1405 }
1406 }
1407
1408 void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1409 {
1410 intptr_t i, oprsz = simd_oprsz(desc);
1411
1412 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1413 int r = *(int16_t *)(a + i) + b;
1414 if (r > INT16_MAX) {
1415 r = INT16_MAX;
1416 } else if (r < INT16_MIN) {
1417 r = INT16_MIN;
1418 }
1419 *(int16_t *)(d + i) = r;
1420 }
1421 }
1422
1423 void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1424 {
1425 intptr_t i, oprsz = simd_oprsz(desc);
1426
1427 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1428 int64_t r = *(int32_t *)(a + i) + b;
1429 if (r > INT32_MAX) {
1430 r = INT32_MAX;
1431 } else if (r < INT32_MIN) {
1432 r = INT32_MIN;
1433 }
1434 *(int32_t *)(d + i) = r;
1435 }
1436 }
1437
1438 void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1439 {
1440 intptr_t i, oprsz = simd_oprsz(desc);
1441
1442 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1443 int64_t ai = *(int64_t *)(a + i);
1444 int64_t r = ai + b;
1445 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1446 /* Signed overflow. */
1447 r = (r < 0 ? INT64_MAX : INT64_MIN);
1448 }
1449 *(int64_t *)(d + i) = r;
1450 }
1451 }
1452
1453 /*
1454 * Unsigned saturating addition with scalar operand.
1455 */
1456
1457 void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1458 {
1459 intptr_t i, oprsz = simd_oprsz(desc);
1460
1461 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1462 int r = *(uint8_t *)(a + i) + b;
1463 if (r > UINT8_MAX) {
1464 r = UINT8_MAX;
1465 } else if (r < 0) {
1466 r = 0;
1467 }
1468 *(uint8_t *)(d + i) = r;
1469 }
1470 }
1471
1472 void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1473 {
1474 intptr_t i, oprsz = simd_oprsz(desc);
1475
1476 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1477 int r = *(uint16_t *)(a + i) + b;
1478 if (r > UINT16_MAX) {
1479 r = UINT16_MAX;
1480 } else if (r < 0) {
1481 r = 0;
1482 }
1483 *(uint16_t *)(d + i) = r;
1484 }
1485 }
1486
1487 void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1488 {
1489 intptr_t i, oprsz = simd_oprsz(desc);
1490
1491 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1492 int64_t r = *(uint32_t *)(a + i) + b;
1493 if (r > UINT32_MAX) {
1494 r = UINT32_MAX;
1495 } else if (r < 0) {
1496 r = 0;
1497 }
1498 *(uint32_t *)(d + i) = r;
1499 }
1500 }
1501
1502 void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1503 {
1504 intptr_t i, oprsz = simd_oprsz(desc);
1505
1506 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1507 uint64_t r = *(uint64_t *)(a + i) + b;
1508 if (r < b) {
1509 r = UINT64_MAX;
1510 }
1511 *(uint64_t *)(d + i) = r;
1512 }
1513 }
1514
1515 void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1516 {
1517 intptr_t i, oprsz = simd_oprsz(desc);
1518
1519 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1520 uint64_t ai = *(uint64_t *)(a + i);
1521 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1522 }
1523 }
1524
1525 /* Two operand predicated copy immediate with merge. All valid immediates
1526 * can fit within 17 signed bits in the simd_data field.
1527 */
1528 void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1529 uint64_t mm, uint32_t desc)
1530 {
1531 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1532 uint64_t *d = vd, *n = vn;
1533 uint8_t *pg = vg;
1534
1535 mm = dup_const(MO_8, mm);
1536 for (i = 0; i < opr_sz; i += 1) {
1537 uint64_t nn = n[i];
1538 uint64_t pp = expand_pred_b(pg[H1(i)]);
1539 d[i] = (mm & pp) | (nn & ~pp);
1540 }
1541 }
1542
1543 void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1544 uint64_t mm, uint32_t desc)
1545 {
1546 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1547 uint64_t *d = vd, *n = vn;
1548 uint8_t *pg = vg;
1549
1550 mm = dup_const(MO_16, mm);
1551 for (i = 0; i < opr_sz; i += 1) {
1552 uint64_t nn = n[i];
1553 uint64_t pp = expand_pred_h(pg[H1(i)]);
1554 d[i] = (mm & pp) | (nn & ~pp);
1555 }
1556 }
1557
1558 void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1559 uint64_t mm, uint32_t desc)
1560 {
1561 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1562 uint64_t *d = vd, *n = vn;
1563 uint8_t *pg = vg;
1564
1565 mm = dup_const(MO_32, mm);
1566 for (i = 0; i < opr_sz; i += 1) {
1567 uint64_t nn = n[i];
1568 uint64_t pp = expand_pred_s(pg[H1(i)]);
1569 d[i] = (mm & pp) | (nn & ~pp);
1570 }
1571 }
1572
1573 void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1574 uint64_t mm, uint32_t desc)
1575 {
1576 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1577 uint64_t *d = vd, *n = vn;
1578 uint8_t *pg = vg;
1579
1580 for (i = 0; i < opr_sz; i += 1) {
1581 uint64_t nn = n[i];
1582 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1583 }
1584 }
1585
1586 void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1587 {
1588 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1589 uint64_t *d = vd;
1590 uint8_t *pg = vg;
1591
1592 val = dup_const(MO_8, val);
1593 for (i = 0; i < opr_sz; i += 1) {
1594 d[i] = val & expand_pred_b(pg[H1(i)]);
1595 }
1596 }
1597
1598 void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1599 {
1600 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1601 uint64_t *d = vd;
1602 uint8_t *pg = vg;
1603
1604 val = dup_const(MO_16, val);
1605 for (i = 0; i < opr_sz; i += 1) {
1606 d[i] = val & expand_pred_h(pg[H1(i)]);
1607 }
1608 }
1609
1610 void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1611 {
1612 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1613 uint64_t *d = vd;
1614 uint8_t *pg = vg;
1615
1616 val = dup_const(MO_32, val);
1617 for (i = 0; i < opr_sz; i += 1) {
1618 d[i] = val & expand_pred_s(pg[H1(i)]);
1619 }
1620 }
1621
1622 void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1623 {
1624 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1625 uint64_t *d = vd;
1626 uint8_t *pg = vg;
1627
1628 for (i = 0; i < opr_sz; i += 1) {
1629 d[i] = (pg[H1(i)] & 1 ? val : 0);
1630 }
1631 }
1632
1633 /* Big-endian hosts need to frob the byte indices. If the copy
1634 * happens to be 8-byte aligned, then no frobbing necessary.
1635 */
1636 static void swap_memmove(void *vd, void *vs, size_t n)
1637 {
1638 uintptr_t d = (uintptr_t)vd;
1639 uintptr_t s = (uintptr_t)vs;
1640 uintptr_t o = (d | s | n) & 7;
1641 size_t i;
1642
1643 #ifndef HOST_WORDS_BIGENDIAN
1644 o = 0;
1645 #endif
1646 switch (o) {
1647 case 0:
1648 memmove(vd, vs, n);
1649 break;
1650
1651 case 4:
1652 if (d < s || d >= s + n) {
1653 for (i = 0; i < n; i += 4) {
1654 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1655 }
1656 } else {
1657 for (i = n; i > 0; ) {
1658 i -= 4;
1659 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1660 }
1661 }
1662 break;
1663
1664 case 2:
1665 case 6:
1666 if (d < s || d >= s + n) {
1667 for (i = 0; i < n; i += 2) {
1668 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1669 }
1670 } else {
1671 for (i = n; i > 0; ) {
1672 i -= 2;
1673 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1674 }
1675 }
1676 break;
1677
1678 default:
1679 if (d < s || d >= s + n) {
1680 for (i = 0; i < n; i++) {
1681 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1682 }
1683 } else {
1684 for (i = n; i > 0; ) {
1685 i -= 1;
1686 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1687 }
1688 }
1689 break;
1690 }
1691 }
1692
1693 /* Similarly for memset of 0. */
1694 static void swap_memzero(void *vd, size_t n)
1695 {
1696 uintptr_t d = (uintptr_t)vd;
1697 uintptr_t o = (d | n) & 7;
1698 size_t i;
1699
1700 /* Usually, the first bit of a predicate is set, so N is 0. */
1701 if (likely(n == 0)) {
1702 return;
1703 }
1704
1705 #ifndef HOST_WORDS_BIGENDIAN
1706 o = 0;
1707 #endif
1708 switch (o) {
1709 case 0:
1710 memset(vd, 0, n);
1711 break;
1712
1713 case 4:
1714 for (i = 0; i < n; i += 4) {
1715 *(uint32_t *)H1_4(d + i) = 0;
1716 }
1717 break;
1718
1719 case 2:
1720 case 6:
1721 for (i = 0; i < n; i += 2) {
1722 *(uint16_t *)H1_2(d + i) = 0;
1723 }
1724 break;
1725
1726 default:
1727 for (i = 0; i < n; i++) {
1728 *(uint8_t *)H1(d + i) = 0;
1729 }
1730 break;
1731 }
1732 }
1733
1734 void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1735 {
1736 intptr_t opr_sz = simd_oprsz(desc);
1737 size_t n_ofs = simd_data(desc);
1738 size_t n_siz = opr_sz - n_ofs;
1739
1740 if (vd != vm) {
1741 swap_memmove(vd, vn + n_ofs, n_siz);
1742 swap_memmove(vd + n_siz, vm, n_ofs);
1743 } else if (vd != vn) {
1744 swap_memmove(vd + n_siz, vd, n_ofs);
1745 swap_memmove(vd, vn + n_ofs, n_siz);
1746 } else {
1747 /* vd == vn == vm. Need temp space. */
1748 ARMVectorReg tmp;
1749 swap_memmove(&tmp, vm, n_ofs);
1750 swap_memmove(vd, vd + n_ofs, n_siz);
1751 memcpy(vd + n_siz, &tmp, n_ofs);
1752 }
1753 }
1754
1755 #define DO_INSR(NAME, TYPE, H) \
1756 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1757 { \
1758 intptr_t opr_sz = simd_oprsz(desc); \
1759 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1760 *(TYPE *)(vd + H(0)) = val; \
1761 }
1762
1763 DO_INSR(sve_insr_b, uint8_t, H1)
1764 DO_INSR(sve_insr_h, uint16_t, H1_2)
1765 DO_INSR(sve_insr_s, uint32_t, H1_4)
1766 DO_INSR(sve_insr_d, uint64_t, )
1767
1768 #undef DO_INSR
1769
1770 void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1771 {
1772 intptr_t i, j, opr_sz = simd_oprsz(desc);
1773 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1774 uint64_t f = *(uint64_t *)(vn + i);
1775 uint64_t b = *(uint64_t *)(vn + j);
1776 *(uint64_t *)(vd + i) = bswap64(b);
1777 *(uint64_t *)(vd + j) = bswap64(f);
1778 }
1779 }
1780
1781 void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1782 {
1783 intptr_t i, j, opr_sz = simd_oprsz(desc);
1784 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1785 uint64_t f = *(uint64_t *)(vn + i);
1786 uint64_t b = *(uint64_t *)(vn + j);
1787 *(uint64_t *)(vd + i) = hswap64(b);
1788 *(uint64_t *)(vd + j) = hswap64(f);
1789 }
1790 }
1791
1792 void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1793 {
1794 intptr_t i, j, opr_sz = simd_oprsz(desc);
1795 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1796 uint64_t f = *(uint64_t *)(vn + i);
1797 uint64_t b = *(uint64_t *)(vn + j);
1798 *(uint64_t *)(vd + i) = rol64(b, 32);
1799 *(uint64_t *)(vd + j) = rol64(f, 32);
1800 }
1801 }
1802
1803 void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1804 {
1805 intptr_t i, j, opr_sz = simd_oprsz(desc);
1806 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1807 uint64_t f = *(uint64_t *)(vn + i);
1808 uint64_t b = *(uint64_t *)(vn + j);
1809 *(uint64_t *)(vd + i) = b;
1810 *(uint64_t *)(vd + j) = f;
1811 }
1812 }
1813
1814 #define DO_TBL(NAME, TYPE, H) \
1815 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1816 { \
1817 intptr_t i, opr_sz = simd_oprsz(desc); \
1818 uintptr_t elem = opr_sz / sizeof(TYPE); \
1819 TYPE *d = vd, *n = vn, *m = vm; \
1820 ARMVectorReg tmp; \
1821 if (unlikely(vd == vn)) { \
1822 n = memcpy(&tmp, vn, opr_sz); \
1823 } \
1824 for (i = 0; i < elem; i++) { \
1825 TYPE j = m[H(i)]; \
1826 d[H(i)] = j < elem ? n[H(j)] : 0; \
1827 } \
1828 }
1829
1830 DO_TBL(sve_tbl_b, uint8_t, H1)
1831 DO_TBL(sve_tbl_h, uint16_t, H2)
1832 DO_TBL(sve_tbl_s, uint32_t, H4)
1833 DO_TBL(sve_tbl_d, uint64_t, )
1834
1835 #undef TBL
1836
1837 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1838 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1839 { \
1840 intptr_t i, opr_sz = simd_oprsz(desc); \
1841 TYPED *d = vd; \
1842 TYPES *n = vn; \
1843 ARMVectorReg tmp; \
1844 if (unlikely(vn - vd < opr_sz)) { \
1845 n = memcpy(&tmp, n, opr_sz / 2); \
1846 } \
1847 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1848 d[HD(i)] = n[HS(i)]; \
1849 } \
1850 }
1851
1852 DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1853 DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1854 DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1855
1856 DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1857 DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1858 DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1859
1860 #undef DO_UNPK
1861
1862 /* Mask of bits included in the even numbered predicates of width esz.
1863 * We also use this for expand_bits/compress_bits, and so extend the
1864 * same pattern out to 16-bit units.
1865 */
1866 static const uint64_t even_bit_esz_masks[5] = {
1867 0x5555555555555555ull,
1868 0x3333333333333333ull,
1869 0x0f0f0f0f0f0f0f0full,
1870 0x00ff00ff00ff00ffull,
1871 0x0000ffff0000ffffull,
1872 };
1873
1874 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1875 * For N==0, this corresponds to the operation that in qemu/bitops.h
1876 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1877 * section 7-2 Shuffling Bits.
1878 */
1879 static uint64_t expand_bits(uint64_t x, int n)
1880 {
1881 int i;
1882
1883 x &= 0xffffffffu;
1884 for (i = 4; i >= n; i--) {
1885 int sh = 1 << i;
1886 x = ((x << sh) | x) & even_bit_esz_masks[i];
1887 }
1888 return x;
1889 }
1890
1891 /* Compress units of 2**(N+1) bits to units of 2**N bits.
1892 * For N==0, this corresponds to the operation that in qemu/bitops.h
1893 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1894 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1895 */
1896 static uint64_t compress_bits(uint64_t x, int n)
1897 {
1898 int i;
1899
1900 for (i = n; i <= 4; i++) {
1901 int sh = 1 << i;
1902 x &= even_bit_esz_masks[i];
1903 x = (x >> sh) | x;
1904 }
1905 return x & 0xffffffffu;
1906 }
1907
1908 void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1909 {
1910 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1911 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1912 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1913 uint64_t *d = vd;
1914 intptr_t i;
1915
1916 if (oprsz <= 8) {
1917 uint64_t nn = *(uint64_t *)vn;
1918 uint64_t mm = *(uint64_t *)vm;
1919 int half = 4 * oprsz;
1920
1921 nn = extract64(nn, high * half, half);
1922 mm = extract64(mm, high * half, half);
1923 nn = expand_bits(nn, esz);
1924 mm = expand_bits(mm, esz);
1925 d[0] = nn + (mm << (1 << esz));
1926 } else {
1927 ARMPredicateReg tmp_n, tmp_m;
1928
1929 /* We produce output faster than we consume input.
1930 Therefore we must be mindful of possible overlap. */
1931 if ((vn - vd) < (uintptr_t)oprsz) {
1932 vn = memcpy(&tmp_n, vn, oprsz);
1933 }
1934 if ((vm - vd) < (uintptr_t)oprsz) {
1935 vm = memcpy(&tmp_m, vm, oprsz);
1936 }
1937 if (high) {
1938 high = oprsz >> 1;
1939 }
1940
1941 if ((high & 3) == 0) {
1942 uint32_t *n = vn, *m = vm;
1943 high >>= 2;
1944
1945 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1946 uint64_t nn = n[H4(high + i)];
1947 uint64_t mm = m[H4(high + i)];
1948
1949 nn = expand_bits(nn, esz);
1950 mm = expand_bits(mm, esz);
1951 d[i] = nn + (mm << (1 << esz));
1952 }
1953 } else {
1954 uint8_t *n = vn, *m = vm;
1955 uint16_t *d16 = vd;
1956
1957 for (i = 0; i < oprsz / 2; i++) {
1958 uint16_t nn = n[H1(high + i)];
1959 uint16_t mm = m[H1(high + i)];
1960
1961 nn = expand_bits(nn, esz);
1962 mm = expand_bits(mm, esz);
1963 d16[H2(i)] = nn + (mm << (1 << esz));
1964 }
1965 }
1966 }
1967 }
1968
1969 void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1970 {
1971 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1972 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1973 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1974 uint64_t *d = vd, *n = vn, *m = vm;
1975 uint64_t l, h;
1976 intptr_t i;
1977
1978 if (oprsz <= 8) {
1979 l = compress_bits(n[0] >> odd, esz);
1980 h = compress_bits(m[0] >> odd, esz);
1981 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1982 } else {
1983 ARMPredicateReg tmp_m;
1984 intptr_t oprsz_16 = oprsz / 16;
1985
1986 if ((vm - vd) < (uintptr_t)oprsz) {
1987 m = memcpy(&tmp_m, vm, oprsz);
1988 }
1989
1990 for (i = 0; i < oprsz_16; i++) {
1991 l = n[2 * i + 0];
1992 h = n[2 * i + 1];
1993 l = compress_bits(l >> odd, esz);
1994 h = compress_bits(h >> odd, esz);
1995 d[i] = l + (h << 32);
1996 }
1997
1998 /* For VL which is not a power of 2, the results from M do not
1999 align nicely with the uint64_t for D. Put the aligned results
2000 from M into TMP_M and then copy it into place afterward. */
2001 if (oprsz & 15) {
2002 d[i] = compress_bits(n[2 * i] >> odd, esz);
2003
2004 for (i = 0; i < oprsz_16; i++) {
2005 l = m[2 * i + 0];
2006 h = m[2 * i + 1];
2007 l = compress_bits(l >> odd, esz);
2008 h = compress_bits(h >> odd, esz);
2009 tmp_m.p[i] = l + (h << 32);
2010 }
2011 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
2012
2013 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
2014 } else {
2015 for (i = 0; i < oprsz_16; i++) {
2016 l = m[2 * i + 0];
2017 h = m[2 * i + 1];
2018 l = compress_bits(l >> odd, esz);
2019 h = compress_bits(h >> odd, esz);
2020 d[oprsz_16 + i] = l + (h << 32);
2021 }
2022 }
2023 }
2024 }
2025
2026 void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2027 {
2028 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2029 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2030 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2031 uint64_t *d = vd, *n = vn, *m = vm;
2032 uint64_t mask;
2033 int shr, shl;
2034 intptr_t i;
2035
2036 shl = 1 << esz;
2037 shr = 0;
2038 mask = even_bit_esz_masks[esz];
2039 if (odd) {
2040 mask <<= shl;
2041 shr = shl;
2042 shl = 0;
2043 }
2044
2045 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2046 uint64_t nn = (n[i] & mask) >> shr;
2047 uint64_t mm = (m[i] & mask) << shl;
2048 d[i] = nn + mm;
2049 }
2050 }
2051
2052 /* Reverse units of 2**N bits. */
2053 static uint64_t reverse_bits_64(uint64_t x, int n)
2054 {
2055 int i, sh;
2056
2057 x = bswap64(x);
2058 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2059 uint64_t mask = even_bit_esz_masks[i];
2060 x = ((x & mask) << sh) | ((x >> sh) & mask);
2061 }
2062 return x;
2063 }
2064
2065 static uint8_t reverse_bits_8(uint8_t x, int n)
2066 {
2067 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2068 int i, sh;
2069
2070 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2071 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2072 }
2073 return x;
2074 }
2075
2076 void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2077 {
2078 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2079 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2080 intptr_t i, oprsz_2 = oprsz / 2;
2081
2082 if (oprsz <= 8) {
2083 uint64_t l = *(uint64_t *)vn;
2084 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2085 *(uint64_t *)vd = l;
2086 } else if ((oprsz & 15) == 0) {
2087 for (i = 0; i < oprsz_2; i += 8) {
2088 intptr_t ih = oprsz - 8 - i;
2089 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2090 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2091 *(uint64_t *)(vd + i) = h;
2092 *(uint64_t *)(vd + ih) = l;
2093 }
2094 } else {
2095 for (i = 0; i < oprsz_2; i += 1) {
2096 intptr_t il = H1(i);
2097 intptr_t ih = H1(oprsz - 1 - i);
2098 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2099 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2100 *(uint8_t *)(vd + il) = h;
2101 *(uint8_t *)(vd + ih) = l;
2102 }
2103 }
2104 }
2105
2106 void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2107 {
2108 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2109 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2110 uint64_t *d = vd;
2111 intptr_t i;
2112
2113 if (oprsz <= 8) {
2114 uint64_t nn = *(uint64_t *)vn;
2115 int half = 4 * oprsz;
2116
2117 nn = extract64(nn, high * half, half);
2118 nn = expand_bits(nn, 0);
2119 d[0] = nn;
2120 } else {
2121 ARMPredicateReg tmp_n;
2122
2123 /* We produce output faster than we consume input.
2124 Therefore we must be mindful of possible overlap. */
2125 if ((vn - vd) < (uintptr_t)oprsz) {
2126 vn = memcpy(&tmp_n, vn, oprsz);
2127 }
2128 if (high) {
2129 high = oprsz >> 1;
2130 }
2131
2132 if ((high & 3) == 0) {
2133 uint32_t *n = vn;
2134 high >>= 2;
2135
2136 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2137 uint64_t nn = n[H4(high + i)];
2138 d[i] = expand_bits(nn, 0);
2139 }
2140 } else {
2141 uint16_t *d16 = vd;
2142 uint8_t *n = vn;
2143
2144 for (i = 0; i < oprsz / 2; i++) {
2145 uint16_t nn = n[H1(high + i)];
2146 d16[H2(i)] = expand_bits(nn, 0);
2147 }
2148 }
2149 }
2150 }
2151
2152 #define DO_ZIP(NAME, TYPE, H) \
2153 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2154 { \
2155 intptr_t oprsz = simd_oprsz(desc); \
2156 intptr_t i, oprsz_2 = oprsz / 2; \
2157 ARMVectorReg tmp_n, tmp_m; \
2158 /* We produce output faster than we consume input. \
2159 Therefore we must be mindful of possible overlap. */ \
2160 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2161 vn = memcpy(&tmp_n, vn, oprsz_2); \
2162 } \
2163 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2164 vm = memcpy(&tmp_m, vm, oprsz_2); \
2165 } \
2166 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2167 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2168 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2169 } \
2170 }
2171
2172 DO_ZIP(sve_zip_b, uint8_t, H1)
2173 DO_ZIP(sve_zip_h, uint16_t, H1_2)
2174 DO_ZIP(sve_zip_s, uint32_t, H1_4)
2175 DO_ZIP(sve_zip_d, uint64_t, )
2176
2177 #define DO_UZP(NAME, TYPE, H) \
2178 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2179 { \
2180 intptr_t oprsz = simd_oprsz(desc); \
2181 intptr_t oprsz_2 = oprsz / 2; \
2182 intptr_t odd_ofs = simd_data(desc); \
2183 intptr_t i; \
2184 ARMVectorReg tmp_m; \
2185 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2186 vm = memcpy(&tmp_m, vm, oprsz); \
2187 } \
2188 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2189 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2190 } \
2191 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2192 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2193 } \
2194 }
2195
2196 DO_UZP(sve_uzp_b, uint8_t, H1)
2197 DO_UZP(sve_uzp_h, uint16_t, H1_2)
2198 DO_UZP(sve_uzp_s, uint32_t, H1_4)
2199 DO_UZP(sve_uzp_d, uint64_t, )
2200
2201 #define DO_TRN(NAME, TYPE, H) \
2202 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2203 { \
2204 intptr_t oprsz = simd_oprsz(desc); \
2205 intptr_t odd_ofs = simd_data(desc); \
2206 intptr_t i; \
2207 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2208 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2209 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2210 *(TYPE *)(vd + H(i + 0)) = ae; \
2211 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2212 } \
2213 }
2214
2215 DO_TRN(sve_trn_b, uint8_t, H1)
2216 DO_TRN(sve_trn_h, uint16_t, H1_2)
2217 DO_TRN(sve_trn_s, uint32_t, H1_4)
2218 DO_TRN(sve_trn_d, uint64_t, )
2219
2220 #undef DO_ZIP
2221 #undef DO_UZP
2222 #undef DO_TRN
2223
2224 void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2225 {
2226 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2227 uint32_t *d = vd, *n = vn;
2228 uint8_t *pg = vg;
2229
2230 for (i = j = 0; i < opr_sz; i++) {
2231 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2232 d[H4(j)] = n[H4(i)];
2233 j++;
2234 }
2235 }
2236 for (; j < opr_sz; j++) {
2237 d[H4(j)] = 0;
2238 }
2239 }
2240
2241 void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2242 {
2243 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2244 uint64_t *d = vd, *n = vn;
2245 uint8_t *pg = vg;
2246
2247 for (i = j = 0; i < opr_sz; i++) {
2248 if (pg[H1(i)] & 1) {
2249 d[j] = n[i];
2250 j++;
2251 }
2252 }
2253 for (; j < opr_sz; j++) {
2254 d[j] = 0;
2255 }
2256 }
2257
2258 /* Similar to the ARM LastActiveElement pseudocode function, except the
2259 * result is multiplied by the element size. This includes the not found
2260 * indication; e.g. not found for esz=3 is -8.
2261 */
2262 int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2263 {
2264 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2265 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2266
2267 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2268 }
2269
2270 void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2271 {
2272 intptr_t opr_sz = simd_oprsz(desc) / 8;
2273 int esz = simd_data(desc);
2274 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2275 intptr_t i, first_i, last_i;
2276 ARMVectorReg tmp;
2277
2278 first_i = last_i = 0;
2279 first_g = last_g = 0;
2280
2281 /* Find the extent of the active elements within VG. */
2282 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2283 pg = *(uint64_t *)(vg + i) & mask;
2284 if (pg) {
2285 if (last_g == 0) {
2286 last_g = pg;
2287 last_i = i;
2288 }
2289 first_g = pg;
2290 first_i = i;
2291 }
2292 }
2293
2294 len = 0;
2295 if (first_g != 0) {
2296 first_i = first_i * 8 + ctz64(first_g);
2297 last_i = last_i * 8 + 63 - clz64(last_g);
2298 len = last_i - first_i + (1 << esz);
2299 if (vd == vm) {
2300 vm = memcpy(&tmp, vm, opr_sz * 8);
2301 }
2302 swap_memmove(vd, vn + first_i, len);
2303 }
2304 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2305 }
2306
2307 void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2308 void *vg, uint32_t desc)
2309 {
2310 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2311 uint64_t *d = vd, *n = vn, *m = vm;
2312 uint8_t *pg = vg;
2313
2314 for (i = 0; i < opr_sz; i += 1) {
2315 uint64_t nn = n[i], mm = m[i];
2316 uint64_t pp = expand_pred_b(pg[H1(i)]);
2317 d[i] = (nn & pp) | (mm & ~pp);
2318 }
2319 }
2320
2321 void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2322 void *vg, uint32_t desc)
2323 {
2324 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2325 uint64_t *d = vd, *n = vn, *m = vm;
2326 uint8_t *pg = vg;
2327
2328 for (i = 0; i < opr_sz; i += 1) {
2329 uint64_t nn = n[i], mm = m[i];
2330 uint64_t pp = expand_pred_h(pg[H1(i)]);
2331 d[i] = (nn & pp) | (mm & ~pp);
2332 }
2333 }
2334
2335 void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2336 void *vg, uint32_t desc)
2337 {
2338 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2339 uint64_t *d = vd, *n = vn, *m = vm;
2340 uint8_t *pg = vg;
2341
2342 for (i = 0; i < opr_sz; i += 1) {
2343 uint64_t nn = n[i], mm = m[i];
2344 uint64_t pp = expand_pred_s(pg[H1(i)]);
2345 d[i] = (nn & pp) | (mm & ~pp);
2346 }
2347 }
2348
2349 void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2350 void *vg, uint32_t desc)
2351 {
2352 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2353 uint64_t *d = vd, *n = vn, *m = vm;
2354 uint8_t *pg = vg;
2355
2356 for (i = 0; i < opr_sz; i += 1) {
2357 uint64_t nn = n[i], mm = m[i];
2358 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2359 }
2360 }
2361
2362 /* Two operand comparison controlled by a predicate.
2363 * ??? It is very tempting to want to be able to expand this inline
2364 * with x86 instructions, e.g.
2365 *
2366 * vcmpeqw zm, zn, %ymm0
2367 * vpmovmskb %ymm0, %eax
2368 * and $0x5555, %eax
2369 * and pg, %eax
2370 *
2371 * or even aarch64, e.g.
2372 *
2373 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2374 * cmeq v0.8h, zn, zm
2375 * and v0.8h, v0.8h, mask
2376 * addv h0, v0.8h
2377 * and v0.8b, pg
2378 *
2379 * However, coming up with an abstraction that allows vector inputs and
2380 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2381 * scalar outputs, is tricky.
2382 */
2383 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2384 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2385 { \
2386 intptr_t opr_sz = simd_oprsz(desc); \
2387 uint32_t flags = PREDTEST_INIT; \
2388 intptr_t i = opr_sz; \
2389 do { \
2390 uint64_t out = 0, pg; \
2391 do { \
2392 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2393 TYPE nn = *(TYPE *)(vn + H(i)); \
2394 TYPE mm = *(TYPE *)(vm + H(i)); \
2395 out |= nn OP mm; \
2396 } while (i & 63); \
2397 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2398 out &= pg; \
2399 *(uint64_t *)(vd + (i >> 3)) = out; \
2400 flags = iter_predtest_bwd(out, pg, flags); \
2401 } while (i > 0); \
2402 return flags; \
2403 }
2404
2405 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2406 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2407 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2408 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2409 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2410 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2411 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2412 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2413
2414 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2415 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2416 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2417 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2418
2419 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2420 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2421 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2422 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2423
2424 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2425 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2426 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2427 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2428
2429 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2430 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2431 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2432 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2433
2434 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2435 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2436 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2437 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2438
2439 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2440 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2441 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2442 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2443
2444 #undef DO_CMP_PPZZ_B
2445 #undef DO_CMP_PPZZ_H
2446 #undef DO_CMP_PPZZ_S
2447 #undef DO_CMP_PPZZ_D
2448 #undef DO_CMP_PPZZ
2449
2450 /* Similar, but the second source is "wide". */
2451 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2452 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2453 { \
2454 intptr_t opr_sz = simd_oprsz(desc); \
2455 uint32_t flags = PREDTEST_INIT; \
2456 intptr_t i = opr_sz; \
2457 do { \
2458 uint64_t out = 0, pg; \
2459 do { \
2460 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2461 do { \
2462 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2463 TYPE nn = *(TYPE *)(vn + H(i)); \
2464 out |= nn OP mm; \
2465 } while (i & 7); \
2466 } while (i & 63); \
2467 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2468 out &= pg; \
2469 *(uint64_t *)(vd + (i >> 3)) = out; \
2470 flags = iter_predtest_bwd(out, pg, flags); \
2471 } while (i > 0); \
2472 return flags; \
2473 }
2474
2475 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2476 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2477 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2478 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2479 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2480 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2481
2482 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
2483 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2484 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
2485
2486 DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
2487 DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2488 DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
2489
2490 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2491 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2492 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2493
2494 DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2495 DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2496 DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2497
2498 DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2499 DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2500 DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2501
2502 DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2503 DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2504 DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2505
2506 DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2507 DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2508 DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2509
2510 DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2511 DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2512 DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2513
2514 DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2515 DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2516 DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2517
2518 DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2519 DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2520 DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2521
2522 #undef DO_CMP_PPZW_B
2523 #undef DO_CMP_PPZW_H
2524 #undef DO_CMP_PPZW_S
2525 #undef DO_CMP_PPZW
2526
2527 /* Similar, but the second source is immediate. */
2528 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2529 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2530 { \
2531 intptr_t opr_sz = simd_oprsz(desc); \
2532 uint32_t flags = PREDTEST_INIT; \
2533 TYPE mm = simd_data(desc); \
2534 intptr_t i = opr_sz; \
2535 do { \
2536 uint64_t out = 0, pg; \
2537 do { \
2538 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2539 TYPE nn = *(TYPE *)(vn + H(i)); \
2540 out |= nn OP mm; \
2541 } while (i & 63); \
2542 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2543 out &= pg; \
2544 *(uint64_t *)(vd + (i >> 3)) = out; \
2545 flags = iter_predtest_bwd(out, pg, flags); \
2546 } while (i > 0); \
2547 return flags; \
2548 }
2549
2550 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2551 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2552 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2553 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2554 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2555 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2556 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2557 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2558
2559 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2560 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2561 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2562 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2563
2564 DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2565 DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2566 DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2567 DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2568
2569 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2570 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2571 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2572 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2573
2574 DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2575 DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2576 DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2577 DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2578
2579 DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2580 DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2581 DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2582 DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2583
2584 DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2585 DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2586 DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2587 DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2588
2589 DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2590 DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2591 DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2592 DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2593
2594 DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2595 DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2596 DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2597 DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2598
2599 DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2600 DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2601 DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2602 DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2603
2604 DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2605 DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2606 DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2607 DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2608
2609 #undef DO_CMP_PPZI_B
2610 #undef DO_CMP_PPZI_H
2611 #undef DO_CMP_PPZI_S
2612 #undef DO_CMP_PPZI_D
2613 #undef DO_CMP_PPZI
2614
2615 /* Similar to the ARM LastActive pseudocode function. */
2616 static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2617 {
2618 intptr_t i;
2619
2620 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2621 uint64_t pg = *(uint64_t *)(vg + i);
2622 if (pg) {
2623 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2624 }
2625 }
2626 return 0;
2627 }
2628
2629 /* Compute a mask into RETB that is true for all G, up to and including
2630 * (if after) or excluding (if !after) the first G & N.
2631 * Return true if BRK found.
2632 */
2633 static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2634 bool brk, bool after)
2635 {
2636 uint64_t b;
2637
2638 if (brk) {
2639 b = 0;
2640 } else if ((g & n) == 0) {
2641 /* For all G, no N are set; break not found. */
2642 b = g;
2643 } else {
2644 /* Break somewhere in N. Locate it. */
2645 b = g & n; /* guard true, pred true */
2646 b = b & -b; /* first such */
2647 if (after) {
2648 b = b | (b - 1); /* break after same */
2649 } else {
2650 b = b - 1; /* break before same */
2651 }
2652 brk = true;
2653 }
2654
2655 *retb = b;
2656 return brk;
2657 }
2658
2659 /* Compute a zeroing BRK. */
2660 static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2661 intptr_t oprsz, bool after)
2662 {
2663 bool brk = false;
2664 intptr_t i;
2665
2666 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2667 uint64_t this_b, this_g = g[i];
2668
2669 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2670 d[i] = this_b & this_g;
2671 }
2672 }
2673
2674 /* Likewise, but also compute flags. */
2675 static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2676 intptr_t oprsz, bool after)
2677 {
2678 uint32_t flags = PREDTEST_INIT;
2679 bool brk = false;
2680 intptr_t i;
2681
2682 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2683 uint64_t this_b, this_d, this_g = g[i];
2684
2685 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2686 d[i] = this_d = this_b & this_g;
2687 flags = iter_predtest_fwd(this_d, this_g, flags);
2688 }
2689 return flags;
2690 }
2691
2692 /* Compute a merging BRK. */
2693 static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2694 intptr_t oprsz, bool after)
2695 {
2696 bool brk = false;
2697 intptr_t i;
2698
2699 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2700 uint64_t this_b, this_g = g[i];
2701
2702 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2703 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2704 }
2705 }
2706
2707 /* Likewise, but also compute flags. */
2708 static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2709 intptr_t oprsz, bool after)
2710 {
2711 uint32_t flags = PREDTEST_INIT;
2712 bool brk = false;
2713 intptr_t i;
2714
2715 for (i = 0; i < oprsz / 8; ++i) {
2716 uint64_t this_b, this_d = d[i], this_g = g[i];
2717
2718 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2719 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2720 flags = iter_predtest_fwd(this_d, this_g, flags);
2721 }
2722 return flags;
2723 }
2724
2725 static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2726 {
2727 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2728 * The compiler should turn this into 4 64-bit integer stores.
2729 */
2730 memset(d, 0, sizeof(ARMPredicateReg));
2731 return PREDTEST_INIT;
2732 }
2733
2734 void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2735 uint32_t pred_desc)
2736 {
2737 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2738 if (last_active_pred(vn, vg, oprsz)) {
2739 compute_brk_z(vd, vm, vg, oprsz, true);
2740 } else {
2741 do_zero(vd, oprsz);
2742 }
2743 }
2744
2745 uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2746 uint32_t pred_desc)
2747 {
2748 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2749 if (last_active_pred(vn, vg, oprsz)) {
2750 return compute_brks_z(vd, vm, vg, oprsz, true);
2751 } else {
2752 return do_zero(vd, oprsz);
2753 }
2754 }
2755
2756 void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2757 uint32_t pred_desc)
2758 {
2759 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2760 if (last_active_pred(vn, vg, oprsz)) {
2761 compute_brk_z(vd, vm, vg, oprsz, false);
2762 } else {
2763 do_zero(vd, oprsz);
2764 }
2765 }
2766
2767 uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2768 uint32_t pred_desc)
2769 {
2770 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2771 if (last_active_pred(vn, vg, oprsz)) {
2772 return compute_brks_z(vd, vm, vg, oprsz, false);
2773 } else {
2774 return do_zero(vd, oprsz);
2775 }
2776 }
2777
2778 void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2779 {
2780 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2781 compute_brk_z(vd, vn, vg, oprsz, true);
2782 }
2783
2784 uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2785 {
2786 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2787 return compute_brks_z(vd, vn, vg, oprsz, true);
2788 }
2789
2790 void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2791 {
2792 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2793 compute_brk_z(vd, vn, vg, oprsz, false);
2794 }
2795
2796 uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2797 {
2798 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2799 return compute_brks_z(vd, vn, vg, oprsz, false);
2800 }
2801
2802 void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2803 {
2804 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2805 compute_brk_m(vd, vn, vg, oprsz, true);
2806 }
2807
2808 uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2809 {
2810 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2811 return compute_brks_m(vd, vn, vg, oprsz, true);
2812 }
2813
2814 void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2815 {
2816 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2817 compute_brk_m(vd, vn, vg, oprsz, false);
2818 }
2819
2820 uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2821 {
2822 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2823 return compute_brks_m(vd, vn, vg, oprsz, false);
2824 }
2825
2826 void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2827 {
2828 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2829
2830 if (!last_active_pred(vn, vg, oprsz)) {
2831 do_zero(vd, oprsz);
2832 }
2833 }
2834
2835 /* As if PredTest(Ones(PL), D, esz). */
2836 static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2837 uint64_t esz_mask)
2838 {
2839 uint32_t flags = PREDTEST_INIT;
2840 intptr_t i;
2841
2842 for (i = 0; i < oprsz / 8; i++) {
2843 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2844 }
2845 if (oprsz & 7) {
2846 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2847 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2848 }
2849 return flags;
2850 }
2851
2852 uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2853 {
2854 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2855
2856 if (last_active_pred(vn, vg, oprsz)) {
2857 return predtest_ones(vd, oprsz, -1);
2858 } else {
2859 return do_zero(vd, oprsz);
2860 }
2861 }
2862
2863 uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2864 {
2865 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2866 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2867 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2868 intptr_t i;
2869
2870 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2871 uint64_t t = n[i] & g[i] & mask;
2872 sum += ctpop64(t);
2873 }
2874 return sum;
2875 }
2876
2877 uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2878 {
2879 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2880 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2881 uint64_t esz_mask = pred_esz_masks[esz];
2882 ARMPredicateReg *d = vd;
2883 uint32_t flags;
2884 intptr_t i;
2885
2886 /* Begin with a zero predicate register. */
2887 flags = do_zero(d, oprsz);
2888 if (count == 0) {
2889 return flags;
2890 }
2891
2892 /* Set all of the requested bits. */
2893 for (i = 0; i < count / 64; ++i) {
2894 d->p[i] = esz_mask;
2895 }
2896 if (count & 63) {
2897 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2898 }
2899
2900 return predtest_ones(d, oprsz, esz_mask);
2901 }
2902
2903 /* Recursive reduction on a function;
2904 * C.f. the ARM ARM function ReducePredicated.
2905 *
2906 * While it would be possible to write this without the DATA temporary,
2907 * it is much simpler to process the predicate register this way.
2908 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2909 * little to gain with a more complex non-recursive form.
2910 */
2911 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2912 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2913 { \
2914 if (n == 1) { \
2915 return *data; \
2916 } else { \
2917 uintptr_t half = n / 2; \
2918 TYPE lo = NAME##_reduce(data, status, half); \
2919 TYPE hi = NAME##_reduce(data + half, status, half); \
2920 return TYPE##_##FUNC(lo, hi, status); \
2921 } \
2922 } \
2923 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2924 { \
2925 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2926 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2927 for (i = 0; i < oprsz; ) { \
2928 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2929 do { \
2930 TYPE nn = *(TYPE *)(vn + H(i)); \
2931 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2932 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2933 } while (i & 15); \
2934 } \
2935 for (; i < maxsz; i += sizeof(TYPE)) { \
2936 *(TYPE *)((void *)data + i) = IDENT; \
2937 } \
2938 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2939 }
2940
2941 DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2942 DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2943 DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2944
2945 /* Identity is floatN_default_nan, without the function call. */
2946 DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2947 DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2948 DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2949
2950 DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2951 DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2952 DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2953
2954 DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2955 DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2956 DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2957
2958 DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2959 DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2960 DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2961
2962 #undef DO_REDUCE
2963
2964 uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2965 void *status, uint32_t desc)
2966 {
2967 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2968 float16 result = nn;
2969
2970 do {
2971 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2972 do {
2973 if (pg & 1) {
2974 float16 mm = *(float16 *)(vm + H1_2(i));
2975 result = float16_add(result, mm, status);
2976 }
2977 i += sizeof(float16), pg >>= sizeof(float16);
2978 } while (i & 15);
2979 } while (i < opr_sz);
2980
2981 return result;
2982 }
2983
2984 uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2985 void *status, uint32_t desc)
2986 {
2987 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2988 float32 result = nn;
2989
2990 do {
2991 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2992 do {
2993 if (pg & 1) {
2994 float32 mm = *(float32 *)(vm + H1_2(i));
2995 result = float32_add(result, mm, status);
2996 }
2997 i += sizeof(float32), pg >>= sizeof(float32);
2998 } while (i & 15);
2999 } while (i < opr_sz);
3000
3001 return result;
3002 }
3003
3004 uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
3005 void *status, uint32_t desc)
3006 {
3007 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
3008 uint64_t *m = vm;
3009 uint8_t *pg = vg;
3010
3011 for (i = 0; i < opr_sz; i++) {
3012 if (pg[H1(i)] & 1) {
3013 nn = float64_add(nn, m[i], status);
3014 }
3015 }
3016
3017 return nn;
3018 }
3019
3020 /* Fully general three-operand expander, controlled by a predicate,
3021 * With the extra float_status parameter.
3022 */
3023 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
3024 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3025 void *status, uint32_t desc) \
3026 { \
3027 intptr_t i = simd_oprsz(desc); \
3028 uint64_t *g = vg; \
3029 do { \
3030 uint64_t pg = g[(i - 1) >> 6]; \
3031 do { \
3032 i -= sizeof(TYPE); \
3033 if (likely((pg >> (i & 63)) & 1)) { \
3034 TYPE nn = *(TYPE *)(vn + H(i)); \
3035 TYPE mm = *(TYPE *)(vm + H(i)); \
3036 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3037 } \
3038 } while (i & 63); \
3039 } while (i != 0); \
3040 }
3041
3042 DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3043 DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3044 DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3045
3046 DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3047 DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3048 DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3049
3050 DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3051 DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3052 DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3053
3054 DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3055 DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3056 DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3057
3058 DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3059 DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3060 DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3061
3062 DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3063 DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3064 DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3065
3066 DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3067 DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3068 DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3069
3070 DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3071 DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3072 DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3073
3074 static inline float16 abd_h(float16 a, float16 b, float_status *s)
3075 {
3076 return float16_abs(float16_sub(a, b, s));
3077 }
3078
3079 static inline float32 abd_s(float32 a, float32 b, float_status *s)
3080 {
3081 return float32_abs(float32_sub(a, b, s));
3082 }
3083
3084 static inline float64 abd_d(float64 a, float64 b, float_status *s)
3085 {
3086 return float64_abs(float64_sub(a, b, s));
3087 }
3088
3089 DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3090 DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3091 DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3092
3093 static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3094 {
3095 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3096 return float64_scalbn(a, b_int, s);
3097 }
3098
3099 DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3100 DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3101 DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3102
3103 DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3104 DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3105 DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3106
3107 #undef DO_ZPZZ_FP
3108
3109 /* Three-operand expander, with one scalar operand, controlled by
3110 * a predicate, with the extra float_status parameter.
3111 */
3112 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3113 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3114 void *status, uint32_t desc) \
3115 { \
3116 intptr_t i = simd_oprsz(desc); \
3117 uint64_t *g = vg; \
3118 TYPE mm = scalar; \
3119 do { \
3120 uint64_t pg = g[(i - 1) >> 6]; \
3121 do { \
3122 i -= sizeof(TYPE); \
3123 if (likely((pg >> (i & 63)) & 1)) { \
3124 TYPE nn = *(TYPE *)(vn + H(i)); \
3125 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3126 } \
3127 } while (i & 63); \
3128 } while (i != 0); \
3129 }
3130
3131 DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3132 DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3133 DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3134
3135 DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3136 DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3137 DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3138
3139 DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3140 DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3141 DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3142
3143 static inline float16 subr_h(float16 a, float16 b, float_status *s)
3144 {
3145 return float16_sub(b, a, s);
3146 }
3147
3148 static inline float32 subr_s(float32 a, float32 b, float_status *s)
3149 {
3150 return float32_sub(b, a, s);
3151 }
3152
3153 static inline float64 subr_d(float64 a, float64 b, float_status *s)
3154 {
3155 return float64_sub(b, a, s);
3156 }
3157
3158 DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3159 DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3160 DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3161
3162 DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3163 DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3164 DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3165
3166 DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3167 DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3168 DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3169
3170 DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3171 DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3172 DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3173
3174 DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3175 DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3176 DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3177
3178 /* Fully general two-operand expander, controlled by a predicate,
3179 * With the extra float_status parameter.
3180 */
3181 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3182 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3183 { \
3184 intptr_t i = simd_oprsz(desc); \
3185 uint64_t *g = vg; \
3186 do { \
3187 uint64_t pg = g[(i - 1) >> 6]; \
3188 do { \
3189 i -= sizeof(TYPE); \
3190 if (likely((pg >> (i & 63)) & 1)) { \
3191 TYPE nn = *(TYPE *)(vn + H(i)); \
3192 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3193 } \
3194 } while (i & 63); \
3195 } while (i != 0); \
3196 }
3197
3198 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3199 * FZ16. When converting from fp16, this affects flushing input denormals;
3200 * when converting to fp16, this affects flushing output denormals.
3201 */
3202 static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3203 {
3204 bool save = get_flush_inputs_to_zero(fpst);
3205 float32 ret;
3206
3207 set_flush_inputs_to_zero(false, fpst);
3208 ret = float16_to_float32(f, true, fpst);
3209 set_flush_inputs_to_zero(save, fpst);
3210 return ret;
3211 }
3212
3213 static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3214 {
3215 bool save = get_flush_inputs_to_zero(fpst);
3216 float64 ret;
3217
3218 set_flush_inputs_to_zero(false, fpst);
3219 ret = float16_to_float64(f, true, fpst);
3220 set_flush_inputs_to_zero(save, fpst);
3221 return ret;
3222 }
3223
3224 static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3225 {
3226 bool save = get_flush_to_zero(fpst);
3227 float16 ret;
3228
3229 set_flush_to_zero(false, fpst);
3230 ret = float32_to_float16(f, true, fpst);
3231 set_flush_to_zero(save, fpst);
3232 return ret;
3233 }
3234
3235 static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3236 {
3237 bool save = get_flush_to_zero(fpst);
3238 float16 ret;
3239
3240 set_flush_to_zero(false, fpst);
3241 ret = float64_to_float16(f, true, fpst);
3242 set_flush_to_zero(save, fpst);
3243 return ret;
3244 }
3245
3246 static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3247 {
3248 if (float16_is_any_nan(f)) {
3249 float_raise(float_flag_invalid, s);
3250 return 0;
3251 }
3252 return float16_to_int16_round_to_zero(f, s);
3253 }
3254
3255 static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3256 {
3257 if (float16_is_any_nan(f)) {
3258 float_raise(float_flag_invalid, s);
3259 return 0;
3260 }
3261 return float16_to_int64_round_to_zero(f, s);
3262 }
3263
3264 static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3265 {
3266 if (float32_is_any_nan(f)) {
3267 float_raise(float_flag_invalid, s);
3268 return 0;
3269 }
3270 return float32_to_int64_round_to_zero(f, s);
3271 }
3272
3273 static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3274 {
3275 if (float64_is_any_nan(f)) {
3276 float_raise(float_flag_invalid, s);
3277 return 0;
3278 }
3279 return float64_to_int64_round_to_zero(f, s);
3280 }
3281
3282 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3283 {
3284 if (float16_is_any_nan(f)) {
3285 float_raise(float_flag_invalid, s);
3286 return 0;
3287 }
3288 return float16_to_uint16_round_to_zero(f, s);
3289 }
3290
3291 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3292 {
3293 if (float16_is_any_nan(f)) {
3294 float_raise(float_flag_invalid, s);
3295 return 0;
3296 }
3297 return float16_to_uint64_round_to_zero(f, s);
3298 }
3299
3300 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3301 {
3302 if (float32_is_any_nan(f)) {
3303 float_raise(float_flag_invalid, s);
3304 return 0;
3305 }
3306 return float32_to_uint64_round_to_zero(f, s);
3307 }
3308
3309 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3310 {
3311 if (float64_is_any_nan(f)) {
3312 float_raise(float_flag_invalid, s);
3313 return 0;
3314 }
3315 return float64_to_uint64_round_to_zero(f, s);
3316 }
3317
3318 DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3319 DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3320 DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3321 DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3322 DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3323 DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3324
3325 DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3326 DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3327 DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3328 DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3329 DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3330 DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3331 DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3332
3333 DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3334 DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3335 DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3336 DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3337 DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3338 DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3339 DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3340
3341 DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3342 DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3343 DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3344
3345 DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3346 DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3347 DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3348
3349 DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3350 DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3351 DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3352
3353 DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3354 DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3355 DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3356
3357 DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3358 DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3359 DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3360 DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3361 DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3362 DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3363 DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3364
3365 DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3366 DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3367 DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3368 DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3369 DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3370 DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3371 DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3372
3373 #undef DO_ZPZ_FP
3374
3375 static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
3376 float_status *status, uint32_t desc,
3377 uint16_t neg1, uint16_t neg3)
3378 {
3379 intptr_t i = simd_oprsz(desc);
3380 uint64_t *g = vg;
3381
3382 do {
3383 uint64_t pg = g[(i - 1) >> 6];
3384 do {
3385 i -= 2;
3386 if (likely((pg >> (i & 63)) & 1)) {
3387 float16 e1, e2, e3, r;
3388
3389 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3390 e2 = *(uint16_t *)(vm + H1_2(i));
3391 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
3392 r = float16_muladd(e1, e2, e3, 0, status);
3393 *(uint16_t *)(vd + H1_2(i)) = r;
3394 }
3395 } while (i & 63);
3396 } while (i != 0);
3397 }
3398
3399 void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3400 void *vg, void *status, uint32_t desc)
3401 {
3402 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
3403 }
3404
3405 void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3406 void *vg, void *status, uint32_t desc)
3407 {
3408 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
3409 }
3410
3411 void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3412 void *vg, void *status, uint32_t desc)
3413 {
3414 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
3415 }
3416
3417 void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3418 void *vg, void *status, uint32_t desc)
3419 {
3420 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
3421 }
3422
3423 static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
3424 float_status *status, uint32_t desc,
3425 uint32_t neg1, uint32_t neg3)
3426 {
3427 intptr_t i = simd_oprsz(desc);
3428 uint64_t *g = vg;
3429
3430 do {
3431 uint64_t pg = g[(i - 1) >> 6];
3432 do {
3433 i -= 4;
3434 if (likely((pg >> (i & 63)) & 1)) {
3435 float32 e1, e2, e3, r;
3436
3437 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3438 e2 = *(uint32_t *)(vm + H1_4(i));
3439 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3440 r = float32_muladd(e1, e2, e3, 0, status);
3441 *(uint32_t *)(vd + H1_4(i)) = r;
3442 }
3443 } while (i & 63);
3444 } while (i != 0);
3445 }
3446
3447 void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3448 void *vg, void *status, uint32_t desc)
3449 {
3450 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
3451 }
3452
3453 void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3454 void *vg, void *status, uint32_t desc)
3455 {
3456 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
3457 }
3458
3459 void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3460 void *vg, void *status, uint32_t desc)
3461 {
3462 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
3463 }
3464
3465 void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3466 void *vg, void *status, uint32_t desc)
3467 {
3468 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
3469 }
3470
3471 static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
3472 float_status *status, uint32_t desc,
3473 uint64_t neg1, uint64_t neg3)
3474 {
3475 intptr_t i = simd_oprsz(desc);
3476 uint64_t *g = vg;
3477
3478 do {
3479 uint64_t pg = g[(i - 1) >> 6];
3480 do {
3481 i -= 8;
3482 if (likely((pg >> (i & 63)) & 1)) {
3483 float64 e1, e2, e3, r;
3484
3485 e1 = *(uint64_t *)(vn + i) ^ neg1;
3486 e2 = *(uint64_t *)(vm + i);
3487 e3 = *(uint64_t *)(va + i) ^ neg3;
3488 r = float64_muladd(e1, e2, e3, 0, status);
3489 *(uint64_t *)(vd + i) = r;
3490 }
3491 } while (i & 63);
3492 } while (i != 0);
3493 }
3494
3495 void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3496 void *vg, void *status, uint32_t desc)
3497 {
3498 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
3499 }
3500
3501 void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3502 void *vg, void *status, uint32_t desc)
3503 {
3504 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
3505 }
3506
3507 void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3508 void *vg, void *status, uint32_t desc)
3509 {
3510 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
3511 }
3512
3513 void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3514 void *vg, void *status, uint32_t desc)
3515 {
3516 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
3517 }
3518
3519 /* Two operand floating-point comparison controlled by a predicate.
3520 * Unlike the integer version, we are not allowed to optimistically
3521 * compare operands, since the comparison may have side effects wrt
3522 * the FPSR.
3523 */
3524 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3525 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3526 void *status, uint32_t desc) \
3527 { \
3528 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3529 uint64_t *d = vd, *g = vg; \
3530 do { \
3531 uint64_t out = 0, pg = g[j]; \
3532 do { \
3533 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3534 if (likely((pg >> (i & 63)) & 1)) { \
3535 TYPE nn = *(TYPE *)(vn + H(i)); \
3536 TYPE mm = *(TYPE *)(vm + H(i)); \
3537 out |= OP(TYPE, nn, mm, status); \
3538 } \
3539 } while (i & 63); \
3540 d[j--] = out; \
3541 } while (i > 0); \
3542 }
3543
3544 #define DO_FPCMP_PPZZ_H(NAME, OP) \
3545 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3546 #define DO_FPCMP_PPZZ_S(NAME, OP) \
3547 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3548 #define DO_FPCMP_PPZZ_D(NAME, OP) \
3549 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3550
3551 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3552 DO_FPCMP_PPZZ_H(NAME, OP) \
3553 DO_FPCMP_PPZZ_S(NAME, OP) \
3554 DO_FPCMP_PPZZ_D(NAME, OP)
3555
3556 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3557 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
3558 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3559 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
3560 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3561 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3562 #define DO_FCMUO(TYPE, X, Y, ST) \
3563 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3564 #define DO_FACGE(TYPE, X, Y, ST) \
3565 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3566 #define DO_FACGT(TYPE, X, Y, ST) \
3567 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3568
3569 DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3570 DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3571 DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3572 DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3573 DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3574 DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3575 DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3576
3577 #undef DO_FPCMP_PPZZ_ALL
3578 #undef DO_FPCMP_PPZZ_D
3579 #undef DO_FPCMP_PPZZ_S
3580 #undef DO_FPCMP_PPZZ_H
3581 #undef DO_FPCMP_PPZZ
3582
3583 /* One operand floating-point comparison against zero, controlled
3584 * by a predicate.
3585 */
3586 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3587 void HELPER(NAME)(void *vd, void *vn, void *vg, \
3588 void *status, uint32_t desc) \
3589 { \
3590 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3591 uint64_t *d = vd, *g = vg; \
3592 do { \
3593 uint64_t out = 0, pg = g[j]; \
3594 do { \
3595 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3596 if ((pg >> (i & 63)) & 1) { \
3597 TYPE nn = *(TYPE *)(vn + H(i)); \
3598 out |= OP(TYPE, nn, 0, status); \
3599 } \
3600 } while (i & 63); \
3601 d[j--] = out; \
3602 } while (i > 0); \
3603 }
3604
3605 #define DO_FPCMP_PPZ0_H(NAME, OP) \
3606 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3607 #define DO_FPCMP_PPZ0_S(NAME, OP) \
3608 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3609 #define DO_FPCMP_PPZ0_D(NAME, OP) \
3610 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3611
3612 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3613 DO_FPCMP_PPZ0_H(NAME, OP) \
3614 DO_FPCMP_PPZ0_S(NAME, OP) \
3615 DO_FPCMP_PPZ0_D(NAME, OP)
3616
3617 DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3618 DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3619 DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3620 DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3621 DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3622 DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3623
3624 /* FP Trig Multiply-Add. */
3625
3626 void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3627 {
3628 static const float16 coeff[16] = {
3629 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3630 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3631 };
3632 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3633 intptr_t x = simd_data(desc);
3634 float16 *d = vd, *n = vn, *m = vm;
3635 for (i = 0; i < opr_sz; i++) {
3636 float16 mm = m[i];
3637 intptr_t xx = x;
3638 if (float16_is_neg(mm)) {
3639 mm = float16_abs(mm);
3640 xx += 8;
3641 }
3642 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3643 }
3644 }
3645
3646 void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3647 {
3648 static const float32 coeff[16] = {
3649 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3650 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3651 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3652 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3653 };
3654 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3655 intptr_t x = simd_data(desc);
3656 float32 *d = vd, *n = vn, *m = vm;
3657 for (i = 0; i < opr_sz; i++) {
3658 float32 mm = m[i];
3659 intptr_t xx = x;
3660 if (float32_is_neg(mm)) {
3661 mm = float32_abs(mm);
3662 xx += 8;
3663 }
3664 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3665 }
3666 }
3667
3668 void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3669 {
3670 static const float64 coeff[16] = {
3671 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3672 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3673 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3674 0x3de5d8408868552full, 0x0000000000000000ull,
3675 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3676 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3677 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3678 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3679 };
3680 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3681 intptr_t x = simd_data(desc);
3682 float64 *d = vd, *n = vn, *m = vm;
3683 for (i = 0; i < opr_sz; i++) {
3684 float64 mm = m[i];
3685 intptr_t xx = x;
3686 if (float64_is_neg(mm)) {
3687 mm = float64_abs(mm);
3688 xx += 8;
3689 }
3690 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3691 }
3692 }
3693
3694 /*
3695 * FP Complex Add
3696 */
3697
3698 void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3699 void *vs, uint32_t desc)
3700 {
3701 intptr_t j, i = simd_oprsz(desc);
3702 uint64_t *g = vg;
3703 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3704 float16 neg_real = float16_chs(neg_imag);
3705
3706 do {
3707 uint64_t pg = g[(i - 1) >> 6];
3708 do {
3709 float16 e0, e1, e2, e3;
3710
3711 /* I holds the real index; J holds the imag index. */
3712 j = i - sizeof(float16);
3713 i -= 2 * sizeof(float16);
3714
3715 e0 = *(float16 *)(vn + H1_2(i));
3716 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3717 e2 = *(float16 *)(vn + H1_2(j));
3718 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3719
3720 if (likely((pg >> (i & 63)) & 1)) {
3721 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3722 }
3723 if (likely((pg >> (j & 63)) & 1)) {
3724 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3725 }
3726 } while (i & 63);
3727 } while (i != 0);
3728 }
3729
3730 void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3731 void *vs, uint32_t desc)
3732 {
3733 intptr_t j, i = simd_oprsz(desc);
3734 uint64_t *g = vg;
3735 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3736 float32 neg_real = float32_chs(neg_imag);
3737
3738 do {
3739 uint64_t pg = g[(i - 1) >> 6];
3740 do {
3741 float32 e0, e1, e2, e3;
3742
3743 /* I holds the real index; J holds the imag index. */
3744 j = i - sizeof(float32);
3745 i -= 2 * sizeof(float32);
3746
3747 e0 = *(float32 *)(vn + H1_2(i));
3748 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3749 e2 = *(float32 *)(vn + H1_2(j));
3750 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3751
3752 if (likely((pg >> (i & 63)) & 1)) {
3753 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3754 }
3755 if (likely((pg >> (j & 63)) & 1)) {
3756 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3757 }
3758 } while (i & 63);
3759 } while (i != 0);
3760 }
3761
3762 void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3763 void *vs, uint32_t desc)
3764 {
3765 intptr_t j, i = simd_oprsz(desc);
3766 uint64_t *g = vg;
3767 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3768 float64 neg_real = float64_chs(neg_imag);
3769
3770 do {
3771 uint64_t pg = g[(i - 1) >> 6];
3772 do {
3773 float64 e0, e1, e2, e3;
3774
3775 /* I holds the real index; J holds the imag index. */
3776 j = i - sizeof(float64);
3777 i -= 2 * sizeof(float64);
3778
3779 e0 = *(float64 *)(vn + H1_2(i));
3780 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3781 e2 = *(float64 *)(vn + H1_2(j));
3782 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3783
3784 if (likely((pg >> (i & 63)) & 1)) {
3785 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3786 }
3787 if (likely((pg >> (j & 63)) & 1)) {
3788 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3789 }
3790 } while (i & 63);
3791 } while (i != 0);
3792 }
3793
3794 /*
3795 * FP Complex Multiply
3796 */
3797
3798 void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3799 void *vg, void *status, uint32_t desc)
3800 {
3801 intptr_t j, i = simd_oprsz(desc);
3802 unsigned rot = simd_data(desc);
3803 bool flip = rot & 1;
3804 float16 neg_imag, neg_real;
3805 uint64_t *g = vg;
3806
3807 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3808 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3809
3810 do {
3811 uint64_t pg = g[(i - 1) >> 6];
3812 do {
3813 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3814
3815 /* I holds the real index; J holds the imag index. */
3816 j = i - sizeof(float16);
3817 i -= 2 * sizeof(float16);
3818
3819 nr = *(float16 *)(vn + H1_2(i));
3820 ni = *(float16 *)(vn + H1_2(j));
3821 mr = *(float16 *)(vm + H1_2(i));
3822 mi = *(float16 *)(vm + H1_2(j));
3823
3824 e2 = (flip ? ni : nr);
3825 e1 = (flip ? mi : mr) ^ neg_real;
3826 e4 = e2;
3827 e3 = (flip ? mr : mi) ^ neg_imag;
3828
3829 if (likely((pg >> (i & 63)) & 1)) {
3830 d = *(float16 *)(va + H1_2(i));
3831 d = float16_muladd(e2, e1, d, 0, status);
3832 *(float16 *)(vd + H1_2(i)) = d;
3833 }
3834 if (likely((pg >> (j & 63)) & 1)) {
3835 d = *(float16 *)(va + H1_2(j));
3836 d = float16_muladd(e4, e3, d, 0, status);
3837 *(float16 *)(vd + H1_2(j)) = d;
3838 }
3839 } while (i & 63);
3840 } while (i != 0);
3841 }
3842
3843 void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3844 void *vg, void *status, uint32_t desc)
3845 {
3846 intptr_t j, i = simd_oprsz(desc);
3847 unsigned rot = simd_data(desc);
3848 bool flip = rot & 1;
3849 float32 neg_imag, neg_real;
3850 uint64_t *g = vg;
3851
3852 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3853 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3854
3855 do {
3856 uint64_t pg = g[(i - 1) >> 6];
3857 do {
3858 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3859
3860 /* I holds the real index; J holds the imag index. */
3861 j = i - sizeof(float32);
3862 i -= 2 * sizeof(float32);
3863
3864 nr = *(float32 *)(vn + H1_2(i));
3865 ni = *(float32 *)(vn + H1_2(j));
3866 mr = *(float32 *)(vm + H1_2(i));
3867 mi = *(float32 *)(vm + H1_2(j));
3868
3869 e2 = (flip ? ni : nr);
3870 e1 = (flip ? mi : mr) ^ neg_real;
3871 e4 = e2;
3872 e3 = (flip ? mr : mi) ^ neg_imag;
3873
3874 if (likely((pg >> (i & 63)) & 1)) {
3875 d = *(float32 *)(va + H1_2(i));
3876 d = float32_muladd(e2, e1, d, 0, status);
3877 *(float32 *)(vd + H1_2(i)) = d;
3878 }
3879 if (likely((pg >> (j & 63)) & 1)) {
3880 d = *(float32 *)(va + H1_2(j));
3881 d = float32_muladd(e4, e3, d, 0, status);
3882 *(float32 *)(vd + H1_2(j)) = d;
3883 }
3884 } while (i & 63);
3885 } while (i != 0);
3886 }
3887
3888 void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3889 void *vg, void *status, uint32_t desc)
3890 {
3891 intptr_t j, i = simd_oprsz(desc);
3892 unsigned rot = simd_data(desc);
3893 bool flip = rot & 1;
3894 float64 neg_imag, neg_real;
3895 uint64_t *g = vg;
3896
3897 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3898 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3899
3900 do {
3901 uint64_t pg = g[(i - 1) >> 6];
3902 do {
3903 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3904
3905 /* I holds the real index; J holds the imag index. */
3906 j = i - sizeof(float64);
3907 i -= 2 * sizeof(float64);
3908
3909 nr = *(float64 *)(vn + H1_2(i));
3910 ni = *(float64 *)(vn + H1_2(j));
3911 mr = *(float64 *)(vm + H1_2(i));
3912 mi = *(float64 *)(vm + H1_2(j));
3913
3914 e2 = (flip ? ni : nr);
3915 e1 = (flip ? mi : mr) ^ neg_real;
3916 e4 = e2;
3917 e3 = (flip ? mr : mi) ^ neg_imag;
3918
3919 if (likely((pg >> (i & 63)) & 1)) {
3920 d = *(float64 *)(va + H1_2(i));
3921 d = float64_muladd(e2, e1, d, 0, status);
3922 *(float64 *)(vd + H1_2(i)) = d;
3923 }
3924 if (likely((pg >> (j & 63)) & 1)) {
3925 d = *(float64 *)(va + H1_2(j));
3926 d = float64_muladd(e4, e3, d, 0, status);
3927 *(float64 *)(vd + H1_2(j)) = d;
3928 }
3929 } while (i & 63);
3930 } while (i != 0);
3931 }
3932
3933 /*
3934 * Load contiguous data, protected by a governing predicate.
3935 */
3936
3937 /*
3938 * Load one element into @vd + @reg_off from @host.
3939 * The controlling predicate is known to be true.
3940 */
3941 typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
3942
3943 /*
3944 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
3945 * The controlling predicate is known to be true.
3946 */
3947 typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
3948 target_ulong vaddr, uintptr_t retaddr);
3949
3950 /*
3951 * Generate the above primitives.
3952 */
3953
3954 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
3955 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
3956 { \
3957 TYPEM val = HOST(host); \
3958 *(TYPEE *)(vd + H(reg_off)) = val; \
3959 }
3960
3961 #define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
3962 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
3963 { HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
3964
3965 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
3966 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
3967 target_ulong addr, uintptr_t ra) \
3968 { \
3969 *(TYPEE *)(vd + H(reg_off)) = (TYPEM)TLB(env, addr, ra); \
3970 }
3971
3972 #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
3973 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
3974 target_ulong addr, uintptr_t ra) \
3975 { \
3976 TLB(env, addr, (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
3977 }
3978
3979 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
3980 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
3981 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
3982
3983 DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
3984 DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
3985 DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
3986 DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
3987 DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
3988 DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
3989 DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
3990
3991 #define DO_ST_PRIM_1(NAME, H, TE, TM) \
3992 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
3993 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
3994
3995 DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
3996 DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
3997 DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
3998 DO_ST_PRIM_1(bd, , uint64_t, uint8_t)
3999
4000 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
4001 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
4002 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
4003 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
4004 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
4005
4006 #define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
4007 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
4008 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
4009 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
4010 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
4011
4012 DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
4013 DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
4014 DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
4015 DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw)
4016 DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw)
4017
4018 DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
4019 DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
4020 DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw)
4021
4022 DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
4023 DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl)
4024 DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl)
4025
4026 DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
4027 DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl)
4028
4029 DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq)
4030 DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq)
4031
4032 #undef DO_LD_TLB
4033 #undef DO_ST_TLB
4034 #undef DO_LD_HOST
4035 #undef DO_LD_PRIM_1
4036 #undef DO_ST_PRIM_1
4037 #undef DO_LD_PRIM_2
4038 #undef DO_ST_PRIM_2
4039
4040 /*
4041 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4042 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4043 * element >= @reg_off, or @reg_max if there were no active elements at all.
4044 */
4045 static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4046 intptr_t reg_max, int esz)
4047 {
4048 uint64_t pg_mask = pred_esz_masks[esz];
4049 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4050
4051 /* In normal usage, the first element is active. */
4052 if (likely(pg & 1)) {
4053 return reg_off;
4054 }
4055
4056 if (pg == 0) {
4057 reg_off &= -64;
4058 do {
4059 reg_off += 64;
4060 if (unlikely(reg_off >= reg_max)) {
4061 /* The entire predicate was false. */
4062 return reg_max;
4063 }
4064 pg = vg[reg_off >> 6] & pg_mask;
4065 } while (pg == 0);
4066 }
4067 reg_off += ctz64(pg);
4068
4069 /* We should never see an out of range predicate bit set. */
4070 tcg_debug_assert(reg_off < reg_max);
4071 return reg_off;
4072 }
4073
4074 /*
4075 * Resolve the guest virtual address to info->host and info->flags.
4076 * If @nofault, return false if the page is invalid, otherwise
4077 * exit via page fault exception.
4078 */
4079
4080 typedef struct {
4081 void *host;
4082 int flags;
4083 MemTxAttrs attrs;
4084 } SVEHostPage;
4085
4086 static bool sve_probe_page(SVEHostPage *info, bool nofault,
4087 CPUARMState *env, target_ulong addr,
4088 int mem_off, MMUAccessType access_type,
4089 int mmu_idx, uintptr_t retaddr)
4090 {
4091 int flags;
4092
4093 addr += mem_off;
4094 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
4095 &info->host, retaddr);
4096 info->flags = flags;
4097
4098 if (flags & TLB_INVALID_MASK) {
4099 g_assert(nofault);
4100 return false;
4101 }
4102
4103 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
4104 info->host -= mem_off;
4105
4106 #ifdef CONFIG_USER_ONLY
4107 memset(&info->attrs, 0, sizeof(info->attrs));
4108 #else
4109 /*
4110 * Find the iotlbentry for addr and return the transaction attributes.
4111 * This *must* be present in the TLB because we just found the mapping.
4112 */
4113 {
4114 uintptr_t index = tlb_index(env, mmu_idx, addr);
4115
4116 # ifdef CONFIG_DEBUG_TCG
4117 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
4118 target_ulong comparator = (access_type == MMU_DATA_LOAD
4119 ? entry->addr_read
4120 : tlb_addr_write(entry));
4121 g_assert(tlb_hit(comparator, addr));
4122 # endif
4123
4124 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
4125 info->attrs = iotlbentry->attrs;
4126 }
4127 #endif
4128
4129 return true;
4130 }
4131
4132
4133 /*
4134 * Analyse contiguous data, protected by a governing predicate.
4135 */
4136
4137 typedef enum {
4138 FAULT_NO,
4139 FAULT_FIRST,
4140 FAULT_ALL,
4141 } SVEContFault;
4142
4143 typedef struct {
4144 /*
4145 * First and last element wholly contained within the two pages.
4146 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
4147 * reg_off_last[0] may be < 0 if the first element crosses pages.
4148 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
4149 * are set >= 0 only if there are complete elements on a second page.
4150 *
4151 * The reg_off_* offsets are relative to the internal vector register.
4152 * The mem_off_first offset is relative to the memory address; the
4153 * two offsets are different when a load operation extends, a store
4154 * operation truncates, or for multi-register operations.
4155 */
4156 int16_t mem_off_first[2];
4157 int16_t reg_off_first[2];
4158 int16_t reg_off_last[2];
4159
4160 /*
4161 * One element that is misaligned and spans both pages,
4162 * or -1 if there is no such active element.
4163 */
4164 int16_t mem_off_split;
4165 int16_t reg_off_split;
4166
4167 /*
4168 * The byte offset at which the entire operation crosses a page boundary.
4169 * Set >= 0 if and only if the entire operation spans two pages.
4170 */
4171 int16_t page_split;
4172
4173 /* TLB data for the two pages. */
4174 SVEHostPage page[2];
4175 } SVEContLdSt;
4176
4177 /*
4178 * Find first active element on each page, and a loose bound for the
4179 * final element on each page. Identify any single element that spans
4180 * the page boundary. Return true if there are any active elements.
4181 */
4182 static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
4183 uint64_t *vg, intptr_t reg_max,
4184 int esz, int msize)
4185 {
4186 const int esize = 1 << esz;
4187 const uint64_t pg_mask = pred_esz_masks[esz];
4188 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
4189 intptr_t mem_off_last, mem_off_split;
4190 intptr_t page_split, elt_split;
4191 intptr_t i;
4192
4193 /* Set all of the element indices to -1, and the TLB data to 0. */
4194 memset(info, -1, offsetof(SVEContLdSt, page));
4195 memset(info->page, 0, sizeof(info->page));
4196
4197 /* Gross scan over the entire predicate to find bounds. */
4198 i = 0;
4199 do {
4200 uint64_t pg = vg[i] & pg_mask;
4201 if (pg) {
4202 reg_off_last = i * 64 + 63 - clz64(pg);
4203 if (reg_off_first < 0) {
4204 reg_off_first = i * 64 + ctz64(pg);
4205 }
4206 }
4207 } while (++i * 64 < reg_max);
4208
4209 if (unlikely(reg_off_first < 0)) {
4210 /* No active elements, no pages touched. */
4211 return false;
4212 }
4213 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
4214
4215 info->reg_off_first[0] = reg_off_first;
4216 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
4217 mem_off_last = (reg_off_last >> esz) * msize;
4218
4219 page_split = -(addr | TARGET_PAGE_MASK);
4220 if (likely(mem_off_last + msize <= page_split)) {
4221 /* The entire operation fits within a single page. */
4222 info->reg_off_last[0] = reg_off_last;
4223 return true;
4224 }
4225
4226 info->page_split = page_split;
4227 elt_split = page_split / msize;
4228 reg_off_split = elt_split << esz;
4229 mem_off_split = elt_split * msize;
4230
4231 /*
4232 * This is the last full element on the first page, but it is not
4233 * necessarily active. If there is no full element, i.e. the first
4234 * active element is the one that's split, this value remains -1.
4235 * It is useful as iteration bounds.
4236 */
4237 if (elt_split != 0) {
4238 info->reg_off_last[0] = reg_off_split - esize;
4239 }
4240
4241 /* Determine if an unaligned element spans the pages. */
4242 if (page_split % msize != 0) {
4243 /* It is helpful to know if the split element is active. */
4244 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
4245 info->reg_off_split = reg_off_split;
4246 info->mem_off_split = mem_off_split;
4247
4248 if (reg_off_split == reg_off_last) {
4249 /* The page crossing element is last. */
4250 return true;
4251 }
4252 }
4253 reg_off_split += esize;
4254 mem_off_split += msize;
4255 }
4256
4257 /*
4258 * We do want the first active element on the second page, because
4259 * this may affect the address reported in an exception.
4260 */
4261 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
4262 tcg_debug_assert(reg_off_split <= reg_off_last);
4263 info->reg_off_first[1] = reg_off_split;
4264 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
4265 info->reg_off_last[1] = reg_off_last;
4266 return true;
4267 }
4268
4269 /*
4270 * Resolve the guest virtual addresses to info->page[].
4271 * Control the generation of page faults with @fault. Return false if
4272 * there is no work to do, which can only happen with @fault == FAULT_NO.
4273 */
4274 static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
4275 CPUARMState *env, target_ulong addr,
4276 MMUAccessType access_type, uintptr_t retaddr)
4277 {
4278 int mmu_idx = cpu_mmu_index(env, false);
4279 int mem_off = info->mem_off_first[0];
4280 bool nofault = fault == FAULT_NO;
4281 bool have_work = true;
4282
4283 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
4284 access_type, mmu_idx, retaddr)) {
4285 /* No work to be done. */
4286 return false;
4287 }
4288
4289 if (likely(info->page_split < 0)) {
4290 /* The entire operation was on the one page. */
4291 return true;
4292 }
4293
4294 /*
4295 * If the second page is invalid, then we want the fault address to be
4296 * the first byte on that page which is accessed.
4297 */
4298 if (info->mem_off_split >= 0) {
4299 /*
4300 * There is an element split across the pages. The fault address
4301 * should be the first byte of the second page.
4302 */
4303 mem_off = info->page_split;
4304 /*
4305 * If the split element is also the first active element
4306 * of the vector, then: For first-fault we should continue
4307 * to generate faults for the second page. For no-fault,
4308 * we have work only if the second page is valid.
4309 */
4310 if (info->mem_off_first[0] < info->mem_off_split) {
4311 nofault = FAULT_FIRST;
4312 have_work = false;
4313 }
4314 } else {
4315 /*
4316 * There is no element split across the pages. The fault address
4317 * should be the first active element on the second page.
4318 */
4319 mem_off = info->mem_off_first[1];
4320 /*
4321 * There must have been one active element on the first page,
4322 * so we're out of first-fault territory.
4323 */
4324 nofault = fault != FAULT_ALL;
4325 }
4326
4327 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
4328 access_type, mmu_idx, retaddr);
4329 return have_work;
4330 }
4331
4332 static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
4333 uint64_t *vg, target_ulong addr,
4334 int esize, int msize, int wp_access,
4335 uintptr_t retaddr)
4336 {
4337 #ifndef CONFIG_USER_ONLY
4338 intptr_t mem_off, reg_off, reg_last;
4339 int flags0 = info->page[0].flags;
4340 int flags1 = info->page[1].flags;
4341
4342 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
4343 return;
4344 }
4345
4346 /* Indicate that watchpoints are handled. */
4347 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
4348 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
4349
4350 if (flags0 & TLB_WATCHPOINT) {
4351 mem_off = info->mem_off_first[0];
4352 reg_off = info->reg_off_first[0];
4353 reg_last = info->reg_off_last[0];
4354
4355 while (reg_off <= reg_last) {
4356 uint64_t pg = vg[reg_off >> 6];
4357 do {
4358 if ((pg >> (reg_off & 63)) & 1) {
4359 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4360 msize, info->page[0].attrs,
4361 wp_access, retaddr);
4362 }
4363 reg_off += esize;
4364 mem_off += msize;
4365 } while (reg_off <= reg_last && (reg_off & 63));
4366 }
4367 }
4368
4369 mem_off = info->mem_off_split;
4370 if (mem_off >= 0) {
4371 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
4372 info->page[0].attrs, wp_access, retaddr);
4373 }
4374
4375 mem_off = info->mem_off_first[1];
4376 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
4377 reg_off = info->reg_off_first[1];
4378 reg_last = info->reg_off_last[1];
4379
4380 do {
4381 uint64_t pg = vg[reg_off >> 6];
4382 do {
4383 if ((pg >> (reg_off & 63)) & 1) {
4384 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4385 msize, info->page[1].attrs,
4386 wp_access, retaddr);
4387 }
4388 reg_off += esize;
4389 mem_off += msize;
4390 } while (reg_off & 63);
4391 } while (reg_off <= reg_last);
4392 }
4393 #endif
4394 }
4395
4396 typedef uint64_t mte_check_fn(CPUARMState *, uint32_t, uint64_t, uintptr_t);
4397
4398 static inline QEMU_ALWAYS_INLINE
4399 void sve_cont_ldst_mte_check_int(SVEContLdSt *info, CPUARMState *env,
4400 uint64_t *vg, target_ulong addr, int esize,
4401 int msize, uint32_t mtedesc, uintptr_t ra,
4402 mte_check_fn *check)
4403 {
4404 intptr_t mem_off, reg_off, reg_last;
4405
4406 /* Process the page only if MemAttr == Tagged. */
4407 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
4408 mem_off = info->mem_off_first[0];
4409 reg_off = info->reg_off_first[0];
4410 reg_last = info->reg_off_split;
4411 if (reg_last < 0) {
4412 reg_last = info->reg_off_last[0];
4413 }
4414
4415 do {
4416 uint64_t pg = vg[reg_off >> 6];
4417 do {
4418 if ((pg >> (reg_off & 63)) & 1) {
4419 check(env, mtedesc, addr, ra);
4420 }
4421 reg_off += esize;
4422 mem_off += msize;
4423 } while (reg_off <= reg_last && (reg_off & 63));
4424 } while (reg_off <= reg_last);
4425 }
4426
4427 mem_off = info->mem_off_first[1];
4428 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
4429 reg_off = info->reg_off_first[1];
4430 reg_last = info->reg_off_last[1];
4431
4432 do {
4433 uint64_t pg = vg[reg_off >> 6];
4434 do {
4435 if ((pg >> (reg_off & 63)) & 1) {
4436 check(env, mtedesc, addr, ra);
4437 }
4438 reg_off += esize;
4439 mem_off += msize;
4440 } while (reg_off & 63);
4441 } while (reg_off <= reg_last);
4442 }
4443 }
4444
4445 typedef void sve_cont_ldst_mte_check_fn(SVEContLdSt *info, CPUARMState *env,
4446 uint64_t *vg, target_ulong addr,
4447 int esize, int msize, uint32_t mtedesc,
4448 uintptr_t ra);
4449
4450 static void sve_cont_ldst_mte_check1(SVEContLdSt *info, CPUARMState *env,
4451 uint64_t *vg, target_ulong addr,
4452 int esize, int msize, uint32_t mtedesc,
4453 uintptr_t ra)
4454 {
4455 sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
4456 mtedesc, ra, mte_check1);
4457 }
4458
4459 static void sve_cont_ldst_mte_checkN(SVEContLdSt *info, CPUARMState *env,
4460 uint64_t *vg, target_ulong addr,
4461 int esize, int msize, uint32_t mtedesc,
4462 uintptr_t ra)
4463 {
4464 sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
4465 mtedesc, ra, mte_checkN);
4466 }
4467
4468
4469 /*
4470 * Common helper for all contiguous 1,2,3,4-register predicated stores.
4471 */
4472 static inline QEMU_ALWAYS_INLINE
4473 void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
4474 uint32_t desc, const uintptr_t retaddr,
4475 const int esz, const int msz, const int N, uint32_t mtedesc,
4476 sve_ldst1_host_fn *host_fn,
4477 sve_ldst1_tlb_fn *tlb_fn,
4478 sve_cont_ldst_mte_check_fn *mte_check_fn)
4479 {
4480 const unsigned rd = simd_data(desc);
4481 const intptr_t reg_max = simd_oprsz(desc);
4482 intptr_t reg_off, reg_last, mem_off;
4483 SVEContLdSt info;
4484 void *host;
4485 int flags, i;
4486
4487 /* Find the active elements. */
4488 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
4489 /* The entire predicate was false; no load occurs. */
4490 for (i = 0; i < N; ++i) {
4491 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4492 }
4493 return;
4494 }
4495
4496 /* Probe the page(s). Exit with exception for any invalid page. */
4497 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
4498
4499 /* Handle watchpoints for all active elements. */
4500 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
4501 BP_MEM_READ, retaddr);
4502
4503 /*
4504 * Handle mte checks for all active elements.
4505 * Since TBI must be set for MTE, !mtedesc => !mte_active.
4506 */
4507 if (mte_check_fn && mtedesc) {
4508 mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
4509 mtedesc, retaddr);
4510 }
4511
4512 flags = info.page[0].flags | info.page[1].flags;
4513 if (unlikely(flags != 0)) {
4514 #ifdef CONFIG_USER_ONLY
4515 g_assert_not_reached();
4516 #else
4517 /*
4518 * At least one page includes MMIO.
4519 * Any bus operation can fail with cpu_transaction_failed,
4520 * which for ARM will raise SyncExternal. Perform the load
4521 * into scratch memory to preserve register state until the end.
4522 */
4523 ARMVectorReg scratch[4] = { };
4524
4525 mem_off = info.mem_off_first[0];
4526 reg_off = info.reg_off_first[0];
4527 reg_last = info.reg_off_last[1];
4528 if (reg_last < 0) {
4529 reg_last = info.reg_off_split;
4530 if (reg_last < 0) {
4531 reg_last = info.reg_off_last[0];
4532 }
4533 }
4534
4535 do {
4536 uint64_t pg = vg[reg_off >> 6];
4537 do {
4538 if ((pg >> (reg_off & 63)) & 1) {
4539 for (i = 0; i < N; ++i) {
4540 tlb_fn(env, &scratch[i], reg_off,
4541 addr + mem_off + (i << msz), retaddr);
4542 }
4543 }
4544 reg_off += 1 << esz;
4545 mem_off += N << msz;
4546 } while (reg_off & 63);
4547 } while (reg_off <= reg_last);
4548
4549 for (i = 0; i < N; ++i) {
4550 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
4551 }
4552 return;
4553 #endif
4554 }
4555
4556 /* The entire operation is in RAM, on valid pages. */
4557
4558 for (i = 0; i < N; ++i) {
4559 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4560 }
4561
4562 mem_off = info.mem_off_first[0];
4563 reg_off = info.reg_off_first[0];
4564 reg_last = info.reg_off_last[0];
4565 host = info.page[0].host;
4566
4567 while (reg_off <= reg_last) {
4568 uint64_t pg = vg[reg_off >> 6];
4569 do {
4570 if ((pg >> (reg_off & 63)) & 1) {
4571 for (i = 0; i < N; ++i) {
4572 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4573 host + mem_off + (i << msz));
4574 }
4575 }
4576 reg_off += 1 << esz;
4577 mem_off += N << msz;
4578 } while (reg_off <= reg_last && (reg_off & 63));
4579 }
4580
4581 /*
4582 * Use the slow path to manage the cross-page misalignment.
4583 * But we know this is RAM and cannot trap.
4584 */
4585 mem_off = info.mem_off_split;
4586 if (unlikely(mem_off >= 0)) {
4587 reg_off = info.reg_off_split;
4588 for (i = 0; i < N; ++i) {
4589 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
4590 addr + mem_off + (i << msz), retaddr);
4591 }
4592 }
4593
4594 mem_off = info.mem_off_first[1];
4595 if (unlikely(mem_off >= 0)) {
4596 reg_off = info.reg_off_first[1];
4597 reg_last = info.reg_off_last[1];
4598 host = info.page[1].host;
4599
4600 do {
4601 uint64_t pg = vg[reg_off >> 6];
4602 do {
4603 if ((pg >> (reg_off & 63)) & 1) {
4604 for (i = 0; i < N; ++i) {
4605 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4606 host + mem_off + (i << msz));
4607 }
4608 }
4609 reg_off += 1 << esz;
4610 mem_off += N << msz;
4611 } while (reg_off & 63);
4612 } while (reg_off <= reg_last);
4613 }
4614 }
4615
4616 static inline QEMU_ALWAYS_INLINE
4617 void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
4618 uint32_t desc, const uintptr_t ra,
4619 const int esz, const int msz, const int N,
4620 sve_ldst1_host_fn *host_fn,
4621 sve_ldst1_tlb_fn *tlb_fn)
4622 {
4623 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4624 int bit55 = extract64(addr, 55, 1);
4625
4626 /* Remove mtedesc from the normal sve descriptor. */
4627 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4628
4629 /* Perform gross MTE suppression early. */
4630 if (!tbi_check(desc, bit55) ||
4631 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
4632 mtedesc = 0;
4633 }
4634
4635 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
4636 N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
4637 }
4638
4639 #define DO_LD1_1(NAME, ESZ) \
4640 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
4641 target_ulong addr, uint32_t desc) \
4642 { \
4643 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
4644 sve_##NAME##_host, sve_##NAME##_tlb, NULL); \
4645 } \
4646 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
4647 target_ulong addr, uint32_t desc) \
4648 { \
4649 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
4650 sve_##NAME##_host, sve_##NAME##_tlb); \
4651 }
4652
4653 #define DO_LD1_2(NAME, ESZ, MSZ) \
4654 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
4655 target_ulong addr, uint32_t desc) \
4656 { \
4657 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4658 sve_##NAME##_le_host, sve_##NAME##_le_tlb, NULL); \
4659 } \
4660 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
4661 target_ulong addr, uint32_t desc) \
4662 { \
4663 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4664 sve_##NAME##_be_host, sve_##NAME##_be_tlb, NULL); \
4665 } \
4666 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
4667 target_ulong addr, uint32_t desc) \
4668 { \
4669 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
4670 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
4671 } \
4672 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
4673 target_ulong addr, uint32_t desc) \
4674 { \
4675 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
4676 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
4677 }
4678
4679 DO_LD1_1(ld1bb, MO_8)
4680 DO_LD1_1(ld1bhu, MO_16)
4681 DO_LD1_1(ld1bhs, MO_16)
4682 DO_LD1_1(ld1bsu, MO_32)
4683 DO_LD1_1(ld1bss, MO_32)
4684 DO_LD1_1(ld1bdu, MO_64)
4685 DO_LD1_1(ld1bds, MO_64)
4686
4687 DO_LD1_2(ld1hh, MO_16, MO_16)
4688 DO_LD1_2(ld1hsu, MO_32, MO_16)
4689 DO_LD1_2(ld1hss, MO_32, MO_16)
4690 DO_LD1_2(ld1hdu, MO_64, MO_16)
4691 DO_LD1_2(ld1hds, MO_64, MO_16)
4692
4693 DO_LD1_2(ld1ss, MO_32, MO_32)
4694 DO_LD1_2(ld1sdu, MO_64, MO_32)
4695 DO_LD1_2(ld1sds, MO_64, MO_32)
4696
4697 DO_LD1_2(ld1dd, MO_64, MO_64)
4698
4699 #undef DO_LD1_1
4700 #undef DO_LD1_2
4701
4702 #define DO_LDN_1(N) \
4703 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
4704 target_ulong addr, uint32_t desc) \
4705 { \
4706 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
4707 sve_ld1bb_host, sve_ld1bb_tlb, NULL); \
4708 } \
4709 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
4710 target_ulong addr, uint32_t desc) \
4711 { \
4712 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
4713 sve_ld1bb_host, sve_ld1bb_tlb); \
4714 }
4715
4716 #define DO_LDN_2(N, SUFF, ESZ) \
4717 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
4718 target_ulong addr, uint32_t desc) \
4719 { \
4720 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4721 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb, NULL); \
4722 } \
4723 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
4724 target_ulong addr, uint32_t desc) \
4725 { \
4726 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4727 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb, NULL); \
4728 } \
4729 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
4730 target_ulong addr, uint32_t desc) \
4731 { \
4732 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
4733 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
4734 } \
4735 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
4736 target_ulong addr, uint32_t desc) \
4737 { \
4738 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
4739 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
4740 }
4741
4742 DO_LDN_1(2)
4743 DO_LDN_1(3)
4744 DO_LDN_1(4)
4745
4746 DO_LDN_2(2, hh, MO_16)
4747 DO_LDN_2(3, hh, MO_16)
4748 DO_LDN_2(4, hh, MO_16)
4749
4750 DO_LDN_2(2, ss, MO_32)
4751 DO_LDN_2(3, ss, MO_32)
4752 DO_LDN_2(4, ss, MO_32)
4753
4754 DO_LDN_2(2, dd, MO_64)
4755 DO_LDN_2(3, dd, MO_64)
4756 DO_LDN_2(4, dd, MO_64)
4757
4758 #undef DO_LDN_1
4759 #undef DO_LDN_2
4760
4761 /*
4762 * Load contiguous data, first-fault and no-fault.
4763 *
4764 * For user-only, one could argue that we should hold the mmap_lock during
4765 * the operation so that there is no race between page_check_range and the
4766 * load operation. However, unmapping pages out from under a running thread
4767 * is extraordinarily unlikely. This theoretical race condition also affects
4768 * linux-user/ in its get_user/put_user macros.
4769 *
4770 * TODO: Construct some helpers, written in assembly, that interact with
4771 * handle_cpu_signal to produce memory ops which can properly report errors
4772 * without racing.
4773 */
4774
4775 /* Fault on byte I. All bits in FFR from I are cleared. The vector
4776 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4777 * option, which leaves subsequent data unchanged.
4778 */
4779 static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4780 {
4781 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4782
4783 if (i & 63) {
4784 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4785 i = ROUND_UP(i, 64);
4786 }
4787 for (; i < oprsz; i += 64) {
4788 ffr[i / 64] = 0;
4789 }
4790 }
4791
4792 /*
4793 * Common helper for all contiguous no-fault and first-fault loads.
4794 */
4795 static inline QEMU_ALWAYS_INLINE
4796 void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
4797 uint32_t desc, const uintptr_t retaddr,
4798 const int esz, const int msz, const SVEContFault fault,
4799 sve_ldst1_host_fn *host_fn,
4800 sve_ldst1_tlb_fn *tlb_fn)
4801 {
4802 const unsigned rd = simd_data(desc);
4803 void *vd = &env->vfp.zregs[rd];
4804 const intptr_t reg_max = simd_oprsz(desc);
4805 intptr_t reg_off, mem_off, reg_last;
4806 SVEContLdSt info;
4807 int flags;
4808 void *host;
4809
4810 /* Find the active elements. */
4811 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
4812 /* The entire predicate was false; no load occurs. */
4813 memset(vd, 0, reg_max);
4814 return;
4815 }
4816 reg_off = info.reg_off_first[0];
4817
4818 /* Probe the page(s). */
4819 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
4820 /* Fault on first element. */
4821 tcg_debug_assert(fault == FAULT_NO);
4822 memset(vd, 0, reg_max);
4823 goto do_fault;
4824 }
4825
4826 mem_off = info.mem_off_first[0];
4827 flags = info.page[0].flags;
4828
4829 if (fault == FAULT_FIRST) {
4830 /*
4831 * Special handling of the first active element,
4832 * if it crosses a page boundary or is MMIO.
4833 */
4834 bool is_split = mem_off == info.mem_off_split;
4835 /* TODO: MTE check. */
4836 if (unlikely(flags != 0) || unlikely(is_split)) {
4837 /*
4838 * Use the slow path for cross-page handling.
4839 * Might trap for MMIO or watchpoints.
4840 */
4841 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4842
4843 /* After any fault, zero the other elements. */
4844 swap_memzero(vd, reg_off);
4845 reg_off += 1 << esz;
4846 mem_off += 1 << msz;
4847 swap_memzero(vd + reg_off, reg_max - reg_off);
4848
4849 if (is_split) {
4850 goto second_page;
4851 }
4852 } else {
4853 memset(vd, 0, reg_max);
4854 }
4855 } else {
4856 memset(vd, 0, reg_max);
4857 if (unlikely(mem_off == info.mem_off_split)) {
4858 /* The first active element crosses a page boundary. */
4859 flags |= info.page[1].flags;
4860 if (unlikely(flags & TLB_MMIO)) {
4861 /* Some page is MMIO, see below. */
4862 goto do_fault;
4863 }
4864 if (unlikely(flags & TLB_WATCHPOINT) &&
4865 (cpu_watchpoint_address_matches
4866 (env_cpu(env), addr + mem_off, 1 << msz)
4867 & BP_MEM_READ)) {
4868 /* Watchpoint hit, see below. */
4869 goto do_fault;
4870 }
4871 /* TODO: MTE check. */
4872 /*
4873 * Use the slow path for cross-page handling.
4874 * This is RAM, without a watchpoint, and will not trap.
4875 */
4876 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4877 goto second_page;
4878 }
4879 }
4880
4881 /*
4882 * From this point on, all memory operations are MemSingleNF.
4883 *
4884 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
4885 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
4886 *
4887 * Unfortuately we do not have access to the memory attributes from the
4888 * PTE to tell Device memory from Normal memory. So we make a mostly
4889 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
4890 * This gives the right answer for the common cases of "Normal memory,
4891 * backed by host RAM" and "Device memory, backed by MMIO".
4892 * The architecture allows us to suppress an NF load and return
4893 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
4894 * case of "Normal memory, backed by MMIO" is permitted. The case we
4895 * get wrong is "Device memory, backed by host RAM", for which we
4896 * should return (UNKNOWN, FAULT) for but do not.
4897 *
4898 * Similarly, CPU_BP breakpoints would raise exceptions, and so
4899 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
4900 * architectural breakpoints the same.
4901 */
4902 if (unlikely(flags & TLB_MMIO)) {
4903 goto do_fault;
4904 }
4905
4906 reg_last = info.reg_off_last[0];
4907 host = info.page[0].host;
4908
4909 do {
4910 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
4911 do {
4912 if ((pg >> (reg_off & 63)) & 1) {
4913 if (unlikely(flags & TLB_WATCHPOINT) &&
4914 (cpu_watchpoint_address_matches
4915 (env_cpu(env), addr + mem_off, 1 << msz)
4916 & BP_MEM_READ)) {
4917 goto do_fault;
4918 }
4919 /* TODO: MTE check. */
4920 host_fn(vd, reg_off, host + mem_off);
4921 }
4922 reg_off += 1 << esz;
4923 mem_off += 1 << msz;
4924 } while (reg_off <= reg_last && (reg_off & 63));
4925 } while (reg_off <= reg_last);
4926
4927 /*
4928 * MemSingleNF is allowed to fail for any reason. We have special
4929 * code above to handle the first element crossing a page boundary.
4930 * As an implementation choice, decline to handle a cross-page element
4931 * in any other position.
4932 */
4933 reg_off = info.reg_off_split;
4934 if (reg_off >= 0) {
4935 goto do_fault;
4936 }
4937
4938 second_page:
4939 reg_off = info.reg_off_first[1];
4940 if (likely(reg_off < 0)) {
4941 /* No active elements on the second page. All done. */
4942 return;
4943 }
4944
4945 /*
4946 * MemSingleNF is allowed to fail for any reason. As an implementation
4947 * choice, decline to handle elements on the second page. This should
4948 * be low frequency as the guest walks through memory -- the next
4949 * iteration of the guest's loop should be aligned on the page boundary,
4950 * and then all following iterations will stay aligned.
4951 */
4952
4953 do_fault:
4954 record_fault(env, reg_off, reg_max);
4955 }
4956
4957 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
4958 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
4959 target_ulong addr, uint32_t desc) \
4960 { \
4961 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
4962 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4963 } \
4964 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
4965 target_ulong addr, uint32_t desc) \
4966 { \
4967 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
4968 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4969 }
4970
4971 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
4972 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
4973 target_ulong addr, uint32_t desc) \
4974 { \
4975 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
4976 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
4977 } \
4978 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
4979 target_ulong addr, uint32_t desc) \
4980 { \
4981 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
4982 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
4983 } \
4984 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
4985 target_ulong addr, uint32_t desc) \
4986 { \
4987 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
4988 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
4989 } \
4990 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
4991 target_ulong addr, uint32_t desc) \
4992 { \
4993 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
4994 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
4995 }
4996
4997 DO_LDFF1_LDNF1_1(bb, MO_8)
4998 DO_LDFF1_LDNF1_1(bhu, MO_16)
4999 DO_LDFF1_LDNF1_1(bhs, MO_16)
5000 DO_LDFF1_LDNF1_1(bsu, MO_32)
5001 DO_LDFF1_LDNF1_1(bss, MO_32)
5002 DO_LDFF1_LDNF1_1(bdu, MO_64)
5003 DO_LDFF1_LDNF1_1(bds, MO_64)
5004
5005 DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
5006 DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
5007 DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
5008 DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
5009 DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
5010
5011 DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
5012 DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
5013 DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
5014
5015 DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
5016
5017 #undef DO_LDFF1_LDNF1_1
5018 #undef DO_LDFF1_LDNF1_2
5019
5020 /*
5021 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5022 */
5023
5024 static inline QEMU_ALWAYS_INLINE
5025 void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
5026 uint32_t desc, const uintptr_t retaddr,
5027 const int esz, const int msz, const int N, uint32_t mtedesc,
5028 sve_ldst1_host_fn *host_fn,
5029 sve_ldst1_tlb_fn *tlb_fn,
5030 sve_cont_ldst_mte_check_fn *mte_check_fn)
5031 {
5032 const unsigned rd = simd_data(desc);
5033 const intptr_t reg_max = simd_oprsz(desc);
5034 intptr_t reg_off, reg_last, mem_off;
5035 SVEContLdSt info;
5036 void *host;
5037 int i, flags;
5038
5039 /* Find the active elements. */
5040 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5041 /* The entire predicate was false; no store occurs. */
5042 return;
5043 }
5044
5045 /* Probe the page(s). Exit with exception for any invalid page. */
5046 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
5047
5048 /* Handle watchpoints for all active elements. */
5049 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5050 BP_MEM_WRITE, retaddr);
5051
5052 /*
5053 * Handle mte checks for all active elements.
5054 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5055 */
5056 if (mte_check_fn && mtedesc) {
5057 mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
5058 mtedesc, retaddr);
5059 }
5060
5061 flags = info.page[0].flags | info.page[1].flags;
5062 if (unlikely(flags != 0)) {
5063 #ifdef CONFIG_USER_ONLY
5064 g_assert_not_reached();
5065 #else
5066 /*
5067 * At least one page includes MMIO.
5068 * Any bus operation can fail with cpu_transaction_failed,
5069 * which for ARM will raise SyncExternal. We cannot avoid
5070 * this fault and will leave with the store incomplete.
5071 */
5072 mem_off = info.mem_off_first[0];
5073 reg_off = info.reg_off_first[0];
5074 reg_last = info.reg_off_last[1];
5075 if (reg_last < 0) {
5076 reg_last = info.reg_off_split;
5077 if (reg_last < 0) {
5078 reg_last = info.reg_off_last[0];
5079 }
5080 }
5081
5082 do {
5083 uint64_t pg = vg[reg_off >> 6];
5084 do {
5085 if ((pg >> (reg_off & 63)) & 1) {
5086 for (i = 0; i < N; ++i) {
5087 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5088 addr + mem_off + (i << msz), retaddr);
5089 }
5090 }
5091 reg_off += 1 << esz;
5092 mem_off += N << msz;
5093 } while (reg_off & 63);
5094 } while (reg_off <= reg_last);
5095 return;
5096 #endif
5097 }
5098
5099 mem_off = info.mem_off_first[0];
5100 reg_off = info.reg_off_first[0];
5101 reg_last = info.reg_off_last[0];
5102 host = info.page[0].host;
5103
5104 while (reg_off <= reg_last) {
5105 uint64_t pg = vg[reg_off >> 6];
5106 do {
5107 if ((pg >> (reg_off & 63)) & 1) {
5108 for (i = 0; i < N; ++i) {
5109 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5110 host + mem_off + (i << msz));
5111 }
5112 }
5113 reg_off += 1 << esz;
5114 mem_off += N << msz;
5115 } while (reg_off <= reg_last && (reg_off & 63));
5116 }
5117
5118 /*
5119 * Use the slow path to manage the cross-page misalignment.
5120 * But we know this is RAM and cannot trap.
5121 */
5122 mem_off = info.mem_off_split;
5123 if (unlikely(mem_off >= 0)) {
5124 reg_off = info.reg_off_split;
5125 for (i = 0; i < N; ++i) {
5126 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5127 addr + mem_off + (i << msz), retaddr);
5128 }
5129 }
5130
5131 mem_off = info.mem_off_first[1];
5132 if (unlikely(mem_off >= 0)) {
5133 reg_off = info.reg_off_first[1];
5134 reg_last = info.reg_off_last[1];
5135 host = info.page[1].host;
5136
5137 do {
5138 uint64_t pg = vg[reg_off >> 6];
5139 do {
5140 if ((pg >> (reg_off & 63)) & 1) {
5141 for (i = 0; i < N; ++i) {
5142 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5143 host + mem_off + (i << msz));
5144 }
5145 }
5146 reg_off += 1 << esz;
5147 mem_off += N << msz;
5148 } while (reg_off & 63);
5149 } while (reg_off <= reg_last);
5150 }
5151 }
5152
5153 static inline QEMU_ALWAYS_INLINE
5154 void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5155 uint32_t desc, const uintptr_t ra,
5156 const int esz, const int msz, const int N,
5157 sve_ldst1_host_fn *host_fn,
5158 sve_ldst1_tlb_fn *tlb_fn)
5159 {
5160 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5161 int bit55 = extract64(addr, 55, 1);
5162
5163 /* Remove mtedesc from the normal sve descriptor. */
5164 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5165
5166 /* Perform gross MTE suppression early. */
5167 if (!tbi_check(desc, bit55) ||
5168 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5169 mtedesc = 0;
5170 }
5171
5172 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
5173 N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
5174 }
5175
5176 #define DO_STN_1(N, NAME, ESZ) \
5177 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
5178 target_ulong addr, uint32_t desc) \
5179 { \
5180 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
5181 sve_st1##NAME##_host, sve_st1##NAME##_tlb, NULL); \
5182 } \
5183 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
5184 target_ulong addr, uint32_t desc) \
5185 { \
5186 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
5187 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
5188 }
5189
5190 #define DO_STN_2(N, NAME, ESZ, MSZ) \
5191 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
5192 target_ulong addr, uint32_t desc) \
5193 { \
5194 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
5195 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb, NULL); \
5196 } \
5197 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
5198 target_ulong addr, uint32_t desc) \
5199 { \
5200 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
5201 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb, NULL); \
5202 } \
5203 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5204 target_ulong addr, uint32_t desc) \
5205 { \
5206 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5207 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
5208 } \
5209 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5210 target_ulong addr, uint32_t desc) \
5211 { \
5212 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5213 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
5214 }
5215
5216 DO_STN_1(1, bb, MO_8)
5217 DO_STN_1(1, bh, MO_16)
5218 DO_STN_1(1, bs, MO_32)
5219 DO_STN_1(1, bd, MO_64)
5220 DO_STN_1(2, bb, MO_8)
5221 DO_STN_1(3, bb, MO_8)
5222 DO_STN_1(4, bb, MO_8)
5223
5224 DO_STN_2(1, hh, MO_16, MO_16)
5225 DO_STN_2(1, hs, MO_32, MO_16)
5226 DO_STN_2(1, hd, MO_64, MO_16)
5227 DO_STN_2(2, hh, MO_16, MO_16)
5228 DO_STN_2(3, hh, MO_16, MO_16)
5229 DO_STN_2(4, hh, MO_16, MO_16)
5230
5231 DO_STN_2(1, ss, MO_32, MO_32)
5232 DO_STN_2(1, sd, MO_64, MO_32)
5233 DO_STN_2(2, ss, MO_32, MO_32)
5234 DO_STN_2(3, ss, MO_32, MO_32)
5235 DO_STN_2(4, ss, MO_32, MO_32)
5236
5237 DO_STN_2(1, dd, MO_64, MO_64)
5238 DO_STN_2(2, dd, MO_64, MO_64)
5239 DO_STN_2(3, dd, MO_64, MO_64)
5240 DO_STN_2(4, dd, MO_64, MO_64)
5241
5242 #undef DO_STN_1
5243 #undef DO_STN_2
5244
5245 /*
5246 * Loads with a vector index.
5247 */
5248
5249 /*
5250 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
5251 */
5252 typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
5253
5254 static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
5255 {
5256 return *(uint32_t *)(reg + H1_4(reg_ofs));
5257 }
5258
5259 static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
5260 {
5261 return *(int32_t *)(reg + H1_4(reg_ofs));
5262 }
5263
5264 static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
5265 {
5266 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
5267 }
5268
5269 static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
5270 {
5271 return (int32_t)*(uint64_t *)(reg + reg_ofs);
5272 }
5273
5274 static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
5275 {
5276 return *(uint64_t *)(reg + reg_ofs);
5277 }
5278
5279 static inline QEMU_ALWAYS_INLINE
5280 void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5281 target_ulong base, uint32_t desc, uintptr_t retaddr,
5282 int esize, int msize, zreg_off_fn *off_fn,
5283 sve_ldst1_host_fn *host_fn,
5284 sve_ldst1_tlb_fn *tlb_fn)
5285 {
5286 const int mmu_idx = cpu_mmu_index(env, false);
5287 const intptr_t reg_max = simd_oprsz(desc);
5288 const int scale = simd_data(desc);
5289 ARMVectorReg scratch;
5290 intptr_t reg_off;
5291 SVEHostPage info, info2;
5292
5293 memset(&scratch, 0, reg_max);
5294 reg_off = 0;
5295 do {
5296 uint64_t pg = vg[reg_off >> 6];
5297 do {
5298 if (likely(pg & 1)) {
5299 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5300 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5301
5302 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
5303 mmu_idx, retaddr);
5304
5305 if (likely(in_page >= msize)) {
5306 if (unlikely(info.flags & TLB_WATCHPOINT)) {
5307 cpu_check_watchpoint(env_cpu(env), addr, msize,
5308 info.attrs, BP_MEM_READ, retaddr);
5309 }
5310 /* TODO: MTE check */
5311 host_fn(&scratch, reg_off, info.host);
5312 } else {
5313 /* Element crosses the page boundary. */
5314 sve_probe_page(&info2, false, env, addr + in_page, 0,
5315 MMU_DATA_LOAD, mmu_idx, retaddr);
5316 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
5317 cpu_check_watchpoint(env_cpu(env), addr,
5318 msize, info.attrs,
5319 BP_MEM_READ, retaddr);
5320 }
5321 /* TODO: MTE check */
5322 tlb_fn(env, &scratch, reg_off, addr, retaddr);
5323 }
5324 }
5325 reg_off += esize;
5326 pg >>= esize;
5327 } while (reg_off & 63);
5328 } while (reg_off < reg_max);
5329
5330 /* Wait until all exceptions have been raised to write back. */
5331 memcpy(vd, &scratch, reg_max);
5332 }
5333
5334 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
5335 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5336 void *vm, target_ulong base, uint32_t desc) \
5337 { \
5338 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
5339 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5340 }
5341
5342 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
5343 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5344 void *vm, target_ulong base, uint32_t desc) \
5345 { \
5346 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
5347 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5348 }
5349
5350 DO_LD1_ZPZ_S(bsu, zsu, MO_8)
5351 DO_LD1_ZPZ_S(bsu, zss, MO_8)
5352 DO_LD1_ZPZ_D(bdu, zsu, MO_8)
5353 DO_LD1_ZPZ_D(bdu, zss, MO_8)
5354 DO_LD1_ZPZ_D(bdu, zd, MO_8)
5355
5356 DO_LD1_ZPZ_S(bss, zsu, MO_8)
5357 DO_LD1_ZPZ_S(bss, zss, MO_8)
5358 DO_LD1_ZPZ_D(bds, zsu, MO_8)
5359 DO_LD1_ZPZ_D(bds, zss, MO_8)
5360 DO_LD1_ZPZ_D(bds, zd, MO_8)
5361
5362 DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
5363 DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
5364 DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
5365 DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
5366 DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
5367
5368 DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
5369 DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
5370 DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
5371 DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
5372 DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
5373
5374 DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
5375 DO_LD1_ZPZ_S(hss_le, zss, MO_16)
5376 DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
5377 DO_LD1_ZPZ_D(hds_le, zss, MO_16)
5378 DO_LD1_ZPZ_D(hds_le, zd, MO_16)
5379
5380 DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
5381 DO_LD1_ZPZ_S(hss_be, zss, MO_16)
5382 DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
5383 DO_LD1_ZPZ_D(hds_be, zss, MO_16)
5384 DO_LD1_ZPZ_D(hds_be, zd, MO_16)
5385
5386 DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
5387 DO_LD1_ZPZ_S(ss_le, zss, MO_32)
5388 DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
5389 DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
5390 DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
5391
5392 DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
5393 DO_LD1_ZPZ_S(ss_be, zss, MO_32)
5394 DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
5395 DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
5396 DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
5397
5398 DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
5399 DO_LD1_ZPZ_D(sds_le, zss, MO_32)
5400 DO_LD1_ZPZ_D(sds_le, zd, MO_32)
5401
5402 DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
5403 DO_LD1_ZPZ_D(sds_be, zss, MO_32)
5404 DO_LD1_ZPZ_D(sds_be, zd, MO_32)
5405
5406 DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
5407 DO_LD1_ZPZ_D(dd_le, zss, MO_64)
5408 DO_LD1_ZPZ_D(dd_le, zd, MO_64)
5409
5410 DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
5411 DO_LD1_ZPZ_D(dd_be, zss, MO_64)
5412 DO_LD1_ZPZ_D(dd_be, zd, MO_64)
5413
5414 #undef DO_LD1_ZPZ_S
5415 #undef DO_LD1_ZPZ_D
5416
5417 /* First fault loads with a vector index. */
5418
5419 /*
5420 * Common helpers for all gather first-faulting loads.
5421 */
5422
5423 static inline QEMU_ALWAYS_INLINE
5424 void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5425 target_ulong base, uint32_t desc, uintptr_t retaddr,
5426 const int esz, const int msz, zreg_off_fn *off_fn,
5427 sve_ldst1_host_fn *host_fn,
5428 sve_ldst1_tlb_fn *tlb_fn)
5429 {
5430 const int mmu_idx = cpu_mmu_index(env, false);
5431 const intptr_t reg_max = simd_oprsz(desc);
5432 const int scale = simd_data(desc);
5433 const int esize = 1 << esz;
5434 const int msize = 1 << msz;
5435 intptr_t reg_off;
5436 SVEHostPage info;
5437 target_ulong addr, in_page;
5438
5439 /* Skip to the first true predicate. */
5440 reg_off = find_next_active(vg, 0, reg_max, esz);
5441 if (unlikely(reg_off >= reg_max)) {
5442 /* The entire predicate was false; no load occurs. */
5443 memset(vd, 0, reg_max);
5444 return;
5445 }
5446
5447 /*
5448 * Probe the first element, allowing faults.
5449 */
5450 addr = base + (off_fn(vm, reg_off) << scale);
5451 tlb_fn(env, vd, reg_off, addr, retaddr);
5452
5453 /* After any fault, zero the other elements. */
5454 swap_memzero(vd, reg_off);
5455 reg_off += esize;
5456 swap_memzero(vd + reg_off, reg_max - reg_off);
5457
5458 /*
5459 * Probe the remaining elements, not allowing faults.
5460 */
5461 while (reg_off < reg_max) {
5462 uint64_t pg = vg[reg_off >> 6];
5463 do {
5464 if (likely((pg >> (reg_off & 63)) & 1)) {
5465 addr = base + (off_fn(vm, reg_off) << scale);
5466 in_page = -(addr | TARGET_PAGE_MASK);
5467
5468 if (unlikely(in_page < msize)) {
5469 /* Stop if the element crosses a page boundary. */
5470 goto fault;
5471 }
5472
5473 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
5474 mmu_idx, retaddr);
5475 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
5476 goto fault;
5477 }
5478 if (unlikely(info.flags & TLB_WATCHPOINT) &&
5479 (cpu_watchpoint_address_matches
5480 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
5481 goto fault;
5482 }
5483 /* TODO: MTE check. */
5484
5485 host_fn(vd, reg_off, info.host);
5486 }
5487 reg_off += esize;
5488 } while (reg_off & 63);
5489 }
5490 return;
5491
5492 fault:
5493 record_fault(env, reg_off, reg_max);
5494 }
5495
5496 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
5497 void HELPER(sve_ldff##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5498 void *vm, target_ulong base, uint32_t desc) \
5499 { \
5500 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
5501 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5502 }
5503
5504 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
5505 void HELPER(sve_ldff##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5506 void *vm, target_ulong base, uint32_t desc) \
5507 { \
5508 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
5509 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5510 }
5511
5512 DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
5513 DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
5514 DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
5515 DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
5516 DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
5517
5518 DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
5519 DO_LDFF1_ZPZ_S(bss, zss, MO_8)
5520 DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
5521 DO_LDFF1_ZPZ_D(bds, zss, MO_8)
5522 DO_LDFF1_ZPZ_D(bds, zd, MO_8)
5523
5524 DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
5525 DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
5526 DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
5527 DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
5528 DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
5529
5530 DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
5531 DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
5532 DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
5533 DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
5534 DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
5535
5536 DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
5537 DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
5538 DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
5539 DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
5540 DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
5541
5542 DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
5543 DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
5544 DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
5545 DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
5546 DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
5547
5548 DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
5549 DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
5550 DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
5551 DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
5552 DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
5553
5554 DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
5555 DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
5556 DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
5557 DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
5558 DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
5559
5560 DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
5561 DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
5562 DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
5563
5564 DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
5565 DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
5566 DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
5567
5568 DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
5569 DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
5570 DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
5571
5572 DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
5573 DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
5574 DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
5575
5576 /* Stores with a vector index. */
5577
5578 static inline QEMU_ALWAYS_INLINE
5579 void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5580 target_ulong base, uint32_t desc, uintptr_t retaddr,
5581 int esize, int msize, zreg_off_fn *off_fn,
5582 sve_ldst1_host_fn *host_fn,
5583 sve_ldst1_tlb_fn *tlb_fn)
5584 {
5585 const int mmu_idx = cpu_mmu_index(env, false);
5586 const intptr_t reg_max = simd_oprsz(desc);
5587 const int scale = simd_data(desc);
5588 void *host[ARM_MAX_VQ * 4];
5589 intptr_t reg_off, i;
5590 SVEHostPage info, info2;
5591
5592 /*
5593 * Probe all of the elements for host addresses and flags.
5594 */
5595 i = reg_off = 0;
5596 do {
5597 uint64_t pg = vg[reg_off >> 6];
5598 do {
5599 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5600 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5601
5602 host[i] = NULL;
5603 if (likely((pg >> (reg_off & 63)) & 1)) {
5604 if (likely(in_page >= msize)) {
5605 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
5606 mmu_idx, retaddr);
5607 host[i] = info.host;
5608 } else {
5609 /*
5610 * Element crosses the page boundary.
5611 * Probe both pages, but do not record the host address,
5612 * so that we use the slow path.
5613 */
5614 sve_probe_page(&info, false, env, addr, 0,
5615 MMU_DATA_STORE, mmu_idx, retaddr);
5616 sve_probe_page(&info2, false, env, addr + in_page, 0,
5617 MMU_DATA_STORE, mmu_idx, retaddr);
5618 info.flags |= info2.flags;
5619 }
5620
5621 if (unlikely(info.flags & TLB_WATCHPOINT)) {
5622 cpu_check_watchpoint(env_cpu(env), addr, msize,
5623 info.attrs, BP_MEM_WRITE, retaddr);
5624 }
5625 /* TODO: MTE check. */
5626 }
5627 i += 1;
5628 reg_off += esize;
5629 } while (reg_off & 63);
5630 } while (reg_off < reg_max);
5631
5632 /*
5633 * Now that we have recognized all exceptions except SyncExternal
5634 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
5635 *
5636 * Note for the common case of an element in RAM, not crossing a page
5637 * boundary, we have stored the host address in host[]. This doubles
5638 * as a first-level check against the predicate, since only enabled
5639 * elements have non-null host addresses.
5640 */
5641 i = reg_off = 0;
5642 do {
5643 void *h = host[i];
5644 if (likely(h != NULL)) {
5645 host_fn(vd, reg_off, h);
5646 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
5647 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5648 tlb_fn(env, vd, reg_off, addr, retaddr);
5649 }
5650 i += 1;
5651 reg_off += esize;
5652 } while (reg_off < reg_max);
5653 }
5654
5655 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
5656 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5657 void *vm, target_ulong base, uint32_t desc) \
5658 { \
5659 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
5660 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5661 }
5662
5663 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
5664 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5665 void *vm, target_ulong base, uint32_t desc) \
5666 { \
5667 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
5668 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5669 }
5670
5671 DO_ST1_ZPZ_S(bs, zsu, MO_8)
5672 DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
5673 DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
5674 DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
5675 DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
5676
5677 DO_ST1_ZPZ_S(bs, zss, MO_8)
5678 DO_ST1_ZPZ_S(hs_le, zss, MO_16)
5679 DO_ST1_ZPZ_S(hs_be, zss, MO_16)
5680 DO_ST1_ZPZ_S(ss_le, zss, MO_32)
5681 DO_ST1_ZPZ_S(ss_be, zss, MO_32)
5682
5683 DO_ST1_ZPZ_D(bd, zsu, MO_8)
5684 DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
5685 DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
5686 DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
5687 DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
5688 DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
5689 DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
5690
5691 DO_ST1_ZPZ_D(bd, zss, MO_8)
5692 DO_ST1_ZPZ_D(hd_le, zss, MO_16)
5693 DO_ST1_ZPZ_D(hd_be, zss, MO_16)
5694 DO_ST1_ZPZ_D(sd_le, zss, MO_32)
5695 DO_ST1_ZPZ_D(sd_be, zss, MO_32)
5696 DO_ST1_ZPZ_D(dd_le, zss, MO_64)
5697 DO_ST1_ZPZ_D(dd_be, zss, MO_64)
5698
5699 DO_ST1_ZPZ_D(bd, zd, MO_8)
5700 DO_ST1_ZPZ_D(hd_le, zd, MO_16)
5701 DO_ST1_ZPZ_D(hd_be, zd, MO_16)
5702 DO_ST1_ZPZ_D(sd_le, zd, MO_32)
5703 DO_ST1_ZPZ_D(sd_be, zd, MO_32)
5704 DO_ST1_ZPZ_D(dd_le, zd, MO_64)
5705 DO_ST1_ZPZ_D(dd_be, zd, MO_64)
5706
5707 #undef DO_ST1_ZPZ_S
5708 #undef DO_ST1_ZPZ_D