]> git.proxmox.com Git - mirror_qemu.git/blame - target/arm/sve_helper.c
target/arm: Fix sve_punpk_p vs odd vector lengths
[mirror_qemu.git] / target / arm / sve_helper.c
CommitLineData
9e18d7a6
RH
1/*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
50f57e09 9 * version 2.1 of the License, or (at your option) any later version.
9e18d7a6
RH
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "qemu/osdep.h"
21#include "cpu.h"
500d0484 22#include "internals.h"
9e18d7a6
RH
23#include "exec/exec-all.h"
24#include "exec/cpu_ldst.h"
25#include "exec/helper-proto.h"
26#include "tcg/tcg-gvec-desc.h"
a1f233f2 27#include "fpu/softfloat.h"
dcb32f1d 28#include "tcg/tcg.h"
9e18d7a6
RH
29
30
f97cfd59
RH
31/* Note that vector data is stored in host-endian 64-bit chunks,
32 so addressing units smaller than that needs a host-endian fixup. */
33#ifdef HOST_WORDS_BIGENDIAN
34#define H1(x) ((x) ^ 7)
35#define H1_2(x) ((x) ^ 6)
36#define H1_4(x) ((x) ^ 4)
37#define H2(x) ((x) ^ 3)
38#define H4(x) ((x) ^ 1)
39#else
40#define H1(x) (x)
41#define H1_2(x) (x)
42#define H1_4(x) (x)
43#define H2(x) (x)
44#define H4(x) (x)
45#endif
46
9e18d7a6
RH
47/* Return a value for NZCV as per the ARM PredTest pseudofunction.
48 *
49 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
50 * and bit 0 set if C is set. Compare the definitions of these variables
51 * within CPUARMState.
52 */
53
54/* For no G bits set, NZCV = C. */
55#define PREDTEST_INIT 1
56
57/* This is an iterative function, called for each Pd and Pg word
58 * moving forward.
59 */
60static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
61{
62 if (likely(g)) {
63 /* Compute N from first D & G.
64 Use bit 2 to signal first G bit seen. */
65 if (!(flags & 4)) {
66 flags |= ((d & (g & -g)) != 0) << 31;
67 flags |= 4;
68 }
69
70 /* Accumulate Z from each D & G. */
71 flags |= ((d & g) != 0) << 1;
72
73 /* Compute C from last !(D & G). Replace previous. */
74 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
75 }
76 return flags;
77}
78
757f9cff
RH
79/* This is an iterative function, called for each Pd and Pg word
80 * moving backward.
81 */
82static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
83{
84 if (likely(g)) {
85 /* Compute C from first (i.e last) !(D & G).
86 Use bit 2 to signal first G bit seen. */
87 if (!(flags & 4)) {
88 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
89 flags |= (d & pow2floor(g)) == 0;
90 }
91
92 /* Accumulate Z from each D & G. */
93 flags |= ((d & g) != 0) << 1;
94
95 /* Compute N from last (i.e first) D & G. Replace previous. */
96 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
97 }
98 return flags;
99}
100
9e18d7a6
RH
101/* The same for a single word predicate. */
102uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
103{
104 return iter_predtest_fwd(d, g, PREDTEST_INIT);
105}
106
107/* The same for a multi-word predicate. */
108uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
109{
110 uint32_t flags = PREDTEST_INIT;
111 uint64_t *d = vd, *g = vg;
112 uintptr_t i = 0;
113
114 do {
115 flags = iter_predtest_fwd(d[i], g[i], flags);
116 } while (++i < words);
117
118 return flags;
119}
516e246a 120
ccd841c3
RH
121/* Expand active predicate bits to bytes, for byte elements.
122 * for (i = 0; i < 256; ++i) {
123 * unsigned long m = 0;
124 * for (j = 0; j < 8; j++) {
125 * if ((i >> j) & 1) {
126 * m |= 0xfful << (j << 3);
127 * }
128 * }
129 * printf("0x%016lx,\n", m);
130 * }
131 */
132static inline uint64_t expand_pred_b(uint8_t byte)
133{
134 static const uint64_t word[256] = {
135 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
136 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
137 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
138 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
139 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
140 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
141 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
142 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
143 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
144 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
145 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
146 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
147 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
148 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
149 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
150 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
151 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
152 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
153 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
154 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
155 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
156 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
157 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
158 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
159 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
160 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
161 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
162 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
163 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
164 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
165 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
166 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
167 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
168 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
169 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
170 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
171 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
172 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
173 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
174 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
175 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
176 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
177 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
178 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
179 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
180 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
181 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
182 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
183 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
184 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
185 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
186 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
187 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
188 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
189 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
190 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
191 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
192 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
193 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
194 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
195 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
196 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
197 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
198 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
199 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
200 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
201 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
202 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
203 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
204 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
205 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
206 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
207 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
208 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
209 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
210 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
211 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
212 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
213 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
214 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
215 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
216 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
217 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
218 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
219 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
220 0xffffffffffffffff,
221 };
222 return word[byte];
223}
224
225/* Similarly for half-word elements.
226 * for (i = 0; i < 256; ++i) {
227 * unsigned long m = 0;
228 * if (i & 0xaa) {
229 * continue;
230 * }
231 * for (j = 0; j < 8; j += 2) {
232 * if ((i >> j) & 1) {
233 * m |= 0xfffful << (j << 3);
234 * }
235 * }
236 * printf("[0x%x] = 0x%016lx,\n", i, m);
237 * }
238 */
239static inline uint64_t expand_pred_h(uint8_t byte)
240{
241 static const uint64_t word[] = {
242 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
243 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
244 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
245 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
246 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
247 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
248 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
249 [0x55] = 0xffffffffffffffff,
250 };
251 return word[byte & 0x55];
252}
253
254/* Similarly for single word elements. */
255static inline uint64_t expand_pred_s(uint8_t byte)
256{
257 static const uint64_t word[] = {
258 [0x01] = 0x00000000ffffffffull,
259 [0x10] = 0xffffffff00000000ull,
260 [0x11] = 0xffffffffffffffffull,
261 };
262 return word[byte & 0x11];
263}
264
dae8fb90
RH
265/* Swap 16-bit words within a 32-bit word. */
266static inline uint32_t hswap32(uint32_t h)
267{
268 return rol32(h, 16);
269}
270
271/* Swap 16-bit words within a 64-bit word. */
272static inline uint64_t hswap64(uint64_t h)
273{
274 uint64_t m = 0x0000ffff0000ffffull;
275 h = rol64(h, 32);
276 return ((h & m) << 16) | ((h >> 16) & m);
277}
278
279/* Swap 32-bit words within a 64-bit word. */
280static inline uint64_t wswap64(uint64_t h)
281{
282 return rol64(h, 32);
283}
284
516e246a
RH
285#define LOGICAL_PPPP(NAME, FUNC) \
286void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
287{ \
288 uintptr_t opr_sz = simd_oprsz(desc); \
289 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
290 uintptr_t i; \
291 for (i = 0; i < opr_sz / 8; ++i) { \
292 d[i] = FUNC(n[i], m[i], g[i]); \
293 } \
294}
295
296#define DO_AND(N, M, G) (((N) & (M)) & (G))
297#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
298#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
299#define DO_ORR(N, M, G) (((N) | (M)) & (G))
300#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
301#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
302#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
303#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
304
305LOGICAL_PPPP(sve_and_pppp, DO_AND)
306LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
307LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
308LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
309LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
310LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
311LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
312LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
313
314#undef DO_AND
315#undef DO_BIC
316#undef DO_EOR
317#undef DO_ORR
318#undef DO_ORN
319#undef DO_NOR
320#undef DO_NAND
321#undef DO_SEL
322#undef LOGICAL_PPPP
028e2a7b 323
f97cfd59
RH
324/* Fully general three-operand expander, controlled by a predicate.
325 * This is complicated by the host-endian storage of the register file.
326 */
327/* ??? I don't expect the compiler could ever vectorize this itself.
328 * With some tables we can convert bit masks to byte masks, and with
329 * extra care wrt byte/word ordering we could use gcc generic vectors
330 * and do 16 bytes at a time.
331 */
332#define DO_ZPZZ(NAME, TYPE, H, OP) \
333void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
334{ \
335 intptr_t i, opr_sz = simd_oprsz(desc); \
336 for (i = 0; i < opr_sz; ) { \
337 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
338 do { \
339 if (pg & 1) { \
340 TYPE nn = *(TYPE *)(vn + H(i)); \
341 TYPE mm = *(TYPE *)(vm + H(i)); \
342 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
343 } \
344 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
345 } while (i & 15); \
346 } \
347}
348
349/* Similarly, specialized for 64-bit operands. */
350#define DO_ZPZZ_D(NAME, TYPE, OP) \
351void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
352{ \
353 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
354 TYPE *d = vd, *n = vn, *m = vm; \
355 uint8_t *pg = vg; \
356 for (i = 0; i < opr_sz; i += 1) { \
357 if (pg[H1(i)] & 1) { \
358 TYPE nn = n[i], mm = m[i]; \
359 d[i] = OP(nn, mm); \
360 } \
361 } \
362}
363
364#define DO_AND(N, M) (N & M)
365#define DO_EOR(N, M) (N ^ M)
366#define DO_ORR(N, M) (N | M)
367#define DO_BIC(N, M) (N & ~M)
368#define DO_ADD(N, M) (N + M)
369#define DO_SUB(N, M) (N - M)
370#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
371#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
372#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
373#define DO_MUL(N, M) (N * M)
7e8fafbf
RH
374
375
376/*
377 * We must avoid the C undefined behaviour cases: division by
378 * zero and signed division of INT_MIN by -1. Both of these
379 * have architecturally defined required results for Arm.
380 * We special case all signed divisions by -1 to avoid having
381 * to deduce the minimum integer for the type involved.
382 */
383#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
384#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
f97cfd59
RH
385
386DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
387DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
388DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
389DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
390
391DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
392DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
393DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
394DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
395
396DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
397DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
398DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
399DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
400
401DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
402DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
403DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
404DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
405
406DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
407DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
408DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
409DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
410
411DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
412DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
413DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
414DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
415
416DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
417DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
418DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
419DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
420
421DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
422DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
423DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
424DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
425
426DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
427DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
428DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
429DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
430
431DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
432DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
433DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
434DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
435
436DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
437DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
438DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
439DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
440
441DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
442DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
443DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
444DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
445
446/* Because the computation type is at least twice as large as required,
447 these work for both signed and unsigned source types. */
448static inline uint8_t do_mulh_b(int32_t n, int32_t m)
449{
450 return (n * m) >> 8;
451}
452
453static inline uint16_t do_mulh_h(int32_t n, int32_t m)
454{
455 return (n * m) >> 16;
456}
457
458static inline uint32_t do_mulh_s(int64_t n, int64_t m)
459{
460 return (n * m) >> 32;
461}
462
463static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
464{
465 uint64_t lo, hi;
466 muls64(&lo, &hi, n, m);
467 return hi;
468}
469
470static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
471{
472 uint64_t lo, hi;
473 mulu64(&lo, &hi, n, m);
474 return hi;
475}
476
477DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
478DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
479DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
480DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
481
482DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
483DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
484DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
485DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
486
487DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
488DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
489DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
490DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
491
7e8fafbf
RH
492DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
493DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
f97cfd59 494
7e8fafbf
RH
495DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
496DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
f97cfd59 497
27721dbb
RH
498/* Note that all bits of the shift are significant
499 and not modulo the element size. */
500#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
501#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
502#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
503
504DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
505DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
506DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
507
508DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
509DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
510DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
511
512DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
513DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
514DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
515
516DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
517DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
518DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
519
f97cfd59
RH
520#undef DO_ZPZZ
521#undef DO_ZPZZ_D
047cec97 522
fe7f8dfb
RH
523/* Three-operand expander, controlled by a predicate, in which the
524 * third operand is "wide". That is, for D = N op M, the same 64-bit
525 * value of M is used with all of the narrower values of N.
526 */
527#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
528void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
529{ \
530 intptr_t i, opr_sz = simd_oprsz(desc); \
531 for (i = 0; i < opr_sz; ) { \
532 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
533 TYPEW mm = *(TYPEW *)(vm + i); \
534 do { \
535 if (pg & 1) { \
536 TYPE nn = *(TYPE *)(vn + H(i)); \
537 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
538 } \
539 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
540 } while (i & 7); \
541 } \
542}
543
544DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
545DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
546DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
547
548DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
549DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
550DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
551
552DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
553DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
554DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
555
556#undef DO_ZPZW
557
afac6d04
RH
558/* Fully general two-operand expander, controlled by a predicate.
559 */
560#define DO_ZPZ(NAME, TYPE, H, OP) \
561void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
562{ \
563 intptr_t i, opr_sz = simd_oprsz(desc); \
564 for (i = 0; i < opr_sz; ) { \
565 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
566 do { \
567 if (pg & 1) { \
568 TYPE nn = *(TYPE *)(vn + H(i)); \
569 *(TYPE *)(vd + H(i)) = OP(nn); \
570 } \
571 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
572 } while (i & 15); \
573 } \
574}
575
576/* Similarly, specialized for 64-bit operands. */
577#define DO_ZPZ_D(NAME, TYPE, OP) \
578void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
579{ \
580 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
581 TYPE *d = vd, *n = vn; \
582 uint8_t *pg = vg; \
583 for (i = 0; i < opr_sz; i += 1) { \
584 if (pg[H1(i)] & 1) { \
585 TYPE nn = n[i]; \
586 d[i] = OP(nn); \
587 } \
588 } \
589}
590
591#define DO_CLS_B(N) (clrsb32(N) - 24)
592#define DO_CLS_H(N) (clrsb32(N) - 16)
593
594DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
595DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
596DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
597DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
598
599#define DO_CLZ_B(N) (clz32(N) - 24)
600#define DO_CLZ_H(N) (clz32(N) - 16)
601
602DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
603DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
604DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
605DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
606
607DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
608DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
609DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
610DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
611
612#define DO_CNOT(N) (N == 0)
613
614DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
615DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
616DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
617DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
618
619#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
620
621DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
622DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
623DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
624
625#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
626
627DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
628DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
629DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
630
631#define DO_NOT(N) (~N)
632
633DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
634DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
635DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
636DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
637
638#define DO_SXTB(N) ((int8_t)N)
639#define DO_SXTH(N) ((int16_t)N)
640#define DO_SXTS(N) ((int32_t)N)
641#define DO_UXTB(N) ((uint8_t)N)
642#define DO_UXTH(N) ((uint16_t)N)
643#define DO_UXTS(N) ((uint32_t)N)
644
645DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
646DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
647DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
648DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
649DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
650DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
651
652DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
653DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
654DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
655DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
656DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
657DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
658
659#define DO_ABS(N) (N < 0 ? -N : N)
660
661DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
662DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
663DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
664DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
665
666#define DO_NEG(N) (-N)
667
668DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
669DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
670DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
671DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
672
dae8fb90
RH
673DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
674DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
675DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
676
677DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
678DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
679
680DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
681
682DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
683DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
684DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
685DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
686
d9d78dcc
RH
687/* Three-operand expander, unpredicated, in which the third operand is "wide".
688 */
689#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
690void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
691{ \
692 intptr_t i, opr_sz = simd_oprsz(desc); \
693 for (i = 0; i < opr_sz; ) { \
694 TYPEW mm = *(TYPEW *)(vm + i); \
695 do { \
696 TYPE nn = *(TYPE *)(vn + H(i)); \
697 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
698 i += sizeof(TYPE); \
699 } while (i & 7); \
700 } \
701}
702
703DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
704DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
705DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
706
707DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
708DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
709DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
710
711DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
712DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
713DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
714
715#undef DO_ZZW
716
afac6d04
RH
717#undef DO_CLS_B
718#undef DO_CLS_H
719#undef DO_CLZ_B
720#undef DO_CLZ_H
721#undef DO_CNOT
722#undef DO_FABS
723#undef DO_FNEG
724#undef DO_ABS
725#undef DO_NEG
726#undef DO_ZPZ
727#undef DO_ZPZ_D
728
047cec97
RH
729/* Two-operand reduction expander, controlled by a predicate.
730 * The difference between TYPERED and TYPERET has to do with
731 * sign-extension. E.g. for SMAX, TYPERED must be signed,
732 * but TYPERET must be unsigned so that e.g. a 32-bit value
733 * is not sign-extended to the ABI uint64_t return type.
734 */
735/* ??? If we were to vectorize this by hand the reduction ordering
736 * would change. For integer operands, this is perfectly fine.
737 */
738#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
739uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
740{ \
741 intptr_t i, opr_sz = simd_oprsz(desc); \
742 TYPERED ret = INIT; \
743 for (i = 0; i < opr_sz; ) { \
744 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
745 do { \
746 if (pg & 1) { \
747 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
748 ret = OP(ret, nn); \
749 } \
750 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
751 } while (i & 15); \
752 } \
753 return (TYPERET)ret; \
754}
755
756#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
757uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
758{ \
759 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
760 TYPEE *n = vn; \
761 uint8_t *pg = vg; \
762 TYPER ret = INIT; \
763 for (i = 0; i < opr_sz; i += 1) { \
764 if (pg[H1(i)] & 1) { \
765 TYPEE nn = n[i]; \
766 ret = OP(ret, nn); \
767 } \
768 } \
769 return ret; \
770}
771
772DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
773DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
774DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
775DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
776
777DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
778DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
779DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
780DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
781
782DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
783DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
784DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
785DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
786
787DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
788DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
789DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
790
791DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
792DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
793DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
794DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
795
796DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
797DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
798DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
799DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
800
801DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
802DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
803DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
804DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
805
806DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
807DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
808DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
809DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
810
811DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
812DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
813DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
814DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
815
816#undef DO_VPZ
817#undef DO_VPZ_D
818
6e6a157d
RH
819/* Two vector operand, one scalar operand, unpredicated. */
820#define DO_ZZI(NAME, TYPE, OP) \
821void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
822{ \
823 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
824 TYPE s = s64, *d = vd, *n = vn; \
825 for (i = 0; i < opr_sz; ++i) { \
826 d[i] = OP(n[i], s); \
827 } \
828}
829
830#define DO_SUBR(X, Y) (Y - X)
831
832DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
833DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
834DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
835DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
836
837DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
838DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
839DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
840DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
841
842DO_ZZI(sve_smini_b, int8_t, DO_MIN)
843DO_ZZI(sve_smini_h, int16_t, DO_MIN)
844DO_ZZI(sve_smini_s, int32_t, DO_MIN)
845DO_ZZI(sve_smini_d, int64_t, DO_MIN)
846
847DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
848DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
849DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
850DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
851
852DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
853DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
854DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
855DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
856
857#undef DO_ZZI
858
f97cfd59
RH
859#undef DO_AND
860#undef DO_ORR
861#undef DO_EOR
862#undef DO_BIC
863#undef DO_ADD
864#undef DO_SUB
865#undef DO_MAX
866#undef DO_MIN
867#undef DO_ABD
868#undef DO_MUL
869#undef DO_DIV
27721dbb
RH
870#undef DO_ASR
871#undef DO_LSR
872#undef DO_LSL
6e6a157d 873#undef DO_SUBR
f97cfd59 874
028e2a7b
RH
875/* Similar to the ARM LastActiveElement pseudocode function, except the
876 result is multiplied by the element size. This includes the not found
877 indication; e.g. not found for esz=3 is -8. */
878static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
879{
880 uint64_t mask = pred_esz_masks[esz];
881 intptr_t i = words;
882
883 do {
884 uint64_t this_g = g[--i] & mask;
885 if (this_g) {
886 return i * 64 + (63 - clz64(this_g));
887 }
888 } while (i > 0);
889 return (intptr_t)-1 << esz;
890}
891
86300b5d 892uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
028e2a7b 893{
86300b5d 894 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
028e2a7b
RH
895 uint32_t flags = PREDTEST_INIT;
896 uint64_t *d = vd, *g = vg;
897 intptr_t i = 0;
898
899 do {
900 uint64_t this_d = d[i];
901 uint64_t this_g = g[i];
902
903 if (this_g) {
904 if (!(flags & 4)) {
905 /* Set in D the first bit of G. */
906 this_d |= this_g & -this_g;
907 d[i] = this_d;
908 }
909 flags = iter_predtest_fwd(this_d, this_g, flags);
910 }
911 } while (++i < words);
912
913 return flags;
914}
915
916uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
917{
86300b5d
RH
918 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
919 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
028e2a7b
RH
920 uint32_t flags = PREDTEST_INIT;
921 uint64_t *d = vd, *g = vg, esz_mask;
922 intptr_t i, next;
923
924 next = last_active_element(vd, words, esz) + (1 << esz);
925 esz_mask = pred_esz_masks[esz];
926
927 /* Similar to the pseudocode for pnext, but scaled by ESZ
928 so that we find the correct bit. */
929 if (next < words * 64) {
930 uint64_t mask = -1;
931
932 if (next & 63) {
933 mask = ~((1ull << (next & 63)) - 1);
934 next &= -64;
935 }
936 do {
937 uint64_t this_g = g[next / 64] & esz_mask & mask;
938 if (this_g != 0) {
939 next = (next & -64) + ctz64(this_g);
940 break;
941 }
942 next += 64;
943 mask = -1;
944 } while (next < words * 64);
945 }
946
947 i = 0;
948 do {
949 uint64_t this_d = 0;
950 if (i == next / 64) {
951 this_d = 1ull << (next & 63);
952 }
953 d[i] = this_d;
954 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
955 } while (++i < words);
956
957 return flags;
958}
ccd841c3 959
60245996
RH
960/*
961 * Copy Zn into Zd, and store zero into inactive elements.
962 * If inv, store zeros into the active elements.
ccd841c3 963 */
68459864
RH
964void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
965{
966 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
60245996 967 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
68459864
RH
968 uint64_t *d = vd, *n = vn;
969 uint8_t *pg = vg;
60245996 970
68459864 971 for (i = 0; i < opr_sz; i += 1) {
60245996 972 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
68459864
RH
973 }
974}
975
976void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
977{
978 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
60245996 979 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
68459864
RH
980 uint64_t *d = vd, *n = vn;
981 uint8_t *pg = vg;
60245996 982
68459864 983 for (i = 0; i < opr_sz; i += 1) {
60245996 984 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
68459864
RH
985 }
986}
987
988void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
989{
990 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
60245996 991 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
68459864
RH
992 uint64_t *d = vd, *n = vn;
993 uint8_t *pg = vg;
60245996 994
68459864 995 for (i = 0; i < opr_sz; i += 1) {
60245996 996 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
68459864
RH
997 }
998}
999
1000void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1001{
1002 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1003 uint64_t *d = vd, *n = vn;
1004 uint8_t *pg = vg;
60245996
RH
1005 uint8_t inv = simd_data(desc);
1006
68459864 1007 for (i = 0; i < opr_sz; i += 1) {
60245996 1008 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
68459864
RH
1009 }
1010}
1011
ccd841c3
RH
1012/* Three-operand expander, immediate operand, controlled by a predicate.
1013 */
1014#define DO_ZPZI(NAME, TYPE, H, OP) \
1015void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1016{ \
1017 intptr_t i, opr_sz = simd_oprsz(desc); \
1018 TYPE imm = simd_data(desc); \
1019 for (i = 0; i < opr_sz; ) { \
1020 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1021 do { \
1022 if (pg & 1) { \
1023 TYPE nn = *(TYPE *)(vn + H(i)); \
1024 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1025 } \
1026 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1027 } while (i & 15); \
1028 } \
1029}
1030
1031/* Similarly, specialized for 64-bit operands. */
1032#define DO_ZPZI_D(NAME, TYPE, OP) \
1033void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1034{ \
1035 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1036 TYPE *d = vd, *n = vn; \
1037 TYPE imm = simd_data(desc); \
1038 uint8_t *pg = vg; \
1039 for (i = 0; i < opr_sz; i += 1) { \
1040 if (pg[H1(i)] & 1) { \
1041 TYPE nn = n[i]; \
1042 d[i] = OP(nn, imm); \
1043 } \
1044 } \
1045}
1046
1047#define DO_SHR(N, M) (N >> M)
1048#define DO_SHL(N, M) (N << M)
1049
1050/* Arithmetic shift right for division. This rounds negative numbers
1051 toward zero as per signed division. Therefore before shifting,
1052 when N is negative, add 2**M-1. */
1053#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1054
1055DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1056DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1057DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1058DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1059
1060DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1061DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1062DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1063DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1064
1065DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1066DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1067DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1068DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1069
1070DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1071DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1072DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1073DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1074
1075#undef DO_SHR
1076#undef DO_SHL
1077#undef DO_ASRD
1078#undef DO_ZPZI
1079#undef DO_ZPZI_D
96a36e4a
RH
1080
1081/* Fully general four-operand expander, controlled by a predicate.
1082 */
1083#define DO_ZPZZZ(NAME, TYPE, H, OP) \
1084void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1085 void *vg, uint32_t desc) \
1086{ \
1087 intptr_t i, opr_sz = simd_oprsz(desc); \
1088 for (i = 0; i < opr_sz; ) { \
1089 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1090 do { \
1091 if (pg & 1) { \
1092 TYPE nn = *(TYPE *)(vn + H(i)); \
1093 TYPE mm = *(TYPE *)(vm + H(i)); \
1094 TYPE aa = *(TYPE *)(va + H(i)); \
1095 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1096 } \
1097 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1098 } while (i & 15); \
1099 } \
1100}
1101
1102/* Similarly, specialized for 64-bit operands. */
1103#define DO_ZPZZZ_D(NAME, TYPE, OP) \
1104void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1105 void *vg, uint32_t desc) \
1106{ \
1107 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1108 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1109 uint8_t *pg = vg; \
1110 for (i = 0; i < opr_sz; i += 1) { \
1111 if (pg[H1(i)] & 1) { \
1112 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1113 d[i] = OP(aa, nn, mm); \
1114 } \
1115 } \
1116}
1117
1118#define DO_MLA(A, N, M) (A + N * M)
1119#define DO_MLS(A, N, M) (A - N * M)
1120
1121DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1122DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1123
1124DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1125DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1126
1127DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1128DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1129
1130DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1131DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1132
1133#undef DO_MLA
1134#undef DO_MLS
1135#undef DO_ZPZZZ
1136#undef DO_ZPZZZ_D
9a56c9c3
RH
1137
1138void HELPER(sve_index_b)(void *vd, uint32_t start,
1139 uint32_t incr, uint32_t desc)
1140{
1141 intptr_t i, opr_sz = simd_oprsz(desc);
1142 uint8_t *d = vd;
1143 for (i = 0; i < opr_sz; i += 1) {
1144 d[H1(i)] = start + i * incr;
1145 }
1146}
1147
1148void HELPER(sve_index_h)(void *vd, uint32_t start,
1149 uint32_t incr, uint32_t desc)
1150{
1151 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1152 uint16_t *d = vd;
1153 for (i = 0; i < opr_sz; i += 1) {
1154 d[H2(i)] = start + i * incr;
1155 }
1156}
1157
1158void HELPER(sve_index_s)(void *vd, uint32_t start,
1159 uint32_t incr, uint32_t desc)
1160{
1161 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1162 uint32_t *d = vd;
1163 for (i = 0; i < opr_sz; i += 1) {
1164 d[H4(i)] = start + i * incr;
1165 }
1166}
1167
1168void HELPER(sve_index_d)(void *vd, uint64_t start,
1169 uint64_t incr, uint32_t desc)
1170{
1171 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1172 uint64_t *d = vd;
1173 for (i = 0; i < opr_sz; i += 1) {
1174 d[i] = start + i * incr;
1175 }
1176}
4b242d9c
RH
1177
1178void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1179{
1180 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1181 uint32_t sh = simd_data(desc);
1182 uint32_t *d = vd, *n = vn, *m = vm;
1183 for (i = 0; i < opr_sz; i += 1) {
1184 d[i] = n[i] + (m[i] << sh);
1185 }
1186}
1187
1188void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1189{
1190 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1191 uint64_t sh = simd_data(desc);
1192 uint64_t *d = vd, *n = vn, *m = vm;
1193 for (i = 0; i < opr_sz; i += 1) {
1194 d[i] = n[i] + (m[i] << sh);
1195 }
1196}
1197
1198void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1199{
1200 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1201 uint64_t sh = simd_data(desc);
1202 uint64_t *d = vd, *n = vn, *m = vm;
1203 for (i = 0; i < opr_sz; i += 1) {
1204 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1205 }
1206}
1207
1208void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1209{
1210 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1211 uint64_t sh = simd_data(desc);
1212 uint64_t *d = vd, *n = vn, *m = vm;
1213 for (i = 0; i < opr_sz; i += 1) {
1214 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1215 }
1216}
0762cd42
RH
1217
1218void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1219{
1220 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1221 static const uint16_t coeff[] = {
1222 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1223 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1224 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1225 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1226 };
1227 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1228 uint16_t *d = vd, *n = vn;
1229
1230 for (i = 0; i < opr_sz; i++) {
1231 uint16_t nn = n[i];
1232 intptr_t idx = extract32(nn, 0, 5);
1233 uint16_t exp = extract32(nn, 5, 5);
1234 d[i] = coeff[idx] | (exp << 10);
1235 }
1236}
1237
1238void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1239{
1240 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1241 static const uint32_t coeff[] = {
1242 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1243 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1244 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1245 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1246 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1247 0x1ef532, 0x20b051, 0x227043, 0x243516,
1248 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1249 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1250 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1251 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1252 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1253 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1254 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1255 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1256 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1257 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1258 };
1259 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1260 uint32_t *d = vd, *n = vn;
1261
1262 for (i = 0; i < opr_sz; i++) {
1263 uint32_t nn = n[i];
1264 intptr_t idx = extract32(nn, 0, 6);
1265 uint32_t exp = extract32(nn, 6, 8);
1266 d[i] = coeff[idx] | (exp << 23);
1267 }
1268}
1269
1270void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1271{
1272 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1273 static const uint64_t coeff[] = {
1274 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1275 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1276 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1277 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1278 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1279 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1280 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1281 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1282 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1283 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1284 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1285 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1286 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1287 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1288 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1289 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1290 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1291 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1292 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1293 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1294 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1295 0xFA7C1819E90D8ull,
1296 };
1297 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1298 uint64_t *d = vd, *n = vn;
1299
1300 for (i = 0; i < opr_sz; i++) {
1301 uint64_t nn = n[i];
1302 intptr_t idx = extract32(nn, 0, 6);
1303 uint64_t exp = extract32(nn, 6, 11);
1304 d[i] = coeff[idx] | (exp << 52);
1305 }
1306}
a1f233f2
RH
1307
1308void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1309{
1310 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1311 uint16_t *d = vd, *n = vn, *m = vm;
1312 for (i = 0; i < opr_sz; i += 1) {
1313 uint16_t nn = n[i];
1314 uint16_t mm = m[i];
1315 if (mm & 1) {
1316 nn = float16_one;
1317 }
1318 d[i] = nn ^ (mm & 2) << 14;
1319 }
1320}
1321
1322void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1323{
1324 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1325 uint32_t *d = vd, *n = vn, *m = vm;
1326 for (i = 0; i < opr_sz; i += 1) {
1327 uint32_t nn = n[i];
1328 uint32_t mm = m[i];
1329 if (mm & 1) {
1330 nn = float32_one;
1331 }
1332 d[i] = nn ^ (mm & 2) << 30;
1333 }
1334}
1335
1336void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1337{
1338 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1339 uint64_t *d = vd, *n = vn, *m = vm;
1340 for (i = 0; i < opr_sz; i += 1) {
1341 uint64_t nn = n[i];
1342 uint64_t mm = m[i];
1343 if (mm & 1) {
1344 nn = float64_one;
1345 }
1346 d[i] = nn ^ (mm & 2) << 62;
1347 }
1348}
24e82e68
RH
1349
1350/*
1351 * Signed saturating addition with scalar operand.
1352 */
1353
1354void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1355{
1356 intptr_t i, oprsz = simd_oprsz(desc);
1357
1358 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1359 int r = *(int8_t *)(a + i) + b;
1360 if (r > INT8_MAX) {
1361 r = INT8_MAX;
1362 } else if (r < INT8_MIN) {
1363 r = INT8_MIN;
1364 }
1365 *(int8_t *)(d + i) = r;
1366 }
1367}
1368
1369void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1370{
1371 intptr_t i, oprsz = simd_oprsz(desc);
1372
1373 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1374 int r = *(int16_t *)(a + i) + b;
1375 if (r > INT16_MAX) {
1376 r = INT16_MAX;
1377 } else if (r < INT16_MIN) {
1378 r = INT16_MIN;
1379 }
1380 *(int16_t *)(d + i) = r;
1381 }
1382}
1383
1384void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1385{
1386 intptr_t i, oprsz = simd_oprsz(desc);
1387
1388 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1389 int64_t r = *(int32_t *)(a + i) + b;
1390 if (r > INT32_MAX) {
1391 r = INT32_MAX;
1392 } else if (r < INT32_MIN) {
1393 r = INT32_MIN;
1394 }
1395 *(int32_t *)(d + i) = r;
1396 }
1397}
1398
1399void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1400{
1401 intptr_t i, oprsz = simd_oprsz(desc);
1402
1403 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1404 int64_t ai = *(int64_t *)(a + i);
1405 int64_t r = ai + b;
1406 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1407 /* Signed overflow. */
1408 r = (r < 0 ? INT64_MAX : INT64_MIN);
1409 }
1410 *(int64_t *)(d + i) = r;
1411 }
1412}
1413
1414/*
1415 * Unsigned saturating addition with scalar operand.
1416 */
1417
1418void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1419{
1420 intptr_t i, oprsz = simd_oprsz(desc);
1421
1422 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1423 int r = *(uint8_t *)(a + i) + b;
1424 if (r > UINT8_MAX) {
1425 r = UINT8_MAX;
1426 } else if (r < 0) {
1427 r = 0;
1428 }
1429 *(uint8_t *)(d + i) = r;
1430 }
1431}
1432
1433void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1434{
1435 intptr_t i, oprsz = simd_oprsz(desc);
1436
1437 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1438 int r = *(uint16_t *)(a + i) + b;
1439 if (r > UINT16_MAX) {
1440 r = UINT16_MAX;
1441 } else if (r < 0) {
1442 r = 0;
1443 }
1444 *(uint16_t *)(d + i) = r;
1445 }
1446}
1447
1448void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1449{
1450 intptr_t i, oprsz = simd_oprsz(desc);
1451
1452 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1453 int64_t r = *(uint32_t *)(a + i) + b;
1454 if (r > UINT32_MAX) {
1455 r = UINT32_MAX;
1456 } else if (r < 0) {
1457 r = 0;
1458 }
1459 *(uint32_t *)(d + i) = r;
1460 }
1461}
1462
1463void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1464{
1465 intptr_t i, oprsz = simd_oprsz(desc);
1466
1467 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1468 uint64_t r = *(uint64_t *)(a + i) + b;
1469 if (r < b) {
1470 r = UINT64_MAX;
1471 }
1472 *(uint64_t *)(d + i) = r;
1473 }
1474}
1475
1476void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1477{
1478 intptr_t i, oprsz = simd_oprsz(desc);
1479
1480 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1481 uint64_t ai = *(uint64_t *)(a + i);
1482 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1483 }
1484}
f25a2361
RH
1485
1486/* Two operand predicated copy immediate with merge. All valid immediates
1487 * can fit within 17 signed bits in the simd_data field.
1488 */
1489void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1490 uint64_t mm, uint32_t desc)
1491{
1492 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1493 uint64_t *d = vd, *n = vn;
1494 uint8_t *pg = vg;
1495
1496 mm = dup_const(MO_8, mm);
1497 for (i = 0; i < opr_sz; i += 1) {
1498 uint64_t nn = n[i];
1499 uint64_t pp = expand_pred_b(pg[H1(i)]);
1500 d[i] = (mm & pp) | (nn & ~pp);
1501 }
1502}
1503
1504void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1505 uint64_t mm, uint32_t desc)
1506{
1507 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1508 uint64_t *d = vd, *n = vn;
1509 uint8_t *pg = vg;
1510
1511 mm = dup_const(MO_16, mm);
1512 for (i = 0; i < opr_sz; i += 1) {
1513 uint64_t nn = n[i];
1514 uint64_t pp = expand_pred_h(pg[H1(i)]);
1515 d[i] = (mm & pp) | (nn & ~pp);
1516 }
1517}
1518
1519void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1520 uint64_t mm, uint32_t desc)
1521{
1522 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1523 uint64_t *d = vd, *n = vn;
1524 uint8_t *pg = vg;
1525
1526 mm = dup_const(MO_32, mm);
1527 for (i = 0; i < opr_sz; i += 1) {
1528 uint64_t nn = n[i];
1529 uint64_t pp = expand_pred_s(pg[H1(i)]);
1530 d[i] = (mm & pp) | (nn & ~pp);
1531 }
1532}
1533
1534void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1535 uint64_t mm, uint32_t desc)
1536{
1537 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1538 uint64_t *d = vd, *n = vn;
1539 uint8_t *pg = vg;
1540
1541 for (i = 0; i < opr_sz; i += 1) {
1542 uint64_t nn = n[i];
1543 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1544 }
1545}
1546
1547void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1548{
1549 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1550 uint64_t *d = vd;
1551 uint8_t *pg = vg;
1552
1553 val = dup_const(MO_8, val);
1554 for (i = 0; i < opr_sz; i += 1) {
1555 d[i] = val & expand_pred_b(pg[H1(i)]);
1556 }
1557}
1558
1559void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1560{
1561 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1562 uint64_t *d = vd;
1563 uint8_t *pg = vg;
1564
1565 val = dup_const(MO_16, val);
1566 for (i = 0; i < opr_sz; i += 1) {
1567 d[i] = val & expand_pred_h(pg[H1(i)]);
1568 }
1569}
1570
1571void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1572{
1573 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1574 uint64_t *d = vd;
1575 uint8_t *pg = vg;
1576
1577 val = dup_const(MO_32, val);
1578 for (i = 0; i < opr_sz; i += 1) {
1579 d[i] = val & expand_pred_s(pg[H1(i)]);
1580 }
1581}
1582
1583void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1584{
1585 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1586 uint64_t *d = vd;
1587 uint8_t *pg = vg;
1588
1589 for (i = 0; i < opr_sz; i += 1) {
1590 d[i] = (pg[H1(i)] & 1 ? val : 0);
1591 }
1592}
b94f8f60 1593
b4cd95d2 1594/* Big-endian hosts need to frob the byte indices. If the copy
b94f8f60
RH
1595 * happens to be 8-byte aligned, then no frobbing necessary.
1596 */
1597static void swap_memmove(void *vd, void *vs, size_t n)
1598{
1599 uintptr_t d = (uintptr_t)vd;
1600 uintptr_t s = (uintptr_t)vs;
1601 uintptr_t o = (d | s | n) & 7;
1602 size_t i;
1603
1604#ifndef HOST_WORDS_BIGENDIAN
1605 o = 0;
1606#endif
1607 switch (o) {
1608 case 0:
1609 memmove(vd, vs, n);
1610 break;
1611
1612 case 4:
1613 if (d < s || d >= s + n) {
1614 for (i = 0; i < n; i += 4) {
1615 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1616 }
1617 } else {
1618 for (i = n; i > 0; ) {
1619 i -= 4;
1620 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1621 }
1622 }
1623 break;
1624
1625 case 2:
1626 case 6:
1627 if (d < s || d >= s + n) {
1628 for (i = 0; i < n; i += 2) {
1629 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1630 }
1631 } else {
1632 for (i = n; i > 0; ) {
1633 i -= 2;
1634 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1635 }
1636 }
1637 break;
1638
1639 default:
1640 if (d < s || d >= s + n) {
1641 for (i = 0; i < n; i++) {
1642 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1643 }
1644 } else {
1645 for (i = n; i > 0; ) {
1646 i -= 1;
1647 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1648 }
1649 }
1650 break;
1651 }
1652}
1653
9123aeb6
RH
1654/* Similarly for memset of 0. */
1655static void swap_memzero(void *vd, size_t n)
1656{
1657 uintptr_t d = (uintptr_t)vd;
1658 uintptr_t o = (d | n) & 7;
1659 size_t i;
1660
1661 /* Usually, the first bit of a predicate is set, so N is 0. */
1662 if (likely(n == 0)) {
1663 return;
1664 }
1665
1666#ifndef HOST_WORDS_BIGENDIAN
1667 o = 0;
1668#endif
1669 switch (o) {
1670 case 0:
1671 memset(vd, 0, n);
1672 break;
1673
1674 case 4:
1675 for (i = 0; i < n; i += 4) {
1676 *(uint32_t *)H1_4(d + i) = 0;
1677 }
1678 break;
1679
1680 case 2:
1681 case 6:
1682 for (i = 0; i < n; i += 2) {
1683 *(uint16_t *)H1_2(d + i) = 0;
1684 }
1685 break;
1686
1687 default:
1688 for (i = 0; i < n; i++) {
1689 *(uint8_t *)H1(d + i) = 0;
1690 }
1691 break;
1692 }
1693}
1694
b94f8f60
RH
1695void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1696{
1697 intptr_t opr_sz = simd_oprsz(desc);
1698 size_t n_ofs = simd_data(desc);
1699 size_t n_siz = opr_sz - n_ofs;
1700
1701 if (vd != vm) {
1702 swap_memmove(vd, vn + n_ofs, n_siz);
1703 swap_memmove(vd + n_siz, vm, n_ofs);
1704 } else if (vd != vn) {
1705 swap_memmove(vd + n_siz, vd, n_ofs);
1706 swap_memmove(vd, vn + n_ofs, n_siz);
1707 } else {
1708 /* vd == vn == vm. Need temp space. */
1709 ARMVectorReg tmp;
1710 swap_memmove(&tmp, vm, n_ofs);
1711 swap_memmove(vd, vd + n_ofs, n_siz);
1712 memcpy(vd + n_siz, &tmp, n_ofs);
1713 }
1714}
30562ab7
RH
1715
1716#define DO_INSR(NAME, TYPE, H) \
1717void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1718{ \
1719 intptr_t opr_sz = simd_oprsz(desc); \
1720 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1721 *(TYPE *)(vd + H(0)) = val; \
1722}
1723
1724DO_INSR(sve_insr_b, uint8_t, H1)
1725DO_INSR(sve_insr_h, uint16_t, H1_2)
1726DO_INSR(sve_insr_s, uint32_t, H1_4)
1727DO_INSR(sve_insr_d, uint64_t, )
1728
1729#undef DO_INSR
1730
1731void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1732{
1733 intptr_t i, j, opr_sz = simd_oprsz(desc);
1734 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1735 uint64_t f = *(uint64_t *)(vn + i);
1736 uint64_t b = *(uint64_t *)(vn + j);
1737 *(uint64_t *)(vd + i) = bswap64(b);
1738 *(uint64_t *)(vd + j) = bswap64(f);
1739 }
1740}
1741
30562ab7
RH
1742void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1743{
1744 intptr_t i, j, opr_sz = simd_oprsz(desc);
1745 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1746 uint64_t f = *(uint64_t *)(vn + i);
1747 uint64_t b = *(uint64_t *)(vn + j);
1748 *(uint64_t *)(vd + i) = hswap64(b);
1749 *(uint64_t *)(vd + j) = hswap64(f);
1750 }
1751}
1752
1753void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1754{
1755 intptr_t i, j, opr_sz = simd_oprsz(desc);
1756 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1757 uint64_t f = *(uint64_t *)(vn + i);
1758 uint64_t b = *(uint64_t *)(vn + j);
1759 *(uint64_t *)(vd + i) = rol64(b, 32);
1760 *(uint64_t *)(vd + j) = rol64(f, 32);
1761 }
1762}
1763
1764void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1765{
1766 intptr_t i, j, opr_sz = simd_oprsz(desc);
1767 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1768 uint64_t f = *(uint64_t *)(vn + i);
1769 uint64_t b = *(uint64_t *)(vn + j);
1770 *(uint64_t *)(vd + i) = b;
1771 *(uint64_t *)(vd + j) = f;
1772 }
1773}
1774
1775#define DO_TBL(NAME, TYPE, H) \
1776void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1777{ \
1778 intptr_t i, opr_sz = simd_oprsz(desc); \
1779 uintptr_t elem = opr_sz / sizeof(TYPE); \
1780 TYPE *d = vd, *n = vn, *m = vm; \
1781 ARMVectorReg tmp; \
1782 if (unlikely(vd == vn)) { \
1783 n = memcpy(&tmp, vn, opr_sz); \
1784 } \
1785 for (i = 0; i < elem; i++) { \
1786 TYPE j = m[H(i)]; \
1787 d[H(i)] = j < elem ? n[H(j)] : 0; \
1788 } \
1789}
1790
1791DO_TBL(sve_tbl_b, uint8_t, H1)
1792DO_TBL(sve_tbl_h, uint16_t, H2)
1793DO_TBL(sve_tbl_s, uint32_t, H4)
1794DO_TBL(sve_tbl_d, uint64_t, )
1795
1796#undef TBL
1797
1798#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1799void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1800{ \
1801 intptr_t i, opr_sz = simd_oprsz(desc); \
1802 TYPED *d = vd; \
1803 TYPES *n = vn; \
1804 ARMVectorReg tmp; \
1805 if (unlikely(vn - vd < opr_sz)) { \
1806 n = memcpy(&tmp, n, opr_sz / 2); \
1807 } \
1808 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1809 d[HD(i)] = n[HS(i)]; \
1810 } \
1811}
1812
1813DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1814DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1815DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1816
1817DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1818DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1819DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1820
1821#undef DO_UNPK
d731d8cb
RH
1822
1823/* Mask of bits included in the even numbered predicates of width esz.
1824 * We also use this for expand_bits/compress_bits, and so extend the
1825 * same pattern out to 16-bit units.
1826 */
1827static const uint64_t even_bit_esz_masks[5] = {
1828 0x5555555555555555ull,
1829 0x3333333333333333ull,
1830 0x0f0f0f0f0f0f0f0full,
1831 0x00ff00ff00ff00ffull,
1832 0x0000ffff0000ffffull,
1833};
1834
1835/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1836 * For N==0, this corresponds to the operation that in qemu/bitops.h
1837 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1838 * section 7-2 Shuffling Bits.
1839 */
1840static uint64_t expand_bits(uint64_t x, int n)
1841{
1842 int i;
1843
1844 x &= 0xffffffffu;
1845 for (i = 4; i >= n; i--) {
1846 int sh = 1 << i;
1847 x = ((x << sh) | x) & even_bit_esz_masks[i];
1848 }
1849 return x;
1850}
1851
1852/* Compress units of 2**(N+1) bits to units of 2**N bits.
1853 * For N==0, this corresponds to the operation that in qemu/bitops.h
1854 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1855 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1856 */
1857static uint64_t compress_bits(uint64_t x, int n)
1858{
1859 int i;
1860
1861 for (i = n; i <= 4; i++) {
1862 int sh = 1 << i;
1863 x &= even_bit_esz_masks[i];
1864 x = (x >> sh) | x;
1865 }
1866 return x & 0xffffffffu;
1867}
1868
1869void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1870{
f9b0fcce
RH
1871 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
1872 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1873 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
8e7fefed 1874 int esize = 1 << esz;
d731d8cb
RH
1875 uint64_t *d = vd;
1876 intptr_t i;
1877
1878 if (oprsz <= 8) {
1879 uint64_t nn = *(uint64_t *)vn;
1880 uint64_t mm = *(uint64_t *)vm;
1881 int half = 4 * oprsz;
1882
1883 nn = extract64(nn, high * half, half);
1884 mm = extract64(mm, high * half, half);
1885 nn = expand_bits(nn, esz);
1886 mm = expand_bits(mm, esz);
8e7fefed 1887 d[0] = nn | (mm << esize);
d731d8cb 1888 } else {
8e7fefed 1889 ARMPredicateReg tmp;
d731d8cb
RH
1890
1891 /* We produce output faster than we consume input.
1892 Therefore we must be mindful of possible overlap. */
8e7fefed
RH
1893 if (vd == vn) {
1894 vn = memcpy(&tmp, vn, oprsz);
1895 if (vd == vm) {
1896 vm = vn;
1897 }
1898 } else if (vd == vm) {
1899 vm = memcpy(&tmp, vm, oprsz);
d731d8cb
RH
1900 }
1901 if (high) {
1902 high = oprsz >> 1;
1903 }
1904
8e7fefed 1905 if ((oprsz & 7) == 0) {
d731d8cb
RH
1906 uint32_t *n = vn, *m = vm;
1907 high >>= 2;
1908
8e7fefed 1909 for (i = 0; i < oprsz / 8; i++) {
d731d8cb
RH
1910 uint64_t nn = n[H4(high + i)];
1911 uint64_t mm = m[H4(high + i)];
1912
1913 nn = expand_bits(nn, esz);
1914 mm = expand_bits(mm, esz);
8e7fefed 1915 d[i] = nn | (mm << esize);
d731d8cb
RH
1916 }
1917 } else {
1918 uint8_t *n = vn, *m = vm;
1919 uint16_t *d16 = vd;
1920
1921 for (i = 0; i < oprsz / 2; i++) {
1922 uint16_t nn = n[H1(high + i)];
1923 uint16_t mm = m[H1(high + i)];
1924
1925 nn = expand_bits(nn, esz);
1926 mm = expand_bits(mm, esz);
8e7fefed 1927 d16[H2(i)] = nn | (mm << esize);
d731d8cb
RH
1928 }
1929 }
1930 }
1931}
1932
1933void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1934{
f9b0fcce
RH
1935 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
1936 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
1937 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
d731d8cb
RH
1938 uint64_t *d = vd, *n = vn, *m = vm;
1939 uint64_t l, h;
1940 intptr_t i;
1941
1942 if (oprsz <= 8) {
1943 l = compress_bits(n[0] >> odd, esz);
1944 h = compress_bits(m[0] >> odd, esz);
226e6c04 1945 d[0] = l | (h << (4 * oprsz));
d731d8cb
RH
1946 } else {
1947 ARMPredicateReg tmp_m;
1948 intptr_t oprsz_16 = oprsz / 16;
1949
1950 if ((vm - vd) < (uintptr_t)oprsz) {
1951 m = memcpy(&tmp_m, vm, oprsz);
1952 }
1953
1954 for (i = 0; i < oprsz_16; i++) {
1955 l = n[2 * i + 0];
1956 h = n[2 * i + 1];
1957 l = compress_bits(l >> odd, esz);
1958 h = compress_bits(h >> odd, esz);
226e6c04 1959 d[i] = l | (h << 32);
d731d8cb
RH
1960 }
1961
226e6c04
RH
1962 /*
1963 * For VL which is not a multiple of 512, the results from M do not
1964 * align nicely with the uint64_t for D. Put the aligned results
1965 * from M into TMP_M and then copy it into place afterward.
1966 */
d731d8cb 1967 if (oprsz & 15) {
226e6c04
RH
1968 int final_shift = (oprsz & 15) * 2;
1969
1970 l = n[2 * i + 0];
1971 h = n[2 * i + 1];
1972 l = compress_bits(l >> odd, esz);
1973 h = compress_bits(h >> odd, esz);
1974 d[i] = l | (h << final_shift);
d731d8cb
RH
1975
1976 for (i = 0; i < oprsz_16; i++) {
1977 l = m[2 * i + 0];
1978 h = m[2 * i + 1];
1979 l = compress_bits(l >> odd, esz);
1980 h = compress_bits(h >> odd, esz);
226e6c04 1981 tmp_m.p[i] = l | (h << 32);
d731d8cb 1982 }
226e6c04
RH
1983 l = m[2 * i + 0];
1984 h = m[2 * i + 1];
1985 l = compress_bits(l >> odd, esz);
1986 h = compress_bits(h >> odd, esz);
1987 tmp_m.p[i] = l | (h << final_shift);
d731d8cb
RH
1988
1989 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1990 } else {
1991 for (i = 0; i < oprsz_16; i++) {
1992 l = m[2 * i + 0];
1993 h = m[2 * i + 1];
1994 l = compress_bits(l >> odd, esz);
1995 h = compress_bits(h >> odd, esz);
226e6c04 1996 d[oprsz_16 + i] = l | (h << 32);
d731d8cb
RH
1997 }
1998 }
1999 }
2000}
2001
2002void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2003{
f9b0fcce
RH
2004 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2005 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2006 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
d731d8cb
RH
2007 uint64_t *d = vd, *n = vn, *m = vm;
2008 uint64_t mask;
2009 int shr, shl;
2010 intptr_t i;
2011
2012 shl = 1 << esz;
2013 shr = 0;
2014 mask = even_bit_esz_masks[esz];
2015 if (odd) {
2016 mask <<= shl;
2017 shr = shl;
2018 shl = 0;
2019 }
2020
2021 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2022 uint64_t nn = (n[i] & mask) >> shr;
2023 uint64_t mm = (m[i] & mask) << shl;
2024 d[i] = nn + mm;
2025 }
2026}
2027
2028/* Reverse units of 2**N bits. */
2029static uint64_t reverse_bits_64(uint64_t x, int n)
2030{
2031 int i, sh;
2032
2033 x = bswap64(x);
2034 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2035 uint64_t mask = even_bit_esz_masks[i];
2036 x = ((x & mask) << sh) | ((x >> sh) & mask);
2037 }
2038 return x;
2039}
2040
2041static uint8_t reverse_bits_8(uint8_t x, int n)
2042{
2043 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2044 int i, sh;
2045
2046 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2047 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2048 }
2049 return x;
2050}
2051
2052void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2053{
70acaafe
RH
2054 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2055 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
d731d8cb
RH
2056 intptr_t i, oprsz_2 = oprsz / 2;
2057
2058 if (oprsz <= 8) {
2059 uint64_t l = *(uint64_t *)vn;
2060 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2061 *(uint64_t *)vd = l;
2062 } else if ((oprsz & 15) == 0) {
2063 for (i = 0; i < oprsz_2; i += 8) {
2064 intptr_t ih = oprsz - 8 - i;
2065 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2066 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2067 *(uint64_t *)(vd + i) = h;
2068 *(uint64_t *)(vd + ih) = l;
2069 }
2070 } else {
2071 for (i = 0; i < oprsz_2; i += 1) {
2072 intptr_t il = H1(i);
2073 intptr_t ih = H1(oprsz - 1 - i);
2074 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2075 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2076 *(uint8_t *)(vd + il) = h;
2077 *(uint8_t *)(vd + ih) = l;
2078 }
2079 }
2080}
2081
2082void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2083{
70acaafe
RH
2084 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2085 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
d731d8cb
RH
2086 uint64_t *d = vd;
2087 intptr_t i;
2088
2089 if (oprsz <= 8) {
2090 uint64_t nn = *(uint64_t *)vn;
2091 int half = 4 * oprsz;
2092
2093 nn = extract64(nn, high * half, half);
2094 nn = expand_bits(nn, 0);
2095 d[0] = nn;
2096 } else {
2097 ARMPredicateReg tmp_n;
2098
2099 /* We produce output faster than we consume input.
2100 Therefore we must be mindful of possible overlap. */
2101 if ((vn - vd) < (uintptr_t)oprsz) {
2102 vn = memcpy(&tmp_n, vn, oprsz);
2103 }
2104 if (high) {
2105 high = oprsz >> 1;
2106 }
2107
fd911a21 2108 if ((oprsz & 7) == 0) {
d731d8cb
RH
2109 uint32_t *n = vn;
2110 high >>= 2;
2111
fd911a21 2112 for (i = 0; i < oprsz / 8; i++) {
d731d8cb
RH
2113 uint64_t nn = n[H4(high + i)];
2114 d[i] = expand_bits(nn, 0);
2115 }
2116 } else {
2117 uint16_t *d16 = vd;
2118 uint8_t *n = vn;
2119
2120 for (i = 0; i < oprsz / 2; i++) {
2121 uint16_t nn = n[H1(high + i)];
2122 d16[H2(i)] = expand_bits(nn, 0);
2123 }
2124 }
2125 }
2126}
234b48e9
RH
2127
2128#define DO_ZIP(NAME, TYPE, H) \
2129void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2130{ \
2131 intptr_t oprsz = simd_oprsz(desc); \
2132 intptr_t i, oprsz_2 = oprsz / 2; \
2133 ARMVectorReg tmp_n, tmp_m; \
2134 /* We produce output faster than we consume input. \
2135 Therefore we must be mindful of possible overlap. */ \
2136 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2137 vn = memcpy(&tmp_n, vn, oprsz_2); \
2138 } \
2139 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2140 vm = memcpy(&tmp_m, vm, oprsz_2); \
2141 } \
2142 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2143 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2144 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2145 } \
2146}
2147
2148DO_ZIP(sve_zip_b, uint8_t, H1)
2149DO_ZIP(sve_zip_h, uint16_t, H1_2)
2150DO_ZIP(sve_zip_s, uint32_t, H1_4)
2151DO_ZIP(sve_zip_d, uint64_t, )
2152
2153#define DO_UZP(NAME, TYPE, H) \
2154void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2155{ \
2156 intptr_t oprsz = simd_oprsz(desc); \
2157 intptr_t oprsz_2 = oprsz / 2; \
2158 intptr_t odd_ofs = simd_data(desc); \
2159 intptr_t i; \
2160 ARMVectorReg tmp_m; \
2161 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2162 vm = memcpy(&tmp_m, vm, oprsz); \
2163 } \
2164 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2165 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2166 } \
2167 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2168 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2169 } \
2170}
2171
2172DO_UZP(sve_uzp_b, uint8_t, H1)
2173DO_UZP(sve_uzp_h, uint16_t, H1_2)
2174DO_UZP(sve_uzp_s, uint32_t, H1_4)
2175DO_UZP(sve_uzp_d, uint64_t, )
2176
2177#define DO_TRN(NAME, TYPE, H) \
2178void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2179{ \
2180 intptr_t oprsz = simd_oprsz(desc); \
2181 intptr_t odd_ofs = simd_data(desc); \
2182 intptr_t i; \
2183 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2184 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2185 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2186 *(TYPE *)(vd + H(i + 0)) = ae; \
2187 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2188 } \
2189}
2190
2191DO_TRN(sve_trn_b, uint8_t, H1)
2192DO_TRN(sve_trn_h, uint16_t, H1_2)
2193DO_TRN(sve_trn_s, uint32_t, H1_4)
2194DO_TRN(sve_trn_d, uint64_t, )
2195
2196#undef DO_ZIP
2197#undef DO_UZP
2198#undef DO_TRN
3ca879ae
RH
2199
2200void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2201{
2202 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2203 uint32_t *d = vd, *n = vn;
2204 uint8_t *pg = vg;
2205
2206 for (i = j = 0; i < opr_sz; i++) {
2207 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2208 d[H4(j)] = n[H4(i)];
2209 j++;
2210 }
2211 }
2212 for (; j < opr_sz; j++) {
2213 d[H4(j)] = 0;
2214 }
2215}
2216
2217void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2218{
2219 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2220 uint64_t *d = vd, *n = vn;
2221 uint8_t *pg = vg;
2222
2223 for (i = j = 0; i < opr_sz; i++) {
2224 if (pg[H1(i)] & 1) {
2225 d[j] = n[i];
2226 j++;
2227 }
2228 }
2229 for (; j < opr_sz; j++) {
2230 d[j] = 0;
2231 }
2232}
ef23cb72
RH
2233
2234/* Similar to the ARM LastActiveElement pseudocode function, except the
2235 * result is multiplied by the element size. This includes the not found
2236 * indication; e.g. not found for esz=3 is -8.
2237 */
2238int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2239{
2240 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2241 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2242
2243 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2244}
b48ff240
RH
2245
2246void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2247{
2248 intptr_t opr_sz = simd_oprsz(desc) / 8;
2249 int esz = simd_data(desc);
2250 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2251 intptr_t i, first_i, last_i;
2252 ARMVectorReg tmp;
2253
2254 first_i = last_i = 0;
2255 first_g = last_g = 0;
2256
2257 /* Find the extent of the active elements within VG. */
2258 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2259 pg = *(uint64_t *)(vg + i) & mask;
2260 if (pg) {
2261 if (last_g == 0) {
2262 last_g = pg;
2263 last_i = i;
2264 }
2265 first_g = pg;
2266 first_i = i;
2267 }
2268 }
2269
2270 len = 0;
2271 if (first_g != 0) {
2272 first_i = first_i * 8 + ctz64(first_g);
2273 last_i = last_i * 8 + 63 - clz64(last_g);
2274 len = last_i - first_i + (1 << esz);
2275 if (vd == vm) {
2276 vm = memcpy(&tmp, vm, opr_sz * 8);
2277 }
2278 swap_memmove(vd, vn + first_i, len);
2279 }
2280 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2281}
d3fe4a29
RH
2282
2283void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2284 void *vg, uint32_t desc)
2285{
2286 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2287 uint64_t *d = vd, *n = vn, *m = vm;
2288 uint8_t *pg = vg;
2289
2290 for (i = 0; i < opr_sz; i += 1) {
2291 uint64_t nn = n[i], mm = m[i];
2292 uint64_t pp = expand_pred_b(pg[H1(i)]);
2293 d[i] = (nn & pp) | (mm & ~pp);
2294 }
2295}
2296
2297void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2298 void *vg, uint32_t desc)
2299{
2300 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2301 uint64_t *d = vd, *n = vn, *m = vm;
2302 uint8_t *pg = vg;
2303
2304 for (i = 0; i < opr_sz; i += 1) {
2305 uint64_t nn = n[i], mm = m[i];
2306 uint64_t pp = expand_pred_h(pg[H1(i)]);
2307 d[i] = (nn & pp) | (mm & ~pp);
2308 }
2309}
2310
2311void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2312 void *vg, uint32_t desc)
2313{
2314 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2315 uint64_t *d = vd, *n = vn, *m = vm;
2316 uint8_t *pg = vg;
2317
2318 for (i = 0; i < opr_sz; i += 1) {
2319 uint64_t nn = n[i], mm = m[i];
2320 uint64_t pp = expand_pred_s(pg[H1(i)]);
2321 d[i] = (nn & pp) | (mm & ~pp);
2322 }
2323}
2324
2325void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2326 void *vg, uint32_t desc)
2327{
2328 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2329 uint64_t *d = vd, *n = vn, *m = vm;
2330 uint8_t *pg = vg;
2331
2332 for (i = 0; i < opr_sz; i += 1) {
2333 uint64_t nn = n[i], mm = m[i];
2334 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2335 }
2336}
757f9cff
RH
2337
2338/* Two operand comparison controlled by a predicate.
2339 * ??? It is very tempting to want to be able to expand this inline
2340 * with x86 instructions, e.g.
2341 *
2342 * vcmpeqw zm, zn, %ymm0
2343 * vpmovmskb %ymm0, %eax
2344 * and $0x5555, %eax
2345 * and pg, %eax
2346 *
2347 * or even aarch64, e.g.
2348 *
2349 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2350 * cmeq v0.8h, zn, zm
2351 * and v0.8h, v0.8h, mask
2352 * addv h0, v0.8h
2353 * and v0.8b, pg
2354 *
2355 * However, coming up with an abstraction that allows vector inputs and
2356 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2357 * scalar outputs, is tricky.
2358 */
2359#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2360uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2361{ \
2362 intptr_t opr_sz = simd_oprsz(desc); \
2363 uint32_t flags = PREDTEST_INIT; \
2364 intptr_t i = opr_sz; \
2365 do { \
2366 uint64_t out = 0, pg; \
2367 do { \
2368 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2369 TYPE nn = *(TYPE *)(vn + H(i)); \
2370 TYPE mm = *(TYPE *)(vm + H(i)); \
2371 out |= nn OP mm; \
2372 } while (i & 63); \
2373 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2374 out &= pg; \
2375 *(uint64_t *)(vd + (i >> 3)) = out; \
2376 flags = iter_predtest_bwd(out, pg, flags); \
2377 } while (i > 0); \
2378 return flags; \
2379}
2380
2381#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2382 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2383#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2384 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2385#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2386 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2387#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2388 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2389
2390DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2391DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2392DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2393DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2394
2395DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2396DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2397DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2398DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2399
2400DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2401DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2402DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2403DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2404
2405DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2406DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2407DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2408DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2409
2410DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2411DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2412DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2413DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2414
2415DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2416DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2417DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2418DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2419
2420#undef DO_CMP_PPZZ_B
2421#undef DO_CMP_PPZZ_H
2422#undef DO_CMP_PPZZ_S
2423#undef DO_CMP_PPZZ_D
2424#undef DO_CMP_PPZZ
2425
2426/* Similar, but the second source is "wide". */
2427#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2428uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2429{ \
2430 intptr_t opr_sz = simd_oprsz(desc); \
2431 uint32_t flags = PREDTEST_INIT; \
2432 intptr_t i = opr_sz; \
2433 do { \
2434 uint64_t out = 0, pg; \
2435 do { \
2436 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2437 do { \
2438 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2439 TYPE nn = *(TYPE *)(vn + H(i)); \
2440 out |= nn OP mm; \
2441 } while (i & 7); \
2442 } while (i & 63); \
2443 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2444 out &= pg; \
2445 *(uint64_t *)(vd + (i >> 3)) = out; \
2446 flags = iter_predtest_bwd(out, pg, flags); \
2447 } while (i > 0); \
2448 return flags; \
2449}
2450
2451#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2452 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2453#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2454 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2455#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2456 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2457
df4e0010
RH
2458DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
2459DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2460DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
757f9cff 2461
df4e0010
RH
2462DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
2463DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2464DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
757f9cff
RH
2465
2466DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2467DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2468DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2469
2470DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2471DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2472DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2473
2474DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2475DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2476DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2477
2478DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2479DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2480DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2481
2482DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2483DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2484DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2485
2486DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2487DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2488DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2489
2490DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2491DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2492DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2493
2494DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2495DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2496DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2497
2498#undef DO_CMP_PPZW_B
2499#undef DO_CMP_PPZW_H
2500#undef DO_CMP_PPZW_S
2501#undef DO_CMP_PPZW
38cadeba
RH
2502
2503/* Similar, but the second source is immediate. */
2504#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2505uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2506{ \
2507 intptr_t opr_sz = simd_oprsz(desc); \
2508 uint32_t flags = PREDTEST_INIT; \
2509 TYPE mm = simd_data(desc); \
2510 intptr_t i = opr_sz; \
2511 do { \
2512 uint64_t out = 0, pg; \
2513 do { \
2514 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2515 TYPE nn = *(TYPE *)(vn + H(i)); \
2516 out |= nn OP mm; \
2517 } while (i & 63); \
2518 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2519 out &= pg; \
2520 *(uint64_t *)(vd + (i >> 3)) = out; \
2521 flags = iter_predtest_bwd(out, pg, flags); \
2522 } while (i > 0); \
2523 return flags; \
2524}
2525
2526#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2527 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2528#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2529 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2530#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2531 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2532#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2533 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2534
2535DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2536DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2537DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2538DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2539
2540DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2541DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2542DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2543DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2544
2545DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2546DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2547DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2548DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2549
2550DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2551DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2552DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2553DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2554
2555DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2556DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2557DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2558DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2559
2560DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2561DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2562DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2563DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2564
2565DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2566DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2567DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2568DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2569
2570DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2571DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2572DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2573DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2574
2575DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2576DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2577DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2578DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2579
2580DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2581DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2582DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2583DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2584
2585#undef DO_CMP_PPZI_B
2586#undef DO_CMP_PPZI_H
2587#undef DO_CMP_PPZI_S
2588#undef DO_CMP_PPZI_D
2589#undef DO_CMP_PPZI
35da316f
RH
2590
2591/* Similar to the ARM LastActive pseudocode function. */
2592static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2593{
2594 intptr_t i;
2595
2596 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2597 uint64_t pg = *(uint64_t *)(vg + i);
2598 if (pg) {
2599 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2600 }
2601 }
2602 return 0;
2603}
2604
2605/* Compute a mask into RETB that is true for all G, up to and including
2606 * (if after) or excluding (if !after) the first G & N.
2607 * Return true if BRK found.
2608 */
2609static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2610 bool brk, bool after)
2611{
2612 uint64_t b;
2613
2614 if (brk) {
2615 b = 0;
2616 } else if ((g & n) == 0) {
2617 /* For all G, no N are set; break not found. */
2618 b = g;
2619 } else {
2620 /* Break somewhere in N. Locate it. */
2621 b = g & n; /* guard true, pred true */
2622 b = b & -b; /* first such */
2623 if (after) {
2624 b = b | (b - 1); /* break after same */
2625 } else {
2626 b = b - 1; /* break before same */
2627 }
2628 brk = true;
2629 }
2630
2631 *retb = b;
2632 return brk;
2633}
2634
2635/* Compute a zeroing BRK. */
2636static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2637 intptr_t oprsz, bool after)
2638{
2639 bool brk = false;
2640 intptr_t i;
2641
2642 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2643 uint64_t this_b, this_g = g[i];
2644
2645 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2646 d[i] = this_b & this_g;
2647 }
2648}
2649
2650/* Likewise, but also compute flags. */
2651static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2652 intptr_t oprsz, bool after)
2653{
2654 uint32_t flags = PREDTEST_INIT;
2655 bool brk = false;
2656 intptr_t i;
2657
2658 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2659 uint64_t this_b, this_d, this_g = g[i];
2660
2661 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2662 d[i] = this_d = this_b & this_g;
2663 flags = iter_predtest_fwd(this_d, this_g, flags);
2664 }
2665 return flags;
2666}
2667
2668/* Compute a merging BRK. */
2669static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2670 intptr_t oprsz, bool after)
2671{
2672 bool brk = false;
2673 intptr_t i;
2674
2675 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2676 uint64_t this_b, this_g = g[i];
2677
2678 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2679 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2680 }
2681}
2682
2683/* Likewise, but also compute flags. */
2684static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2685 intptr_t oprsz, bool after)
2686{
2687 uint32_t flags = PREDTEST_INIT;
2688 bool brk = false;
2689 intptr_t i;
2690
2691 for (i = 0; i < oprsz / 8; ++i) {
2692 uint64_t this_b, this_d = d[i], this_g = g[i];
2693
2694 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2695 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2696 flags = iter_predtest_fwd(this_d, this_g, flags);
2697 }
2698 return flags;
2699}
2700
2701static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2702{
2703 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2704 * The compiler should turn this into 4 64-bit integer stores.
2705 */
2706 memset(d, 0, sizeof(ARMPredicateReg));
2707 return PREDTEST_INIT;
2708}
2709
2710void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2711 uint32_t pred_desc)
2712{
2713 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2714 if (last_active_pred(vn, vg, oprsz)) {
2715 compute_brk_z(vd, vm, vg, oprsz, true);
2716 } else {
2717 do_zero(vd, oprsz);
2718 }
2719}
2720
2721uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2722 uint32_t pred_desc)
2723{
2724 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2725 if (last_active_pred(vn, vg, oprsz)) {
2726 return compute_brks_z(vd, vm, vg, oprsz, true);
2727 } else {
2728 return do_zero(vd, oprsz);
2729 }
2730}
2731
2732void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2733 uint32_t pred_desc)
2734{
2735 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2736 if (last_active_pred(vn, vg, oprsz)) {
2737 compute_brk_z(vd, vm, vg, oprsz, false);
2738 } else {
2739 do_zero(vd, oprsz);
2740 }
2741}
2742
2743uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2744 uint32_t pred_desc)
2745{
2746 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2747 if (last_active_pred(vn, vg, oprsz)) {
2748 return compute_brks_z(vd, vm, vg, oprsz, false);
2749 } else {
2750 return do_zero(vd, oprsz);
2751 }
2752}
2753
2754void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2755{
2756 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2757 compute_brk_z(vd, vn, vg, oprsz, true);
2758}
2759
2760uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2761{
2762 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2763 return compute_brks_z(vd, vn, vg, oprsz, true);
2764}
2765
2766void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2767{
2768 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2769 compute_brk_z(vd, vn, vg, oprsz, false);
2770}
2771
2772uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2773{
2774 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2775 return compute_brks_z(vd, vn, vg, oprsz, false);
2776}
2777
2778void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2779{
2780 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2781 compute_brk_m(vd, vn, vg, oprsz, true);
2782}
2783
2784uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2785{
2786 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2787 return compute_brks_m(vd, vn, vg, oprsz, true);
2788}
2789
2790void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2791{
2792 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2793 compute_brk_m(vd, vn, vg, oprsz, false);
2794}
2795
2796uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2797{
2798 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2799 return compute_brks_m(vd, vn, vg, oprsz, false);
2800}
2801
2802void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2803{
2804 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2805
2806 if (!last_active_pred(vn, vg, oprsz)) {
2807 do_zero(vd, oprsz);
2808 }
2809}
2810
2811/* As if PredTest(Ones(PL), D, esz). */
2812static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2813 uint64_t esz_mask)
2814{
2815 uint32_t flags = PREDTEST_INIT;
2816 intptr_t i;
2817
2818 for (i = 0; i < oprsz / 8; i++) {
2819 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2820 }
2821 if (oprsz & 7) {
2822 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2823 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2824 }
2825 return flags;
2826}
2827
2828uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2829{
2830 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2831
2832 if (last_active_pred(vn, vg, oprsz)) {
2833 return predtest_ones(vd, oprsz, -1);
2834 } else {
2835 return do_zero(vd, oprsz);
2836 }
2837}
9ee3a611
RH
2838
2839uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2840{
2841 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2842 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2843 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2844 intptr_t i;
2845
2846 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2847 uint64_t t = n[i] & g[i] & mask;
2848 sum += ctpop64(t);
2849 }
2850 return sum;
2851}
caf1cefc
RH
2852
2853uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2854{
2855 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2856 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2857 uint64_t esz_mask = pred_esz_masks[esz];
2858 ARMPredicateReg *d = vd;
2859 uint32_t flags;
2860 intptr_t i;
2861
2862 /* Begin with a zero predicate register. */
2863 flags = do_zero(d, oprsz);
2864 if (count == 0) {
2865 return flags;
2866 }
2867
caf1cefc
RH
2868 /* Set all of the requested bits. */
2869 for (i = 0; i < count / 64; ++i) {
2870 d->p[i] = esz_mask;
2871 }
2872 if (count & 63) {
2873 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2874 }
2875
2876 return predtest_ones(d, oprsz, esz_mask);
2877}
c4e7c493 2878
23fbe79f
RH
2879/* Recursive reduction on a function;
2880 * C.f. the ARM ARM function ReducePredicated.
2881 *
2882 * While it would be possible to write this without the DATA temporary,
2883 * it is much simpler to process the predicate register this way.
2884 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2885 * little to gain with a more complex non-recursive form.
2886 */
2887#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2888static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2889{ \
2890 if (n == 1) { \
2891 return *data; \
2892 } else { \
2893 uintptr_t half = n / 2; \
2894 TYPE lo = NAME##_reduce(data, status, half); \
2895 TYPE hi = NAME##_reduce(data + half, status, half); \
2896 return TYPE##_##FUNC(lo, hi, status); \
2897 } \
2898} \
2899uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2900{ \
2901 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2902 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2903 for (i = 0; i < oprsz; ) { \
2904 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2905 do { \
2906 TYPE nn = *(TYPE *)(vn + H(i)); \
2907 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2908 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2909 } while (i & 15); \
2910 } \
2911 for (; i < maxsz; i += sizeof(TYPE)) { \
2912 *(TYPE *)((void *)data + i) = IDENT; \
2913 } \
2914 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2915}
2916
2917DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2918DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2919DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2920
2921/* Identity is floatN_default_nan, without the function call. */
2922DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2923DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2924DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2925
2926DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2927DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2928DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2929
2930DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2931DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2932DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2933
2934DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2935DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2936DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2937
2938#undef DO_REDUCE
2939
7f9ddf64
RH
2940uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2941 void *status, uint32_t desc)
2942{
2943 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2944 float16 result = nn;
2945
2946 do {
2947 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2948 do {
2949 if (pg & 1) {
2950 float16 mm = *(float16 *)(vm + H1_2(i));
2951 result = float16_add(result, mm, status);
2952 }
2953 i += sizeof(float16), pg >>= sizeof(float16);
2954 } while (i & 15);
2955 } while (i < opr_sz);
2956
2957 return result;
2958}
2959
2960uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2961 void *status, uint32_t desc)
2962{
2963 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2964 float32 result = nn;
2965
2966 do {
2967 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2968 do {
2969 if (pg & 1) {
2970 float32 mm = *(float32 *)(vm + H1_2(i));
2971 result = float32_add(result, mm, status);
2972 }
2973 i += sizeof(float32), pg >>= sizeof(float32);
2974 } while (i & 15);
2975 } while (i < opr_sz);
2976
2977 return result;
2978}
2979
2980uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
2981 void *status, uint32_t desc)
2982{
2983 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
2984 uint64_t *m = vm;
2985 uint8_t *pg = vg;
2986
2987 for (i = 0; i < opr_sz; i++) {
2988 if (pg[H1(i)] & 1) {
2989 nn = float64_add(nn, m[i], status);
2990 }
2991 }
2992
2993 return nn;
2994}
2995
ec3b87c2
RH
2996/* Fully general three-operand expander, controlled by a predicate,
2997 * With the extra float_status parameter.
2998 */
2999#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
3000void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3001 void *status, uint32_t desc) \
3002{ \
3003 intptr_t i = simd_oprsz(desc); \
3004 uint64_t *g = vg; \
3005 do { \
3006 uint64_t pg = g[(i - 1) >> 6]; \
3007 do { \
3008 i -= sizeof(TYPE); \
3009 if (likely((pg >> (i & 63)) & 1)) { \
3010 TYPE nn = *(TYPE *)(vn + H(i)); \
3011 TYPE mm = *(TYPE *)(vm + H(i)); \
3012 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3013 } \
3014 } while (i & 63); \
3015 } while (i != 0); \
3016}
3017
3018DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3019DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3020DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3021
3022DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3023DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3024DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3025
3026DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3027DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3028DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3029
3030DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3031DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3032DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3033
3034DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3035DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3036DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3037
3038DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3039DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3040DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3041
3042DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3043DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3044DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3045
3046DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3047DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3048DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3049
3050static inline float16 abd_h(float16 a, float16 b, float_status *s)
3051{
3052 return float16_abs(float16_sub(a, b, s));
3053}
3054
3055static inline float32 abd_s(float32 a, float32 b, float_status *s)
3056{
3057 return float32_abs(float32_sub(a, b, s));
3058}
3059
3060static inline float64 abd_d(float64 a, float64 b, float_status *s)
3061{
3062 return float64_abs(float64_sub(a, b, s));
3063}
3064
3065DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3066DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3067DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3068
3069static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3070{
3071 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3072 return float64_scalbn(a, b_int, s);
3073}
3074
3075DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3076DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3077DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3078
3079DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3080DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3081DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3082
3083#undef DO_ZPZZ_FP
3084
cc48affe
RH
3085/* Three-operand expander, with one scalar operand, controlled by
3086 * a predicate, with the extra float_status parameter.
3087 */
3088#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3089void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3090 void *status, uint32_t desc) \
3091{ \
3092 intptr_t i = simd_oprsz(desc); \
3093 uint64_t *g = vg; \
3094 TYPE mm = scalar; \
3095 do { \
3096 uint64_t pg = g[(i - 1) >> 6]; \
3097 do { \
3098 i -= sizeof(TYPE); \
3099 if (likely((pg >> (i & 63)) & 1)) { \
3100 TYPE nn = *(TYPE *)(vn + H(i)); \
3101 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3102 } \
3103 } while (i & 63); \
3104 } while (i != 0); \
3105}
3106
3107DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3108DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3109DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3110
3111DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3112DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3113DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3114
3115DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3116DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3117DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3118
3119static inline float16 subr_h(float16 a, float16 b, float_status *s)
3120{
3121 return float16_sub(b, a, s);
3122}
3123
3124static inline float32 subr_s(float32 a, float32 b, float_status *s)
3125{
3126 return float32_sub(b, a, s);
3127}
3128
3129static inline float64 subr_d(float64 a, float64 b, float_status *s)
3130{
3131 return float64_sub(b, a, s);
3132}
3133
3134DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3135DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3136DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3137
3138DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3139DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3140DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3141
3142DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3143DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3144DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3145
3146DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3147DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3148DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3149
3150DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3151DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3152DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3153
8092c6a3
RH
3154/* Fully general two-operand expander, controlled by a predicate,
3155 * With the extra float_status parameter.
3156 */
3157#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3158void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3159{ \
3160 intptr_t i = simd_oprsz(desc); \
3161 uint64_t *g = vg; \
3162 do { \
3163 uint64_t pg = g[(i - 1) >> 6]; \
3164 do { \
3165 i -= sizeof(TYPE); \
3166 if (likely((pg >> (i & 63)) & 1)) { \
3167 TYPE nn = *(TYPE *)(vn + H(i)); \
3168 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3169 } \
3170 } while (i & 63); \
3171 } while (i != 0); \
3172}
3173
46d33d1e
RH
3174/* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3175 * FZ16. When converting from fp16, this affects flushing input denormals;
3176 * when converting to fp16, this affects flushing output denormals.
3177 */
3178static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3179{
c120391c 3180 bool save = get_flush_inputs_to_zero(fpst);
46d33d1e
RH
3181 float32 ret;
3182
3183 set_flush_inputs_to_zero(false, fpst);
3184 ret = float16_to_float32(f, true, fpst);
3185 set_flush_inputs_to_zero(save, fpst);
3186 return ret;
3187}
3188
3189static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3190{
c120391c 3191 bool save = get_flush_inputs_to_zero(fpst);
46d33d1e
RH
3192 float64 ret;
3193
3194 set_flush_inputs_to_zero(false, fpst);
3195 ret = float16_to_float64(f, true, fpst);
3196 set_flush_inputs_to_zero(save, fpst);
3197 return ret;
3198}
3199
3200static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3201{
c120391c 3202 bool save = get_flush_to_zero(fpst);
46d33d1e
RH
3203 float16 ret;
3204
3205 set_flush_to_zero(false, fpst);
3206 ret = float32_to_float16(f, true, fpst);
3207 set_flush_to_zero(save, fpst);
3208 return ret;
3209}
3210
3211static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3212{
c120391c 3213 bool save = get_flush_to_zero(fpst);
46d33d1e
RH
3214 float16 ret;
3215
3216 set_flush_to_zero(false, fpst);
3217 ret = float64_to_float16(f, true, fpst);
3218 set_flush_to_zero(save, fpst);
3219 return ret;
3220}
3221
df4de1af
RH
3222static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3223{
3224 if (float16_is_any_nan(f)) {
3225 float_raise(float_flag_invalid, s);
3226 return 0;
3227 }
3228 return float16_to_int16_round_to_zero(f, s);
3229}
3230
3231static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3232{
3233 if (float16_is_any_nan(f)) {
3234 float_raise(float_flag_invalid, s);
3235 return 0;
3236 }
3237 return float16_to_int64_round_to_zero(f, s);
3238}
3239
3240static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3241{
3242 if (float32_is_any_nan(f)) {
3243 float_raise(float_flag_invalid, s);
3244 return 0;
3245 }
3246 return float32_to_int64_round_to_zero(f, s);
3247}
3248
3249static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3250{
3251 if (float64_is_any_nan(f)) {
3252 float_raise(float_flag_invalid, s);
3253 return 0;
3254 }
3255 return float64_to_int64_round_to_zero(f, s);
3256}
3257
3258static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3259{
3260 if (float16_is_any_nan(f)) {
3261 float_raise(float_flag_invalid, s);
3262 return 0;
3263 }
3264 return float16_to_uint16_round_to_zero(f, s);
3265}
3266
3267static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3268{
3269 if (float16_is_any_nan(f)) {
3270 float_raise(float_flag_invalid, s);
3271 return 0;
3272 }
3273 return float16_to_uint64_round_to_zero(f, s);
3274}
3275
3276static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3277{
3278 if (float32_is_any_nan(f)) {
3279 float_raise(float_flag_invalid, s);
3280 return 0;
3281 }
3282 return float32_to_uint64_round_to_zero(f, s);
3283}
3284
3285static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3286{
3287 if (float64_is_any_nan(f)) {
3288 float_raise(float_flag_invalid, s);
3289 return 0;
3290 }
3291 return float64_to_uint64_round_to_zero(f, s);
3292}
3293
46d33d1e
RH
3294DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3295DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3296DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3297DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3298DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3299DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3300
df4de1af
RH
3301DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3302DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3303DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3304DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3305DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3306DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3307DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3308
3309DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3310DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3311DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3312DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3313DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3314DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3315DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3316
cda3c753
RH
3317DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3318DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3319DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3320
3321DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3322DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3323DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3324
ec5b375b
RH
3325DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3326DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3327DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3328
3329DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3330DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3331DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3332
8092c6a3
RH
3333DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3334DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3335DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3336DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3337DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3338DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3339DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3340
3341DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3342DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3343DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3344DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3345DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3346DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3347DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3348
3349#undef DO_ZPZ_FP
3350
08975da9
RH
3351static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
3352 float_status *status, uint32_t desc,
6ceabaad
RH
3353 uint16_t neg1, uint16_t neg3)
3354{
3355 intptr_t i = simd_oprsz(desc);
6ceabaad
RH
3356 uint64_t *g = vg;
3357
3358 do {
3359 uint64_t pg = g[(i - 1) >> 6];
3360 do {
3361 i -= 2;
3362 if (likely((pg >> (i & 63)) & 1)) {
3363 float16 e1, e2, e3, r;
3364
3365 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3366 e2 = *(uint16_t *)(vm + H1_2(i));
3367 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
08975da9 3368 r = float16_muladd(e1, e2, e3, 0, status);
6ceabaad
RH
3369 *(uint16_t *)(vd + H1_2(i)) = r;
3370 }
3371 } while (i & 63);
3372 } while (i != 0);
3373}
3374
08975da9
RH
3375void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3376 void *vg, void *status, uint32_t desc)
6ceabaad 3377{
08975da9 3378 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
6ceabaad
RH
3379}
3380
08975da9
RH
3381void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3382 void *vg, void *status, uint32_t desc)
6ceabaad 3383{
08975da9 3384 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
6ceabaad
RH
3385}
3386
08975da9
RH
3387void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3388 void *vg, void *status, uint32_t desc)
6ceabaad 3389{
08975da9 3390 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
6ceabaad
RH
3391}
3392
08975da9
RH
3393void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3394 void *vg, void *status, uint32_t desc)
6ceabaad 3395{
08975da9 3396 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
6ceabaad
RH
3397}
3398
08975da9
RH
3399static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
3400 float_status *status, uint32_t desc,
6ceabaad
RH
3401 uint32_t neg1, uint32_t neg3)
3402{
3403 intptr_t i = simd_oprsz(desc);
6ceabaad
RH
3404 uint64_t *g = vg;
3405
3406 do {
3407 uint64_t pg = g[(i - 1) >> 6];
3408 do {
3409 i -= 4;
3410 if (likely((pg >> (i & 63)) & 1)) {
3411 float32 e1, e2, e3, r;
3412
3413 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3414 e2 = *(uint32_t *)(vm + H1_4(i));
3415 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
08975da9 3416 r = float32_muladd(e1, e2, e3, 0, status);
6ceabaad
RH
3417 *(uint32_t *)(vd + H1_4(i)) = r;
3418 }
3419 } while (i & 63);
3420 } while (i != 0);
3421}
3422
08975da9
RH
3423void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3424 void *vg, void *status, uint32_t desc)
6ceabaad 3425{
08975da9 3426 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
6ceabaad
RH
3427}
3428
08975da9
RH
3429void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3430 void *vg, void *status, uint32_t desc)
6ceabaad 3431{
08975da9 3432 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
6ceabaad
RH
3433}
3434
08975da9
RH
3435void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3436 void *vg, void *status, uint32_t desc)
6ceabaad 3437{
08975da9 3438 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
6ceabaad
RH
3439}
3440
08975da9
RH
3441void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3442 void *vg, void *status, uint32_t desc)
6ceabaad 3443{
08975da9 3444 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
6ceabaad
RH
3445}
3446
08975da9
RH
3447static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
3448 float_status *status, uint32_t desc,
6ceabaad
RH
3449 uint64_t neg1, uint64_t neg3)
3450{
3451 intptr_t i = simd_oprsz(desc);
6ceabaad
RH
3452 uint64_t *g = vg;
3453
3454 do {
3455 uint64_t pg = g[(i - 1) >> 6];
3456 do {
3457 i -= 8;
3458 if (likely((pg >> (i & 63)) & 1)) {
3459 float64 e1, e2, e3, r;
3460
3461 e1 = *(uint64_t *)(vn + i) ^ neg1;
3462 e2 = *(uint64_t *)(vm + i);
3463 e3 = *(uint64_t *)(va + i) ^ neg3;
08975da9 3464 r = float64_muladd(e1, e2, e3, 0, status);
6ceabaad
RH
3465 *(uint64_t *)(vd + i) = r;
3466 }
3467 } while (i & 63);
3468 } while (i != 0);
3469}
3470
08975da9
RH
3471void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3472 void *vg, void *status, uint32_t desc)
6ceabaad 3473{
08975da9 3474 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
6ceabaad
RH
3475}
3476
08975da9
RH
3477void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3478 void *vg, void *status, uint32_t desc)
6ceabaad 3479{
08975da9 3480 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
6ceabaad
RH
3481}
3482
08975da9
RH
3483void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3484 void *vg, void *status, uint32_t desc)
6ceabaad 3485{
08975da9 3486 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
6ceabaad
RH
3487}
3488
08975da9
RH
3489void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3490 void *vg, void *status, uint32_t desc)
6ceabaad 3491{
08975da9 3492 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
6ceabaad
RH
3493}
3494
abfdefd5
RH
3495/* Two operand floating-point comparison controlled by a predicate.
3496 * Unlike the integer version, we are not allowed to optimistically
3497 * compare operands, since the comparison may have side effects wrt
3498 * the FPSR.
3499 */
3500#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3501void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3502 void *status, uint32_t desc) \
3503{ \
3504 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3505 uint64_t *d = vd, *g = vg; \
3506 do { \
3507 uint64_t out = 0, pg = g[j]; \
3508 do { \
3509 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3510 if (likely((pg >> (i & 63)) & 1)) { \
3511 TYPE nn = *(TYPE *)(vn + H(i)); \
3512 TYPE mm = *(TYPE *)(vm + H(i)); \
3513 out |= OP(TYPE, nn, mm, status); \
3514 } \
3515 } while (i & 63); \
3516 d[j--] = out; \
3517 } while (i > 0); \
3518}
3519
3520#define DO_FPCMP_PPZZ_H(NAME, OP) \
3521 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3522#define DO_FPCMP_PPZZ_S(NAME, OP) \
3523 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3524#define DO_FPCMP_PPZZ_D(NAME, OP) \
3525 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3526
3527#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3528 DO_FPCMP_PPZZ_H(NAME, OP) \
3529 DO_FPCMP_PPZZ_S(NAME, OP) \
3530 DO_FPCMP_PPZZ_D(NAME, OP)
3531
3532#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3533#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4d2e2a03
RH
3534#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3535#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
abfdefd5
RH
3536#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3537#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3538#define DO_FCMUO(TYPE, X, Y, ST) \
3539 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3540#define DO_FACGE(TYPE, X, Y, ST) \
3541 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3542#define DO_FACGT(TYPE, X, Y, ST) \
3543 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3544
3545DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3546DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3547DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3548DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3549DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3550DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3551DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3552
3553#undef DO_FPCMP_PPZZ_ALL
3554#undef DO_FPCMP_PPZZ_D
3555#undef DO_FPCMP_PPZZ_S
3556#undef DO_FPCMP_PPZZ_H
3557#undef DO_FPCMP_PPZZ
3558
4d2e2a03
RH
3559/* One operand floating-point comparison against zero, controlled
3560 * by a predicate.
3561 */
3562#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3563void HELPER(NAME)(void *vd, void *vn, void *vg, \
3564 void *status, uint32_t desc) \
3565{ \
3566 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3567 uint64_t *d = vd, *g = vg; \
3568 do { \
3569 uint64_t out = 0, pg = g[j]; \
3570 do { \
3571 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3572 if ((pg >> (i & 63)) & 1) { \
3573 TYPE nn = *(TYPE *)(vn + H(i)); \
3574 out |= OP(TYPE, nn, 0, status); \
3575 } \
3576 } while (i & 63); \
3577 d[j--] = out; \
3578 } while (i > 0); \
3579}
3580
3581#define DO_FPCMP_PPZ0_H(NAME, OP) \
3582 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3583#define DO_FPCMP_PPZ0_S(NAME, OP) \
3584 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3585#define DO_FPCMP_PPZ0_D(NAME, OP) \
3586 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3587
3588#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3589 DO_FPCMP_PPZ0_H(NAME, OP) \
3590 DO_FPCMP_PPZ0_S(NAME, OP) \
3591 DO_FPCMP_PPZ0_D(NAME, OP)
3592
3593DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3594DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3595DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3596DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3597DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3598DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3599
67fcd9ad
RH
3600/* FP Trig Multiply-Add. */
3601
3602void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3603{
3604 static const float16 coeff[16] = {
3605 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3606 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3607 };
3608 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3609 intptr_t x = simd_data(desc);
3610 float16 *d = vd, *n = vn, *m = vm;
3611 for (i = 0; i < opr_sz; i++) {
3612 float16 mm = m[i];
3613 intptr_t xx = x;
3614 if (float16_is_neg(mm)) {
3615 mm = float16_abs(mm);
3616 xx += 8;
3617 }
3618 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3619 }
3620}
3621
3622void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3623{
3624 static const float32 coeff[16] = {
3625 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3626 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3627 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3628 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3629 };
3630 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3631 intptr_t x = simd_data(desc);
3632 float32 *d = vd, *n = vn, *m = vm;
3633 for (i = 0; i < opr_sz; i++) {
3634 float32 mm = m[i];
3635 intptr_t xx = x;
3636 if (float32_is_neg(mm)) {
3637 mm = float32_abs(mm);
3638 xx += 8;
3639 }
3640 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3641 }
3642}
3643
3644void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3645{
3646 static const float64 coeff[16] = {
3647 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3648 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3649 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3650 0x3de5d8408868552full, 0x0000000000000000ull,
3651 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3652 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3653 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3654 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3655 };
3656 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3657 intptr_t x = simd_data(desc);
3658 float64 *d = vd, *n = vn, *m = vm;
3659 for (i = 0; i < opr_sz; i++) {
3660 float64 mm = m[i];
3661 intptr_t xx = x;
3662 if (float64_is_neg(mm)) {
3663 mm = float64_abs(mm);
3664 xx += 8;
3665 }
3666 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3667 }
3668}
3669
76a9d9cd
RH
3670/*
3671 * FP Complex Add
3672 */
3673
3674void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3675 void *vs, uint32_t desc)
3676{
3677 intptr_t j, i = simd_oprsz(desc);
3678 uint64_t *g = vg;
3679 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3680 float16 neg_real = float16_chs(neg_imag);
3681
3682 do {
3683 uint64_t pg = g[(i - 1) >> 6];
3684 do {
3685 float16 e0, e1, e2, e3;
3686
3687 /* I holds the real index; J holds the imag index. */
3688 j = i - sizeof(float16);
3689 i -= 2 * sizeof(float16);
3690
3691 e0 = *(float16 *)(vn + H1_2(i));
3692 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3693 e2 = *(float16 *)(vn + H1_2(j));
3694 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3695
3696 if (likely((pg >> (i & 63)) & 1)) {
3697 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3698 }
3699 if (likely((pg >> (j & 63)) & 1)) {
3700 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3701 }
3702 } while (i & 63);
3703 } while (i != 0);
3704}
3705
3706void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3707 void *vs, uint32_t desc)
3708{
3709 intptr_t j, i = simd_oprsz(desc);
3710 uint64_t *g = vg;
3711 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3712 float32 neg_real = float32_chs(neg_imag);
3713
3714 do {
3715 uint64_t pg = g[(i - 1) >> 6];
3716 do {
3717 float32 e0, e1, e2, e3;
3718
3719 /* I holds the real index; J holds the imag index. */
3720 j = i - sizeof(float32);
3721 i -= 2 * sizeof(float32);
3722
3723 e0 = *(float32 *)(vn + H1_2(i));
3724 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3725 e2 = *(float32 *)(vn + H1_2(j));
3726 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3727
3728 if (likely((pg >> (i & 63)) & 1)) {
3729 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3730 }
3731 if (likely((pg >> (j & 63)) & 1)) {
3732 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3733 }
3734 } while (i & 63);
3735 } while (i != 0);
3736}
3737
3738void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3739 void *vs, uint32_t desc)
3740{
3741 intptr_t j, i = simd_oprsz(desc);
3742 uint64_t *g = vg;
3743 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3744 float64 neg_real = float64_chs(neg_imag);
3745
3746 do {
3747 uint64_t pg = g[(i - 1) >> 6];
3748 do {
3749 float64 e0, e1, e2, e3;
3750
3751 /* I holds the real index; J holds the imag index. */
3752 j = i - sizeof(float64);
3753 i -= 2 * sizeof(float64);
3754
3755 e0 = *(float64 *)(vn + H1_2(i));
3756 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3757 e2 = *(float64 *)(vn + H1_2(j));
3758 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3759
3760 if (likely((pg >> (i & 63)) & 1)) {
3761 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3762 }
3763 if (likely((pg >> (j & 63)) & 1)) {
3764 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3765 }
3766 } while (i & 63);
3767 } while (i != 0);
3768}
3769
05f48bab
RH
3770/*
3771 * FP Complex Multiply
3772 */
3773
08975da9
RH
3774void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3775 void *vg, void *status, uint32_t desc)
05f48bab
RH
3776{
3777 intptr_t j, i = simd_oprsz(desc);
08975da9 3778 unsigned rot = simd_data(desc);
05f48bab
RH
3779 bool flip = rot & 1;
3780 float16 neg_imag, neg_real;
05f48bab
RH
3781 uint64_t *g = vg;
3782
3783 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3784 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3785
3786 do {
3787 uint64_t pg = g[(i - 1) >> 6];
3788 do {
3789 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3790
3791 /* I holds the real index; J holds the imag index. */
3792 j = i - sizeof(float16);
3793 i -= 2 * sizeof(float16);
3794
3795 nr = *(float16 *)(vn + H1_2(i));
3796 ni = *(float16 *)(vn + H1_2(j));
3797 mr = *(float16 *)(vm + H1_2(i));
3798 mi = *(float16 *)(vm + H1_2(j));
3799
3800 e2 = (flip ? ni : nr);
3801 e1 = (flip ? mi : mr) ^ neg_real;
3802 e4 = e2;
3803 e3 = (flip ? mr : mi) ^ neg_imag;
3804
3805 if (likely((pg >> (i & 63)) & 1)) {
3806 d = *(float16 *)(va + H1_2(i));
08975da9 3807 d = float16_muladd(e2, e1, d, 0, status);
05f48bab
RH
3808 *(float16 *)(vd + H1_2(i)) = d;
3809 }
3810 if (likely((pg >> (j & 63)) & 1)) {
3811 d = *(float16 *)(va + H1_2(j));
08975da9 3812 d = float16_muladd(e4, e3, d, 0, status);
05f48bab
RH
3813 *(float16 *)(vd + H1_2(j)) = d;
3814 }
3815 } while (i & 63);
3816 } while (i != 0);
3817}
3818
08975da9
RH
3819void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3820 void *vg, void *status, uint32_t desc)
05f48bab
RH
3821{
3822 intptr_t j, i = simd_oprsz(desc);
08975da9 3823 unsigned rot = simd_data(desc);
05f48bab
RH
3824 bool flip = rot & 1;
3825 float32 neg_imag, neg_real;
05f48bab
RH
3826 uint64_t *g = vg;
3827
3828 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3829 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3830
3831 do {
3832 uint64_t pg = g[(i - 1) >> 6];
3833 do {
3834 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3835
3836 /* I holds the real index; J holds the imag index. */
3837 j = i - sizeof(float32);
3838 i -= 2 * sizeof(float32);
3839
3840 nr = *(float32 *)(vn + H1_2(i));
3841 ni = *(float32 *)(vn + H1_2(j));
3842 mr = *(float32 *)(vm + H1_2(i));
3843 mi = *(float32 *)(vm + H1_2(j));
3844
3845 e2 = (flip ? ni : nr);
3846 e1 = (flip ? mi : mr) ^ neg_real;
3847 e4 = e2;
3848 e3 = (flip ? mr : mi) ^ neg_imag;
3849
3850 if (likely((pg >> (i & 63)) & 1)) {
3851 d = *(float32 *)(va + H1_2(i));
08975da9 3852 d = float32_muladd(e2, e1, d, 0, status);
05f48bab
RH
3853 *(float32 *)(vd + H1_2(i)) = d;
3854 }
3855 if (likely((pg >> (j & 63)) & 1)) {
3856 d = *(float32 *)(va + H1_2(j));
08975da9 3857 d = float32_muladd(e4, e3, d, 0, status);
05f48bab
RH
3858 *(float32 *)(vd + H1_2(j)) = d;
3859 }
3860 } while (i & 63);
3861 } while (i != 0);
3862}
3863
08975da9
RH
3864void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
3865 void *vg, void *status, uint32_t desc)
05f48bab
RH
3866{
3867 intptr_t j, i = simd_oprsz(desc);
08975da9 3868 unsigned rot = simd_data(desc);
05f48bab
RH
3869 bool flip = rot & 1;
3870 float64 neg_imag, neg_real;
05f48bab
RH
3871 uint64_t *g = vg;
3872
3873 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3874 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3875
3876 do {
3877 uint64_t pg = g[(i - 1) >> 6];
3878 do {
3879 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3880
3881 /* I holds the real index; J holds the imag index. */
3882 j = i - sizeof(float64);
3883 i -= 2 * sizeof(float64);
3884
3885 nr = *(float64 *)(vn + H1_2(i));
3886 ni = *(float64 *)(vn + H1_2(j));
3887 mr = *(float64 *)(vm + H1_2(i));
3888 mi = *(float64 *)(vm + H1_2(j));
3889
3890 e2 = (flip ? ni : nr);
3891 e1 = (flip ? mi : mr) ^ neg_real;
3892 e4 = e2;
3893 e3 = (flip ? mr : mi) ^ neg_imag;
3894
3895 if (likely((pg >> (i & 63)) & 1)) {
3896 d = *(float64 *)(va + H1_2(i));
08975da9 3897 d = float64_muladd(e2, e1, d, 0, status);
05f48bab
RH
3898 *(float64 *)(vd + H1_2(i)) = d;
3899 }
3900 if (likely((pg >> (j & 63)) & 1)) {
3901 d = *(float64 *)(va + H1_2(j));
08975da9 3902 d = float64_muladd(e4, e3, d, 0, status);
05f48bab
RH
3903 *(float64 *)(vd + H1_2(j)) = d;
3904 }
3905 } while (i & 63);
3906 } while (i != 0);
3907}
3908
c4e7c493
RH
3909/*
3910 * Load contiguous data, protected by a governing predicate.
3911 */
9123aeb6
RH
3912
3913/*
cf4a49b7
RH
3914 * Load one element into @vd + @reg_off from @host.
3915 * The controlling predicate is known to be true.
9123aeb6 3916 */
cf4a49b7 3917typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
9123aeb6
RH
3918
3919/*
3920 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
3921 * The controlling predicate is known to be true.
3922 */
6799ce7b
RH
3923typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
3924 target_ulong vaddr, uintptr_t retaddr);
9123aeb6
RH
3925
3926/*
3927 * Generate the above primitives.
3928 */
3929
3930#define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
cf4a49b7
RH
3931static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
3932{ \
3933 TYPEM val = HOST(host); \
3934 *(TYPEE *)(vd + H(reg_off)) = val; \
9123aeb6
RH
3935}
3936
0fa476c1
RH
3937#define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
3938static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
3939{ HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
3940
6799ce7b 3941#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
9123aeb6 3942static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
6799ce7b 3943 target_ulong addr, uintptr_t ra) \
9123aeb6 3944{ \
c4af8ba1
RH
3945 *(TYPEE *)(vd + H(reg_off)) = \
3946 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
9123aeb6 3947}
6799ce7b
RH
3948
3949#define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
9123aeb6 3950static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
6799ce7b 3951 target_ulong addr, uintptr_t ra) \
9123aeb6 3952{ \
c4af8ba1
RH
3953 TLB(env, useronly_clean_ptr(addr), \
3954 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
9123aeb6 3955}
9123aeb6
RH
3956
3957#define DO_LD_PRIM_1(NAME, H, TE, TM) \
3958 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
6799ce7b 3959 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
9123aeb6
RH
3960
3961DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
3962DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
3963DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
3964DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
3965DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
3966DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
3967DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
3968
6799ce7b 3969#define DO_ST_PRIM_1(NAME, H, TE, TM) \
0fa476c1 3970 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
6799ce7b
RH
3971 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
3972
3973DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
3974DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
3975DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
3976DO_ST_PRIM_1(bd, , uint64_t, uint8_t)
9123aeb6 3977
6799ce7b
RH
3978#define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
3979 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
3980 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
3981 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
3982 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
9123aeb6 3983
6799ce7b 3984#define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
0fa476c1
RH
3985 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
3986 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
6799ce7b
RH
3987 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
3988 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
9123aeb6 3989
6799ce7b
RH
3990DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
3991DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
3992DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
3993DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw)
3994DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw)
9123aeb6 3995
6799ce7b
RH
3996DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
3997DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
3998DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw)
9123aeb6 3999
6799ce7b
RH
4000DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
4001DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl)
4002DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl)
9123aeb6 4003
6799ce7b
RH
4004DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
4005DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl)
4006
4007DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq)
4008DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq)
9123aeb6
RH
4009
4010#undef DO_LD_TLB
6799ce7b 4011#undef DO_ST_TLB
9123aeb6
RH
4012#undef DO_LD_HOST
4013#undef DO_LD_PRIM_1
6799ce7b 4014#undef DO_ST_PRIM_1
9123aeb6 4015#undef DO_LD_PRIM_2
6799ce7b 4016#undef DO_ST_PRIM_2
9123aeb6
RH
4017
4018/*
4019 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4020 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4021 * element >= @reg_off, or @reg_max if there were no active elements at all.
4022 */
4023static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4024 intptr_t reg_max, int esz)
4025{
4026 uint64_t pg_mask = pred_esz_masks[esz];
4027 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4028
4029 /* In normal usage, the first element is active. */
4030 if (likely(pg & 1)) {
4031 return reg_off;
4032 }
4033
4034 if (pg == 0) {
4035 reg_off &= -64;
4036 do {
4037 reg_off += 64;
4038 if (unlikely(reg_off >= reg_max)) {
4039 /* The entire predicate was false. */
4040 return reg_max;
4041 }
4042 pg = vg[reg_off >> 6] & pg_mask;
4043 } while (pg == 0);
4044 }
4045 reg_off += ctz64(pg);
4046
4047 /* We should never see an out of range predicate bit set. */
4048 tcg_debug_assert(reg_off < reg_max);
4049 return reg_off;
4050}
4051
b4cd95d2
RH
4052/*
4053 * Resolve the guest virtual address to info->host and info->flags.
4054 * If @nofault, return false if the page is invalid, otherwise
4055 * exit via page fault exception.
4056 */
4057
4058typedef struct {
4059 void *host;
4060 int flags;
4061 MemTxAttrs attrs;
4062} SVEHostPage;
4063
4064static bool sve_probe_page(SVEHostPage *info, bool nofault,
4065 CPUARMState *env, target_ulong addr,
4066 int mem_off, MMUAccessType access_type,
4067 int mmu_idx, uintptr_t retaddr)
4068{
4069 int flags;
4070
4071 addr += mem_off;
c4af8ba1
RH
4072
4073 /*
4074 * User-only currently always issues with TBI. See the comment
4075 * above useronly_clean_ptr. Usually we clean this top byte away
4076 * during translation, but we can't do that for e.g. vector + imm
4077 * addressing modes.
4078 *
4079 * We currently always enable TBI for user-only, and do not provide
4080 * a way to turn it off. So clean the pointer unconditionally here,
4081 * rather than look it up here, or pass it down from above.
4082 */
4083 addr = useronly_clean_ptr(addr);
4084
b4cd95d2
RH
4085 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
4086 &info->host, retaddr);
4087 info->flags = flags;
4088
4089 if (flags & TLB_INVALID_MASK) {
4090 g_assert(nofault);
4091 return false;
4092 }
4093
4094 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
4095 info->host -= mem_off;
4096
4097#ifdef CONFIG_USER_ONLY
4098 memset(&info->attrs, 0, sizeof(info->attrs));
4099#else
4100 /*
4101 * Find the iotlbentry for addr and return the transaction attributes.
4102 * This *must* be present in the TLB because we just found the mapping.
4103 */
4104 {
4105 uintptr_t index = tlb_index(env, mmu_idx, addr);
4106
4107# ifdef CONFIG_DEBUG_TCG
4108 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
4109 target_ulong comparator = (access_type == MMU_DATA_LOAD
4110 ? entry->addr_read
4111 : tlb_addr_write(entry));
4112 g_assert(tlb_hit(comparator, addr));
4113# endif
4114
4115 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
4116 info->attrs = iotlbentry->attrs;
4117 }
4118#endif
4119
4120 return true;
4121}
4122
4123
4124/*
4125 * Analyse contiguous data, protected by a governing predicate.
4126 */
4127
4128typedef enum {
4129 FAULT_NO,
4130 FAULT_FIRST,
4131 FAULT_ALL,
4132} SVEContFault;
4133
4134typedef struct {
4135 /*
4136 * First and last element wholly contained within the two pages.
4137 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
4138 * reg_off_last[0] may be < 0 if the first element crosses pages.
4139 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
4140 * are set >= 0 only if there are complete elements on a second page.
4141 *
4142 * The reg_off_* offsets are relative to the internal vector register.
4143 * The mem_off_first offset is relative to the memory address; the
4144 * two offsets are different when a load operation extends, a store
4145 * operation truncates, or for multi-register operations.
4146 */
4147 int16_t mem_off_first[2];
4148 int16_t reg_off_first[2];
4149 int16_t reg_off_last[2];
4150
4151 /*
4152 * One element that is misaligned and spans both pages,
4153 * or -1 if there is no such active element.
4154 */
4155 int16_t mem_off_split;
4156 int16_t reg_off_split;
4157
4158 /*
4159 * The byte offset at which the entire operation crosses a page boundary.
4160 * Set >= 0 if and only if the entire operation spans two pages.
4161 */
4162 int16_t page_split;
4163
4164 /* TLB data for the two pages. */
4165 SVEHostPage page[2];
4166} SVEContLdSt;
4167
4168/*
4169 * Find first active element on each page, and a loose bound for the
4170 * final element on each page. Identify any single element that spans
4171 * the page boundary. Return true if there are any active elements.
4172 */
b854fd06
RH
4173static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
4174 uint64_t *vg, intptr_t reg_max,
4175 int esz, int msize)
b4cd95d2
RH
4176{
4177 const int esize = 1 << esz;
4178 const uint64_t pg_mask = pred_esz_masks[esz];
4179 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
4180 intptr_t mem_off_last, mem_off_split;
4181 intptr_t page_split, elt_split;
4182 intptr_t i;
4183
4184 /* Set all of the element indices to -1, and the TLB data to 0. */
4185 memset(info, -1, offsetof(SVEContLdSt, page));
4186 memset(info->page, 0, sizeof(info->page));
4187
4188 /* Gross scan over the entire predicate to find bounds. */
4189 i = 0;
4190 do {
4191 uint64_t pg = vg[i] & pg_mask;
4192 if (pg) {
4193 reg_off_last = i * 64 + 63 - clz64(pg);
4194 if (reg_off_first < 0) {
4195 reg_off_first = i * 64 + ctz64(pg);
4196 }
4197 }
4198 } while (++i * 64 < reg_max);
4199
4200 if (unlikely(reg_off_first < 0)) {
4201 /* No active elements, no pages touched. */
4202 return false;
4203 }
4204 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
4205
4206 info->reg_off_first[0] = reg_off_first;
4207 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
4208 mem_off_last = (reg_off_last >> esz) * msize;
4209
4210 page_split = -(addr | TARGET_PAGE_MASK);
4211 if (likely(mem_off_last + msize <= page_split)) {
4212 /* The entire operation fits within a single page. */
4213 info->reg_off_last[0] = reg_off_last;
4214 return true;
4215 }
4216
4217 info->page_split = page_split;
4218 elt_split = page_split / msize;
4219 reg_off_split = elt_split << esz;
4220 mem_off_split = elt_split * msize;
4221
4222 /*
4223 * This is the last full element on the first page, but it is not
4224 * necessarily active. If there is no full element, i.e. the first
4225 * active element is the one that's split, this value remains -1.
4226 * It is useful as iteration bounds.
4227 */
4228 if (elt_split != 0) {
4229 info->reg_off_last[0] = reg_off_split - esize;
4230 }
4231
4232 /* Determine if an unaligned element spans the pages. */
4233 if (page_split % msize != 0) {
4234 /* It is helpful to know if the split element is active. */
4235 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
4236 info->reg_off_split = reg_off_split;
4237 info->mem_off_split = mem_off_split;
4238
4239 if (reg_off_split == reg_off_last) {
4240 /* The page crossing element is last. */
4241 return true;
4242 }
4243 }
4244 reg_off_split += esize;
4245 mem_off_split += msize;
4246 }
4247
4248 /*
4249 * We do want the first active element on the second page, because
4250 * this may affect the address reported in an exception.
4251 */
4252 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
4253 tcg_debug_assert(reg_off_split <= reg_off_last);
4254 info->reg_off_first[1] = reg_off_split;
4255 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
4256 info->reg_off_last[1] = reg_off_last;
4257 return true;
4258}
4259
4260/*
4261 * Resolve the guest virtual addresses to info->page[].
4262 * Control the generation of page faults with @fault. Return false if
4263 * there is no work to do, which can only happen with @fault == FAULT_NO.
4264 */
b854fd06
RH
4265static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
4266 CPUARMState *env, target_ulong addr,
4267 MMUAccessType access_type, uintptr_t retaddr)
b4cd95d2
RH
4268{
4269 int mmu_idx = cpu_mmu_index(env, false);
4270 int mem_off = info->mem_off_first[0];
4271 bool nofault = fault == FAULT_NO;
4272 bool have_work = true;
4273
4274 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
4275 access_type, mmu_idx, retaddr)) {
4276 /* No work to be done. */
4277 return false;
4278 }
4279
4280 if (likely(info->page_split < 0)) {
4281 /* The entire operation was on the one page. */
4282 return true;
4283 }
4284
4285 /*
4286 * If the second page is invalid, then we want the fault address to be
4287 * the first byte on that page which is accessed.
4288 */
4289 if (info->mem_off_split >= 0) {
4290 /*
4291 * There is an element split across the pages. The fault address
4292 * should be the first byte of the second page.
4293 */
4294 mem_off = info->page_split;
4295 /*
4296 * If the split element is also the first active element
4297 * of the vector, then: For first-fault we should continue
4298 * to generate faults for the second page. For no-fault,
4299 * we have work only if the second page is valid.
4300 */
4301 if (info->mem_off_first[0] < info->mem_off_split) {
4302 nofault = FAULT_FIRST;
4303 have_work = false;
4304 }
4305 } else {
4306 /*
4307 * There is no element split across the pages. The fault address
4308 * should be the first active element on the second page.
4309 */
4310 mem_off = info->mem_off_first[1];
4311 /*
4312 * There must have been one active element on the first page,
4313 * so we're out of first-fault territory.
4314 */
4315 nofault = fault != FAULT_ALL;
4316 }
4317
4318 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
4319 access_type, mmu_idx, retaddr);
4320 return have_work;
4321}
4322
4bcc3f0f
RH
4323static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
4324 uint64_t *vg, target_ulong addr,
4325 int esize, int msize, int wp_access,
4326 uintptr_t retaddr)
4327{
4328#ifndef CONFIG_USER_ONLY
4329 intptr_t mem_off, reg_off, reg_last;
4330 int flags0 = info->page[0].flags;
4331 int flags1 = info->page[1].flags;
4332
4333 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
4334 return;
4335 }
4336
4337 /* Indicate that watchpoints are handled. */
4338 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
4339 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
4340
4341 if (flags0 & TLB_WATCHPOINT) {
4342 mem_off = info->mem_off_first[0];
4343 reg_off = info->reg_off_first[0];
4344 reg_last = info->reg_off_last[0];
4345
4346 while (reg_off <= reg_last) {
4347 uint64_t pg = vg[reg_off >> 6];
4348 do {
4349 if ((pg >> (reg_off & 63)) & 1) {
4350 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4351 msize, info->page[0].attrs,
4352 wp_access, retaddr);
4353 }
4354 reg_off += esize;
4355 mem_off += msize;
4356 } while (reg_off <= reg_last && (reg_off & 63));
4357 }
4358 }
4359
4360 mem_off = info->mem_off_split;
4361 if (mem_off >= 0) {
4362 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
4363 info->page[0].attrs, wp_access, retaddr);
4364 }
4365
4366 mem_off = info->mem_off_first[1];
4367 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
4368 reg_off = info->reg_off_first[1];
4369 reg_last = info->reg_off_last[1];
4370
4371 do {
4372 uint64_t pg = vg[reg_off >> 6];
4373 do {
4374 if ((pg >> (reg_off & 63)) & 1) {
4375 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4376 msize, info->page[1].attrs,
4377 wp_access, retaddr);
4378 }
4379 reg_off += esize;
4380 mem_off += msize;
4381 } while (reg_off & 63);
4382 } while (reg_off <= reg_last);
4383 }
4384#endif
4385}
4386
206adacf
RH
4387typedef uint64_t mte_check_fn(CPUARMState *, uint32_t, uint64_t, uintptr_t);
4388
4389static inline QEMU_ALWAYS_INLINE
4390void sve_cont_ldst_mte_check_int(SVEContLdSt *info, CPUARMState *env,
4391 uint64_t *vg, target_ulong addr, int esize,
4392 int msize, uint32_t mtedesc, uintptr_t ra,
4393 mte_check_fn *check)
4394{
4395 intptr_t mem_off, reg_off, reg_last;
4396
4397 /* Process the page only if MemAttr == Tagged. */
4398 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
4399 mem_off = info->mem_off_first[0];
4400 reg_off = info->reg_off_first[0];
4401 reg_last = info->reg_off_split;
4402 if (reg_last < 0) {
4403 reg_last = info->reg_off_last[0];
4404 }
4405
4406 do {
4407 uint64_t pg = vg[reg_off >> 6];
4408 do {
4409 if ((pg >> (reg_off & 63)) & 1) {
4410 check(env, mtedesc, addr, ra);
4411 }
4412 reg_off += esize;
4413 mem_off += msize;
4414 } while (reg_off <= reg_last && (reg_off & 63));
4415 } while (reg_off <= reg_last);
4416 }
4417
4418 mem_off = info->mem_off_first[1];
4419 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
4420 reg_off = info->reg_off_first[1];
4421 reg_last = info->reg_off_last[1];
4422
4423 do {
4424 uint64_t pg = vg[reg_off >> 6];
4425 do {
4426 if ((pg >> (reg_off & 63)) & 1) {
4427 check(env, mtedesc, addr, ra);
4428 }
4429 reg_off += esize;
4430 mem_off += msize;
4431 } while (reg_off & 63);
4432 } while (reg_off <= reg_last);
4433 }
4434}
4435
4436typedef void sve_cont_ldst_mte_check_fn(SVEContLdSt *info, CPUARMState *env,
4437 uint64_t *vg, target_ulong addr,
4438 int esize, int msize, uint32_t mtedesc,
4439 uintptr_t ra);
4440
4441static void sve_cont_ldst_mte_check1(SVEContLdSt *info, CPUARMState *env,
4442 uint64_t *vg, target_ulong addr,
4443 int esize, int msize, uint32_t mtedesc,
4444 uintptr_t ra)
4445{
4446 sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
4447 mtedesc, ra, mte_check1);
4448}
4449
4450static void sve_cont_ldst_mte_checkN(SVEContLdSt *info, CPUARMState *env,
4451 uint64_t *vg, target_ulong addr,
4452 int esize, int msize, uint32_t mtedesc,
4453 uintptr_t ra)
4454{
4455 sve_cont_ldst_mte_check_int(info, env, vg, addr, esize, msize,
4456 mtedesc, ra, mte_checkN);
4457}
4458
4459
9123aeb6 4460/*
5c9b8458 4461 * Common helper for all contiguous 1,2,3,4-register predicated stores.
9123aeb6 4462 */
b854fd06 4463static inline QEMU_ALWAYS_INLINE
5c9b8458 4464void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
b854fd06 4465 uint32_t desc, const uintptr_t retaddr,
206adacf 4466 const int esz, const int msz, const int N, uint32_t mtedesc,
b854fd06 4467 sve_ldst1_host_fn *host_fn,
206adacf
RH
4468 sve_ldst1_tlb_fn *tlb_fn,
4469 sve_cont_ldst_mte_check_fn *mte_check_fn)
b854fd06 4470{
ba080b86 4471 const unsigned rd = simd_data(desc);
9123aeb6 4472 const intptr_t reg_max = simd_oprsz(desc);
b854fd06
RH
4473 intptr_t reg_off, reg_last, mem_off;
4474 SVEContLdSt info;
9123aeb6 4475 void *host;
5c9b8458 4476 int flags, i;
9123aeb6 4477
b854fd06 4478 /* Find the active elements. */
5c9b8458 4479 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
9123aeb6 4480 /* The entire predicate was false; no load occurs. */
5c9b8458
RH
4481 for (i = 0; i < N; ++i) {
4482 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4483 }
9123aeb6
RH
4484 return;
4485 }
9123aeb6 4486
b854fd06
RH
4487 /* Probe the page(s). Exit with exception for any invalid page. */
4488 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
9123aeb6 4489
4bcc3f0f 4490 /* Handle watchpoints for all active elements. */
5c9b8458 4491 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
4bcc3f0f
RH
4492 BP_MEM_READ, retaddr);
4493
206adacf
RH
4494 /*
4495 * Handle mte checks for all active elements.
4496 * Since TBI must be set for MTE, !mtedesc => !mte_active.
4497 */
4498 if (mte_check_fn && mtedesc) {
4499 mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
4500 mtedesc, retaddr);
4501 }
4bcc3f0f 4502
b854fd06
RH
4503 flags = info.page[0].flags | info.page[1].flags;
4504 if (unlikely(flags != 0)) {
9123aeb6 4505#ifdef CONFIG_USER_ONLY
b854fd06 4506 g_assert_not_reached();
9123aeb6 4507#else
b854fd06 4508 /*
4bcc3f0f 4509 * At least one page includes MMIO.
b854fd06
RH
4510 * Any bus operation can fail with cpu_transaction_failed,
4511 * which for ARM will raise SyncExternal. Perform the load
4512 * into scratch memory to preserve register state until the end.
4513 */
5c9b8458 4514 ARMVectorReg scratch[4] = { };
b854fd06 4515
b854fd06
RH
4516 mem_off = info.mem_off_first[0];
4517 reg_off = info.reg_off_first[0];
4518 reg_last = info.reg_off_last[1];
4519 if (reg_last < 0) {
4520 reg_last = info.reg_off_split;
4521 if (reg_last < 0) {
4522 reg_last = info.reg_off_last[0];
9123aeb6
RH
4523 }
4524 }
4525
b854fd06
RH
4526 do {
4527 uint64_t pg = vg[reg_off >> 6];
4528 do {
4529 if ((pg >> (reg_off & 63)) & 1) {
5c9b8458
RH
4530 for (i = 0; i < N; ++i) {
4531 tlb_fn(env, &scratch[i], reg_off,
4532 addr + mem_off + (i << msz), retaddr);
4533 }
b854fd06
RH
4534 }
4535 reg_off += 1 << esz;
5c9b8458 4536 mem_off += N << msz;
b854fd06
RH
4537 } while (reg_off & 63);
4538 } while (reg_off <= reg_last);
4539
5c9b8458
RH
4540 for (i = 0; i < N; ++i) {
4541 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
4542 }
b854fd06 4543 return;
9123aeb6 4544#endif
b854fd06
RH
4545 }
4546
4547 /* The entire operation is in RAM, on valid pages. */
4548
5c9b8458
RH
4549 for (i = 0; i < N; ++i) {
4550 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
4551 }
4552
b854fd06
RH
4553 mem_off = info.mem_off_first[0];
4554 reg_off = info.reg_off_first[0];
4555 reg_last = info.reg_off_last[0];
4556 host = info.page[0].host;
4557
4558 while (reg_off <= reg_last) {
4559 uint64_t pg = vg[reg_off >> 6];
4560 do {
4561 if ((pg >> (reg_off & 63)) & 1) {
5c9b8458
RH
4562 for (i = 0; i < N; ++i) {
4563 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4564 host + mem_off + (i << msz));
4565 }
b854fd06
RH
4566 }
4567 reg_off += 1 << esz;
5c9b8458 4568 mem_off += N << msz;
b854fd06
RH
4569 } while (reg_off <= reg_last && (reg_off & 63));
4570 }
9123aeb6 4571
b854fd06
RH
4572 /*
4573 * Use the slow path to manage the cross-page misalignment.
4574 * But we know this is RAM and cannot trap.
4575 */
4576 mem_off = info.mem_off_split;
4577 if (unlikely(mem_off >= 0)) {
5c9b8458
RH
4578 reg_off = info.reg_off_split;
4579 for (i = 0; i < N; ++i) {
4580 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
4581 addr + mem_off + (i << msz), retaddr);
4582 }
b854fd06
RH
4583 }
4584
4585 mem_off = info.mem_off_first[1];
4586 if (unlikely(mem_off >= 0)) {
4587 reg_off = info.reg_off_first[1];
4588 reg_last = info.reg_off_last[1];
4589 host = info.page[1].host;
4590
4591 do {
4592 uint64_t pg = vg[reg_off >> 6];
4593 do {
4594 if ((pg >> (reg_off & 63)) & 1) {
5c9b8458
RH
4595 for (i = 0; i < N; ++i) {
4596 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
4597 host + mem_off + (i << msz));
4598 }
b854fd06
RH
4599 }
4600 reg_off += 1 << esz;
5c9b8458 4601 mem_off += N << msz;
b854fd06
RH
4602 } while (reg_off & 63);
4603 } while (reg_off <= reg_last);
4604 }
c4e7c493
RH
4605}
4606
206adacf
RH
4607static inline QEMU_ALWAYS_INLINE
4608void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
4609 uint32_t desc, const uintptr_t ra,
4610 const int esz, const int msz, const int N,
4611 sve_ldst1_host_fn *host_fn,
4612 sve_ldst1_tlb_fn *tlb_fn)
4613{
4614 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4615 int bit55 = extract64(addr, 55, 1);
4616
4617 /* Remove mtedesc from the normal sve descriptor. */
4618 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4619
4620 /* Perform gross MTE suppression early. */
4621 if (!tbi_check(desc, bit55) ||
4622 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
4623 mtedesc = 0;
4624 }
4625
4626 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
4627 N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
9123aeb6
RH
4628}
4629
206adacf
RH
4630#define DO_LD1_1(NAME, ESZ) \
4631void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
4632 target_ulong addr, uint32_t desc) \
4633{ \
4634 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
4635 sve_##NAME##_host, sve_##NAME##_tlb, NULL); \
4636} \
4637void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
4638 target_ulong addr, uint32_t desc) \
4639{ \
4640 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
4641 sve_##NAME##_host, sve_##NAME##_tlb); \
4642}
4643
4644#define DO_LD1_2(NAME, ESZ, MSZ) \
4645void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
4646 target_ulong addr, uint32_t desc) \
4647{ \
4648 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4649 sve_##NAME##_le_host, sve_##NAME##_le_tlb, NULL); \
4650} \
4651void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
4652 target_ulong addr, uint32_t desc) \
4653{ \
4654 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4655 sve_##NAME##_be_host, sve_##NAME##_be_tlb, NULL); \
4656} \
4657void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
4658 target_ulong addr, uint32_t desc) \
4659{ \
4660 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
4661 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
4662} \
4663void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
4664 target_ulong addr, uint32_t desc) \
4665{ \
4666 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
4667 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
9123aeb6
RH
4668}
4669
5c9b8458
RH
4670DO_LD1_1(ld1bb, MO_8)
4671DO_LD1_1(ld1bhu, MO_16)
4672DO_LD1_1(ld1bhs, MO_16)
4673DO_LD1_1(ld1bsu, MO_32)
4674DO_LD1_1(ld1bss, MO_32)
4675DO_LD1_1(ld1bdu, MO_64)
4676DO_LD1_1(ld1bds, MO_64)
9123aeb6 4677
5c9b8458
RH
4678DO_LD1_2(ld1hh, MO_16, MO_16)
4679DO_LD1_2(ld1hsu, MO_32, MO_16)
4680DO_LD1_2(ld1hss, MO_32, MO_16)
4681DO_LD1_2(ld1hdu, MO_64, MO_16)
4682DO_LD1_2(ld1hds, MO_64, MO_16)
9123aeb6 4683
5c9b8458
RH
4684DO_LD1_2(ld1ss, MO_32, MO_32)
4685DO_LD1_2(ld1sdu, MO_64, MO_32)
4686DO_LD1_2(ld1sds, MO_64, MO_32)
9123aeb6 4687
5c9b8458 4688DO_LD1_2(ld1dd, MO_64, MO_64)
9123aeb6
RH
4689
4690#undef DO_LD1_1
4691#undef DO_LD1_2
4692
206adacf
RH
4693#define DO_LDN_1(N) \
4694void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
4695 target_ulong addr, uint32_t desc) \
4696{ \
4697 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
4698 sve_ld1bb_host, sve_ld1bb_tlb, NULL); \
4699} \
4700void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
4701 target_ulong addr, uint32_t desc) \
4702{ \
4703 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
4704 sve_ld1bb_host, sve_ld1bb_tlb); \
f27d4dc2
RH
4705}
4706
206adacf
RH
4707#define DO_LDN_2(N, SUFF, ESZ) \
4708void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
4709 target_ulong addr, uint32_t desc) \
4710{ \
4711 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4712 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb, NULL); \
4713} \
4714void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
4715 target_ulong addr, uint32_t desc) \
4716{ \
4717 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4718 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb, NULL); \
4719} \
4720void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
4721 target_ulong addr, uint32_t desc) \
4722{ \
4723 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
4724 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
4725} \
4726void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
4727 target_ulong addr, uint32_t desc) \
4728{ \
4729 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
4730 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
c4e7c493
RH
4731}
4732
f27d4dc2
RH
4733DO_LDN_1(2)
4734DO_LDN_1(3)
4735DO_LDN_1(4)
c4e7c493 4736
5c9b8458
RH
4737DO_LDN_2(2, hh, MO_16)
4738DO_LDN_2(3, hh, MO_16)
4739DO_LDN_2(4, hh, MO_16)
c4e7c493 4740
5c9b8458
RH
4741DO_LDN_2(2, ss, MO_32)
4742DO_LDN_2(3, ss, MO_32)
4743DO_LDN_2(4, ss, MO_32)
c4e7c493 4744
5c9b8458
RH
4745DO_LDN_2(2, dd, MO_64)
4746DO_LDN_2(3, dd, MO_64)
4747DO_LDN_2(4, dd, MO_64)
c4e7c493 4748
f27d4dc2
RH
4749#undef DO_LDN_1
4750#undef DO_LDN_2
e2654d75
RH
4751
4752/*
4753 * Load contiguous data, first-fault and no-fault.
9123aeb6
RH
4754 *
4755 * For user-only, one could argue that we should hold the mmap_lock during
4756 * the operation so that there is no race between page_check_range and the
4757 * load operation. However, unmapping pages out from under a running thread
4758 * is extraordinarily unlikely. This theoretical race condition also affects
4759 * linux-user/ in its get_user/put_user macros.
4760 *
4761 * TODO: Construct some helpers, written in assembly, that interact with
4762 * handle_cpu_signal to produce memory ops which can properly report errors
4763 * without racing.
e2654d75
RH
4764 */
4765
e2654d75
RH
4766/* Fault on byte I. All bits in FFR from I are cleared. The vector
4767 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4768 * option, which leaves subsequent data unchanged.
4769 */
4770static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4771{
4772 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4773
4774 if (i & 63) {
4775 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4776 i = ROUND_UP(i, 64);
4777 }
4778 for (; i < oprsz; i += 64) {
4779 ffr[i / 64] = 0;
4780 }
4781}
4782
9123aeb6 4783/*
c647673c 4784 * Common helper for all contiguous no-fault and first-fault loads.
9123aeb6 4785 */
c647673c
RH
4786static inline QEMU_ALWAYS_INLINE
4787void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
aa13f7c3 4788 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
c647673c
RH
4789 const int esz, const int msz, const SVEContFault fault,
4790 sve_ldst1_host_fn *host_fn,
4791 sve_ldst1_tlb_fn *tlb_fn)
4792{
ba080b86 4793 const unsigned rd = simd_data(desc);
500d0484 4794 void *vd = &env->vfp.zregs[rd];
9123aeb6 4795 const intptr_t reg_max = simd_oprsz(desc);
c647673c
RH
4796 intptr_t reg_off, mem_off, reg_last;
4797 SVEContLdSt info;
4798 int flags;
9123aeb6
RH
4799 void *host;
4800
c647673c
RH
4801 /* Find the active elements. */
4802 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
9123aeb6
RH
4803 /* The entire predicate was false; no load occurs. */
4804 memset(vd, 0, reg_max);
4805 return;
4806 }
c647673c 4807 reg_off = info.reg_off_first[0];
9123aeb6 4808
c647673c
RH
4809 /* Probe the page(s). */
4810 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
4811 /* Fault on first element. */
4812 tcg_debug_assert(fault == FAULT_NO);
4813 memset(vd, 0, reg_max);
4814 goto do_fault;
4815 }
4816
4817 mem_off = info.mem_off_first[0];
4818 flags = info.page[0].flags;
4819
aa13f7c3
RH
4820 /*
4821 * Disable MTE checking if the Tagged bit is not set. Since TBI must
4822 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
4823 */
4824 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
4825 mtedesc = 0;
4826 }
4827
c647673c 4828 if (fault == FAULT_FIRST) {
aa13f7c3
RH
4829 /* Trapping mte check for the first-fault element. */
4830 if (mtedesc) {
4831 mte_check1(env, mtedesc, addr + mem_off, retaddr);
4832 }
4833
c647673c
RH
4834 /*
4835 * Special handling of the first active element,
4836 * if it crosses a page boundary or is MMIO.
4837 */
4838 bool is_split = mem_off == info.mem_off_split;
c647673c
RH
4839 if (unlikely(flags != 0) || unlikely(is_split)) {
4840 /*
4841 * Use the slow path for cross-page handling.
4842 * Might trap for MMIO or watchpoints.
4843 */
4844 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4845
4846 /* After any fault, zero the other elements. */
9123aeb6 4847 swap_memzero(vd, reg_off);
c647673c
RH
4848 reg_off += 1 << esz;
4849 mem_off += 1 << msz;
4850 swap_memzero(vd + reg_off, reg_max - reg_off);
4851
4852 if (is_split) {
4853 goto second_page;
4854 }
4855 } else {
4856 memset(vd, 0, reg_max);
4857 }
4858 } else {
4859 memset(vd, 0, reg_max);
4860 if (unlikely(mem_off == info.mem_off_split)) {
4861 /* The first active element crosses a page boundary. */
4862 flags |= info.page[1].flags;
4863 if (unlikely(flags & TLB_MMIO)) {
4864 /* Some page is MMIO, see below. */
4865 goto do_fault;
4866 }
4867 if (unlikely(flags & TLB_WATCHPOINT) &&
4868 (cpu_watchpoint_address_matches
4869 (env_cpu(env), addr + mem_off, 1 << msz)
4870 & BP_MEM_READ)) {
4871 /* Watchpoint hit, see below. */
4872 goto do_fault;
4873 }
aa13f7c3
RH
4874 if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
4875 goto do_fault;
4876 }
c647673c
RH
4877 /*
4878 * Use the slow path for cross-page handling.
4879 * This is RAM, without a watchpoint, and will not trap.
4880 */
4881 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
4882 goto second_page;
9123aeb6
RH
4883 }
4884 }
4885
9123aeb6 4886 /*
c647673c
RH
4887 * From this point on, all memory operations are MemSingleNF.
4888 *
4889 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
4890 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
4891 *
4892 * Unfortuately we do not have access to the memory attributes from the
4893 * PTE to tell Device memory from Normal memory. So we make a mostly
4894 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
4895 * This gives the right answer for the common cases of "Normal memory,
4896 * backed by host RAM" and "Device memory, backed by MMIO".
4897 * The architecture allows us to suppress an NF load and return
4898 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
4899 * case of "Normal memory, backed by MMIO" is permitted. The case we
4900 * get wrong is "Device memory, backed by host RAM", for which we
4901 * should return (UNKNOWN, FAULT) for but do not.
4902 *
4903 * Similarly, CPU_BP breakpoints would raise exceptions, and so
4904 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
4905 * architectural breakpoints the same.
9123aeb6 4906 */
c647673c
RH
4907 if (unlikely(flags & TLB_MMIO)) {
4908 goto do_fault;
9123aeb6 4909 }
9123aeb6 4910
c647673c
RH
4911 reg_last = info.reg_off_last[0];
4912 host = info.page[0].host;
9123aeb6 4913
c647673c
RH
4914 do {
4915 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
cf4a49b7 4916 do {
c647673c
RH
4917 if ((pg >> (reg_off & 63)) & 1) {
4918 if (unlikely(flags & TLB_WATCHPOINT) &&
4919 (cpu_watchpoint_address_matches
4920 (env_cpu(env), addr + mem_off, 1 << msz)
4921 & BP_MEM_READ)) {
4922 goto do_fault;
4923 }
aa13f7c3
RH
4924 if (mtedesc && !mte_probe1(env, mtedesc, addr + mem_off)) {
4925 goto do_fault;
4926 }
c647673c
RH
4927 host_fn(vd, reg_off, host + mem_off);
4928 }
cf4a49b7 4929 reg_off += 1 << esz;
c647673c
RH
4930 mem_off += 1 << msz;
4931 } while (reg_off <= reg_last && (reg_off & 63));
4932 } while (reg_off <= reg_last);
9123aeb6 4933
c647673c
RH
4934 /*
4935 * MemSingleNF is allowed to fail for any reason. We have special
4936 * code above to handle the first element crossing a page boundary.
4937 * As an implementation choice, decline to handle a cross-page element
4938 * in any other position.
4939 */
4940 reg_off = info.reg_off_split;
4941 if (reg_off >= 0) {
4942 goto do_fault;
4943 }
9123aeb6 4944
c647673c
RH
4945 second_page:
4946 reg_off = info.reg_off_first[1];
4947 if (likely(reg_off < 0)) {
4948 /* No active elements on the second page. All done. */
9123aeb6
RH
4949 return;
4950 }
9123aeb6 4951
9123aeb6 4952 /*
c647673c
RH
4953 * MemSingleNF is allowed to fail for any reason. As an implementation
4954 * choice, decline to handle elements on the second page. This should
4955 * be low frequency as the guest walks through memory -- the next
4956 * iteration of the guest's loop should be aligned on the page boundary,
4957 * and then all following iterations will stay aligned.
9123aeb6 4958 */
9123aeb6 4959
c647673c 4960 do_fault:
9123aeb6
RH
4961 record_fault(env, reg_off, reg_max);
4962}
4963
aa13f7c3
RH
4964static inline QEMU_ALWAYS_INLINE
4965void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
4966 uint32_t desc, const uintptr_t retaddr,
4967 const int esz, const int msz, const SVEContFault fault,
4968 sve_ldst1_host_fn *host_fn,
4969 sve_ldst1_tlb_fn *tlb_fn)
4970{
4971 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4972 int bit55 = extract64(addr, 55, 1);
4973
4974 /* Remove mtedesc from the normal sve descriptor. */
4975 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
4976
4977 /* Perform gross MTE suppression early. */
4978 if (!tbi_check(desc, bit55) ||
4979 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
4980 mtedesc = 0;
4981 }
4982
4983 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
4984 esz, msz, fault, host_fn, tlb_fn);
4985}
4986
4987#define DO_LDFF1_LDNF1_1(PART, ESZ) \
9123aeb6
RH
4988void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
4989 target_ulong addr, uint32_t desc) \
e2654d75 4990{ \
aa13f7c3 4991 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
c647673c 4992 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
e2654d75 4993} \
9123aeb6
RH
4994void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
4995 target_ulong addr, uint32_t desc) \
e2654d75 4996{ \
aa13f7c3
RH
4997 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
4998 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
4999} \
5000void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
5001 target_ulong addr, uint32_t desc) \
5002{ \
5003 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
5004 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5005} \
5006void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
5007 target_ulong addr, uint32_t desc) \
5008{ \
5009 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
c647673c 5010 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
e2654d75
RH
5011}
5012
aa13f7c3 5013#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
7d0a57a2
RH
5014void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
5015 target_ulong addr, uint32_t desc) \
e2654d75 5016{ \
aa13f7c3 5017 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
c647673c 5018 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
9123aeb6 5019} \
7d0a57a2
RH
5020void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
5021 target_ulong addr, uint32_t desc) \
9123aeb6 5022{ \
aa13f7c3 5023 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
c647673c 5024 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
7d0a57a2
RH
5025} \
5026void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
5027 target_ulong addr, uint32_t desc) \
5028{ \
aa13f7c3 5029 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
c647673c 5030 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
7d0a57a2
RH
5031} \
5032void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
5033 target_ulong addr, uint32_t desc) \
5034{ \
aa13f7c3 5035 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
c647673c 5036 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
aa13f7c3
RH
5037} \
5038void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5039 target_ulong addr, uint32_t desc) \
5040{ \
5041 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5042 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5043} \
5044void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5045 target_ulong addr, uint32_t desc) \
5046{ \
5047 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5048 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5049} \
5050void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5051 target_ulong addr, uint32_t desc) \
5052{ \
5053 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5054 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5055} \
5056void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5057 target_ulong addr, uint32_t desc) \
5058{ \
5059 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5060 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
e2654d75
RH
5061}
5062
c647673c
RH
5063DO_LDFF1_LDNF1_1(bb, MO_8)
5064DO_LDFF1_LDNF1_1(bhu, MO_16)
5065DO_LDFF1_LDNF1_1(bhs, MO_16)
5066DO_LDFF1_LDNF1_1(bsu, MO_32)
5067DO_LDFF1_LDNF1_1(bss, MO_32)
5068DO_LDFF1_LDNF1_1(bdu, MO_64)
5069DO_LDFF1_LDNF1_1(bds, MO_64)
e2654d75 5070
c647673c
RH
5071DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
5072DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
5073DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
5074DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
5075DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
e2654d75 5076
c647673c
RH
5077DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
5078DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
5079DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
e2654d75 5080
c647673c 5081DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
e2654d75 5082
9123aeb6
RH
5083#undef DO_LDFF1_LDNF1_1
5084#undef DO_LDFF1_LDNF1_2
1a039c7e 5085
9fd46c83 5086/*
0fa476c1 5087 * Common helper for all contiguous 1,2,3,4-register predicated stores.
9fd46c83 5088 */
0fa476c1
RH
5089
5090static inline QEMU_ALWAYS_INLINE
71b9f394
RH
5091void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
5092 uint32_t desc, const uintptr_t retaddr,
5093 const int esz, const int msz, const int N, uint32_t mtedesc,
0fa476c1 5094 sve_ldst1_host_fn *host_fn,
71b9f394
RH
5095 sve_ldst1_tlb_fn *tlb_fn,
5096 sve_cont_ldst_mte_check_fn *mte_check_fn)
9fd46c83 5097{
ba080b86 5098 const unsigned rd = simd_data(desc);
0fa476c1
RH
5099 const intptr_t reg_max = simd_oprsz(desc);
5100 intptr_t reg_off, reg_last, mem_off;
5101 SVEContLdSt info;
5102 void *host;
5103 int i, flags;
1a039c7e 5104
0fa476c1
RH
5105 /* Find the active elements. */
5106 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5107 /* The entire predicate was false; no store occurs. */
5108 return;
9fd46c83 5109 }
1a039c7e 5110
0fa476c1
RH
5111 /* Probe the page(s). Exit with exception for any invalid page. */
5112 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
1a039c7e 5113
0fa476c1
RH
5114 /* Handle watchpoints for all active elements. */
5115 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5116 BP_MEM_WRITE, retaddr);
5117
71b9f394
RH
5118 /*
5119 * Handle mte checks for all active elements.
5120 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5121 */
5122 if (mte_check_fn && mtedesc) {
5123 mte_check_fn(&info, env, vg, addr, 1 << esz, N << msz,
5124 mtedesc, retaddr);
5125 }
0fa476c1
RH
5126
5127 flags = info.page[0].flags | info.page[1].flags;
5128 if (unlikely(flags != 0)) {
5129#ifdef CONFIG_USER_ONLY
5130 g_assert_not_reached();
5131#else
5132 /*
5133 * At least one page includes MMIO.
5134 * Any bus operation can fail with cpu_transaction_failed,
5135 * which for ARM will raise SyncExternal. We cannot avoid
5136 * this fault and will leave with the store incomplete.
5137 */
5138 mem_off = info.mem_off_first[0];
5139 reg_off = info.reg_off_first[0];
5140 reg_last = info.reg_off_last[1];
5141 if (reg_last < 0) {
5142 reg_last = info.reg_off_split;
5143 if (reg_last < 0) {
5144 reg_last = info.reg_off_last[0];
9fd46c83 5145 }
0fa476c1
RH
5146 }
5147
5148 do {
5149 uint64_t pg = vg[reg_off >> 6];
5150 do {
5151 if ((pg >> (reg_off & 63)) & 1) {
5152 for (i = 0; i < N; ++i) {
5153 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5154 addr + mem_off + (i << msz), retaddr);
5155 }
5156 }
5157 reg_off += 1 << esz;
5158 mem_off += N << msz;
5159 } while (reg_off & 63);
5160 } while (reg_off <= reg_last);
5161 return;
5162#endif
1a039c7e 5163 }
1a039c7e 5164
0fa476c1
RH
5165 mem_off = info.mem_off_first[0];
5166 reg_off = info.reg_off_first[0];
5167 reg_last = info.reg_off_last[0];
5168 host = info.page[0].host;
1a039c7e 5169
0fa476c1
RH
5170 while (reg_off <= reg_last) {
5171 uint64_t pg = vg[reg_off >> 6];
9fd46c83 5172 do {
0fa476c1
RH
5173 if ((pg >> (reg_off & 63)) & 1) {
5174 for (i = 0; i < N; ++i) {
5175 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5176 host + mem_off + (i << msz));
5177 }
9fd46c83 5178 }
0fa476c1
RH
5179 reg_off += 1 << esz;
5180 mem_off += N << msz;
5181 } while (reg_off <= reg_last && (reg_off & 63));
1a039c7e 5182 }
1a039c7e 5183
0fa476c1
RH
5184 /*
5185 * Use the slow path to manage the cross-page misalignment.
5186 * But we know this is RAM and cannot trap.
5187 */
5188 mem_off = info.mem_off_split;
5189 if (unlikely(mem_off >= 0)) {
5190 reg_off = info.reg_off_split;
5191 for (i = 0; i < N; ++i) {
5192 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5193 addr + mem_off + (i << msz), retaddr);
5194 }
5195 }
5196
5197 mem_off = info.mem_off_first[1];
5198 if (unlikely(mem_off >= 0)) {
5199 reg_off = info.reg_off_first[1];
5200 reg_last = info.reg_off_last[1];
5201 host = info.page[1].host;
1a039c7e 5202
9fd46c83 5203 do {
0fa476c1
RH
5204 uint64_t pg = vg[reg_off >> 6];
5205 do {
5206 if ((pg >> (reg_off & 63)) & 1) {
5207 for (i = 0; i < N; ++i) {
5208 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5209 host + mem_off + (i << msz));
5210 }
5211 }
5212 reg_off += 1 << esz;
5213 mem_off += N << msz;
5214 } while (reg_off & 63);
5215 } while (reg_off <= reg_last);
1a039c7e 5216 }
9fd46c83
RH
5217}
5218
71b9f394
RH
5219static inline QEMU_ALWAYS_INLINE
5220void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5221 uint32_t desc, const uintptr_t ra,
5222 const int esz, const int msz, const int N,
5223 sve_ldst1_host_fn *host_fn,
5224 sve_ldst1_tlb_fn *tlb_fn)
5225{
5226 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5227 int bit55 = extract64(addr, 55, 1);
5228
5229 /* Remove mtedesc from the normal sve descriptor. */
5230 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5231
5232 /* Perform gross MTE suppression early. */
5233 if (!tbi_check(desc, bit55) ||
5234 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5235 mtedesc = 0;
5236 }
5237
5238 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn,
5239 N == 1 ? sve_cont_ldst_mte_check1 : sve_cont_ldst_mte_checkN);
1a039c7e 5240}
f6dbf62a 5241
71b9f394
RH
5242#define DO_STN_1(N, NAME, ESZ) \
5243void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
5244 target_ulong addr, uint32_t desc) \
5245{ \
5246 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
5247 sve_st1##NAME##_host, sve_st1##NAME##_tlb, NULL); \
5248} \
5249void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
5250 target_ulong addr, uint32_t desc) \
5251{ \
5252 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
5253 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
5254}
5255
5256#define DO_STN_2(N, NAME, ESZ, MSZ) \
5257void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
5258 target_ulong addr, uint32_t desc) \
5259{ \
5260 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
5261 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb, NULL); \
5262} \
5263void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
5264 target_ulong addr, uint32_t desc) \
5265{ \
5266 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
5267 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb, NULL); \
5268} \
5269void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5270 target_ulong addr, uint32_t desc) \
5271{ \
5272 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5273 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
5274} \
5275void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5276 target_ulong addr, uint32_t desc) \
5277{ \
5278 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5279 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
0fa476c1
RH
5280}
5281
5282DO_STN_1(1, bb, MO_8)
5283DO_STN_1(1, bh, MO_16)
5284DO_STN_1(1, bs, MO_32)
5285DO_STN_1(1, bd, MO_64)
5286DO_STN_1(2, bb, MO_8)
5287DO_STN_1(3, bb, MO_8)
5288DO_STN_1(4, bb, MO_8)
5289
5290DO_STN_2(1, hh, MO_16, MO_16)
5291DO_STN_2(1, hs, MO_32, MO_16)
5292DO_STN_2(1, hd, MO_64, MO_16)
5293DO_STN_2(2, hh, MO_16, MO_16)
5294DO_STN_2(3, hh, MO_16, MO_16)
5295DO_STN_2(4, hh, MO_16, MO_16)
5296
5297DO_STN_2(1, ss, MO_32, MO_32)
5298DO_STN_2(1, sd, MO_64, MO_32)
5299DO_STN_2(2, ss, MO_32, MO_32)
5300DO_STN_2(3, ss, MO_32, MO_32)
5301DO_STN_2(4, ss, MO_32, MO_32)
5302
5303DO_STN_2(1, dd, MO_64, MO_64)
5304DO_STN_2(2, dd, MO_64, MO_64)
5305DO_STN_2(3, dd, MO_64, MO_64)
5306DO_STN_2(4, dd, MO_64, MO_64)
9fd46c83
RH
5307
5308#undef DO_STN_1
5309#undef DO_STN_2
5310
d4f75f25
RH
5311/*
5312 * Loads with a vector index.
5313 */
673e9fa6 5314
d4f75f25
RH
5315/*
5316 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
5317 */
5318typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
5319
5320static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
5321{
5322 return *(uint32_t *)(reg + H1_4(reg_ofs));
673e9fa6
RH
5323}
5324
d4f75f25
RH
5325static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
5326{
5327 return *(int32_t *)(reg + H1_4(reg_ofs));
5328}
5329
5330static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
5331{
5332 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
5333}
5334
5335static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
5336{
5337 return (int32_t)*(uint64_t *)(reg + reg_ofs);
5338}
5339
5340static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
5341{
5342 return *(uint64_t *)(reg + reg_ofs);
673e9fa6
RH
5343}
5344
10a85e2c
RH
5345static inline QEMU_ALWAYS_INLINE
5346void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5347 target_ulong base, uint32_t desc, uintptr_t retaddr,
d28d12f0
RH
5348 uint32_t mtedesc, int esize, int msize,
5349 zreg_off_fn *off_fn,
10a85e2c
RH
5350 sve_ldst1_host_fn *host_fn,
5351 sve_ldst1_tlb_fn *tlb_fn)
d4f75f25 5352{
10a85e2c
RH
5353 const int mmu_idx = cpu_mmu_index(env, false);
5354 const intptr_t reg_max = simd_oprsz(desc);
ba080b86 5355 const int scale = simd_data(desc);
10a85e2c
RH
5356 ARMVectorReg scratch;
5357 intptr_t reg_off;
5358 SVEHostPage info, info2;
d4f75f25 5359
10a85e2c
RH
5360 memset(&scratch, 0, reg_max);
5361 reg_off = 0;
5362 do {
5363 uint64_t pg = vg[reg_off >> 6];
d4f75f25
RH
5364 do {
5365 if (likely(pg & 1)) {
10a85e2c
RH
5366 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5367 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5368
5369 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
5370 mmu_idx, retaddr);
5371
5372 if (likely(in_page >= msize)) {
5373 if (unlikely(info.flags & TLB_WATCHPOINT)) {
5374 cpu_check_watchpoint(env_cpu(env), addr, msize,
5375 info.attrs, BP_MEM_READ, retaddr);
5376 }
d28d12f0
RH
5377 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5378 mte_check1(env, mtedesc, addr, retaddr);
5379 }
10a85e2c
RH
5380 host_fn(&scratch, reg_off, info.host);
5381 } else {
5382 /* Element crosses the page boundary. */
5383 sve_probe_page(&info2, false, env, addr + in_page, 0,
5384 MMU_DATA_LOAD, mmu_idx, retaddr);
5385 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
5386 cpu_check_watchpoint(env_cpu(env), addr,
5387 msize, info.attrs,
5388 BP_MEM_READ, retaddr);
5389 }
d28d12f0
RH
5390 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5391 mte_check1(env, mtedesc, addr, retaddr);
5392 }
10a85e2c
RH
5393 tlb_fn(env, &scratch, reg_off, addr, retaddr);
5394 }
d4f75f25 5395 }
10a85e2c
RH
5396 reg_off += esize;
5397 pg >>= esize;
5398 } while (reg_off & 63);
5399 } while (reg_off < reg_max);
d4f75f25
RH
5400
5401 /* Wait until all exceptions have been raised to write back. */
10a85e2c 5402 memcpy(vd, &scratch, reg_max);
d4f75f25
RH
5403}
5404
d28d12f0
RH
5405static inline QEMU_ALWAYS_INLINE
5406void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5407 target_ulong base, uint32_t desc, uintptr_t retaddr,
5408 int esize, int msize, zreg_off_fn *off_fn,
5409 sve_ldst1_host_fn *host_fn,
5410 sve_ldst1_tlb_fn *tlb_fn)
5411{
5412 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5413 /* Remove mtedesc from the normal sve descriptor. */
5414 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5415
5416 /*
5417 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5418 * offset base entirely over the address space hole to change the
5419 * pointer tag, or change the bit55 selector. So we could here
5420 * examine TBI + TCMA like we do for sve_ldN_r_mte().
5421 */
5422 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5423 esize, msize, off_fn, host_fn, tlb_fn);
5424}
5425
10a85e2c
RH
5426#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
5427void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5428 void *vm, target_ulong base, uint32_t desc) \
5429{ \
d28d12f0 5430 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
10a85e2c 5431 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
d28d12f0
RH
5432} \
5433void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5434 void *vm, target_ulong base, uint32_t desc) \
5435{ \
5436 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
5437 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
10a85e2c 5438}
d4f75f25 5439
10a85e2c
RH
5440#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
5441void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5442 void *vm, target_ulong base, uint32_t desc) \
5443{ \
d28d12f0 5444 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
10a85e2c 5445 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
d28d12f0
RH
5446} \
5447void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5448 void *vm, target_ulong base, uint32_t desc) \
5449{ \
5450 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
5451 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
10a85e2c
RH
5452}
5453
5454DO_LD1_ZPZ_S(bsu, zsu, MO_8)
5455DO_LD1_ZPZ_S(bsu, zss, MO_8)
5456DO_LD1_ZPZ_D(bdu, zsu, MO_8)
5457DO_LD1_ZPZ_D(bdu, zss, MO_8)
5458DO_LD1_ZPZ_D(bdu, zd, MO_8)
5459
5460DO_LD1_ZPZ_S(bss, zsu, MO_8)
5461DO_LD1_ZPZ_S(bss, zss, MO_8)
5462DO_LD1_ZPZ_D(bds, zsu, MO_8)
5463DO_LD1_ZPZ_D(bds, zss, MO_8)
5464DO_LD1_ZPZ_D(bds, zd, MO_8)
5465
5466DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
5467DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
5468DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
5469DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
5470DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
5471
5472DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
5473DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
5474DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
5475DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
5476DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
5477
5478DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
5479DO_LD1_ZPZ_S(hss_le, zss, MO_16)
5480DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
5481DO_LD1_ZPZ_D(hds_le, zss, MO_16)
5482DO_LD1_ZPZ_D(hds_le, zd, MO_16)
5483
5484DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
5485DO_LD1_ZPZ_S(hss_be, zss, MO_16)
5486DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
5487DO_LD1_ZPZ_D(hds_be, zss, MO_16)
5488DO_LD1_ZPZ_D(hds_be, zd, MO_16)
5489
5490DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
5491DO_LD1_ZPZ_S(ss_le, zss, MO_32)
5492DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
5493DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
5494DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
5495
5496DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
5497DO_LD1_ZPZ_S(ss_be, zss, MO_32)
5498DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
5499DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
5500DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
5501
5502DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
5503DO_LD1_ZPZ_D(sds_le, zss, MO_32)
5504DO_LD1_ZPZ_D(sds_le, zd, MO_32)
5505
5506DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
5507DO_LD1_ZPZ_D(sds_be, zss, MO_32)
5508DO_LD1_ZPZ_D(sds_be, zd, MO_32)
5509
5510DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
5511DO_LD1_ZPZ_D(dd_le, zss, MO_64)
5512DO_LD1_ZPZ_D(dd_le, zd, MO_64)
5513
5514DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
5515DO_LD1_ZPZ_D(dd_be, zss, MO_64)
5516DO_LD1_ZPZ_D(dd_be, zd, MO_64)
d4f75f25
RH
5517
5518#undef DO_LD1_ZPZ_S
5519#undef DO_LD1_ZPZ_D
673e9fa6 5520
ed67eb7f
RH
5521/* First fault loads with a vector index. */
5522
116347ce 5523/*
50de9b78 5524 * Common helpers for all gather first-faulting loads.
116347ce 5525 */
50de9b78
RH
5526
5527static inline QEMU_ALWAYS_INLINE
5528void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5529 target_ulong base, uint32_t desc, uintptr_t retaddr,
d28d12f0
RH
5530 uint32_t mtedesc, const int esz, const int msz,
5531 zreg_off_fn *off_fn,
50de9b78
RH
5532 sve_ldst1_host_fn *host_fn,
5533 sve_ldst1_tlb_fn *tlb_fn)
116347ce 5534{
50de9b78 5535 const int mmu_idx = cpu_mmu_index(env, false);
ba080b86
RH
5536 const intptr_t reg_max = simd_oprsz(desc);
5537 const int scale = simd_data(desc);
50de9b78
RH
5538 const int esize = 1 << esz;
5539 const int msize = 1 << msz;
50de9b78
RH
5540 intptr_t reg_off;
5541 SVEHostPage info;
5542 target_ulong addr, in_page;
116347ce
RH
5543
5544 /* Skip to the first true predicate. */
50de9b78
RH
5545 reg_off = find_next_active(vg, 0, reg_max, esz);
5546 if (unlikely(reg_off >= reg_max)) {
5547 /* The entire predicate was false; no load occurs. */
5548 memset(vd, 0, reg_max);
5549 return;
116347ce
RH
5550 }
5551
50de9b78
RH
5552 /*
5553 * Probe the first element, allowing faults.
5554 */
5555 addr = base + (off_fn(vm, reg_off) << scale);
d28d12f0
RH
5556 if (mtedesc) {
5557 mte_check1(env, mtedesc, addr, retaddr);
5558 }
50de9b78 5559 tlb_fn(env, vd, reg_off, addr, retaddr);
ed67eb7f 5560
50de9b78
RH
5561 /* After any fault, zero the other elements. */
5562 swap_memzero(vd, reg_off);
5563 reg_off += esize;
5564 swap_memzero(vd + reg_off, reg_max - reg_off);
116347ce 5565
50de9b78
RH
5566 /*
5567 * Probe the remaining elements, not allowing faults.
5568 */
5569 while (reg_off < reg_max) {
5570 uint64_t pg = vg[reg_off >> 6];
5571 do {
5572 if (likely((pg >> (reg_off & 63)) & 1)) {
5573 addr = base + (off_fn(vm, reg_off) << scale);
5574 in_page = -(addr | TARGET_PAGE_MASK);
116347ce 5575
50de9b78
RH
5576 if (unlikely(in_page < msize)) {
5577 /* Stop if the element crosses a page boundary. */
5578 goto fault;
5579 }
ed67eb7f 5580
50de9b78
RH
5581 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
5582 mmu_idx, retaddr);
5583 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
5584 goto fault;
5585 }
5586 if (unlikely(info.flags & TLB_WATCHPOINT) &&
5587 (cpu_watchpoint_address_matches
5588 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
5589 goto fault;
5590 }
d28d12f0
RH
5591 if (mtedesc &&
5592 arm_tlb_mte_tagged(&info.attrs) &&
5593 !mte_probe1(env, mtedesc, addr)) {
5594 goto fault;
5595 }
116347ce 5596
50de9b78 5597 host_fn(vd, reg_off, info.host);
116347ce 5598 }
50de9b78
RH
5599 reg_off += esize;
5600 } while (reg_off & 63);
116347ce 5601 }
50de9b78 5602 return;
116347ce 5603
50de9b78
RH
5604 fault:
5605 record_fault(env, reg_off, reg_max);
116347ce
RH
5606}
5607
d28d12f0
RH
5608static inline QEMU_ALWAYS_INLINE
5609void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5610 target_ulong base, uint32_t desc, uintptr_t retaddr,
5611 const int esz, const int msz,
5612 zreg_off_fn *off_fn,
5613 sve_ldst1_host_fn *host_fn,
5614 sve_ldst1_tlb_fn *tlb_fn)
5615{
5616 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5617 /* Remove mtedesc from the normal sve descriptor. */
5618 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5619
5620 /*
5621 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5622 * offset base entirely over the address space hole to change the
5623 * pointer tag, or change the bit55 selector. So we could here
5624 * examine TBI + TCMA like we do for sve_ldN_r_mte().
5625 */
5626 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5627 esz, msz, off_fn, host_fn, tlb_fn);
50de9b78
RH
5628}
5629
d28d12f0
RH
5630#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
5631void HELPER(sve_ldff##MEM##_##OFS) \
5632 (CPUARMState *env, void *vd, void *vg, \
5633 void *vm, target_ulong base, uint32_t desc) \
5634{ \
5635 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
5636 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5637} \
5638void HELPER(sve_ldff##MEM##_##OFS##_mte) \
5639 (CPUARMState *env, void *vd, void *vg, \
5640 void *vm, target_ulong base, uint32_t desc) \
5641{ \
5642 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
5643 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5644}
5645
5646#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
5647void HELPER(sve_ldff##MEM##_##OFS) \
5648 (CPUARMState *env, void *vd, void *vg, \
5649 void *vm, target_ulong base, uint32_t desc) \
5650{ \
5651 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
5652 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5653} \
5654void HELPER(sve_ldff##MEM##_##OFS##_mte) \
5655 (CPUARMState *env, void *vd, void *vg, \
5656 void *vm, target_ulong base, uint32_t desc) \
5657{ \
5658 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
5659 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
50de9b78
RH
5660}
5661
5662DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
5663DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
5664DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
5665DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
5666DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
5667
5668DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
5669DO_LDFF1_ZPZ_S(bss, zss, MO_8)
5670DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
5671DO_LDFF1_ZPZ_D(bds, zss, MO_8)
5672DO_LDFF1_ZPZ_D(bds, zd, MO_8)
5673
5674DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
5675DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
5676DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
5677DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
5678DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
5679
5680DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
5681DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
5682DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
5683DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
5684DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
5685
5686DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
5687DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
5688DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
5689DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
5690DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
5691
5692DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
5693DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
5694DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
5695DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
5696DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
5697
5698DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
5699DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
5700DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
5701DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
5702DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
5703
5704DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
5705DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
5706DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
5707DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
5708DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
5709
5710DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
5711DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
5712DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
5713
5714DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
5715DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
5716DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
5717
5718DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
5719DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
5720DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
5721
5722DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
5723DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
5724DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
ed67eb7f 5725
f6dbf62a
RH
5726/* Stores with a vector index. */
5727
88a660a4
RH
5728static inline QEMU_ALWAYS_INLINE
5729void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5730 target_ulong base, uint32_t desc, uintptr_t retaddr,
d28d12f0
RH
5731 uint32_t mtedesc, int esize, int msize,
5732 zreg_off_fn *off_fn,
88a660a4
RH
5733 sve_ldst1_host_fn *host_fn,
5734 sve_ldst1_tlb_fn *tlb_fn)
78cf1b88 5735{
88a660a4
RH
5736 const int mmu_idx = cpu_mmu_index(env, false);
5737 const intptr_t reg_max = simd_oprsz(desc);
ba080b86 5738 const int scale = simd_data(desc);
88a660a4
RH
5739 void *host[ARM_MAX_VQ * 4];
5740 intptr_t reg_off, i;
5741 SVEHostPage info, info2;
f6dbf62a 5742
88a660a4
RH
5743 /*
5744 * Probe all of the elements for host addresses and flags.
5745 */
5746 i = reg_off = 0;
5747 do {
5748 uint64_t pg = vg[reg_off >> 6];
78cf1b88 5749 do {
88a660a4
RH
5750 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5751 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
f6dbf62a 5752
88a660a4
RH
5753 host[i] = NULL;
5754 if (likely((pg >> (reg_off & 63)) & 1)) {
5755 if (likely(in_page >= msize)) {
5756 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
5757 mmu_idx, retaddr);
5758 host[i] = info.host;
5759 } else {
5760 /*
5761 * Element crosses the page boundary.
5762 * Probe both pages, but do not record the host address,
5763 * so that we use the slow path.
5764 */
5765 sve_probe_page(&info, false, env, addr, 0,
5766 MMU_DATA_STORE, mmu_idx, retaddr);
5767 sve_probe_page(&info2, false, env, addr + in_page, 0,
5768 MMU_DATA_STORE, mmu_idx, retaddr);
5769 info.flags |= info2.flags;
5770 }
f6dbf62a 5771
88a660a4
RH
5772 if (unlikely(info.flags & TLB_WATCHPOINT)) {
5773 cpu_check_watchpoint(env_cpu(env), addr, msize,
5774 info.attrs, BP_MEM_WRITE, retaddr);
5775 }
d28d12f0
RH
5776
5777 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
5778 mte_check1(env, mtedesc, addr, retaddr);
5779 }
88a660a4
RH
5780 }
5781 i += 1;
5782 reg_off += esize;
5783 } while (reg_off & 63);
5784 } while (reg_off < reg_max);
5785
5786 /*
5787 * Now that we have recognized all exceptions except SyncExternal
5788 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
5789 *
5790 * Note for the common case of an element in RAM, not crossing a page
5791 * boundary, we have stored the host address in host[]. This doubles
5792 * as a first-level check against the predicate, since only enabled
5793 * elements have non-null host addresses.
5794 */
5795 i = reg_off = 0;
5796 do {
5797 void *h = host[i];
5798 if (likely(h != NULL)) {
5799 host_fn(vd, reg_off, h);
5800 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
5801 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5802 tlb_fn(env, vd, reg_off, addr, retaddr);
78cf1b88 5803 }
88a660a4
RH
5804 i += 1;
5805 reg_off += esize;
5806 } while (reg_off < reg_max);
78cf1b88 5807}
f6dbf62a 5808
d28d12f0
RH
5809static inline QEMU_ALWAYS_INLINE
5810void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5811 target_ulong base, uint32_t desc, uintptr_t retaddr,
5812 int esize, int msize, zreg_off_fn *off_fn,
5813 sve_ldst1_host_fn *host_fn,
5814 sve_ldst1_tlb_fn *tlb_fn)
5815{
5816 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5817 /* Remove mtedesc from the normal sve descriptor. */
5818 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5819
5820 /*
5821 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5822 * offset base entirely over the address space hole to change the
5823 * pointer tag, or change the bit55 selector. So we could here
5824 * examine TBI + TCMA like we do for sve_ldN_r_mte().
5825 */
5826 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5827 esize, msize, off_fn, host_fn, tlb_fn);
5828}
5829
5830#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
5831void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
88a660a4 5832 void *vm, target_ulong base, uint32_t desc) \
d28d12f0
RH
5833{ \
5834 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
5835 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5836} \
5837void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5838 void *vm, target_ulong base, uint32_t desc) \
5839{ \
5840 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
5841 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
78cf1b88 5842}
f6dbf62a 5843
d28d12f0
RH
5844#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
5845void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
88a660a4 5846 void *vm, target_ulong base, uint32_t desc) \
d28d12f0
RH
5847{ \
5848 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
5849 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
5850} \
5851void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5852 void *vm, target_ulong base, uint32_t desc) \
5853{ \
5854 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
5855 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
88a660a4
RH
5856}
5857
5858DO_ST1_ZPZ_S(bs, zsu, MO_8)
5859DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
5860DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
5861DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
5862DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
5863
5864DO_ST1_ZPZ_S(bs, zss, MO_8)
5865DO_ST1_ZPZ_S(hs_le, zss, MO_16)
5866DO_ST1_ZPZ_S(hs_be, zss, MO_16)
5867DO_ST1_ZPZ_S(ss_le, zss, MO_32)
5868DO_ST1_ZPZ_S(ss_be, zss, MO_32)
5869
5870DO_ST1_ZPZ_D(bd, zsu, MO_8)
5871DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
5872DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
5873DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
5874DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
5875DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
5876DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
5877
5878DO_ST1_ZPZ_D(bd, zss, MO_8)
5879DO_ST1_ZPZ_D(hd_le, zss, MO_16)
5880DO_ST1_ZPZ_D(hd_be, zss, MO_16)
5881DO_ST1_ZPZ_D(sd_le, zss, MO_32)
5882DO_ST1_ZPZ_D(sd_be, zss, MO_32)
5883DO_ST1_ZPZ_D(dd_le, zss, MO_64)
5884DO_ST1_ZPZ_D(dd_be, zss, MO_64)
5885
5886DO_ST1_ZPZ_D(bd, zd, MO_8)
5887DO_ST1_ZPZ_D(hd_le, zd, MO_16)
5888DO_ST1_ZPZ_D(hd_be, zd, MO_16)
5889DO_ST1_ZPZ_D(sd_le, zd, MO_32)
5890DO_ST1_ZPZ_D(sd_be, zd, MO_32)
5891DO_ST1_ZPZ_D(dd_le, zd, MO_64)
5892DO_ST1_ZPZ_D(dd_be, zd, MO_64)
78cf1b88
RH
5893
5894#undef DO_ST1_ZPZ_S
5895#undef DO_ST1_ZPZ_D