]> git.proxmox.com Git - mirror_qemu.git/blame - target/arm/sve_helper.c
target/arm: Implement SVE2 SQSHRN, SQRSHRN
[mirror_qemu.git] / target / arm / sve_helper.c
CommitLineData
9e18d7a6
RH
1/*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
50f57e09 9 * version 2.1 of the License, or (at your option) any later version.
9e18d7a6
RH
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "qemu/osdep.h"
21#include "cpu.h"
500d0484 22#include "internals.h"
9e18d7a6
RH
23#include "exec/exec-all.h"
24#include "exec/cpu_ldst.h"
25#include "exec/helper-proto.h"
26#include "tcg/tcg-gvec-desc.h"
a1f233f2 27#include "fpu/softfloat.h"
dcb32f1d 28#include "tcg/tcg.h"
45d9503d 29#include "vec_internal.h"
9e18d7a6
RH
30
31
f97cfd59
RH
32/* Note that vector data is stored in host-endian 64-bit chunks,
33 so addressing units smaller than that needs a host-endian fixup. */
34#ifdef HOST_WORDS_BIGENDIAN
35#define H1(x) ((x) ^ 7)
36#define H1_2(x) ((x) ^ 6)
37#define H1_4(x) ((x) ^ 4)
38#define H2(x) ((x) ^ 3)
39#define H4(x) ((x) ^ 1)
40#else
41#define H1(x) (x)
42#define H1_2(x) (x)
43#define H1_4(x) (x)
44#define H2(x) (x)
45#define H4(x) (x)
46#endif
47
9e18d7a6
RH
48/* Return a value for NZCV as per the ARM PredTest pseudofunction.
49 *
50 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
51 * and bit 0 set if C is set. Compare the definitions of these variables
52 * within CPUARMState.
53 */
54
55/* For no G bits set, NZCV = C. */
56#define PREDTEST_INIT 1
57
58/* This is an iterative function, called for each Pd and Pg word
59 * moving forward.
60 */
61static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
62{
63 if (likely(g)) {
64 /* Compute N from first D & G.
65 Use bit 2 to signal first G bit seen. */
66 if (!(flags & 4)) {
67 flags |= ((d & (g & -g)) != 0) << 31;
68 flags |= 4;
69 }
70
71 /* Accumulate Z from each D & G. */
72 flags |= ((d & g) != 0) << 1;
73
74 /* Compute C from last !(D & G). Replace previous. */
75 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
76 }
77 return flags;
78}
79
757f9cff
RH
80/* This is an iterative function, called for each Pd and Pg word
81 * moving backward.
82 */
83static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
84{
85 if (likely(g)) {
86 /* Compute C from first (i.e last) !(D & G).
87 Use bit 2 to signal first G bit seen. */
88 if (!(flags & 4)) {
89 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
90 flags |= (d & pow2floor(g)) == 0;
91 }
92
93 /* Accumulate Z from each D & G. */
94 flags |= ((d & g) != 0) << 1;
95
96 /* Compute N from last (i.e first) D & G. Replace previous. */
97 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
98 }
99 return flags;
100}
101
9e18d7a6
RH
102/* The same for a single word predicate. */
103uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
104{
105 return iter_predtest_fwd(d, g, PREDTEST_INIT);
106}
107
108/* The same for a multi-word predicate. */
109uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
110{
111 uint32_t flags = PREDTEST_INIT;
112 uint64_t *d = vd, *g = vg;
113 uintptr_t i = 0;
114
115 do {
116 flags = iter_predtest_fwd(d[i], g[i], flags);
117 } while (++i < words);
118
119 return flags;
120}
516e246a 121
ccd841c3
RH
122/* Expand active predicate bits to bytes, for byte elements.
123 * for (i = 0; i < 256; ++i) {
124 * unsigned long m = 0;
125 * for (j = 0; j < 8; j++) {
126 * if ((i >> j) & 1) {
127 * m |= 0xfful << (j << 3);
128 * }
129 * }
130 * printf("0x%016lx,\n", m);
131 * }
132 */
133static inline uint64_t expand_pred_b(uint8_t byte)
134{
135 static const uint64_t word[256] = {
136 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
137 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
138 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
139 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
140 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
141 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
142 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
143 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
144 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
145 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
146 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
147 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
148 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
149 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
150 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
151 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
152 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
153 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
154 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
155 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
156 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
157 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
158 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
159 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
160 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
161 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
162 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
163 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
164 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
165 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
166 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
167 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
168 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
169 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
170 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
171 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
172 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
173 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
174 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
175 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
176 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
177 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
178 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
179 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
180 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
181 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
182 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
183 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
184 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
185 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
186 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
187 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
188 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
189 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
190 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
191 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
192 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
193 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
194 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
195 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
196 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
197 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
198 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
199 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
200 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
201 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
202 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
203 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
204 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
205 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
206 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
207 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
208 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
209 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
210 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
211 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
212 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
213 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
214 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
215 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
216 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
217 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
218 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
219 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
220 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
221 0xffffffffffffffff,
222 };
223 return word[byte];
224}
225
226/* Similarly for half-word elements.
227 * for (i = 0; i < 256; ++i) {
228 * unsigned long m = 0;
229 * if (i & 0xaa) {
230 * continue;
231 * }
232 * for (j = 0; j < 8; j += 2) {
233 * if ((i >> j) & 1) {
234 * m |= 0xfffful << (j << 3);
235 * }
236 * }
237 * printf("[0x%x] = 0x%016lx,\n", i, m);
238 * }
239 */
240static inline uint64_t expand_pred_h(uint8_t byte)
241{
242 static const uint64_t word[] = {
243 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
244 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
245 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
246 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
247 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
248 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
249 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
250 [0x55] = 0xffffffffffffffff,
251 };
252 return word[byte & 0x55];
253}
254
255/* Similarly for single word elements. */
256static inline uint64_t expand_pred_s(uint8_t byte)
257{
258 static const uint64_t word[] = {
259 [0x01] = 0x00000000ffffffffull,
260 [0x10] = 0xffffffff00000000ull,
261 [0x11] = 0xffffffffffffffffull,
262 };
263 return word[byte & 0x11];
264}
265
dae8fb90
RH
266/* Swap 16-bit words within a 32-bit word. */
267static inline uint32_t hswap32(uint32_t h)
268{
269 return rol32(h, 16);
270}
271
272/* Swap 16-bit words within a 64-bit word. */
273static inline uint64_t hswap64(uint64_t h)
274{
275 uint64_t m = 0x0000ffff0000ffffull;
276 h = rol64(h, 32);
277 return ((h & m) << 16) | ((h >> 16) & m);
278}
279
280/* Swap 32-bit words within a 64-bit word. */
281static inline uint64_t wswap64(uint64_t h)
282{
283 return rol64(h, 32);
284}
285
516e246a
RH
286#define LOGICAL_PPPP(NAME, FUNC) \
287void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
288{ \
289 uintptr_t opr_sz = simd_oprsz(desc); \
290 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
291 uintptr_t i; \
292 for (i = 0; i < opr_sz / 8; ++i) { \
293 d[i] = FUNC(n[i], m[i], g[i]); \
294 } \
295}
296
297#define DO_AND(N, M, G) (((N) & (M)) & (G))
298#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
299#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
300#define DO_ORR(N, M, G) (((N) | (M)) & (G))
301#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
302#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
303#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
304#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
305
306LOGICAL_PPPP(sve_and_pppp, DO_AND)
307LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
308LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
309LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
310LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
311LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
312LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
313LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
314
315#undef DO_AND
316#undef DO_BIC
317#undef DO_EOR
318#undef DO_ORR
319#undef DO_ORN
320#undef DO_NOR
321#undef DO_NAND
322#undef DO_SEL
323#undef LOGICAL_PPPP
028e2a7b 324
f97cfd59
RH
325/* Fully general three-operand expander, controlled by a predicate.
326 * This is complicated by the host-endian storage of the register file.
327 */
328/* ??? I don't expect the compiler could ever vectorize this itself.
329 * With some tables we can convert bit masks to byte masks, and with
330 * extra care wrt byte/word ordering we could use gcc generic vectors
331 * and do 16 bytes at a time.
332 */
333#define DO_ZPZZ(NAME, TYPE, H, OP) \
334void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
335{ \
336 intptr_t i, opr_sz = simd_oprsz(desc); \
337 for (i = 0; i < opr_sz; ) { \
338 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
339 do { \
340 if (pg & 1) { \
341 TYPE nn = *(TYPE *)(vn + H(i)); \
342 TYPE mm = *(TYPE *)(vm + H(i)); \
343 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
344 } \
345 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
346 } while (i & 15); \
347 } \
348}
349
350/* Similarly, specialized for 64-bit operands. */
351#define DO_ZPZZ_D(NAME, TYPE, OP) \
352void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
353{ \
354 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
355 TYPE *d = vd, *n = vn, *m = vm; \
356 uint8_t *pg = vg; \
357 for (i = 0; i < opr_sz; i += 1) { \
358 if (pg[H1(i)] & 1) { \
359 TYPE nn = n[i], mm = m[i]; \
360 d[i] = OP(nn, mm); \
361 } \
362 } \
363}
364
365#define DO_AND(N, M) (N & M)
366#define DO_EOR(N, M) (N ^ M)
367#define DO_ORR(N, M) (N | M)
368#define DO_BIC(N, M) (N & ~M)
369#define DO_ADD(N, M) (N + M)
370#define DO_SUB(N, M) (N - M)
371#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
372#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
373#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
374#define DO_MUL(N, M) (N * M)
7e8fafbf
RH
375
376
377/*
378 * We must avoid the C undefined behaviour cases: division by
379 * zero and signed division of INT_MIN by -1. Both of these
380 * have architecturally defined required results for Arm.
381 * We special case all signed divisions by -1 to avoid having
382 * to deduce the minimum integer for the type involved.
383 */
384#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
385#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
f97cfd59
RH
386
387DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
388DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
389DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
390DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
391
392DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
393DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
394DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
395DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
396
397DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
398DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
399DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
400DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
401
402DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
403DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
404DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
405DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
406
407DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
408DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
409DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
410DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
411
412DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
413DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
414DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
415DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
416
417DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
418DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
419DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
420DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
421
422DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
423DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
424DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
425DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
426
427DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
428DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
429DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
430DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
431
432DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
433DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
434DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
435DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
436
437DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
438DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
439DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
440DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
441
442DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
443DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
444DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
445DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
446
447/* Because the computation type is at least twice as large as required,
448 these work for both signed and unsigned source types. */
449static inline uint8_t do_mulh_b(int32_t n, int32_t m)
450{
451 return (n * m) >> 8;
452}
453
454static inline uint16_t do_mulh_h(int32_t n, int32_t m)
455{
456 return (n * m) >> 16;
457}
458
459static inline uint32_t do_mulh_s(int64_t n, int64_t m)
460{
461 return (n * m) >> 32;
462}
463
464static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
465{
466 uint64_t lo, hi;
467 muls64(&lo, &hi, n, m);
468 return hi;
469}
470
471static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
472{
473 uint64_t lo, hi;
474 mulu64(&lo, &hi, n, m);
475 return hi;
476}
477
478DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
479DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
480DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
481DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
482
483DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
484DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
485DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
486DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
487
488DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
489DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
490DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
491DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
492
7e8fafbf
RH
493DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
494DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
f97cfd59 495
7e8fafbf
RH
496DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
497DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
f97cfd59 498
27721dbb
RH
499/* Note that all bits of the shift are significant
500 and not modulo the element size. */
501#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
502#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
503#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
504
505DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
506DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
507DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
508
509DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
510DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
511DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
512
513DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
514DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
515DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
516
517DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
518DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
519DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
520
d4b1e59d
RH
521static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
522{
523 int8_t n1 = n, n2 = n >> 8;
524 return m + n1 + n2;
525}
526
527static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
528{
529 int16_t n1 = n, n2 = n >> 16;
530 return m + n1 + n2;
531}
532
533static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
534{
535 int32_t n1 = n, n2 = n >> 32;
536 return m + n1 + n2;
537}
538
539DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
540DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
541DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
542
543static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
544{
545 uint8_t n1 = n, n2 = n >> 8;
546 return m + n1 + n2;
547}
548
549static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
550{
551 uint16_t n1 = n, n2 = n >> 16;
552 return m + n1 + n2;
553}
554
555static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
556{
557 uint32_t n1 = n, n2 = n >> 32;
558 return m + n1 + n2;
559}
560
561DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
562DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
563DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
564
45d9503d
RH
565#define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
566#define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
567#define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
568#define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
569
570DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
571DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
572DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
573DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
574
575#define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
576#define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
577#define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
578#define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
579
580DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
581DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
582DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
583DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
584
585/*
586 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
587 * We pass in a pointer to a dummy saturation field to trigger
588 * the saturating arithmetic but discard the information about
589 * whether it has occurred.
590 */
591#define do_sqshl_b(n, m) \
592 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
593#define do_sqshl_h(n, m) \
594 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
595#define do_sqshl_s(n, m) \
596 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
597#define do_sqshl_d(n, m) \
598 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
599
600DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
601DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
602DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
603DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
604
605#define do_uqshl_b(n, m) \
606 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
607#define do_uqshl_h(n, m) \
608 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
609#define do_uqshl_s(n, m) \
610 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
611#define do_uqshl_d(n, m) \
612 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
613
614DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
615DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
616DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
617DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
618
619#define do_sqrshl_b(n, m) \
620 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
621#define do_sqrshl_h(n, m) \
622 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
623#define do_sqrshl_s(n, m) \
624 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
625#define do_sqrshl_d(n, m) \
626 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
627
628DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
629DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
630DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
631DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
632
633#undef do_sqrshl_d
634
635#define do_uqrshl_b(n, m) \
636 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
637#define do_uqrshl_h(n, m) \
638 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
639#define do_uqrshl_s(n, m) \
640 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
641#define do_uqrshl_d(n, m) \
642 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
643
644DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
645DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
646DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
647DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
648
649#undef do_uqrshl_d
650
a47dc220
RH
651#define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
652#define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
653
654DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
655DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
656DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
657DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
658
659DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
660DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
661DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
662DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
663
664#define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
665#define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
666
667DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
668DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
669DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
670DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
671
672DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
673DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
674DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
675DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
676
677#define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
678#define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
679
680DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
681DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
682DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
683DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
684
685DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
686DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
687DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
688DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
689
4f07fbeb
RH
690static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
691{
692 return val >= max ? max : val <= min ? min : val;
693}
694
695#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
696#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
697#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
698
699static inline int64_t do_sqadd_d(int64_t n, int64_t m)
700{
701 int64_t r = n + m;
702 if (((r ^ n) & ~(n ^ m)) < 0) {
703 /* Signed overflow. */
704 return r < 0 ? INT64_MAX : INT64_MIN;
705 }
706 return r;
707}
708
709DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
710DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
711DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
712DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
713
714#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
715#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
716#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
717
718static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
719{
720 uint64_t r = n + m;
721 return r < n ? UINT64_MAX : r;
722}
723
724DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
725DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
726DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
727DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
728
729#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
730#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
731#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
732
733static inline int64_t do_sqsub_d(int64_t n, int64_t m)
734{
735 int64_t r = n - m;
736 if (((r ^ n) & (n ^ m)) < 0) {
737 /* Signed overflow. */
738 return r < 0 ? INT64_MAX : INT64_MIN;
739 }
740 return r;
741}
742
743DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
744DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
745DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
746DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
747
748#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
749#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
750#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
751
752static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
753{
754 return n > m ? n - m : 0;
755}
756
757DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
758DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
759DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
760DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
761
762#define DO_SUQADD_B(n, m) \
763 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
764#define DO_SUQADD_H(n, m) \
765 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
766#define DO_SUQADD_S(n, m) \
767 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
768
769static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
770{
771 uint64_t r = n + m;
772
773 if (n < 0) {
774 /* Note that m - abs(n) cannot underflow. */
775 if (r > INT64_MAX) {
776 /* Result is either very large positive or negative. */
777 if (m > -n) {
778 /* m > abs(n), so r is a very large positive. */
779 return INT64_MAX;
780 }
781 /* Result is negative. */
782 }
783 } else {
784 /* Both inputs are positive: check for overflow. */
785 if (r < m || r > INT64_MAX) {
786 return INT64_MAX;
787 }
788 }
789 return r;
790}
791
792DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
793DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
794DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
795DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
796
797#define DO_USQADD_B(n, m) \
798 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
799#define DO_USQADD_H(n, m) \
800 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
801#define DO_USQADD_S(n, m) \
802 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
803
804static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
805{
806 uint64_t r = n + m;
807
808 if (m < 0) {
809 return n < -m ? 0 : r;
810 }
811 return r < n ? UINT64_MAX : r;
812}
813
814DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
815DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
816DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
817DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
818
f97cfd59
RH
819#undef DO_ZPZZ
820#undef DO_ZPZZ_D
047cec97 821
8597dc8b
RH
822/*
823 * Three operand expander, operating on element pairs.
824 * If the slot I is even, the elements from from VN {I, I+1}.
825 * If the slot I is odd, the elements from from VM {I-1, I}.
826 * Load all of the input elements in each pair before overwriting output.
827 */
828#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
829void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
830{ \
831 intptr_t i, opr_sz = simd_oprsz(desc); \
832 for (i = 0; i < opr_sz; ) { \
833 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
834 do { \
835 TYPE n0 = *(TYPE *)(vn + H(i)); \
836 TYPE m0 = *(TYPE *)(vm + H(i)); \
837 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
838 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
839 if (pg & 1) { \
840 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
841 } \
842 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
843 if (pg & 1) { \
844 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
845 } \
846 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
847 } while (i & 15); \
848 } \
849}
850
851/* Similarly, specialized for 64-bit operands. */
852#define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
853void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
854{ \
855 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
856 TYPE *d = vd, *n = vn, *m = vm; \
857 uint8_t *pg = vg; \
858 for (i = 0; i < opr_sz; i += 2) { \
859 TYPE n0 = n[i], n1 = n[i + 1]; \
860 TYPE m0 = m[i], m1 = m[i + 1]; \
861 if (pg[H1(i)] & 1) { \
862 d[i] = OP(n0, n1); \
863 } \
864 if (pg[H1(i + 1)] & 1) { \
865 d[i + 1] = OP(m0, m1); \
866 } \
867 } \
868}
869
870DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
871DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
872DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
873DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
874
875DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
876DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
877DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
878DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
879
880DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
881DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
882DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
883DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
884
885DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
886DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
887DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
888DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
889
890DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
891DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
892DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
893DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
894
895#undef DO_ZPZZ_PAIR
896#undef DO_ZPZZ_PAIR_D
897
b87dbeeb
SL
898#define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
899void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
900 void *status, uint32_t desc) \
901{ \
902 intptr_t i, opr_sz = simd_oprsz(desc); \
903 for (i = 0; i < opr_sz; ) { \
904 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
905 do { \
906 TYPE n0 = *(TYPE *)(vn + H(i)); \
907 TYPE m0 = *(TYPE *)(vm + H(i)); \
908 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
909 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
910 if (pg & 1) { \
911 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
912 } \
913 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
914 if (pg & 1) { \
915 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
916 } \
917 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
918 } while (i & 15); \
919 } \
920}
921
922DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
923DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
924DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, , float64_add)
925
926DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
927DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
928DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, , float64_maxnum)
929
930DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
931DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
932DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, , float64_minnum)
933
934DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
935DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
936DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, , float64_max)
937
938DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
939DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
940DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, , float64_min)
941
942#undef DO_ZPZZ_PAIR_FP
943
fe7f8dfb
RH
944/* Three-operand expander, controlled by a predicate, in which the
945 * third operand is "wide". That is, for D = N op M, the same 64-bit
946 * value of M is used with all of the narrower values of N.
947 */
948#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
949void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
950{ \
951 intptr_t i, opr_sz = simd_oprsz(desc); \
952 for (i = 0; i < opr_sz; ) { \
953 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
954 TYPEW mm = *(TYPEW *)(vm + i); \
955 do { \
956 if (pg & 1) { \
957 TYPE nn = *(TYPE *)(vn + H(i)); \
958 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
959 } \
960 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
961 } while (i & 7); \
962 } \
963}
964
965DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
966DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
967DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
968
969DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
970DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
971DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
972
973DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
974DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
975DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
976
977#undef DO_ZPZW
978
afac6d04
RH
979/* Fully general two-operand expander, controlled by a predicate.
980 */
981#define DO_ZPZ(NAME, TYPE, H, OP) \
982void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
983{ \
984 intptr_t i, opr_sz = simd_oprsz(desc); \
985 for (i = 0; i < opr_sz; ) { \
986 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
987 do { \
988 if (pg & 1) { \
989 TYPE nn = *(TYPE *)(vn + H(i)); \
990 *(TYPE *)(vd + H(i)) = OP(nn); \
991 } \
992 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
993 } while (i & 15); \
994 } \
995}
996
997/* Similarly, specialized for 64-bit operands. */
998#define DO_ZPZ_D(NAME, TYPE, OP) \
999void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1000{ \
1001 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1002 TYPE *d = vd, *n = vn; \
1003 uint8_t *pg = vg; \
1004 for (i = 0; i < opr_sz; i += 1) { \
1005 if (pg[H1(i)] & 1) { \
1006 TYPE nn = n[i]; \
1007 d[i] = OP(nn); \
1008 } \
1009 } \
1010}
1011
1012#define DO_CLS_B(N) (clrsb32(N) - 24)
1013#define DO_CLS_H(N) (clrsb32(N) - 16)
1014
1015DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
1016DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
1017DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
1018DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
1019
1020#define DO_CLZ_B(N) (clz32(N) - 24)
1021#define DO_CLZ_H(N) (clz32(N) - 16)
1022
1023DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
1024DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
1025DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
1026DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
1027
1028DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
1029DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
1030DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
1031DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
1032
1033#define DO_CNOT(N) (N == 0)
1034
1035DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
1036DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
1037DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
1038DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
1039
1040#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
1041
1042DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
1043DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
1044DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
1045
1046#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
1047
1048DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
1049DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
1050DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
1051
1052#define DO_NOT(N) (~N)
1053
1054DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
1055DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
1056DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
1057DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
1058
1059#define DO_SXTB(N) ((int8_t)N)
1060#define DO_SXTH(N) ((int16_t)N)
1061#define DO_SXTS(N) ((int32_t)N)
1062#define DO_UXTB(N) ((uint8_t)N)
1063#define DO_UXTH(N) ((uint16_t)N)
1064#define DO_UXTS(N) ((uint32_t)N)
1065
1066DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
1067DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
1068DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
1069DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
1070DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
1071DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
1072
1073DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
1074DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
1075DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
1076DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
1077DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
1078DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
1079
1080#define DO_ABS(N) (N < 0 ? -N : N)
1081
1082DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
1083DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
1084DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
1085DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
1086
1087#define DO_NEG(N) (-N)
1088
1089DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
1090DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
1091DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
1092DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
1093
dae8fb90
RH
1094DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
1095DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
1096DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
1097
1098DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
1099DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
1100
1101DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
1102
1103DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
1104DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
1105DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
1106DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
1107
db366da8
RH
1108#define DO_SQABS(X) \
1109 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1110 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
1111
1112DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
1113DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
1114DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
1115DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
1116
1117#define DO_SQNEG(X) \
1118 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1119 x_ == min_ ? -min_ - 1 : -x_; })
1120
1121DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
1122DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
1123DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
1124DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
1125
1126DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
1127DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
1128
d9d78dcc
RH
1129/* Three-operand expander, unpredicated, in which the third operand is "wide".
1130 */
1131#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1132void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1133{ \
1134 intptr_t i, opr_sz = simd_oprsz(desc); \
1135 for (i = 0; i < opr_sz; ) { \
1136 TYPEW mm = *(TYPEW *)(vm + i); \
1137 do { \
1138 TYPE nn = *(TYPE *)(vn + H(i)); \
1139 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1140 i += sizeof(TYPE); \
1141 } while (i & 7); \
1142 } \
1143}
1144
1145DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1146DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1147DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1148
1149DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1150DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1151DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1152
1153DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1154DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1155DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1156
1157#undef DO_ZZW
1158
afac6d04
RH
1159#undef DO_CLS_B
1160#undef DO_CLS_H
1161#undef DO_CLZ_B
1162#undef DO_CLZ_H
1163#undef DO_CNOT
1164#undef DO_FABS
1165#undef DO_FNEG
1166#undef DO_ABS
1167#undef DO_NEG
1168#undef DO_ZPZ
1169#undef DO_ZPZ_D
1170
0ce1dda8
RH
1171/*
1172 * Three-operand expander, unpredicated, in which the two inputs are
1173 * selected from the top or bottom half of the wide column.
1174 */
1175#define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1176void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1177{ \
1178 intptr_t i, opr_sz = simd_oprsz(desc); \
1179 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1180 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1181 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1182 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1183 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1184 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1185 } \
1186}
1187
1188DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1189DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1190DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, , H1_4, DO_ADD)
1191
1192DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1193DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1194DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, , H1_4, DO_SUB)
1195
1196DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1197DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1198DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, , H1_4, DO_ABD)
1199
1200DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1201DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1202DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, , H1_4, DO_ADD)
1203
1204DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1205DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1206DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, , H1_4, DO_SUB)
1207
1208DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1209DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1210DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, , H1_4, DO_ABD)
1211
69ccc099
RH
1212DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1213DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1214DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, , H1_4, DO_MUL)
1215
1216DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1217DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1218DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, , H1_4, DO_MUL)
1219
1220/* Note that the multiply cannot overflow, but the doubling can. */
1221static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1222{
1223 int16_t val = n * m;
1224 return DO_SQADD_H(val, val);
1225}
1226
1227static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1228{
1229 int32_t val = n * m;
1230 return DO_SQADD_S(val, val);
1231}
1232
1233static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1234{
1235 int64_t val = n * m;
1236 return do_sqadd_d(val, val);
1237}
1238
1239DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1240DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1241DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, , H1_4, do_sqdmull_d)
1242
0ce1dda8
RH
1243#undef DO_ZZZ_TB
1244
81fccf09
RH
1245#define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1246void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1247{ \
1248 intptr_t i, opr_sz = simd_oprsz(desc); \
1249 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1250 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1251 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1252 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1253 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1254 } \
1255}
1256
1257DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1258DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1259DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, , H1_4, DO_ADD)
1260
1261DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1262DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1263DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, , H1_4, DO_SUB)
1264
1265DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1266DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1267DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, , H1_4, DO_ADD)
1268
1269DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1270DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1271DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, , H1_4, DO_SUB)
1272
1273#undef DO_ZZZ_WTB
1274
2df3ca55
RH
1275#define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1276void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1277{ \
1278 intptr_t i, opr_sz = simd_oprsz(desc); \
1279 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1280 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1281 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1282 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1283 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1284 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1285 } \
1286}
1287
1288DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1289DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1290DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1291DO_ZZZ_NTB(sve2_eoril_d, uint64_t, , DO_EOR)
1292
1293#undef DO_ZZZ_NTB
1294
38650638
RH
1295#define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1296void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1297{ \
1298 intptr_t i, opr_sz = simd_oprsz(desc); \
1299 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1300 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1301 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1302 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1303 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1304 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1305 } \
1306}
1307
1308DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1309DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1310DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, , H1_4, DO_ABD)
1311
1312DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1313DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1314DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, , H1_4, DO_ABD)
1315
1316#undef DO_ZZZW_ACC
1317
5ff2838d
RH
1318#define DO_XTNB(NAME, TYPE, OP) \
1319void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1320{ \
1321 intptr_t i, opr_sz = simd_oprsz(desc); \
1322 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1323 TYPE nn = *(TYPE *)(vn + i); \
1324 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1325 *(TYPE *)(vd + i) = nn; \
1326 } \
1327}
1328
1329#define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1330void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1331{ \
1332 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1333 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1334 TYPE nn = *(TYPE *)(vn + i); \
1335 *(TYPEN *)(vd + i + odd) = OP(nn); \
1336 } \
1337}
1338
1339#define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1340#define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1341#define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1342
1343DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1344DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1345DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1346
1347DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1348DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1349DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1350
1351#define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1352#define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1353#define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1354
1355DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1356DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1357DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1358
1359DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1360DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1361DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1362
1363DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1364DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1365DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1366
1367DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1368DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1369DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1370
1371#undef DO_XTNB
1372#undef DO_XTNT
1373
b8295dfb
RH
1374void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1375{
1376 intptr_t i, opr_sz = simd_oprsz(desc);
1377 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1378 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1379 uint32_t *a = va, *n = vn;
1380 uint64_t *d = vd, *m = vm;
1381
1382 for (i = 0; i < opr_sz / 8; ++i) {
1383 uint32_t e1 = a[2 * i + H4(0)];
1384 uint32_t e2 = n[2 * i + sel] ^ inv;
1385 uint64_t c = extract64(m[i], 32, 1);
1386 /* Compute and store the entire 33-bit result at once. */
1387 d[i] = c + e1 + e2;
1388 }
1389}
1390
1391void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1392{
1393 intptr_t i, opr_sz = simd_oprsz(desc);
1394 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1395 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1396 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1397
1398 for (i = 0; i < opr_sz / 8; i += 2) {
1399 Int128 e1 = int128_make64(a[i]);
1400 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1401 Int128 c = int128_make64(m[i + 1] & 1);
1402 Int128 r = int128_add(int128_add(e1, e2), c);
1403 d[i + 0] = int128_getlo(r);
1404 d[i + 1] = int128_gethi(r);
1405 }
1406}
1407
cb9c33b8
RH
1408#define DO_BITPERM(NAME, TYPE, OP) \
1409void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1410{ \
1411 intptr_t i, opr_sz = simd_oprsz(desc); \
1412 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1413 TYPE nn = *(TYPE *)(vn + i); \
1414 TYPE mm = *(TYPE *)(vm + i); \
1415 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1416 } \
1417}
1418
1419static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1420{
1421 uint64_t res = 0;
1422 int db, rb = 0;
1423
1424 for (db = 0; db < n; ++db) {
1425 if ((mask >> db) & 1) {
1426 res |= ((data >> db) & 1) << rb;
1427 ++rb;
1428 }
1429 }
1430 return res;
1431}
1432
1433DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1434DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1435DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1436DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1437
1438static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1439{
1440 uint64_t res = 0;
1441 int rb, db = 0;
1442
1443 for (rb = 0; rb < n; ++rb) {
1444 if ((mask >> rb) & 1) {
1445 res |= ((data >> db) & 1) << rb;
1446 ++db;
1447 }
1448 }
1449 return res;
1450}
1451
1452DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1453DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1454DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1455DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1456
1457static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1458{
1459 uint64_t resm = 0, resu = 0;
1460 int db, rbm = 0, rbu = 0;
1461
1462 for (db = 0; db < n; ++db) {
1463 uint64_t val = (data >> db) & 1;
1464 if ((mask >> db) & 1) {
1465 resm |= val << rbm++;
1466 } else {
1467 resu |= val << rbu++;
1468 }
1469 }
1470
1471 return resm | (resu << rbm);
1472}
1473
1474DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1475DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1476DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1477DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1478
1479#undef DO_BITPERM
1480
ed4a6387
RH
1481#define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1482void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1483{ \
1484 intptr_t i, opr_sz = simd_oprsz(desc); \
1485 int sub_r = simd_data(desc); \
1486 if (sub_r) { \
1487 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1488 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1489 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1490 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1491 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1492 acc_r = ADD_OP(acc_r, el2_i); \
1493 acc_i = SUB_OP(acc_i, el2_r); \
1494 *(TYPE *)(vd + H(i)) = acc_r; \
1495 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1496 } \
1497 } else { \
1498 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1499 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1500 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1501 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1502 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1503 acc_r = SUB_OP(acc_r, el2_i); \
1504 acc_i = ADD_OP(acc_i, el2_r); \
1505 *(TYPE *)(vd + H(i)) = acc_r; \
1506 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1507 } \
1508 } \
1509}
1510
1511DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1512DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1513DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1514DO_CADD(sve2_cadd_d, int64_t, , DO_ADD, DO_SUB)
1515
1516DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1517DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1518DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1519DO_CADD(sve2_sqcadd_d, int64_t, , do_sqadd_d, do_sqsub_d)
1520
1521#undef DO_CADD
1522
4269fef1
RH
1523#define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1524void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1525{ \
1526 intptr_t i, opr_sz = simd_oprsz(desc); \
1527 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1528 int shift = simd_data(desc) >> 1; \
1529 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1530 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1531 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1532 } \
1533}
1534
1535DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1536DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1537DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, , H1_4)
1538
1539DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1540DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1541DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, , H1_4)
1542
1543#undef DO_ZZI_SHLL
1544
047cec97
RH
1545/* Two-operand reduction expander, controlled by a predicate.
1546 * The difference between TYPERED and TYPERET has to do with
1547 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1548 * but TYPERET must be unsigned so that e.g. a 32-bit value
1549 * is not sign-extended to the ABI uint64_t return type.
1550 */
1551/* ??? If we were to vectorize this by hand the reduction ordering
1552 * would change. For integer operands, this is perfectly fine.
1553 */
1554#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1555uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1556{ \
1557 intptr_t i, opr_sz = simd_oprsz(desc); \
1558 TYPERED ret = INIT; \
1559 for (i = 0; i < opr_sz; ) { \
1560 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1561 do { \
1562 if (pg & 1) { \
1563 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1564 ret = OP(ret, nn); \
1565 } \
1566 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1567 } while (i & 15); \
1568 } \
1569 return (TYPERET)ret; \
1570}
1571
1572#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1573uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1574{ \
1575 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1576 TYPEE *n = vn; \
1577 uint8_t *pg = vg; \
1578 TYPER ret = INIT; \
1579 for (i = 0; i < opr_sz; i += 1) { \
1580 if (pg[H1(i)] & 1) { \
1581 TYPEE nn = n[i]; \
1582 ret = OP(ret, nn); \
1583 } \
1584 } \
1585 return ret; \
1586}
1587
1588DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1589DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1590DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1591DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1592
1593DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1594DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1595DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1596DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1597
1598DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1599DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1600DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1601DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1602
1603DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1604DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1605DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1606
1607DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1608DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1609DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1610DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1611
1612DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1613DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1614DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1615DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1616
1617DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1618DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1619DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1620DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1621
1622DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1623DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1624DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1625DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1626
1627DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1628DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1629DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1630DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1631
1632#undef DO_VPZ
1633#undef DO_VPZ_D
1634
6e6a157d
RH
1635/* Two vector operand, one scalar operand, unpredicated. */
1636#define DO_ZZI(NAME, TYPE, OP) \
1637void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1638{ \
1639 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1640 TYPE s = s64, *d = vd, *n = vn; \
1641 for (i = 0; i < opr_sz; ++i) { \
1642 d[i] = OP(n[i], s); \
1643 } \
1644}
1645
1646#define DO_SUBR(X, Y) (Y - X)
1647
1648DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1649DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1650DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1651DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1652
1653DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1654DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1655DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1656DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1657
1658DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1659DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1660DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1661DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1662
1663DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1664DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1665DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1666DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1667
1668DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1669DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1670DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1671DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1672
1673#undef DO_ZZI
1674
f97cfd59
RH
1675#undef DO_AND
1676#undef DO_ORR
1677#undef DO_EOR
1678#undef DO_BIC
1679#undef DO_ADD
1680#undef DO_SUB
1681#undef DO_MAX
1682#undef DO_MIN
1683#undef DO_ABD
1684#undef DO_MUL
1685#undef DO_DIV
27721dbb
RH
1686#undef DO_ASR
1687#undef DO_LSR
1688#undef DO_LSL
6e6a157d 1689#undef DO_SUBR
f97cfd59 1690
028e2a7b
RH
1691/* Similar to the ARM LastActiveElement pseudocode function, except the
1692 result is multiplied by the element size. This includes the not found
1693 indication; e.g. not found for esz=3 is -8. */
1694static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1695{
1696 uint64_t mask = pred_esz_masks[esz];
1697 intptr_t i = words;
1698
1699 do {
1700 uint64_t this_g = g[--i] & mask;
1701 if (this_g) {
1702 return i * 64 + (63 - clz64(this_g));
1703 }
1704 } while (i > 0);
1705 return (intptr_t)-1 << esz;
1706}
1707
86300b5d 1708uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
028e2a7b 1709{
86300b5d 1710 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
028e2a7b
RH
1711 uint32_t flags = PREDTEST_INIT;
1712 uint64_t *d = vd, *g = vg;
1713 intptr_t i = 0;
1714
1715 do {
1716 uint64_t this_d = d[i];
1717 uint64_t this_g = g[i];
1718
1719 if (this_g) {
1720 if (!(flags & 4)) {
1721 /* Set in D the first bit of G. */
1722 this_d |= this_g & -this_g;
1723 d[i] = this_d;
1724 }
1725 flags = iter_predtest_fwd(this_d, this_g, flags);
1726 }
1727 } while (++i < words);
1728
1729 return flags;
1730}
1731
1732uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1733{
86300b5d
RH
1734 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1735 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
028e2a7b
RH
1736 uint32_t flags = PREDTEST_INIT;
1737 uint64_t *d = vd, *g = vg, esz_mask;
1738 intptr_t i, next;
1739
1740 next = last_active_element(vd, words, esz) + (1 << esz);
1741 esz_mask = pred_esz_masks[esz];
1742
1743 /* Similar to the pseudocode for pnext, but scaled by ESZ
1744 so that we find the correct bit. */
1745 if (next < words * 64) {
1746 uint64_t mask = -1;
1747
1748 if (next & 63) {
1749 mask = ~((1ull << (next & 63)) - 1);
1750 next &= -64;
1751 }
1752 do {
1753 uint64_t this_g = g[next / 64] & esz_mask & mask;
1754 if (this_g != 0) {
1755 next = (next & -64) + ctz64(this_g);
1756 break;
1757 }
1758 next += 64;
1759 mask = -1;
1760 } while (next < words * 64);
1761 }
1762
1763 i = 0;
1764 do {
1765 uint64_t this_d = 0;
1766 if (i == next / 64) {
1767 this_d = 1ull << (next & 63);
1768 }
1769 d[i] = this_d;
1770 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1771 } while (++i < words);
1772
1773 return flags;
1774}
ccd841c3 1775
60245996
RH
1776/*
1777 * Copy Zn into Zd, and store zero into inactive elements.
1778 * If inv, store zeros into the active elements.
ccd841c3 1779 */
68459864
RH
1780void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1781{
1782 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
60245996 1783 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
68459864
RH
1784 uint64_t *d = vd, *n = vn;
1785 uint8_t *pg = vg;
60245996 1786
68459864 1787 for (i = 0; i < opr_sz; i += 1) {
60245996 1788 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
68459864
RH
1789 }
1790}
1791
1792void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1793{
1794 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
60245996 1795 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
68459864
RH
1796 uint64_t *d = vd, *n = vn;
1797 uint8_t *pg = vg;
60245996 1798
68459864 1799 for (i = 0; i < opr_sz; i += 1) {
60245996 1800 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
68459864
RH
1801 }
1802}
1803
1804void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1805{
1806 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
60245996 1807 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
68459864
RH
1808 uint64_t *d = vd, *n = vn;
1809 uint8_t *pg = vg;
60245996 1810
68459864 1811 for (i = 0; i < opr_sz; i += 1) {
60245996 1812 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
68459864
RH
1813 }
1814}
1815
1816void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1817{
1818 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1819 uint64_t *d = vd, *n = vn;
1820 uint8_t *pg = vg;
60245996
RH
1821 uint8_t inv = simd_data(desc);
1822
68459864 1823 for (i = 0; i < opr_sz; i += 1) {
60245996 1824 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
68459864
RH
1825 }
1826}
1827
ccd841c3
RH
1828/* Three-operand expander, immediate operand, controlled by a predicate.
1829 */
1830#define DO_ZPZI(NAME, TYPE, H, OP) \
1831void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1832{ \
1833 intptr_t i, opr_sz = simd_oprsz(desc); \
1834 TYPE imm = simd_data(desc); \
1835 for (i = 0; i < opr_sz; ) { \
1836 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1837 do { \
1838 if (pg & 1) { \
1839 TYPE nn = *(TYPE *)(vn + H(i)); \
1840 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1841 } \
1842 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1843 } while (i & 15); \
1844 } \
1845}
1846
1847/* Similarly, specialized for 64-bit operands. */
1848#define DO_ZPZI_D(NAME, TYPE, OP) \
1849void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1850{ \
1851 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1852 TYPE *d = vd, *n = vn; \
1853 TYPE imm = simd_data(desc); \
1854 uint8_t *pg = vg; \
1855 for (i = 0; i < opr_sz; i += 1) { \
1856 if (pg[H1(i)] & 1) { \
1857 TYPE nn = n[i]; \
1858 d[i] = OP(nn, imm); \
1859 } \
1860 } \
1861}
1862
1863#define DO_SHR(N, M) (N >> M)
1864#define DO_SHL(N, M) (N << M)
1865
1866/* Arithmetic shift right for division. This rounds negative numbers
1867 toward zero as per signed division. Therefore before shifting,
1868 when N is negative, add 2**M-1. */
1869#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1870
46d111b2
RH
1871static inline uint64_t do_urshr(uint64_t x, unsigned sh)
1872{
1873 if (likely(sh < 64)) {
1874 return (x >> sh) + ((x >> (sh - 1)) & 1);
1875 } else if (sh == 64) {
1876 return x >> 63;
1877 } else {
1878 return 0;
1879 }
1880}
1881
81fd3e6e
RH
1882static inline int64_t do_srshr(int64_t x, unsigned sh)
1883{
1884 if (likely(sh < 64)) {
1885 return (x >> sh) + ((x >> (sh - 1)) & 1);
1886 } else {
1887 /* Rounding the sign bit always produces 0. */
1888 return 0;
1889 }
1890}
1891
ccd841c3
RH
1892DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1893DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1894DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1895DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1896
1897DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1898DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1899DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1900DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1901
1902DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1903DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1904DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1905DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1906
1907DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1908DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1909DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1910DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1911
ccd841c3
RH
1912#undef DO_ASRD
1913#undef DO_ZPZI
1914#undef DO_ZPZI_D
96a36e4a 1915
46d111b2
RH
1916#define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
1917void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1918{ \
1919 intptr_t i, opr_sz = simd_oprsz(desc); \
1920 int shift = simd_data(desc); \
1921 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1922 TYPEW nn = *(TYPEW *)(vn + i); \
1923 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
1924 } \
1925}
1926
1927#define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
1928void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1929{ \
1930 intptr_t i, opr_sz = simd_oprsz(desc); \
1931 int shift = simd_data(desc); \
1932 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1933 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1934 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
1935 } \
1936}
1937
1938DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
1939DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
1940DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
1941
1942DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
1943DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
1944DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, , H1_4, DO_SHR)
1945
1946DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
1947DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
1948DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
1949
1950DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
1951DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
1952DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, , H1_4, do_urshr)
1953
81fd3e6e
RH
1954#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
1955#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
1956#define DO_SQSHRUN_D(x, sh) \
1957 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
1958
1959DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
1960DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
1961DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
1962
1963DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
1964DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
1965DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, , H1_4, DO_SQSHRUN_D)
1966
1967#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
1968#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
1969#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
1970
1971DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
1972DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
1973DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
1974
1975DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
1976DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
1977DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, , H1_4, DO_SQRSHRUN_D)
1978
743bb147
RH
1979#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
1980#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
1981#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
1982
1983DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
1984DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
1985DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
1986
1987DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
1988DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
1989DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, , H1_4, DO_SQSHRN_D)
1990
1991#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
1992#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
1993#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
1994
1995DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
1996DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
1997DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
1998
1999DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2000DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2001DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, , H1_4, DO_SQRSHRN_D)
2002
c13418da
RH
2003#define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2004#define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2005#define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2006
2007DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2008DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2009DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2010
2011DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2012DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2013DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, , H1_4, DO_UQSHRN_D)
2014
2015#define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2016#define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2017#define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2018
2019DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2020DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2021DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2022
2023DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2024DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2025DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, , H1_4, DO_UQRSHRN_D)
2026
46d111b2
RH
2027#undef DO_SHRNB
2028#undef DO_SHRNT
2029
96a36e4a
RH
2030/* Fully general four-operand expander, controlled by a predicate.
2031 */
2032#define DO_ZPZZZ(NAME, TYPE, H, OP) \
2033void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2034 void *vg, uint32_t desc) \
2035{ \
2036 intptr_t i, opr_sz = simd_oprsz(desc); \
2037 for (i = 0; i < opr_sz; ) { \
2038 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2039 do { \
2040 if (pg & 1) { \
2041 TYPE nn = *(TYPE *)(vn + H(i)); \
2042 TYPE mm = *(TYPE *)(vm + H(i)); \
2043 TYPE aa = *(TYPE *)(va + H(i)); \
2044 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2045 } \
2046 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2047 } while (i & 15); \
2048 } \
2049}
2050
2051/* Similarly, specialized for 64-bit operands. */
2052#define DO_ZPZZZ_D(NAME, TYPE, OP) \
2053void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2054 void *vg, uint32_t desc) \
2055{ \
2056 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2057 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2058 uint8_t *pg = vg; \
2059 for (i = 0; i < opr_sz; i += 1) { \
2060 if (pg[H1(i)] & 1) { \
2061 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2062 d[i] = OP(aa, nn, mm); \
2063 } \
2064 } \
2065}
2066
2067#define DO_MLA(A, N, M) (A + N * M)
2068#define DO_MLS(A, N, M) (A - N * M)
2069
2070DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2071DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2072
2073DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2074DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2075
2076DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2077DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2078
2079DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2080DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2081
2082#undef DO_MLA
2083#undef DO_MLS
2084#undef DO_ZPZZZ
2085#undef DO_ZPZZZ_D
9a56c9c3
RH
2086
2087void HELPER(sve_index_b)(void *vd, uint32_t start,
2088 uint32_t incr, uint32_t desc)
2089{
2090 intptr_t i, opr_sz = simd_oprsz(desc);
2091 uint8_t *d = vd;
2092 for (i = 0; i < opr_sz; i += 1) {
2093 d[H1(i)] = start + i * incr;
2094 }
2095}
2096
2097void HELPER(sve_index_h)(void *vd, uint32_t start,
2098 uint32_t incr, uint32_t desc)
2099{
2100 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2101 uint16_t *d = vd;
2102 for (i = 0; i < opr_sz; i += 1) {
2103 d[H2(i)] = start + i * incr;
2104 }
2105}
2106
2107void HELPER(sve_index_s)(void *vd, uint32_t start,
2108 uint32_t incr, uint32_t desc)
2109{
2110 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2111 uint32_t *d = vd;
2112 for (i = 0; i < opr_sz; i += 1) {
2113 d[H4(i)] = start + i * incr;
2114 }
2115}
2116
2117void HELPER(sve_index_d)(void *vd, uint64_t start,
2118 uint64_t incr, uint32_t desc)
2119{
2120 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2121 uint64_t *d = vd;
2122 for (i = 0; i < opr_sz; i += 1) {
2123 d[i] = start + i * incr;
2124 }
2125}
4b242d9c
RH
2126
2127void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2128{
2129 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2130 uint32_t sh = simd_data(desc);
2131 uint32_t *d = vd, *n = vn, *m = vm;
2132 for (i = 0; i < opr_sz; i += 1) {
2133 d[i] = n[i] + (m[i] << sh);
2134 }
2135}
2136
2137void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2138{
2139 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2140 uint64_t sh = simd_data(desc);
2141 uint64_t *d = vd, *n = vn, *m = vm;
2142 for (i = 0; i < opr_sz; i += 1) {
2143 d[i] = n[i] + (m[i] << sh);
2144 }
2145}
2146
2147void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2148{
2149 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2150 uint64_t sh = simd_data(desc);
2151 uint64_t *d = vd, *n = vn, *m = vm;
2152 for (i = 0; i < opr_sz; i += 1) {
2153 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2154 }
2155}
2156
2157void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2158{
2159 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2160 uint64_t sh = simd_data(desc);
2161 uint64_t *d = vd, *n = vn, *m = vm;
2162 for (i = 0; i < opr_sz; i += 1) {
2163 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2164 }
2165}
0762cd42
RH
2166
2167void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2168{
2169 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2170 static const uint16_t coeff[] = {
2171 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2172 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2173 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2174 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2175 };
2176 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2177 uint16_t *d = vd, *n = vn;
2178
2179 for (i = 0; i < opr_sz; i++) {
2180 uint16_t nn = n[i];
2181 intptr_t idx = extract32(nn, 0, 5);
2182 uint16_t exp = extract32(nn, 5, 5);
2183 d[i] = coeff[idx] | (exp << 10);
2184 }
2185}
2186
2187void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2188{
2189 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2190 static const uint32_t coeff[] = {
2191 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2192 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2193 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2194 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2195 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2196 0x1ef532, 0x20b051, 0x227043, 0x243516,
2197 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2198 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2199 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2200 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2201 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2202 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2203 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2204 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2205 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2206 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2207 };
2208 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2209 uint32_t *d = vd, *n = vn;
2210
2211 for (i = 0; i < opr_sz; i++) {
2212 uint32_t nn = n[i];
2213 intptr_t idx = extract32(nn, 0, 6);
2214 uint32_t exp = extract32(nn, 6, 8);
2215 d[i] = coeff[idx] | (exp << 23);
2216 }
2217}
2218
2219void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2220{
2221 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2222 static const uint64_t coeff[] = {
2223 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2224 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2225 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2226 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2227 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2228 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2229 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2230 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2231 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2232 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2233 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2234 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2235 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2236 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2237 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2238 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2239 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2240 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2241 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2242 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2243 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2244 0xFA7C1819E90D8ull,
2245 };
2246 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2247 uint64_t *d = vd, *n = vn;
2248
2249 for (i = 0; i < opr_sz; i++) {
2250 uint64_t nn = n[i];
2251 intptr_t idx = extract32(nn, 0, 6);
2252 uint64_t exp = extract32(nn, 6, 11);
2253 d[i] = coeff[idx] | (exp << 52);
2254 }
2255}
a1f233f2
RH
2256
2257void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2258{
2259 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2260 uint16_t *d = vd, *n = vn, *m = vm;
2261 for (i = 0; i < opr_sz; i += 1) {
2262 uint16_t nn = n[i];
2263 uint16_t mm = m[i];
2264 if (mm & 1) {
2265 nn = float16_one;
2266 }
2267 d[i] = nn ^ (mm & 2) << 14;
2268 }
2269}
2270
2271void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2272{
2273 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2274 uint32_t *d = vd, *n = vn, *m = vm;
2275 for (i = 0; i < opr_sz; i += 1) {
2276 uint32_t nn = n[i];
2277 uint32_t mm = m[i];
2278 if (mm & 1) {
2279 nn = float32_one;
2280 }
2281 d[i] = nn ^ (mm & 2) << 30;
2282 }
2283}
2284
2285void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2286{
2287 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2288 uint64_t *d = vd, *n = vn, *m = vm;
2289 for (i = 0; i < opr_sz; i += 1) {
2290 uint64_t nn = n[i];
2291 uint64_t mm = m[i];
2292 if (mm & 1) {
2293 nn = float64_one;
2294 }
2295 d[i] = nn ^ (mm & 2) << 62;
2296 }
2297}
24e82e68
RH
2298
2299/*
2300 * Signed saturating addition with scalar operand.
2301 */
2302
2303void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2304{
2305 intptr_t i, oprsz = simd_oprsz(desc);
2306
2307 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
4f07fbeb 2308 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
24e82e68
RH
2309 }
2310}
2311
2312void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2313{
2314 intptr_t i, oprsz = simd_oprsz(desc);
2315
2316 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
4f07fbeb 2317 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
24e82e68
RH
2318 }
2319}
2320
2321void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2322{
2323 intptr_t i, oprsz = simd_oprsz(desc);
2324
2325 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
4f07fbeb 2326 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
24e82e68
RH
2327 }
2328}
2329
2330void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2331{
2332 intptr_t i, oprsz = simd_oprsz(desc);
2333
2334 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
4f07fbeb 2335 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
24e82e68
RH
2336 }
2337}
2338
2339/*
2340 * Unsigned saturating addition with scalar operand.
2341 */
2342
2343void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2344{
2345 intptr_t i, oprsz = simd_oprsz(desc);
2346
2347 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
4f07fbeb 2348 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
24e82e68
RH
2349 }
2350}
2351
2352void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2353{
2354 intptr_t i, oprsz = simd_oprsz(desc);
2355
2356 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
4f07fbeb 2357 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
24e82e68
RH
2358 }
2359}
2360
2361void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2362{
2363 intptr_t i, oprsz = simd_oprsz(desc);
2364
2365 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
4f07fbeb 2366 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
24e82e68
RH
2367 }
2368}
2369
2370void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2371{
2372 intptr_t i, oprsz = simd_oprsz(desc);
2373
2374 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
4f07fbeb 2375 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
24e82e68
RH
2376 }
2377}
2378
2379void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2380{
2381 intptr_t i, oprsz = simd_oprsz(desc);
2382
2383 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
4f07fbeb 2384 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
24e82e68
RH
2385 }
2386}
f25a2361
RH
2387
2388/* Two operand predicated copy immediate with merge. All valid immediates
2389 * can fit within 17 signed bits in the simd_data field.
2390 */
2391void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2392 uint64_t mm, uint32_t desc)
2393{
2394 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2395 uint64_t *d = vd, *n = vn;
2396 uint8_t *pg = vg;
2397
2398 mm = dup_const(MO_8, mm);
2399 for (i = 0; i < opr_sz; i += 1) {
2400 uint64_t nn = n[i];
2401 uint64_t pp = expand_pred_b(pg[H1(i)]);
2402 d[i] = (mm & pp) | (nn & ~pp);
2403 }
2404}
2405
2406void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2407 uint64_t mm, uint32_t desc)
2408{
2409 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2410 uint64_t *d = vd, *n = vn;
2411 uint8_t *pg = vg;
2412
2413 mm = dup_const(MO_16, mm);
2414 for (i = 0; i < opr_sz; i += 1) {
2415 uint64_t nn = n[i];
2416 uint64_t pp = expand_pred_h(pg[H1(i)]);
2417 d[i] = (mm & pp) | (nn & ~pp);
2418 }
2419}
2420
2421void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2422 uint64_t mm, uint32_t desc)
2423{
2424 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2425 uint64_t *d = vd, *n = vn;
2426 uint8_t *pg = vg;
2427
2428 mm = dup_const(MO_32, mm);
2429 for (i = 0; i < opr_sz; i += 1) {
2430 uint64_t nn = n[i];
2431 uint64_t pp = expand_pred_s(pg[H1(i)]);
2432 d[i] = (mm & pp) | (nn & ~pp);
2433 }
2434}
2435
2436void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2437 uint64_t mm, uint32_t desc)
2438{
2439 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2440 uint64_t *d = vd, *n = vn;
2441 uint8_t *pg = vg;
2442
2443 for (i = 0; i < opr_sz; i += 1) {
2444 uint64_t nn = n[i];
2445 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2446 }
2447}
2448
2449void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2450{
2451 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2452 uint64_t *d = vd;
2453 uint8_t *pg = vg;
2454
2455 val = dup_const(MO_8, val);
2456 for (i = 0; i < opr_sz; i += 1) {
2457 d[i] = val & expand_pred_b(pg[H1(i)]);
2458 }
2459}
2460
2461void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2462{
2463 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2464 uint64_t *d = vd;
2465 uint8_t *pg = vg;
2466
2467 val = dup_const(MO_16, val);
2468 for (i = 0; i < opr_sz; i += 1) {
2469 d[i] = val & expand_pred_h(pg[H1(i)]);
2470 }
2471}
2472
2473void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2474{
2475 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2476 uint64_t *d = vd;
2477 uint8_t *pg = vg;
2478
2479 val = dup_const(MO_32, val);
2480 for (i = 0; i < opr_sz; i += 1) {
2481 d[i] = val & expand_pred_s(pg[H1(i)]);
2482 }
2483}
2484
2485void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2486{
2487 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2488 uint64_t *d = vd;
2489 uint8_t *pg = vg;
2490
2491 for (i = 0; i < opr_sz; i += 1) {
2492 d[i] = (pg[H1(i)] & 1 ? val : 0);
2493 }
2494}
b94f8f60 2495
b4cd95d2 2496/* Big-endian hosts need to frob the byte indices. If the copy
b94f8f60
RH
2497 * happens to be 8-byte aligned, then no frobbing necessary.
2498 */
2499static void swap_memmove(void *vd, void *vs, size_t n)
2500{
2501 uintptr_t d = (uintptr_t)vd;
2502 uintptr_t s = (uintptr_t)vs;
2503 uintptr_t o = (d | s | n) & 7;
2504 size_t i;
2505
2506#ifndef HOST_WORDS_BIGENDIAN
2507 o = 0;
2508#endif
2509 switch (o) {
2510 case 0:
2511 memmove(vd, vs, n);
2512 break;
2513
2514 case 4:
2515 if (d < s || d >= s + n) {
2516 for (i = 0; i < n; i += 4) {
2517 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2518 }
2519 } else {
2520 for (i = n; i > 0; ) {
2521 i -= 4;
2522 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2523 }
2524 }
2525 break;
2526
2527 case 2:
2528 case 6:
2529 if (d < s || d >= s + n) {
2530 for (i = 0; i < n; i += 2) {
2531 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2532 }
2533 } else {
2534 for (i = n; i > 0; ) {
2535 i -= 2;
2536 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2537 }
2538 }
2539 break;
2540
2541 default:
2542 if (d < s || d >= s + n) {
2543 for (i = 0; i < n; i++) {
2544 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2545 }
2546 } else {
2547 for (i = n; i > 0; ) {
2548 i -= 1;
2549 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2550 }
2551 }
2552 break;
2553 }
2554}
2555
9123aeb6
RH
2556/* Similarly for memset of 0. */
2557static void swap_memzero(void *vd, size_t n)
2558{
2559 uintptr_t d = (uintptr_t)vd;
2560 uintptr_t o = (d | n) & 7;
2561 size_t i;
2562
2563 /* Usually, the first bit of a predicate is set, so N is 0. */
2564 if (likely(n == 0)) {
2565 return;
2566 }
2567
2568#ifndef HOST_WORDS_BIGENDIAN
2569 o = 0;
2570#endif
2571 switch (o) {
2572 case 0:
2573 memset(vd, 0, n);
2574 break;
2575
2576 case 4:
2577 for (i = 0; i < n; i += 4) {
2578 *(uint32_t *)H1_4(d + i) = 0;
2579 }
2580 break;
2581
2582 case 2:
2583 case 6:
2584 for (i = 0; i < n; i += 2) {
2585 *(uint16_t *)H1_2(d + i) = 0;
2586 }
2587 break;
2588
2589 default:
2590 for (i = 0; i < n; i++) {
2591 *(uint8_t *)H1(d + i) = 0;
2592 }
2593 break;
2594 }
2595}
2596
b94f8f60
RH
2597void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2598{
2599 intptr_t opr_sz = simd_oprsz(desc);
2600 size_t n_ofs = simd_data(desc);
2601 size_t n_siz = opr_sz - n_ofs;
2602
2603 if (vd != vm) {
2604 swap_memmove(vd, vn + n_ofs, n_siz);
2605 swap_memmove(vd + n_siz, vm, n_ofs);
2606 } else if (vd != vn) {
2607 swap_memmove(vd + n_siz, vd, n_ofs);
2608 swap_memmove(vd, vn + n_ofs, n_siz);
2609 } else {
2610 /* vd == vn == vm. Need temp space. */
2611 ARMVectorReg tmp;
2612 swap_memmove(&tmp, vm, n_ofs);
2613 swap_memmove(vd, vd + n_ofs, n_siz);
2614 memcpy(vd + n_siz, &tmp, n_ofs);
2615 }
2616}
30562ab7
RH
2617
2618#define DO_INSR(NAME, TYPE, H) \
2619void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2620{ \
2621 intptr_t opr_sz = simd_oprsz(desc); \
2622 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2623 *(TYPE *)(vd + H(0)) = val; \
2624}
2625
2626DO_INSR(sve_insr_b, uint8_t, H1)
2627DO_INSR(sve_insr_h, uint16_t, H1_2)
2628DO_INSR(sve_insr_s, uint32_t, H1_4)
2629DO_INSR(sve_insr_d, uint64_t, )
2630
2631#undef DO_INSR
2632
2633void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2634{
2635 intptr_t i, j, opr_sz = simd_oprsz(desc);
2636 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2637 uint64_t f = *(uint64_t *)(vn + i);
2638 uint64_t b = *(uint64_t *)(vn + j);
2639 *(uint64_t *)(vd + i) = bswap64(b);
2640 *(uint64_t *)(vd + j) = bswap64(f);
2641 }
2642}
2643
30562ab7
RH
2644void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2645{
2646 intptr_t i, j, opr_sz = simd_oprsz(desc);
2647 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2648 uint64_t f = *(uint64_t *)(vn + i);
2649 uint64_t b = *(uint64_t *)(vn + j);
2650 *(uint64_t *)(vd + i) = hswap64(b);
2651 *(uint64_t *)(vd + j) = hswap64(f);
2652 }
2653}
2654
2655void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2656{
2657 intptr_t i, j, opr_sz = simd_oprsz(desc);
2658 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2659 uint64_t f = *(uint64_t *)(vn + i);
2660 uint64_t b = *(uint64_t *)(vn + j);
2661 *(uint64_t *)(vd + i) = rol64(b, 32);
2662 *(uint64_t *)(vd + j) = rol64(f, 32);
2663 }
2664}
2665
2666void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2667{
2668 intptr_t i, j, opr_sz = simd_oprsz(desc);
2669 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2670 uint64_t f = *(uint64_t *)(vn + i);
2671 uint64_t b = *(uint64_t *)(vn + j);
2672 *(uint64_t *)(vd + i) = b;
2673 *(uint64_t *)(vd + j) = f;
2674 }
2675}
2676
2677#define DO_TBL(NAME, TYPE, H) \
2678void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2679{ \
2680 intptr_t i, opr_sz = simd_oprsz(desc); \
2681 uintptr_t elem = opr_sz / sizeof(TYPE); \
2682 TYPE *d = vd, *n = vn, *m = vm; \
2683 ARMVectorReg tmp; \
2684 if (unlikely(vd == vn)) { \
2685 n = memcpy(&tmp, vn, opr_sz); \
2686 } \
2687 for (i = 0; i < elem; i++) { \
2688 TYPE j = m[H(i)]; \
2689 d[H(i)] = j < elem ? n[H(j)] : 0; \
2690 } \
2691}
2692
2693DO_TBL(sve_tbl_b, uint8_t, H1)
2694DO_TBL(sve_tbl_h, uint16_t, H2)
2695DO_TBL(sve_tbl_s, uint32_t, H4)
2696DO_TBL(sve_tbl_d, uint64_t, )
2697
2698#undef TBL
2699
2700#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
2701void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2702{ \
2703 intptr_t i, opr_sz = simd_oprsz(desc); \
2704 TYPED *d = vd; \
2705 TYPES *n = vn; \
2706 ARMVectorReg tmp; \
2707 if (unlikely(vn - vd < opr_sz)) { \
2708 n = memcpy(&tmp, n, opr_sz / 2); \
2709 } \
2710 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
2711 d[HD(i)] = n[HS(i)]; \
2712 } \
2713}
2714
2715DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
2716DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
2717DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
2718
2719DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
2720DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
2721DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
2722
2723#undef DO_UNPK
d731d8cb
RH
2724
2725/* Mask of bits included in the even numbered predicates of width esz.
2726 * We also use this for expand_bits/compress_bits, and so extend the
2727 * same pattern out to 16-bit units.
2728 */
2729static const uint64_t even_bit_esz_masks[5] = {
2730 0x5555555555555555ull,
2731 0x3333333333333333ull,
2732 0x0f0f0f0f0f0f0f0full,
2733 0x00ff00ff00ff00ffull,
2734 0x0000ffff0000ffffull,
2735};
2736
2737/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
2738 * For N==0, this corresponds to the operation that in qemu/bitops.h
2739 * we call half_shuffle64; this algorithm is from Hacker's Delight,
2740 * section 7-2 Shuffling Bits.
2741 */
2742static uint64_t expand_bits(uint64_t x, int n)
2743{
2744 int i;
2745
2746 x &= 0xffffffffu;
2747 for (i = 4; i >= n; i--) {
2748 int sh = 1 << i;
2749 x = ((x << sh) | x) & even_bit_esz_masks[i];
2750 }
2751 return x;
2752}
2753
2754/* Compress units of 2**(N+1) bits to units of 2**N bits.
2755 * For N==0, this corresponds to the operation that in qemu/bitops.h
2756 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
2757 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
2758 */
2759static uint64_t compress_bits(uint64_t x, int n)
2760{
2761 int i;
2762
2763 for (i = n; i <= 4; i++) {
2764 int sh = 1 << i;
2765 x &= even_bit_esz_masks[i];
2766 x = (x >> sh) | x;
2767 }
2768 return x & 0xffffffffu;
2769}
2770
2771void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2772{
f9b0fcce
RH
2773 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2774 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2775 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
8e7fefed 2776 int esize = 1 << esz;
d731d8cb
RH
2777 uint64_t *d = vd;
2778 intptr_t i;
2779
2780 if (oprsz <= 8) {
2781 uint64_t nn = *(uint64_t *)vn;
2782 uint64_t mm = *(uint64_t *)vm;
2783 int half = 4 * oprsz;
2784
2785 nn = extract64(nn, high * half, half);
2786 mm = extract64(mm, high * half, half);
2787 nn = expand_bits(nn, esz);
2788 mm = expand_bits(mm, esz);
8e7fefed 2789 d[0] = nn | (mm << esize);
d731d8cb 2790 } else {
8e7fefed 2791 ARMPredicateReg tmp;
d731d8cb
RH
2792
2793 /* We produce output faster than we consume input.
2794 Therefore we must be mindful of possible overlap. */
8e7fefed
RH
2795 if (vd == vn) {
2796 vn = memcpy(&tmp, vn, oprsz);
2797 if (vd == vm) {
2798 vm = vn;
2799 }
2800 } else if (vd == vm) {
2801 vm = memcpy(&tmp, vm, oprsz);
d731d8cb
RH
2802 }
2803 if (high) {
2804 high = oprsz >> 1;
2805 }
2806
8e7fefed 2807 if ((oprsz & 7) == 0) {
d731d8cb
RH
2808 uint32_t *n = vn, *m = vm;
2809 high >>= 2;
2810
8e7fefed 2811 for (i = 0; i < oprsz / 8; i++) {
d731d8cb
RH
2812 uint64_t nn = n[H4(high + i)];
2813 uint64_t mm = m[H4(high + i)];
2814
2815 nn = expand_bits(nn, esz);
2816 mm = expand_bits(mm, esz);
8e7fefed 2817 d[i] = nn | (mm << esize);
d731d8cb
RH
2818 }
2819 } else {
2820 uint8_t *n = vn, *m = vm;
2821 uint16_t *d16 = vd;
2822
2823 for (i = 0; i < oprsz / 2; i++) {
2824 uint16_t nn = n[H1(high + i)];
2825 uint16_t mm = m[H1(high + i)];
2826
2827 nn = expand_bits(nn, esz);
2828 mm = expand_bits(mm, esz);
8e7fefed 2829 d16[H2(i)] = nn | (mm << esize);
d731d8cb
RH
2830 }
2831 }
2832 }
2833}
2834
2835void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2836{
f9b0fcce
RH
2837 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2838 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2839 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
d731d8cb
RH
2840 uint64_t *d = vd, *n = vn, *m = vm;
2841 uint64_t l, h;
2842 intptr_t i;
2843
2844 if (oprsz <= 8) {
2845 l = compress_bits(n[0] >> odd, esz);
2846 h = compress_bits(m[0] >> odd, esz);
226e6c04 2847 d[0] = l | (h << (4 * oprsz));
d731d8cb
RH
2848 } else {
2849 ARMPredicateReg tmp_m;
2850 intptr_t oprsz_16 = oprsz / 16;
2851
2852 if ((vm - vd) < (uintptr_t)oprsz) {
2853 m = memcpy(&tmp_m, vm, oprsz);
2854 }
2855
2856 for (i = 0; i < oprsz_16; i++) {
2857 l = n[2 * i + 0];
2858 h = n[2 * i + 1];
2859 l = compress_bits(l >> odd, esz);
2860 h = compress_bits(h >> odd, esz);
226e6c04 2861 d[i] = l | (h << 32);
d731d8cb
RH
2862 }
2863
226e6c04
RH
2864 /*
2865 * For VL which is not a multiple of 512, the results from M do not
2866 * align nicely with the uint64_t for D. Put the aligned results
2867 * from M into TMP_M and then copy it into place afterward.
2868 */
d731d8cb 2869 if (oprsz & 15) {
226e6c04
RH
2870 int final_shift = (oprsz & 15) * 2;
2871
2872 l = n[2 * i + 0];
2873 h = n[2 * i + 1];
2874 l = compress_bits(l >> odd, esz);
2875 h = compress_bits(h >> odd, esz);
2876 d[i] = l | (h << final_shift);
d731d8cb
RH
2877
2878 for (i = 0; i < oprsz_16; i++) {
2879 l = m[2 * i + 0];
2880 h = m[2 * i + 1];
2881 l = compress_bits(l >> odd, esz);
2882 h = compress_bits(h >> odd, esz);
226e6c04 2883 tmp_m.p[i] = l | (h << 32);
d731d8cb 2884 }
226e6c04
RH
2885 l = m[2 * i + 0];
2886 h = m[2 * i + 1];
2887 l = compress_bits(l >> odd, esz);
2888 h = compress_bits(h >> odd, esz);
2889 tmp_m.p[i] = l | (h << final_shift);
d731d8cb
RH
2890
2891 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
2892 } else {
2893 for (i = 0; i < oprsz_16; i++) {
2894 l = m[2 * i + 0];
2895 h = m[2 * i + 1];
2896 l = compress_bits(l >> odd, esz);
2897 h = compress_bits(h >> odd, esz);
226e6c04 2898 d[oprsz_16 + i] = l | (h << 32);
d731d8cb
RH
2899 }
2900 }
2901 }
2902}
2903
2904void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2905{
f9b0fcce
RH
2906 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2907 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2908 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
d731d8cb
RH
2909 uint64_t *d = vd, *n = vn, *m = vm;
2910 uint64_t mask;
2911 int shr, shl;
2912 intptr_t i;
2913
2914 shl = 1 << esz;
2915 shr = 0;
2916 mask = even_bit_esz_masks[esz];
2917 if (odd) {
2918 mask <<= shl;
2919 shr = shl;
2920 shl = 0;
2921 }
2922
2923 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2924 uint64_t nn = (n[i] & mask) >> shr;
2925 uint64_t mm = (m[i] & mask) << shl;
2926 d[i] = nn + mm;
2927 }
2928}
2929
2930/* Reverse units of 2**N bits. */
2931static uint64_t reverse_bits_64(uint64_t x, int n)
2932{
2933 int i, sh;
2934
2935 x = bswap64(x);
2936 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2937 uint64_t mask = even_bit_esz_masks[i];
2938 x = ((x & mask) << sh) | ((x >> sh) & mask);
2939 }
2940 return x;
2941}
2942
2943static uint8_t reverse_bits_8(uint8_t x, int n)
2944{
2945 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2946 int i, sh;
2947
2948 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2949 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2950 }
2951 return x;
2952}
2953
2954void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2955{
70acaafe
RH
2956 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2957 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
d731d8cb
RH
2958 intptr_t i, oprsz_2 = oprsz / 2;
2959
2960 if (oprsz <= 8) {
2961 uint64_t l = *(uint64_t *)vn;
2962 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2963 *(uint64_t *)vd = l;
2964 } else if ((oprsz & 15) == 0) {
2965 for (i = 0; i < oprsz_2; i += 8) {
2966 intptr_t ih = oprsz - 8 - i;
2967 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2968 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2969 *(uint64_t *)(vd + i) = h;
2970 *(uint64_t *)(vd + ih) = l;
2971 }
2972 } else {
2973 for (i = 0; i < oprsz_2; i += 1) {
2974 intptr_t il = H1(i);
2975 intptr_t ih = H1(oprsz - 1 - i);
2976 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2977 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2978 *(uint8_t *)(vd + il) = h;
2979 *(uint8_t *)(vd + ih) = l;
2980 }
2981 }
2982}
2983
2984void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2985{
70acaafe
RH
2986 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2987 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
d731d8cb
RH
2988 uint64_t *d = vd;
2989 intptr_t i;
2990
2991 if (oprsz <= 8) {
2992 uint64_t nn = *(uint64_t *)vn;
2993 int half = 4 * oprsz;
2994
2995 nn = extract64(nn, high * half, half);
2996 nn = expand_bits(nn, 0);
2997 d[0] = nn;
2998 } else {
2999 ARMPredicateReg tmp_n;
3000
3001 /* We produce output faster than we consume input.
3002 Therefore we must be mindful of possible overlap. */
3003 if ((vn - vd) < (uintptr_t)oprsz) {
3004 vn = memcpy(&tmp_n, vn, oprsz);
3005 }
3006 if (high) {
3007 high = oprsz >> 1;
3008 }
3009
fd911a21 3010 if ((oprsz & 7) == 0) {
d731d8cb
RH
3011 uint32_t *n = vn;
3012 high >>= 2;
3013
fd911a21 3014 for (i = 0; i < oprsz / 8; i++) {
d731d8cb
RH
3015 uint64_t nn = n[H4(high + i)];
3016 d[i] = expand_bits(nn, 0);
3017 }
3018 } else {
3019 uint16_t *d16 = vd;
3020 uint8_t *n = vn;
3021
3022 for (i = 0; i < oprsz / 2; i++) {
3023 uint16_t nn = n[H1(high + i)];
3024 d16[H2(i)] = expand_bits(nn, 0);
3025 }
3026 }
3027 }
3028}
234b48e9
RH
3029
3030#define DO_ZIP(NAME, TYPE, H) \
3031void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3032{ \
3033 intptr_t oprsz = simd_oprsz(desc); \
3034 intptr_t i, oprsz_2 = oprsz / 2; \
3035 ARMVectorReg tmp_n, tmp_m; \
3036 /* We produce output faster than we consume input. \
3037 Therefore we must be mindful of possible overlap. */ \
3038 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3039 vn = memcpy(&tmp_n, vn, oprsz_2); \
3040 } \
3041 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3042 vm = memcpy(&tmp_m, vm, oprsz_2); \
3043 } \
3044 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3045 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
3046 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
3047 } \
3048}
3049
3050DO_ZIP(sve_zip_b, uint8_t, H1)
3051DO_ZIP(sve_zip_h, uint16_t, H1_2)
3052DO_ZIP(sve_zip_s, uint32_t, H1_4)
3053DO_ZIP(sve_zip_d, uint64_t, )
3054
3055#define DO_UZP(NAME, TYPE, H) \
3056void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3057{ \
3058 intptr_t oprsz = simd_oprsz(desc); \
3059 intptr_t oprsz_2 = oprsz / 2; \
3060 intptr_t odd_ofs = simd_data(desc); \
3061 intptr_t i; \
3062 ARMVectorReg tmp_m; \
3063 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3064 vm = memcpy(&tmp_m, vm, oprsz); \
3065 } \
3066 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3067 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
3068 } \
3069 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3070 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
3071 } \
3072}
3073
3074DO_UZP(sve_uzp_b, uint8_t, H1)
3075DO_UZP(sve_uzp_h, uint16_t, H1_2)
3076DO_UZP(sve_uzp_s, uint32_t, H1_4)
3077DO_UZP(sve_uzp_d, uint64_t, )
3078
3079#define DO_TRN(NAME, TYPE, H) \
3080void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3081{ \
3082 intptr_t oprsz = simd_oprsz(desc); \
3083 intptr_t odd_ofs = simd_data(desc); \
3084 intptr_t i; \
3085 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3086 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3087 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3088 *(TYPE *)(vd + H(i + 0)) = ae; \
3089 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3090 } \
3091}
3092
3093DO_TRN(sve_trn_b, uint8_t, H1)
3094DO_TRN(sve_trn_h, uint16_t, H1_2)
3095DO_TRN(sve_trn_s, uint32_t, H1_4)
3096DO_TRN(sve_trn_d, uint64_t, )
3097
3098#undef DO_ZIP
3099#undef DO_UZP
3100#undef DO_TRN
3ca879ae
RH
3101
3102void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3103{
3104 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3105 uint32_t *d = vd, *n = vn;
3106 uint8_t *pg = vg;
3107
3108 for (i = j = 0; i < opr_sz; i++) {
3109 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3110 d[H4(j)] = n[H4(i)];
3111 j++;
3112 }
3113 }
3114 for (; j < opr_sz; j++) {
3115 d[H4(j)] = 0;
3116 }
3117}
3118
3119void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3120{
3121 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3122 uint64_t *d = vd, *n = vn;
3123 uint8_t *pg = vg;
3124
3125 for (i = j = 0; i < opr_sz; i++) {
3126 if (pg[H1(i)] & 1) {
3127 d[j] = n[i];
3128 j++;
3129 }
3130 }
3131 for (; j < opr_sz; j++) {
3132 d[j] = 0;
3133 }
3134}
ef23cb72
RH
3135
3136/* Similar to the ARM LastActiveElement pseudocode function, except the
3137 * result is multiplied by the element size. This includes the not found
3138 * indication; e.g. not found for esz=3 is -8.
3139 */
3140int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3141{
2acbfbe4
RH
3142 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3143 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
ef23cb72 3144
2acbfbe4 3145 return last_active_element(vg, words, esz);
ef23cb72 3146}
b48ff240
RH
3147
3148void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3149{
3150 intptr_t opr_sz = simd_oprsz(desc) / 8;
3151 int esz = simd_data(desc);
3152 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3153 intptr_t i, first_i, last_i;
3154 ARMVectorReg tmp;
3155
3156 first_i = last_i = 0;
3157 first_g = last_g = 0;
3158
3159 /* Find the extent of the active elements within VG. */
3160 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3161 pg = *(uint64_t *)(vg + i) & mask;
3162 if (pg) {
3163 if (last_g == 0) {
3164 last_g = pg;
3165 last_i = i;
3166 }
3167 first_g = pg;
3168 first_i = i;
3169 }
3170 }
3171
3172 len = 0;
3173 if (first_g != 0) {
3174 first_i = first_i * 8 + ctz64(first_g);
3175 last_i = last_i * 8 + 63 - clz64(last_g);
3176 len = last_i - first_i + (1 << esz);
3177 if (vd == vm) {
3178 vm = memcpy(&tmp, vm, opr_sz * 8);
3179 }
3180 swap_memmove(vd, vn + first_i, len);
3181 }
3182 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3183}
d3fe4a29
RH
3184
3185void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3186 void *vg, uint32_t desc)
3187{
3188 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3189 uint64_t *d = vd, *n = vn, *m = vm;
3190 uint8_t *pg = vg;
3191
3192 for (i = 0; i < opr_sz; i += 1) {
3193 uint64_t nn = n[i], mm = m[i];
3194 uint64_t pp = expand_pred_b(pg[H1(i)]);
3195 d[i] = (nn & pp) | (mm & ~pp);
3196 }
3197}
3198
3199void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3200 void *vg, uint32_t desc)
3201{
3202 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3203 uint64_t *d = vd, *n = vn, *m = vm;
3204 uint8_t *pg = vg;
3205
3206 for (i = 0; i < opr_sz; i += 1) {
3207 uint64_t nn = n[i], mm = m[i];
3208 uint64_t pp = expand_pred_h(pg[H1(i)]);
3209 d[i] = (nn & pp) | (mm & ~pp);
3210 }
3211}
3212
3213void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3214 void *vg, uint32_t desc)
3215{
3216 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3217 uint64_t *d = vd, *n = vn, *m = vm;
3218 uint8_t *pg = vg;
3219
3220 for (i = 0; i < opr_sz; i += 1) {
3221 uint64_t nn = n[i], mm = m[i];
3222 uint64_t pp = expand_pred_s(pg[H1(i)]);
3223 d[i] = (nn & pp) | (mm & ~pp);
3224 }
3225}
3226
3227void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3228 void *vg, uint32_t desc)
3229{
3230 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3231 uint64_t *d = vd, *n = vn, *m = vm;
3232 uint8_t *pg = vg;
3233
3234 for (i = 0; i < opr_sz; i += 1) {
3235 uint64_t nn = n[i], mm = m[i];
3236 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3237 }
3238}
757f9cff
RH
3239
3240/* Two operand comparison controlled by a predicate.
3241 * ??? It is very tempting to want to be able to expand this inline
3242 * with x86 instructions, e.g.
3243 *
3244 * vcmpeqw zm, zn, %ymm0
3245 * vpmovmskb %ymm0, %eax
3246 * and $0x5555, %eax
3247 * and pg, %eax
3248 *
3249 * or even aarch64, e.g.
3250 *
3251 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3252 * cmeq v0.8h, zn, zm
3253 * and v0.8h, v0.8h, mask
3254 * addv h0, v0.8h
3255 * and v0.8b, pg
3256 *
3257 * However, coming up with an abstraction that allows vector inputs and
3258 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3259 * scalar outputs, is tricky.
3260 */
3261#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3262uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3263{ \
3264 intptr_t opr_sz = simd_oprsz(desc); \
3265 uint32_t flags = PREDTEST_INIT; \
3266 intptr_t i = opr_sz; \
3267 do { \
3268 uint64_t out = 0, pg; \
3269 do { \
3270 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3271 TYPE nn = *(TYPE *)(vn + H(i)); \
3272 TYPE mm = *(TYPE *)(vm + H(i)); \
3273 out |= nn OP mm; \
3274 } while (i & 63); \
3275 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3276 out &= pg; \
3277 *(uint64_t *)(vd + (i >> 3)) = out; \
3278 flags = iter_predtest_bwd(out, pg, flags); \
3279 } while (i > 0); \
3280 return flags; \
3281}
3282
3283#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3284 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3285#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3286 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3287#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3288 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3289#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3290 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
3291
3292DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3293DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3294DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3295DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3296
3297DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3298DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3299DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3300DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3301
3302DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3303DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3304DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3305DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3306
3307DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3308DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3309DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3310DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3311
3312DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3313DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3314DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3315DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3316
3317DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3318DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3319DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3320DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3321
3322#undef DO_CMP_PPZZ_B
3323#undef DO_CMP_PPZZ_H
3324#undef DO_CMP_PPZZ_S
3325#undef DO_CMP_PPZZ_D
3326#undef DO_CMP_PPZZ
3327
3328/* Similar, but the second source is "wide". */
3329#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3330uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3331{ \
3332 intptr_t opr_sz = simd_oprsz(desc); \
3333 uint32_t flags = PREDTEST_INIT; \
3334 intptr_t i = opr_sz; \
3335 do { \
3336 uint64_t out = 0, pg; \
3337 do { \
3338 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3339 do { \
3340 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3341 TYPE nn = *(TYPE *)(vn + H(i)); \
3342 out |= nn OP mm; \
3343 } while (i & 7); \
3344 } while (i & 63); \
3345 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3346 out &= pg; \
3347 *(uint64_t *)(vd + (i >> 3)) = out; \
3348 flags = iter_predtest_bwd(out, pg, flags); \
3349 } while (i > 0); \
3350 return flags; \
3351}
3352
3353#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3354 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3355#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3356 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3357#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3358 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3359
df4e0010
RH
3360DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3361DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3362DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
757f9cff 3363
df4e0010
RH
3364DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3365DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3366DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
757f9cff
RH
3367
3368DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3369DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3370DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3371
3372DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3373DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3374DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3375
3376DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3377DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3378DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3379
3380DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3381DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3382DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3383
3384DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3385DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3386DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3387
3388DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3389DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3390DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3391
3392DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3393DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3394DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3395
3396DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3397DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3398DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3399
3400#undef DO_CMP_PPZW_B
3401#undef DO_CMP_PPZW_H
3402#undef DO_CMP_PPZW_S
3403#undef DO_CMP_PPZW
38cadeba
RH
3404
3405/* Similar, but the second source is immediate. */
3406#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3407uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3408{ \
3409 intptr_t opr_sz = simd_oprsz(desc); \
3410 uint32_t flags = PREDTEST_INIT; \
3411 TYPE mm = simd_data(desc); \
3412 intptr_t i = opr_sz; \
3413 do { \
3414 uint64_t out = 0, pg; \
3415 do { \
3416 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3417 TYPE nn = *(TYPE *)(vn + H(i)); \
3418 out |= nn OP mm; \
3419 } while (i & 63); \
3420 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3421 out &= pg; \
3422 *(uint64_t *)(vd + (i >> 3)) = out; \
3423 flags = iter_predtest_bwd(out, pg, flags); \
3424 } while (i > 0); \
3425 return flags; \
3426}
3427
3428#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3429 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3430#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3431 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3432#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3433 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3434#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3435 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
3436
3437DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3438DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3439DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3440DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3441
3442DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3443DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3444DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3445DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3446
3447DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3448DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3449DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3450DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3451
3452DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3453DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3454DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3455DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3456
3457DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3458DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3459DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3460DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3461
3462DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3463DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3464DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3465DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3466
3467DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3468DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3469DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3470DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3471
3472DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3473DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3474DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3475DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3476
3477DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3478DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3479DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3480DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3481
3482DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3483DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3484DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3485DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3486
3487#undef DO_CMP_PPZI_B
3488#undef DO_CMP_PPZI_H
3489#undef DO_CMP_PPZI_S
3490#undef DO_CMP_PPZI_D
3491#undef DO_CMP_PPZI
35da316f
RH
3492
3493/* Similar to the ARM LastActive pseudocode function. */
3494static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3495{
3496 intptr_t i;
3497
3498 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3499 uint64_t pg = *(uint64_t *)(vg + i);
3500 if (pg) {
3501 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3502 }
3503 }
3504 return 0;
3505}
3506
3507/* Compute a mask into RETB that is true for all G, up to and including
3508 * (if after) or excluding (if !after) the first G & N.
3509 * Return true if BRK found.
3510 */
3511static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3512 bool brk, bool after)
3513{
3514 uint64_t b;
3515
3516 if (brk) {
3517 b = 0;
3518 } else if ((g & n) == 0) {
3519 /* For all G, no N are set; break not found. */
3520 b = g;
3521 } else {
3522 /* Break somewhere in N. Locate it. */
3523 b = g & n; /* guard true, pred true */
3524 b = b & -b; /* first such */
3525 if (after) {
3526 b = b | (b - 1); /* break after same */
3527 } else {
3528 b = b - 1; /* break before same */
3529 }
3530 brk = true;
3531 }
3532
3533 *retb = b;
3534 return brk;
3535}
3536
3537/* Compute a zeroing BRK. */
3538static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3539 intptr_t oprsz, bool after)
3540{
3541 bool brk = false;
3542 intptr_t i;
3543
3544 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3545 uint64_t this_b, this_g = g[i];
3546
3547 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3548 d[i] = this_b & this_g;
3549 }
3550}
3551
3552/* Likewise, but also compute flags. */
3553static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3554 intptr_t oprsz, bool after)
3555{
3556 uint32_t flags = PREDTEST_INIT;
3557 bool brk = false;
3558 intptr_t i;
3559
3560 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3561 uint64_t this_b, this_d, this_g = g[i];
3562
3563 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3564 d[i] = this_d = this_b & this_g;
3565 flags = iter_predtest_fwd(this_d, this_g, flags);
3566 }
3567 return flags;
3568}
3569
3570/* Compute a merging BRK. */
3571static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3572 intptr_t oprsz, bool after)
3573{
3574 bool brk = false;
3575 intptr_t i;
3576
3577 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3578 uint64_t this_b, this_g = g[i];
3579
3580 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3581 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3582 }
3583}
3584
3585/* Likewise, but also compute flags. */
3586static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3587 intptr_t oprsz, bool after)
3588{
3589 uint32_t flags = PREDTEST_INIT;
3590 bool brk = false;
3591 intptr_t i;
3592
3593 for (i = 0; i < oprsz / 8; ++i) {
3594 uint64_t this_b, this_d = d[i], this_g = g[i];
3595
3596 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3597 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3598 flags = iter_predtest_fwd(this_d, this_g, flags);
3599 }
3600 return flags;
3601}
3602
3603static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3604{
3605 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3606 * The compiler should turn this into 4 64-bit integer stores.
3607 */
3608 memset(d, 0, sizeof(ARMPredicateReg));
3609 return PREDTEST_INIT;
3610}
3611
3612void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3613 uint32_t pred_desc)
3614{
04c774a2 3615 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3616 if (last_active_pred(vn, vg, oprsz)) {
3617 compute_brk_z(vd, vm, vg, oprsz, true);
3618 } else {
3619 do_zero(vd, oprsz);
3620 }
3621}
3622
3623uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3624 uint32_t pred_desc)
3625{
04c774a2 3626 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3627 if (last_active_pred(vn, vg, oprsz)) {
3628 return compute_brks_z(vd, vm, vg, oprsz, true);
3629 } else {
3630 return do_zero(vd, oprsz);
3631 }
3632}
3633
3634void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3635 uint32_t pred_desc)
3636{
04c774a2 3637 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3638 if (last_active_pred(vn, vg, oprsz)) {
3639 compute_brk_z(vd, vm, vg, oprsz, false);
3640 } else {
3641 do_zero(vd, oprsz);
3642 }
3643}
3644
3645uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
3646 uint32_t pred_desc)
3647{
04c774a2 3648 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3649 if (last_active_pred(vn, vg, oprsz)) {
3650 return compute_brks_z(vd, vm, vg, oprsz, false);
3651 } else {
3652 return do_zero(vd, oprsz);
3653 }
3654}
3655
3656void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3657{
04c774a2 3658 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3659 compute_brk_z(vd, vn, vg, oprsz, true);
3660}
3661
3662uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3663{
04c774a2 3664 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3665 return compute_brks_z(vd, vn, vg, oprsz, true);
3666}
3667
3668void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3669{
04c774a2 3670 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3671 compute_brk_z(vd, vn, vg, oprsz, false);
3672}
3673
3674uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3675{
04c774a2 3676 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3677 return compute_brks_z(vd, vn, vg, oprsz, false);
3678}
3679
3680void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3681{
04c774a2 3682 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3683 compute_brk_m(vd, vn, vg, oprsz, true);
3684}
3685
3686uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3687{
04c774a2 3688 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3689 return compute_brks_m(vd, vn, vg, oprsz, true);
3690}
3691
3692void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3693{
04c774a2 3694 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3695 compute_brk_m(vd, vn, vg, oprsz, false);
3696}
3697
3698uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3699{
04c774a2 3700 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3701 return compute_brks_m(vd, vn, vg, oprsz, false);
3702}
3703
3704void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3705{
04c774a2 3706 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3707 if (!last_active_pred(vn, vg, oprsz)) {
3708 do_zero(vd, oprsz);
3709 }
3710}
3711
3712/* As if PredTest(Ones(PL), D, esz). */
3713static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
3714 uint64_t esz_mask)
3715{
3716 uint32_t flags = PREDTEST_INIT;
3717 intptr_t i;
3718
3719 for (i = 0; i < oprsz / 8; i++) {
3720 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
3721 }
3722 if (oprsz & 7) {
3723 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
3724 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
3725 }
3726 return flags;
3727}
3728
3729uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3730{
04c774a2 3731 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3732 if (last_active_pred(vn, vg, oprsz)) {
3733 return predtest_ones(vd, oprsz, -1);
3734 } else {
3735 return do_zero(vd, oprsz);
3736 }
3737}
9ee3a611
RH
3738
3739uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
3740{
f556a201
RH
3741 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3742 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
9ee3a611
RH
3743 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
3744 intptr_t i;
3745
f556a201 3746 for (i = 0; i < words; ++i) {
9ee3a611
RH
3747 uint64_t t = n[i] & g[i] & mask;
3748 sum += ctpop64(t);
3749 }
3750 return sum;
3751}
caf1cefc
RH
3752
3753uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
3754{
e610906c
RH
3755 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3756 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
caf1cefc
RH
3757 uint64_t esz_mask = pred_esz_masks[esz];
3758 ARMPredicateReg *d = vd;
3759 uint32_t flags;
3760 intptr_t i;
3761
3762 /* Begin with a zero predicate register. */
3763 flags = do_zero(d, oprsz);
3764 if (count == 0) {
3765 return flags;
3766 }
3767
caf1cefc
RH
3768 /* Set all of the requested bits. */
3769 for (i = 0; i < count / 64; ++i) {
3770 d->p[i] = esz_mask;
3771 }
3772 if (count & 63) {
3773 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
3774 }
3775
3776 return predtest_ones(d, oprsz, esz_mask);
3777}
c4e7c493 3778
23fbe79f
RH
3779/* Recursive reduction on a function;
3780 * C.f. the ARM ARM function ReducePredicated.
3781 *
3782 * While it would be possible to write this without the DATA temporary,
3783 * it is much simpler to process the predicate register this way.
3784 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
3785 * little to gain with a more complex non-recursive form.
3786 */
3787#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
3788static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
3789{ \
3790 if (n == 1) { \
3791 return *data; \
3792 } else { \
3793 uintptr_t half = n / 2; \
3794 TYPE lo = NAME##_reduce(data, status, half); \
3795 TYPE hi = NAME##_reduce(data + half, status, half); \
3796 return TYPE##_##FUNC(lo, hi, status); \
3797 } \
3798} \
3799uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
3800{ \
c648c9b7 3801 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
23fbe79f
RH
3802 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
3803 for (i = 0; i < oprsz; ) { \
3804 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3805 do { \
3806 TYPE nn = *(TYPE *)(vn + H(i)); \
3807 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
3808 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
3809 } while (i & 15); \
3810 } \
3811 for (; i < maxsz; i += sizeof(TYPE)) { \
3812 *(TYPE *)((void *)data + i) = IDENT; \
3813 } \
3814 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
3815}
3816
3817DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
3818DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
3819DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
3820
3821/* Identity is floatN_default_nan, without the function call. */
3822DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
3823DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
3824DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
3825
3826DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
3827DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
3828DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
3829
3830DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
3831DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
3832DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
3833
3834DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
3835DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
3836DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
3837
3838#undef DO_REDUCE
3839
7f9ddf64
RH
3840uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
3841 void *status, uint32_t desc)
3842{
3843 intptr_t i = 0, opr_sz = simd_oprsz(desc);
3844 float16 result = nn;
3845
3846 do {
3847 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
3848 do {
3849 if (pg & 1) {
3850 float16 mm = *(float16 *)(vm + H1_2(i));
3851 result = float16_add(result, mm, status);
3852 }
3853 i += sizeof(float16), pg >>= sizeof(float16);
3854 } while (i & 15);
3855 } while (i < opr_sz);
3856
3857 return result;
3858}
3859
3860uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
3861 void *status, uint32_t desc)
3862{
3863 intptr_t i = 0, opr_sz = simd_oprsz(desc);
3864 float32 result = nn;
3865
3866 do {
3867 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
3868 do {
3869 if (pg & 1) {
3870 float32 mm = *(float32 *)(vm + H1_2(i));
3871 result = float32_add(result, mm, status);
3872 }
3873 i += sizeof(float32), pg >>= sizeof(float32);
3874 } while (i & 15);
3875 } while (i < opr_sz);
3876
3877 return result;
3878}
3879
3880uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
3881 void *status, uint32_t desc)
3882{
3883 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
3884 uint64_t *m = vm;
3885 uint8_t *pg = vg;
3886
3887 for (i = 0; i < opr_sz; i++) {
3888 if (pg[H1(i)] & 1) {
3889 nn = float64_add(nn, m[i], status);
3890 }
3891 }
3892
3893 return nn;
3894}
3895
ec3b87c2
RH
3896/* Fully general three-operand expander, controlled by a predicate,
3897 * With the extra float_status parameter.
3898 */
3899#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
3900void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3901 void *status, uint32_t desc) \
3902{ \
3903 intptr_t i = simd_oprsz(desc); \
3904 uint64_t *g = vg; \
3905 do { \
3906 uint64_t pg = g[(i - 1) >> 6]; \
3907 do { \
3908 i -= sizeof(TYPE); \
3909 if (likely((pg >> (i & 63)) & 1)) { \
3910 TYPE nn = *(TYPE *)(vn + H(i)); \
3911 TYPE mm = *(TYPE *)(vm + H(i)); \
3912 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3913 } \
3914 } while (i & 63); \
3915 } while (i != 0); \
3916}
3917
3918DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3919DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3920DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3921
3922DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3923DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3924DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3925
3926DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3927DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3928DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3929
3930DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3931DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3932DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3933
3934DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3935DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3936DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3937
3938DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3939DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3940DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3941
3942DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3943DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3944DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3945
3946DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3947DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3948DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3949
3950static inline float16 abd_h(float16 a, float16 b, float_status *s)
3951{
3952 return float16_abs(float16_sub(a, b, s));
3953}
3954
3955static inline float32 abd_s(float32 a, float32 b, float_status *s)
3956{
3957 return float32_abs(float32_sub(a, b, s));
3958}
3959
3960static inline float64 abd_d(float64 a, float64 b, float_status *s)
3961{
3962 return float64_abs(float64_sub(a, b, s));
3963}
3964
3965DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3966DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3967DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3968
3969static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3970{
3971 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3972 return float64_scalbn(a, b_int, s);
3973}
3974
3975DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3976DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3977DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3978
3979DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3980DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3981DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3982
3983#undef DO_ZPZZ_FP
3984
cc48affe
RH
3985/* Three-operand expander, with one scalar operand, controlled by
3986 * a predicate, with the extra float_status parameter.
3987 */
3988#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3989void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3990 void *status, uint32_t desc) \
3991{ \
3992 intptr_t i = simd_oprsz(desc); \
3993 uint64_t *g = vg; \
3994 TYPE mm = scalar; \
3995 do { \
3996 uint64_t pg = g[(i - 1) >> 6]; \
3997 do { \
3998 i -= sizeof(TYPE); \
3999 if (likely((pg >> (i & 63)) & 1)) { \
4000 TYPE nn = *(TYPE *)(vn + H(i)); \
4001 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4002 } \
4003 } while (i & 63); \
4004 } while (i != 0); \
4005}
4006
4007DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4008DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4009DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
4010
4011DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4012DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4013DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
4014
4015DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4016DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4017DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
4018
4019static inline float16 subr_h(float16 a, float16 b, float_status *s)
4020{
4021 return float16_sub(b, a, s);
4022}
4023
4024static inline float32 subr_s(float32 a, float32 b, float_status *s)
4025{
4026 return float32_sub(b, a, s);
4027}
4028
4029static inline float64 subr_d(float64 a, float64 b, float_status *s)
4030{
4031 return float64_sub(b, a, s);
4032}
4033
4034DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4035DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4036DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
4037
4038DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4039DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4040DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
4041
4042DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4043DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4044DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
4045
4046DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4047DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4048DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
4049
4050DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4051DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4052DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
4053
8092c6a3
RH
4054/* Fully general two-operand expander, controlled by a predicate,
4055 * With the extra float_status parameter.
4056 */
4057#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4058void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4059{ \
4060 intptr_t i = simd_oprsz(desc); \
4061 uint64_t *g = vg; \
4062 do { \
4063 uint64_t pg = g[(i - 1) >> 6]; \
4064 do { \
4065 i -= sizeof(TYPE); \
4066 if (likely((pg >> (i & 63)) & 1)) { \
4067 TYPE nn = *(TYPE *)(vn + H(i)); \
4068 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4069 } \
4070 } while (i & 63); \
4071 } while (i != 0); \
4072}
4073
46d33d1e
RH
4074/* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4075 * FZ16. When converting from fp16, this affects flushing input denormals;
4076 * when converting to fp16, this affects flushing output denormals.
4077 */
4078static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4079{
c120391c 4080 bool save = get_flush_inputs_to_zero(fpst);
46d33d1e
RH
4081 float32 ret;
4082
4083 set_flush_inputs_to_zero(false, fpst);
4084 ret = float16_to_float32(f, true, fpst);
4085 set_flush_inputs_to_zero(save, fpst);
4086 return ret;
4087}
4088
4089static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4090{
c120391c 4091 bool save = get_flush_inputs_to_zero(fpst);
46d33d1e
RH
4092 float64 ret;
4093
4094 set_flush_inputs_to_zero(false, fpst);
4095 ret = float16_to_float64(f, true, fpst);
4096 set_flush_inputs_to_zero(save, fpst);
4097 return ret;
4098}
4099
4100static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4101{
c120391c 4102 bool save = get_flush_to_zero(fpst);
46d33d1e
RH
4103 float16 ret;
4104
4105 set_flush_to_zero(false, fpst);
4106 ret = float32_to_float16(f, true, fpst);
4107 set_flush_to_zero(save, fpst);
4108 return ret;
4109}
4110
4111static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4112{
c120391c 4113 bool save = get_flush_to_zero(fpst);
46d33d1e
RH
4114 float16 ret;
4115
4116 set_flush_to_zero(false, fpst);
4117 ret = float64_to_float16(f, true, fpst);
4118 set_flush_to_zero(save, fpst);
4119 return ret;
4120}
4121
df4de1af
RH
4122static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4123{
4124 if (float16_is_any_nan(f)) {
4125 float_raise(float_flag_invalid, s);
4126 return 0;
4127 }
4128 return float16_to_int16_round_to_zero(f, s);
4129}
4130
4131static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4132{
4133 if (float16_is_any_nan(f)) {
4134 float_raise(float_flag_invalid, s);
4135 return 0;
4136 }
4137 return float16_to_int64_round_to_zero(f, s);
4138}
4139
4140static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4141{
4142 if (float32_is_any_nan(f)) {
4143 float_raise(float_flag_invalid, s);
4144 return 0;
4145 }
4146 return float32_to_int64_round_to_zero(f, s);
4147}
4148
4149static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4150{
4151 if (float64_is_any_nan(f)) {
4152 float_raise(float_flag_invalid, s);
4153 return 0;
4154 }
4155 return float64_to_int64_round_to_zero(f, s);
4156}
4157
4158static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4159{
4160 if (float16_is_any_nan(f)) {
4161 float_raise(float_flag_invalid, s);
4162 return 0;
4163 }
4164 return float16_to_uint16_round_to_zero(f, s);
4165}
4166
4167static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4168{
4169 if (float16_is_any_nan(f)) {
4170 float_raise(float_flag_invalid, s);
4171 return 0;
4172 }
4173 return float16_to_uint64_round_to_zero(f, s);
4174}
4175
4176static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4177{
4178 if (float32_is_any_nan(f)) {
4179 float_raise(float_flag_invalid, s);
4180 return 0;
4181 }
4182 return float32_to_uint64_round_to_zero(f, s);
4183}
4184
4185static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4186{
4187 if (float64_is_any_nan(f)) {
4188 float_raise(float_flag_invalid, s);
4189 return 0;
4190 }
4191 return float64_to_uint64_round_to_zero(f, s);
4192}
4193
46d33d1e
RH
4194DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4195DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4196DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
4197DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
4198DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
4199DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
4200
df4de1af
RH
4201DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4202DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4203DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4204DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
4205DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
4206DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
4207DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
4208
4209DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4210DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4211DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4212DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
4213DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
4214DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
4215DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
4216
cda3c753
RH
4217DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4218DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4219DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
4220
4221DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4222DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4223DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
4224
ec5b375b
RH
4225DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4226DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4227DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
4228
4229DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4230DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4231DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
4232
8092c6a3
RH
4233DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4234DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4235DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4236DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
4237DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
4238DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
4239DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
4240
4241DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4242DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4243DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4244DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
4245DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
4246DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
4247DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
4248
4249#undef DO_ZPZ_FP
4250
08975da9
RH
4251static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4252 float_status *status, uint32_t desc,
6ceabaad
RH
4253 uint16_t neg1, uint16_t neg3)
4254{
4255 intptr_t i = simd_oprsz(desc);
6ceabaad
RH
4256 uint64_t *g = vg;
4257
4258 do {
4259 uint64_t pg = g[(i - 1) >> 6];
4260 do {
4261 i -= 2;
4262 if (likely((pg >> (i & 63)) & 1)) {
4263 float16 e1, e2, e3, r;
4264
4265 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4266 e2 = *(uint16_t *)(vm + H1_2(i));
4267 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
08975da9 4268 r = float16_muladd(e1, e2, e3, 0, status);
6ceabaad
RH
4269 *(uint16_t *)(vd + H1_2(i)) = r;
4270 }
4271 } while (i & 63);
4272 } while (i != 0);
4273}
4274
08975da9
RH
4275void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4276 void *vg, void *status, uint32_t desc)
6ceabaad 4277{
08975da9 4278 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
6ceabaad
RH
4279}
4280
08975da9
RH
4281void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4282 void *vg, void *status, uint32_t desc)
6ceabaad 4283{
08975da9 4284 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
6ceabaad
RH
4285}
4286
08975da9
RH
4287void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4288 void *vg, void *status, uint32_t desc)
6ceabaad 4289{
08975da9 4290 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
6ceabaad
RH
4291}
4292
08975da9
RH
4293void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4294 void *vg, void *status, uint32_t desc)
6ceabaad 4295{
08975da9 4296 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
6ceabaad
RH
4297}
4298
08975da9
RH
4299static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4300 float_status *status, uint32_t desc,
6ceabaad
RH
4301 uint32_t neg1, uint32_t neg3)
4302{
4303 intptr_t i = simd_oprsz(desc);
6ceabaad
RH
4304 uint64_t *g = vg;
4305
4306 do {
4307 uint64_t pg = g[(i - 1) >> 6];
4308 do {
4309 i -= 4;
4310 if (likely((pg >> (i & 63)) & 1)) {
4311 float32 e1, e2, e3, r;
4312
4313 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4314 e2 = *(uint32_t *)(vm + H1_4(i));
4315 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
08975da9 4316 r = float32_muladd(e1, e2, e3, 0, status);
6ceabaad
RH
4317 *(uint32_t *)(vd + H1_4(i)) = r;
4318 }
4319 } while (i & 63);
4320 } while (i != 0);
4321}
4322
08975da9
RH
4323void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4324 void *vg, void *status, uint32_t desc)
6ceabaad 4325{
08975da9 4326 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
6ceabaad
RH
4327}
4328
08975da9
RH
4329void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4330 void *vg, void *status, uint32_t desc)
6ceabaad 4331{
08975da9 4332 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
6ceabaad
RH
4333}
4334
08975da9
RH
4335void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4336 void *vg, void *status, uint32_t desc)
6ceabaad 4337{
08975da9 4338 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
6ceabaad
RH
4339}
4340
08975da9
RH
4341void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4342 void *vg, void *status, uint32_t desc)
6ceabaad 4343{
08975da9 4344 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
6ceabaad
RH
4345}
4346
08975da9
RH
4347static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4348 float_status *status, uint32_t desc,
6ceabaad
RH
4349 uint64_t neg1, uint64_t neg3)
4350{
4351 intptr_t i = simd_oprsz(desc);
6ceabaad
RH
4352 uint64_t *g = vg;
4353
4354 do {
4355 uint64_t pg = g[(i - 1) >> 6];
4356 do {
4357 i -= 8;
4358 if (likely((pg >> (i & 63)) & 1)) {
4359 float64 e1, e2, e3, r;
4360
4361 e1 = *(uint64_t *)(vn + i) ^ neg1;
4362 e2 = *(uint64_t *)(vm + i);
4363 e3 = *(uint64_t *)(va + i) ^ neg3;
08975da9 4364 r = float64_muladd(e1, e2, e3, 0, status);
6ceabaad
RH
4365 *(uint64_t *)(vd + i) = r;
4366 }
4367 } while (i & 63);
4368 } while (i != 0);
4369}
4370
08975da9
RH
4371void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4372 void *vg, void *status, uint32_t desc)
6ceabaad 4373{
08975da9 4374 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
6ceabaad
RH
4375}
4376
08975da9
RH
4377void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4378 void *vg, void *status, uint32_t desc)
6ceabaad 4379{
08975da9 4380 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
6ceabaad
RH
4381}
4382
08975da9
RH
4383void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4384 void *vg, void *status, uint32_t desc)
6ceabaad 4385{
08975da9 4386 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
6ceabaad
RH
4387}
4388
08975da9
RH
4389void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4390 void *vg, void *status, uint32_t desc)
6ceabaad 4391{
08975da9 4392 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
6ceabaad
RH
4393}
4394
abfdefd5
RH
4395/* Two operand floating-point comparison controlled by a predicate.
4396 * Unlike the integer version, we are not allowed to optimistically
4397 * compare operands, since the comparison may have side effects wrt
4398 * the FPSR.
4399 */
4400#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4401void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4402 void *status, uint32_t desc) \
4403{ \
4404 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4405 uint64_t *d = vd, *g = vg; \
4406 do { \
4407 uint64_t out = 0, pg = g[j]; \
4408 do { \
4409 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4410 if (likely((pg >> (i & 63)) & 1)) { \
4411 TYPE nn = *(TYPE *)(vn + H(i)); \
4412 TYPE mm = *(TYPE *)(vm + H(i)); \
4413 out |= OP(TYPE, nn, mm, status); \
4414 } \
4415 } while (i & 63); \
4416 d[j--] = out; \
4417 } while (i > 0); \
4418}
4419
4420#define DO_FPCMP_PPZZ_H(NAME, OP) \
4421 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4422#define DO_FPCMP_PPZZ_S(NAME, OP) \
4423 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4424#define DO_FPCMP_PPZZ_D(NAME, OP) \
4425 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
4426
4427#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4428 DO_FPCMP_PPZZ_H(NAME, OP) \
4429 DO_FPCMP_PPZZ_S(NAME, OP) \
4430 DO_FPCMP_PPZZ_D(NAME, OP)
4431
4432#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4433#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4d2e2a03
RH
4434#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4435#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
abfdefd5
RH
4436#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4437#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4438#define DO_FCMUO(TYPE, X, Y, ST) \
4439 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4440#define DO_FACGE(TYPE, X, Y, ST) \
4441 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4442#define DO_FACGT(TYPE, X, Y, ST) \
4443 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4444
4445DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4446DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4447DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4448DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4449DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4450DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4451DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4452
4453#undef DO_FPCMP_PPZZ_ALL
4454#undef DO_FPCMP_PPZZ_D
4455#undef DO_FPCMP_PPZZ_S
4456#undef DO_FPCMP_PPZZ_H
4457#undef DO_FPCMP_PPZZ
4458
4d2e2a03
RH
4459/* One operand floating-point comparison against zero, controlled
4460 * by a predicate.
4461 */
4462#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4463void HELPER(NAME)(void *vd, void *vn, void *vg, \
4464 void *status, uint32_t desc) \
4465{ \
4466 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4467 uint64_t *d = vd, *g = vg; \
4468 do { \
4469 uint64_t out = 0, pg = g[j]; \
4470 do { \
4471 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4472 if ((pg >> (i & 63)) & 1) { \
4473 TYPE nn = *(TYPE *)(vn + H(i)); \
4474 out |= OP(TYPE, nn, 0, status); \
4475 } \
4476 } while (i & 63); \
4477 d[j--] = out; \
4478 } while (i > 0); \
4479}
4480
4481#define DO_FPCMP_PPZ0_H(NAME, OP) \
4482 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4483#define DO_FPCMP_PPZ0_S(NAME, OP) \
4484 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4485#define DO_FPCMP_PPZ0_D(NAME, OP) \
4486 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
4487
4488#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4489 DO_FPCMP_PPZ0_H(NAME, OP) \
4490 DO_FPCMP_PPZ0_S(NAME, OP) \
4491 DO_FPCMP_PPZ0_D(NAME, OP)
4492
4493DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4494DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4495DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4496DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4497DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4498DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4499
67fcd9ad
RH
4500/* FP Trig Multiply-Add. */
4501
4502void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4503{
4504 static const float16 coeff[16] = {
4505 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4506 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4507 };
4508 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4509 intptr_t x = simd_data(desc);
4510 float16 *d = vd, *n = vn, *m = vm;
4511 for (i = 0; i < opr_sz; i++) {
4512 float16 mm = m[i];
4513 intptr_t xx = x;
4514 if (float16_is_neg(mm)) {
4515 mm = float16_abs(mm);
4516 xx += 8;
4517 }
4518 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
4519 }
4520}
4521
4522void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4523{
4524 static const float32 coeff[16] = {
4525 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
4526 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
4527 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
4528 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
4529 };
4530 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
4531 intptr_t x = simd_data(desc);
4532 float32 *d = vd, *n = vn, *m = vm;
4533 for (i = 0; i < opr_sz; i++) {
4534 float32 mm = m[i];
4535 intptr_t xx = x;
4536 if (float32_is_neg(mm)) {
4537 mm = float32_abs(mm);
4538 xx += 8;
4539 }
4540 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
4541 }
4542}
4543
4544void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4545{
4546 static const float64 coeff[16] = {
4547 0x3ff0000000000000ull, 0xbfc5555555555543ull,
4548 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
4549 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
4550 0x3de5d8408868552full, 0x0000000000000000ull,
4551 0x3ff0000000000000ull, 0xbfe0000000000000ull,
4552 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
4553 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
4554 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
4555 };
4556 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
4557 intptr_t x = simd_data(desc);
4558 float64 *d = vd, *n = vn, *m = vm;
4559 for (i = 0; i < opr_sz; i++) {
4560 float64 mm = m[i];
4561 intptr_t xx = x;
4562 if (float64_is_neg(mm)) {
4563 mm = float64_abs(mm);
4564 xx += 8;
4565 }
4566 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
4567 }
4568}
4569
76a9d9cd
RH
4570/*
4571 * FP Complex Add
4572 */
4573
4574void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
4575 void *vs, uint32_t desc)
4576{
4577 intptr_t j, i = simd_oprsz(desc);
4578 uint64_t *g = vg;
4579 float16 neg_imag = float16_set_sign(0, simd_data(desc));
4580 float16 neg_real = float16_chs(neg_imag);
4581
4582 do {
4583 uint64_t pg = g[(i - 1) >> 6];
4584 do {
4585 float16 e0, e1, e2, e3;
4586
4587 /* I holds the real index; J holds the imag index. */
4588 j = i - sizeof(float16);
4589 i -= 2 * sizeof(float16);
4590
4591 e0 = *(float16 *)(vn + H1_2(i));
4592 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
4593 e2 = *(float16 *)(vn + H1_2(j));
4594 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
4595
4596 if (likely((pg >> (i & 63)) & 1)) {
4597 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
4598 }
4599 if (likely((pg >> (j & 63)) & 1)) {
4600 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
4601 }
4602 } while (i & 63);
4603 } while (i != 0);
4604}
4605
4606void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
4607 void *vs, uint32_t desc)
4608{
4609 intptr_t j, i = simd_oprsz(desc);
4610 uint64_t *g = vg;
4611 float32 neg_imag = float32_set_sign(0, simd_data(desc));
4612 float32 neg_real = float32_chs(neg_imag);
4613
4614 do {
4615 uint64_t pg = g[(i - 1) >> 6];
4616 do {
4617 float32 e0, e1, e2, e3;
4618
4619 /* I holds the real index; J holds the imag index. */
4620 j = i - sizeof(float32);
4621 i -= 2 * sizeof(float32);
4622
4623 e0 = *(float32 *)(vn + H1_2(i));
4624 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
4625 e2 = *(float32 *)(vn + H1_2(j));
4626 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
4627
4628 if (likely((pg >> (i & 63)) & 1)) {
4629 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
4630 }
4631 if (likely((pg >> (j & 63)) & 1)) {
4632 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
4633 }
4634 } while (i & 63);
4635 } while (i != 0);
4636}
4637
4638void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
4639 void *vs, uint32_t desc)
4640{
4641 intptr_t j, i = simd_oprsz(desc);
4642 uint64_t *g = vg;
4643 float64 neg_imag = float64_set_sign(0, simd_data(desc));
4644 float64 neg_real = float64_chs(neg_imag);
4645
4646 do {
4647 uint64_t pg = g[(i - 1) >> 6];
4648 do {
4649 float64 e0, e1, e2, e3;
4650
4651 /* I holds the real index; J holds the imag index. */
4652 j = i - sizeof(float64);
4653 i -= 2 * sizeof(float64);
4654
4655 e0 = *(float64 *)(vn + H1_2(i));
4656 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
4657 e2 = *(float64 *)(vn + H1_2(j));
4658 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
4659
4660 if (likely((pg >> (i & 63)) & 1)) {
4661 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
4662 }
4663 if (likely((pg >> (j & 63)) & 1)) {
4664 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
4665 }
4666 } while (i & 63);
4667 } while (i != 0);
4668}
4669
05f48bab
RH
4670/*
4671 * FP Complex Multiply
4672 */
4673
08975da9
RH
4674void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4675 void *vg, void *status, uint32_t desc)
05f48bab
RH
4676{
4677 intptr_t j, i = simd_oprsz(desc);
08975da9 4678 unsigned rot = simd_data(desc);
05f48bab
RH
4679 bool flip = rot & 1;
4680 float16 neg_imag, neg_real;
05f48bab
RH
4681 uint64_t *g = vg;
4682
4683 neg_imag = float16_set_sign(0, (rot & 2) != 0);
4684 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
4685
4686 do {
4687 uint64_t pg = g[(i - 1) >> 6];
4688 do {
4689 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
4690
4691 /* I holds the real index; J holds the imag index. */
4692 j = i - sizeof(float16);
4693 i -= 2 * sizeof(float16);
4694
4695 nr = *(float16 *)(vn + H1_2(i));
4696 ni = *(float16 *)(vn + H1_2(j));
4697 mr = *(float16 *)(vm + H1_2(i));
4698 mi = *(float16 *)(vm + H1_2(j));
4699
4700 e2 = (flip ? ni : nr);
4701 e1 = (flip ? mi : mr) ^ neg_real;
4702 e4 = e2;
4703 e3 = (flip ? mr : mi) ^ neg_imag;
4704
4705 if (likely((pg >> (i & 63)) & 1)) {
4706 d = *(float16 *)(va + H1_2(i));
08975da9 4707 d = float16_muladd(e2, e1, d, 0, status);
05f48bab
RH
4708 *(float16 *)(vd + H1_2(i)) = d;
4709 }
4710 if (likely((pg >> (j & 63)) & 1)) {
4711 d = *(float16 *)(va + H1_2(j));
08975da9 4712 d = float16_muladd(e4, e3, d, 0, status);
05f48bab
RH
4713 *(float16 *)(vd + H1_2(j)) = d;
4714 }
4715 } while (i & 63);
4716 } while (i != 0);
4717}
4718
08975da9
RH
4719void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4720 void *vg, void *status, uint32_t desc)
05f48bab
RH
4721{
4722 intptr_t j, i = simd_oprsz(desc);
08975da9 4723 unsigned rot = simd_data(desc);
05f48bab
RH
4724 bool flip = rot & 1;
4725 float32 neg_imag, neg_real;
05f48bab
RH
4726 uint64_t *g = vg;
4727
4728 neg_imag = float32_set_sign(0, (rot & 2) != 0);
4729 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
4730
4731 do {
4732 uint64_t pg = g[(i - 1) >> 6];
4733 do {
4734 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
4735
4736 /* I holds the real index; J holds the imag index. */
4737 j = i - sizeof(float32);
4738 i -= 2 * sizeof(float32);
4739
4740 nr = *(float32 *)(vn + H1_2(i));
4741 ni = *(float32 *)(vn + H1_2(j));
4742 mr = *(float32 *)(vm + H1_2(i));
4743 mi = *(float32 *)(vm + H1_2(j));
4744
4745 e2 = (flip ? ni : nr);
4746 e1 = (flip ? mi : mr) ^ neg_real;
4747 e4 = e2;
4748 e3 = (flip ? mr : mi) ^ neg_imag;
4749
4750 if (likely((pg >> (i & 63)) & 1)) {
4751 d = *(float32 *)(va + H1_2(i));
08975da9 4752 d = float32_muladd(e2, e1, d, 0, status);
05f48bab
RH
4753 *(float32 *)(vd + H1_2(i)) = d;
4754 }
4755 if (likely((pg >> (j & 63)) & 1)) {
4756 d = *(float32 *)(va + H1_2(j));
08975da9 4757 d = float32_muladd(e4, e3, d, 0, status);
05f48bab
RH
4758 *(float32 *)(vd + H1_2(j)) = d;
4759 }
4760 } while (i & 63);
4761 } while (i != 0);
4762}
4763
08975da9
RH
4764void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4765 void *vg, void *status, uint32_t desc)
05f48bab
RH
4766{
4767 intptr_t j, i = simd_oprsz(desc);
08975da9 4768 unsigned rot = simd_data(desc);
05f48bab
RH
4769 bool flip = rot & 1;
4770 float64 neg_imag, neg_real;
05f48bab
RH
4771 uint64_t *g = vg;
4772
4773 neg_imag = float64_set_sign(0, (rot & 2) != 0);
4774 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
4775
4776 do {
4777 uint64_t pg = g[(i - 1) >> 6];
4778 do {
4779 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
4780
4781 /* I holds the real index; J holds the imag index. */
4782 j = i - sizeof(float64);
4783 i -= 2 * sizeof(float64);
4784
4785 nr = *(float64 *)(vn + H1_2(i));
4786 ni = *(float64 *)(vn + H1_2(j));
4787 mr = *(float64 *)(vm + H1_2(i));
4788 mi = *(float64 *)(vm + H1_2(j));
4789
4790 e2 = (flip ? ni : nr);
4791 e1 = (flip ? mi : mr) ^ neg_real;
4792 e4 = e2;
4793 e3 = (flip ? mr : mi) ^ neg_imag;
4794
4795 if (likely((pg >> (i & 63)) & 1)) {
4796 d = *(float64 *)(va + H1_2(i));
08975da9 4797 d = float64_muladd(e2, e1, d, 0, status);
05f48bab
RH
4798 *(float64 *)(vd + H1_2(i)) = d;
4799 }
4800 if (likely((pg >> (j & 63)) & 1)) {
4801 d = *(float64 *)(va + H1_2(j));
08975da9 4802 d = float64_muladd(e4, e3, d, 0, status);
05f48bab
RH
4803 *(float64 *)(vd + H1_2(j)) = d;
4804 }
4805 } while (i & 63);
4806 } while (i != 0);
4807}
4808
c4e7c493
RH
4809/*
4810 * Load contiguous data, protected by a governing predicate.
4811 */
9123aeb6
RH
4812
4813/*
cf4a49b7
RH
4814 * Load one element into @vd + @reg_off from @host.
4815 * The controlling predicate is known to be true.
9123aeb6 4816 */
cf4a49b7 4817typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
9123aeb6
RH
4818
4819/*
4820 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
4821 * The controlling predicate is known to be true.
4822 */
6799ce7b
RH
4823typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
4824 target_ulong vaddr, uintptr_t retaddr);
9123aeb6
RH
4825
4826/*
4827 * Generate the above primitives.
4828 */
4829
4830#define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
cf4a49b7
RH
4831static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
4832{ \
4833 TYPEM val = HOST(host); \
4834 *(TYPEE *)(vd + H(reg_off)) = val; \
9123aeb6
RH
4835}
4836
0fa476c1
RH
4837#define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
4838static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
4839{ HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
4840
6799ce7b 4841#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
9123aeb6 4842static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
6799ce7b 4843 target_ulong addr, uintptr_t ra) \
9123aeb6 4844{ \
c4af8ba1
RH
4845 *(TYPEE *)(vd + H(reg_off)) = \
4846 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
9123aeb6 4847}
6799ce7b
RH
4848
4849#define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
9123aeb6 4850static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
6799ce7b 4851 target_ulong addr, uintptr_t ra) \
9123aeb6 4852{ \
c4af8ba1
RH
4853 TLB(env, useronly_clean_ptr(addr), \
4854 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
9123aeb6 4855}
9123aeb6
RH
4856
4857#define DO_LD_PRIM_1(NAME, H, TE, TM) \
4858 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
6799ce7b 4859 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
9123aeb6
RH
4860
4861DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
4862DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
4863DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
4864DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
4865DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
4866DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
4867DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
4868
6799ce7b 4869#define DO_ST_PRIM_1(NAME, H, TE, TM) \
0fa476c1 4870 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
6799ce7b
RH
4871 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
4872
4873DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
4874DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
4875DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
4876DO_ST_PRIM_1(bd, , uint64_t, uint8_t)
9123aeb6 4877
6799ce7b
RH
4878#define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
4879 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
4880 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
4881 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
4882 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
9123aeb6 4883
6799ce7b 4884#define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
0fa476c1
RH
4885 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
4886 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
6799ce7b
RH
4887 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
4888 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
9123aeb6 4889
6799ce7b
RH
4890DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
4891DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
4892DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
4893DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw)
4894DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw)
9123aeb6 4895
6799ce7b
RH
4896DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
4897DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
4898DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw)
9123aeb6 4899
6799ce7b
RH
4900DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
4901DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl)
4902DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl)
9123aeb6 4903
6799ce7b
RH
4904DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
4905DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl)
4906
4907DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq)
4908DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq)
9123aeb6
RH
4909
4910#undef DO_LD_TLB
6799ce7b 4911#undef DO_ST_TLB
9123aeb6
RH
4912#undef DO_LD_HOST
4913#undef DO_LD_PRIM_1
6799ce7b 4914#undef DO_ST_PRIM_1
9123aeb6 4915#undef DO_LD_PRIM_2
6799ce7b 4916#undef DO_ST_PRIM_2
9123aeb6
RH
4917
4918/*
4919 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4920 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4921 * element >= @reg_off, or @reg_max if there were no active elements at all.
4922 */
4923static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4924 intptr_t reg_max, int esz)
4925{
4926 uint64_t pg_mask = pred_esz_masks[esz];
4927 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4928
4929 /* In normal usage, the first element is active. */
4930 if (likely(pg & 1)) {
4931 return reg_off;
4932 }
4933
4934 if (pg == 0) {
4935 reg_off &= -64;
4936 do {
4937 reg_off += 64;
4938 if (unlikely(reg_off >= reg_max)) {
4939 /* The entire predicate was false. */
4940 return reg_max;
4941 }
4942 pg = vg[reg_off >> 6] & pg_mask;
4943 } while (pg == 0);
4944 }
4945 reg_off += ctz64(pg);
4946
4947 /* We should never see an out of range predicate bit set. */
4948 tcg_debug_assert(reg_off < reg_max);
4949 return reg_off;
4950}
4951
b4cd95d2
RH
4952/*
4953 * Resolve the guest virtual address to info->host and info->flags.
4954 * If @nofault, return false if the page is invalid, otherwise
4955 * exit via page fault exception.
4956 */
4957
4958typedef struct {
4959 void *host;
4960 int flags;
4961 MemTxAttrs attrs;
4962} SVEHostPage;
4963
4964static bool sve_probe_page(SVEHostPage *info, bool nofault,
4965 CPUARMState *env, target_ulong addr,
4966 int mem_off, MMUAccessType access_type,
4967 int mmu_idx, uintptr_t retaddr)
4968{
4969 int flags;
4970
4971 addr += mem_off;
c4af8ba1
RH
4972
4973 /*
4974 * User-only currently always issues with TBI. See the comment
4975 * above useronly_clean_ptr. Usually we clean this top byte away
4976 * during translation, but we can't do that for e.g. vector + imm
4977 * addressing modes.
4978 *
4979 * We currently always enable TBI for user-only, and do not provide
4980 * a way to turn it off. So clean the pointer unconditionally here,
4981 * rather than look it up here, or pass it down from above.
4982 */
4983 addr = useronly_clean_ptr(addr);
4984
b4cd95d2
RH
4985 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
4986 &info->host, retaddr);
4987 info->flags = flags;
4988
4989 if (flags & TLB_INVALID_MASK) {
4990 g_assert(nofault);
4991 return false;
4992 }
4993
4994 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
4995 info->host -= mem_off;
4996
4997#ifdef CONFIG_USER_ONLY
4998 memset(&info->attrs, 0, sizeof(info->attrs));
4999#else
5000 /*
5001 * Find the iotlbentry for addr and return the transaction attributes.
5002 * This *must* be present in the TLB because we just found the mapping.
5003 */
5004 {
5005 uintptr_t index = tlb_index(env, mmu_idx, addr);
5006
5007# ifdef CONFIG_DEBUG_TCG
5008 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
5009 target_ulong comparator = (access_type == MMU_DATA_LOAD
5010 ? entry->addr_read
5011 : tlb_addr_write(entry));
5012 g_assert(tlb_hit(comparator, addr));
5013# endif
5014
5015 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
5016 info->attrs = iotlbentry->attrs;
5017 }
5018#endif
5019
5020 return true;
5021}
5022
5023
5024/*
5025 * Analyse contiguous data, protected by a governing predicate.
5026 */
5027
5028typedef enum {
5029 FAULT_NO,
5030 FAULT_FIRST,
5031 FAULT_ALL,
5032} SVEContFault;
5033
5034typedef struct {
5035 /*
5036 * First and last element wholly contained within the two pages.
5037 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
5038 * reg_off_last[0] may be < 0 if the first element crosses pages.
5039 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
5040 * are set >= 0 only if there are complete elements on a second page.
5041 *
5042 * The reg_off_* offsets are relative to the internal vector register.
5043 * The mem_off_first offset is relative to the memory address; the
5044 * two offsets are different when a load operation extends, a store
5045 * operation truncates, or for multi-register operations.
5046 */
5047 int16_t mem_off_first[2];
5048 int16_t reg_off_first[2];
5049 int16_t reg_off_last[2];
5050
5051 /*
5052 * One element that is misaligned and spans both pages,
5053 * or -1 if there is no such active element.
5054 */
5055 int16_t mem_off_split;
5056 int16_t reg_off_split;
5057
5058 /*
5059 * The byte offset at which the entire operation crosses a page boundary.
5060 * Set >= 0 if and only if the entire operation spans two pages.
5061 */
5062 int16_t page_split;
5063
5064 /* TLB data for the two pages. */
5065 SVEHostPage page[2];
5066} SVEContLdSt;
5067
5068/*
5069 * Find first active element on each page, and a loose bound for the
5070 * final element on each page. Identify any single element that spans
5071 * the page boundary. Return true if there are any active elements.
5072 */
b854fd06
RH
5073static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
5074 uint64_t *vg, intptr_t reg_max,
5075 int esz, int msize)
b4cd95d2
RH
5076{
5077 const int esize = 1 << esz;
5078 const uint64_t pg_mask = pred_esz_masks[esz];
5079 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5080 intptr_t mem_off_last, mem_off_split;
5081 intptr_t page_split, elt_split;
5082 intptr_t i;
5083
5084 /* Set all of the element indices to -1, and the TLB data to 0. */
5085 memset(info, -1, offsetof(SVEContLdSt, page));
5086 memset(info->page, 0, sizeof(info->page));
5087
5088 /* Gross scan over the entire predicate to find bounds. */
5089 i = 0;
5090 do {
5091 uint64_t pg = vg[i] & pg_mask;
5092 if (pg) {
5093 reg_off_last = i * 64 + 63 - clz64(pg);
5094 if (reg_off_first < 0) {
5095 reg_off_first = i * 64 + ctz64(pg);
5096 }
5097 }
5098 } while (++i * 64 < reg_max);
5099
5100 if (unlikely(reg_off_first < 0)) {
5101 /* No active elements, no pages touched. */
5102 return false;
5103 }
5104 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5105
5106 info->reg_off_first[0] = reg_off_first;
5107 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5108 mem_off_last = (reg_off_last >> esz) * msize;
5109
5110 page_split = -(addr | TARGET_PAGE_MASK);
5111 if (likely(mem_off_last + msize <= page_split)) {
5112 /* The entire operation fits within a single page. */
5113 info->reg_off_last[0] = reg_off_last;
5114 return true;
5115 }
5116
5117 info->page_split = page_split;
5118 elt_split = page_split / msize;
5119 reg_off_split = elt_split << esz;
5120 mem_off_split = elt_split * msize;
5121
5122 /*
5123 * This is the last full element on the first page, but it is not
5124 * necessarily active. If there is no full element, i.e. the first
5125 * active element is the one that's split, this value remains -1.
5126 * It is useful as iteration bounds.
5127 */
5128 if (elt_split != 0) {
5129 info->reg_off_last[0] = reg_off_split - esize;
5130 }
5131
5132 /* Determine if an unaligned element spans the pages. */
5133 if (page_split % msize != 0) {
5134 /* It is helpful to know if the split element is active. */
5135 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5136 info->reg_off_split = reg_off_split;
5137 info->mem_off_split = mem_off_split;
5138
5139 if (reg_off_split == reg_off_last) {
5140 /* The page crossing element is last. */
5141 return true;
5142 }
5143 }
5144 reg_off_split += esize;
5145 mem_off_split += msize;
5146 }
5147
5148 /*
5149 * We do want the first active element on the second page, because
5150 * this may affect the address reported in an exception.
5151 */
5152 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5153 tcg_debug_assert(reg_off_split <= reg_off_last);
5154 info->reg_off_first[1] = reg_off_split;
5155 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5156 info->reg_off_last[1] = reg_off_last;
5157 return true;
5158}
5159
5160/*
5161 * Resolve the guest virtual addresses to info->page[].
5162 * Control the generation of page faults with @fault. Return false if
5163 * there is no work to do, which can only happen with @fault == FAULT_NO.
5164 */
b854fd06
RH
5165static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5166 CPUARMState *env, target_ulong addr,
5167 MMUAccessType access_type, uintptr_t retaddr)
b4cd95d2
RH
5168{
5169 int mmu_idx = cpu_mmu_index(env, false);
5170 int mem_off = info->mem_off_first[0];
5171 bool nofault = fault == FAULT_NO;
5172 bool have_work = true;
5173
5174 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5175 access_type, mmu_idx, retaddr)) {
5176 /* No work to be done. */
5177 return false;
5178 }
5179
5180 if (likely(info->page_split < 0)) {
5181 /* The entire operation was on the one page. */
5182 return true;
5183 }
5184
5185 /*
5186 * If the second page is invalid, then we want the fault address to be
5187 * the first byte on that page which is accessed.
5188 */
5189 if (info->mem_off_split >= 0) {
5190 /*
5191 * There is an element split across the pages. The fault address
5192 * should be the first byte of the second page.
5193 */
5194 mem_off = info->page_split;
5195 /*
5196 * If the split element is also the first active element
5197 * of the vector, then: For first-fault we should continue
5198 * to generate faults for the second page. For no-fault,
5199 * we have work only if the second page is valid.
5200 */
5201 if (info->mem_off_first[0] < info->mem_off_split) {
5202 nofault = FAULT_FIRST;
5203 have_work = false;
5204 }
5205 } else {
5206 /*
5207 * There is no element split across the pages. The fault address
5208 * should be the first active element on the second page.
5209 */
5210 mem_off = info->mem_off_first[1];
5211 /*
5212 * There must have been one active element on the first page,
5213 * so we're out of first-fault territory.
5214 */
5215 nofault = fault != FAULT_ALL;
5216 }
5217
5218 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5219 access_type, mmu_idx, retaddr);
5220 return have_work;
5221}
5222
4bcc3f0f
RH
5223static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5224 uint64_t *vg, target_ulong addr,
5225 int esize, int msize, int wp_access,
5226 uintptr_t retaddr)
5227{
5228#ifndef CONFIG_USER_ONLY
5229 intptr_t mem_off, reg_off, reg_last;
5230 int flags0 = info->page[0].flags;
5231 int flags1 = info->page[1].flags;
5232
5233 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5234 return;
5235 }
5236
5237 /* Indicate that watchpoints are handled. */
5238 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5239 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5240
5241 if (flags0 & TLB_WATCHPOINT) {
5242 mem_off = info->mem_off_first[0];
5243 reg_off = info->reg_off_first[0];
5244 reg_last = info->reg_off_last[0];
5245
5246 while (reg_off <= reg_last) {
5247 uint64_t pg = vg[reg_off >> 6];
5248 do {
5249 if ((pg >> (reg_off & 63)) & 1) {
5250 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5251 msize, info->page[0].attrs,
5252 wp_access, retaddr);
5253 }
5254 reg_off += esize;
5255 mem_off += msize;
5256 } while (reg_off <= reg_last && (reg_off & 63));
5257 }
5258 }
5259
5260 mem_off = info->mem_off_split;
5261 if (mem_off >= 0) {
5262 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5263 info->page[0].attrs, wp_access, retaddr);
5264 }
5265
5266 mem_off = info->mem_off_first[1];
5267 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5268 reg_off = info->reg_off_first[1];
5269 reg_last = info->reg_off_last[1];
5270
5271 do {
5272 uint64_t pg = vg[reg_off >> 6];
5273 do {
5274 if ((pg >> (reg_off & 63)) & 1) {
5275 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5276 msize, info->page[1].attrs,
5277 wp_access, retaddr);
5278 }
5279 reg_off += esize;
5280 mem_off += msize;
5281 } while (reg_off & 63);
5282 } while (reg_off <= reg_last);
5283 }
5284#endif
5285}
5286
4c3310c7
RH
5287static void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5288 uint64_t *vg, target_ulong addr, int esize,
5289 int msize, uint32_t mtedesc, uintptr_t ra)
206adacf
RH
5290{
5291 intptr_t mem_off, reg_off, reg_last;
5292
5293 /* Process the page only if MemAttr == Tagged. */
5294 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
5295 mem_off = info->mem_off_first[0];
5296 reg_off = info->reg_off_first[0];
5297 reg_last = info->reg_off_split;
5298 if (reg_last < 0) {
5299 reg_last = info->reg_off_last[0];
5300 }
5301
5302 do {
5303 uint64_t pg = vg[reg_off >> 6];
5304 do {
5305 if ((pg >> (reg_off & 63)) & 1) {
4c3310c7 5306 mte_check(env, mtedesc, addr, ra);
206adacf
RH
5307 }
5308 reg_off += esize;
5309 mem_off += msize;
5310 } while (reg_off <= reg_last && (reg_off & 63));
5311 } while (reg_off <= reg_last);
5312 }
5313
5314 mem_off = info->mem_off_first[1];
5315 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
5316 reg_off = info->reg_off_first[1];
5317 reg_last = info->reg_off_last[1];
5318
5319 do {
5320 uint64_t pg = vg[reg_off >> 6];
5321 do {
5322 if ((pg >> (reg_off & 63)) & 1) {
4c3310c7 5323 mte_check(env, mtedesc, addr, ra);
206adacf
RH
5324 }
5325 reg_off += esize;
5326 mem_off += msize;
5327 } while (reg_off & 63);
5328 } while (reg_off <= reg_last);
5329 }
5330}
5331
9123aeb6 5332/*
5c9b8458 5333 * Common helper for all contiguous 1,2,3,4-register predicated stores.
9123aeb6 5334 */
b854fd06 5335static inline QEMU_ALWAYS_INLINE
5c9b8458 5336void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
b854fd06 5337 uint32_t desc, const uintptr_t retaddr,
206adacf 5338 const int esz, const int msz, const int N, uint32_t mtedesc,
b854fd06 5339 sve_ldst1_host_fn *host_fn,
4c3310c7 5340 sve_ldst1_tlb_fn *tlb_fn)
b854fd06 5341{
ba080b86 5342 const unsigned rd = simd_data(desc);
9123aeb6 5343 const intptr_t reg_max = simd_oprsz(desc);
b854fd06
RH
5344 intptr_t reg_off, reg_last, mem_off;
5345 SVEContLdSt info;
9123aeb6 5346 void *host;
5c9b8458 5347 int flags, i;
9123aeb6 5348
b854fd06 5349 /* Find the active elements. */
5c9b8458 5350 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
9123aeb6 5351 /* The entire predicate was false; no load occurs. */
5c9b8458
RH
5352 for (i = 0; i < N; ++i) {
5353 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5354 }
9123aeb6
RH
5355 return;
5356 }
9123aeb6 5357
b854fd06
RH
5358 /* Probe the page(s). Exit with exception for any invalid page. */
5359 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
9123aeb6 5360
4bcc3f0f 5361 /* Handle watchpoints for all active elements. */
5c9b8458 5362 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
4bcc3f0f
RH
5363 BP_MEM_READ, retaddr);
5364
206adacf
RH
5365 /*
5366 * Handle mte checks for all active elements.
5367 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5368 */
4c3310c7
RH
5369 if (mtedesc) {
5370 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5371 mtedesc, retaddr);
206adacf 5372 }
4bcc3f0f 5373
b854fd06
RH
5374 flags = info.page[0].flags | info.page[1].flags;
5375 if (unlikely(flags != 0)) {
9123aeb6 5376#ifdef CONFIG_USER_ONLY
b854fd06 5377 g_assert_not_reached();
9123aeb6 5378#else
b854fd06 5379 /*
4bcc3f0f 5380 * At least one page includes MMIO.
b854fd06
RH
5381 * Any bus operation can fail with cpu_transaction_failed,
5382 * which for ARM will raise SyncExternal. Perform the load
5383 * into scratch memory to preserve register state until the end.
5384 */
5c9b8458 5385 ARMVectorReg scratch[4] = { };
b854fd06 5386
b854fd06
RH
5387 mem_off = info.mem_off_first[0];
5388 reg_off = info.reg_off_first[0];
5389 reg_last = info.reg_off_last[1];
5390 if (reg_last < 0) {
5391 reg_last = info.reg_off_split;
5392 if (reg_last < 0) {
5393 reg_last = info.reg_off_last[0];
9123aeb6
RH
5394 }
5395 }
5396
b854fd06
RH
5397 do {
5398 uint64_t pg = vg[reg_off >> 6];
5399 do {
5400 if ((pg >> (reg_off & 63)) & 1) {
5c9b8458
RH
5401 for (i = 0; i < N; ++i) {
5402 tlb_fn(env, &scratch[i], reg_off,
5403 addr + mem_off + (i << msz), retaddr);
5404 }
b854fd06
RH
5405 }
5406 reg_off += 1 << esz;
5c9b8458 5407 mem_off += N << msz;
b854fd06
RH
5408 } while (reg_off & 63);
5409 } while (reg_off <= reg_last);
5410
5c9b8458
RH
5411 for (i = 0; i < N; ++i) {
5412 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5413 }
b854fd06 5414 return;
9123aeb6 5415#endif
b854fd06
RH
5416 }
5417
5418 /* The entire operation is in RAM, on valid pages. */
5419
5c9b8458
RH
5420 for (i = 0; i < N; ++i) {
5421 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5422 }
5423
b854fd06
RH
5424 mem_off = info.mem_off_first[0];
5425 reg_off = info.reg_off_first[0];
5426 reg_last = info.reg_off_last[0];
5427 host = info.page[0].host;
5428
5429 while (reg_off <= reg_last) {
5430 uint64_t pg = vg[reg_off >> 6];
5431 do {
5432 if ((pg >> (reg_off & 63)) & 1) {
5c9b8458
RH
5433 for (i = 0; i < N; ++i) {
5434 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5435 host + mem_off + (i << msz));
5436 }
b854fd06
RH
5437 }
5438 reg_off += 1 << esz;
5c9b8458 5439 mem_off += N << msz;
b854fd06
RH
5440 } while (reg_off <= reg_last && (reg_off & 63));
5441 }
9123aeb6 5442
b854fd06
RH
5443 /*
5444 * Use the slow path to manage the cross-page misalignment.
5445 * But we know this is RAM and cannot trap.
5446 */
5447 mem_off = info.mem_off_split;
5448 if (unlikely(mem_off >= 0)) {
5c9b8458
RH
5449 reg_off = info.reg_off_split;
5450 for (i = 0; i < N; ++i) {
5451 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5452 addr + mem_off + (i << msz), retaddr);
5453 }
b854fd06
RH
5454 }
5455
5456 mem_off = info.mem_off_first[1];
5457 if (unlikely(mem_off >= 0)) {
5458 reg_off = info.reg_off_first[1];
5459 reg_last = info.reg_off_last[1];
5460 host = info.page[1].host;
5461
5462 do {
5463 uint64_t pg = vg[reg_off >> 6];
5464 do {
5465 if ((pg >> (reg_off & 63)) & 1) {
5c9b8458
RH
5466 for (i = 0; i < N; ++i) {
5467 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5468 host + mem_off + (i << msz));
5469 }
b854fd06
RH
5470 }
5471 reg_off += 1 << esz;
5c9b8458 5472 mem_off += N << msz;
b854fd06
RH
5473 } while (reg_off & 63);
5474 } while (reg_off <= reg_last);
5475 }
c4e7c493
RH
5476}
5477
206adacf
RH
5478static inline QEMU_ALWAYS_INLINE
5479void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5480 uint32_t desc, const uintptr_t ra,
5481 const int esz, const int msz, const int N,
5482 sve_ldst1_host_fn *host_fn,
5483 sve_ldst1_tlb_fn *tlb_fn)
5484{
5485 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5486 int bit55 = extract64(addr, 55, 1);
5487
5488 /* Remove mtedesc from the normal sve descriptor. */
5489 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5490
5491 /* Perform gross MTE suppression early. */
5492 if (!tbi_check(desc, bit55) ||
5493 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5494 mtedesc = 0;
5495 }
5496
4c3310c7 5497 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
9123aeb6
RH
5498}
5499
206adacf
RH
5500#define DO_LD1_1(NAME, ESZ) \
5501void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5502 target_ulong addr, uint32_t desc) \
5503{ \
5504 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
4c3310c7 5505 sve_##NAME##_host, sve_##NAME##_tlb); \
206adacf
RH
5506} \
5507void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5508 target_ulong addr, uint32_t desc) \
5509{ \
5510 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5511 sve_##NAME##_host, sve_##NAME##_tlb); \
5512}
5513
5514#define DO_LD1_2(NAME, ESZ, MSZ) \
5515void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5516 target_ulong addr, uint32_t desc) \
5517{ \
5518 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4c3310c7 5519 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
206adacf
RH
5520} \
5521void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5522 target_ulong addr, uint32_t desc) \
5523{ \
5524 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4c3310c7 5525 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
206adacf
RH
5526} \
5527void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
4c3310c7 5528 target_ulong addr, uint32_t desc) \
206adacf
RH
5529{ \
5530 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5531 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5532} \
5533void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
4c3310c7 5534 target_ulong addr, uint32_t desc) \
206adacf
RH
5535{ \
5536 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5537 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
9123aeb6
RH
5538}
5539
5c9b8458
RH
5540DO_LD1_1(ld1bb, MO_8)
5541DO_LD1_1(ld1bhu, MO_16)
5542DO_LD1_1(ld1bhs, MO_16)
5543DO_LD1_1(ld1bsu, MO_32)
5544DO_LD1_1(ld1bss, MO_32)
5545DO_LD1_1(ld1bdu, MO_64)
5546DO_LD1_1(ld1bds, MO_64)
9123aeb6 5547
5c9b8458
RH
5548DO_LD1_2(ld1hh, MO_16, MO_16)
5549DO_LD1_2(ld1hsu, MO_32, MO_16)
5550DO_LD1_2(ld1hss, MO_32, MO_16)
5551DO_LD1_2(ld1hdu, MO_64, MO_16)
5552DO_LD1_2(ld1hds, MO_64, MO_16)
9123aeb6 5553
5c9b8458
RH
5554DO_LD1_2(ld1ss, MO_32, MO_32)
5555DO_LD1_2(ld1sdu, MO_64, MO_32)
5556DO_LD1_2(ld1sds, MO_64, MO_32)
9123aeb6 5557
5c9b8458 5558DO_LD1_2(ld1dd, MO_64, MO_64)
9123aeb6
RH
5559
5560#undef DO_LD1_1
5561#undef DO_LD1_2
5562
206adacf
RH
5563#define DO_LDN_1(N) \
5564void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5565 target_ulong addr, uint32_t desc) \
5566{ \
5567 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
4c3310c7 5568 sve_ld1bb_host, sve_ld1bb_tlb); \
206adacf
RH
5569} \
5570void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
5571 target_ulong addr, uint32_t desc) \
5572{ \
5573 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
5574 sve_ld1bb_host, sve_ld1bb_tlb); \
f27d4dc2
RH
5575}
5576
206adacf
RH
5577#define DO_LDN_2(N, SUFF, ESZ) \
5578void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
5579 target_ulong addr, uint32_t desc) \
5580{ \
5581 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4c3310c7 5582 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
206adacf
RH
5583} \
5584void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
5585 target_ulong addr, uint32_t desc) \
5586{ \
5587 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4c3310c7 5588 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
206adacf
RH
5589} \
5590void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
5591 target_ulong addr, uint32_t desc) \
5592{ \
5593 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5594 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5595} \
5596void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
5597 target_ulong addr, uint32_t desc) \
5598{ \
5599 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5600 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
c4e7c493
RH
5601}
5602
f27d4dc2
RH
5603DO_LDN_1(2)
5604DO_LDN_1(3)
5605DO_LDN_1(4)
c4e7c493 5606
5c9b8458
RH
5607DO_LDN_2(2, hh, MO_16)
5608DO_LDN_2(3, hh, MO_16)
5609DO_LDN_2(4, hh, MO_16)
c4e7c493 5610
5c9b8458
RH
5611DO_LDN_2(2, ss, MO_32)
5612DO_LDN_2(3, ss, MO_32)
5613DO_LDN_2(4, ss, MO_32)
c4e7c493 5614
5c9b8458
RH
5615DO_LDN_2(2, dd, MO_64)
5616DO_LDN_2(3, dd, MO_64)
5617DO_LDN_2(4, dd, MO_64)
c4e7c493 5618
f27d4dc2
RH
5619#undef DO_LDN_1
5620#undef DO_LDN_2
e2654d75
RH
5621
5622/*
5623 * Load contiguous data, first-fault and no-fault.
9123aeb6
RH
5624 *
5625 * For user-only, one could argue that we should hold the mmap_lock during
5626 * the operation so that there is no race between page_check_range and the
5627 * load operation. However, unmapping pages out from under a running thread
5628 * is extraordinarily unlikely. This theoretical race condition also affects
5629 * linux-user/ in its get_user/put_user macros.
5630 *
5631 * TODO: Construct some helpers, written in assembly, that interact with
5632 * handle_cpu_signal to produce memory ops which can properly report errors
5633 * without racing.
e2654d75
RH
5634 */
5635
e2654d75
RH
5636/* Fault on byte I. All bits in FFR from I are cleared. The vector
5637 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5638 * option, which leaves subsequent data unchanged.
5639 */
5640static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5641{
5642 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5643
5644 if (i & 63) {
5645 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5646 i = ROUND_UP(i, 64);
5647 }
5648 for (; i < oprsz; i += 64) {
5649 ffr[i / 64] = 0;
5650 }
5651}
5652
9123aeb6 5653/*
c647673c 5654 * Common helper for all contiguous no-fault and first-fault loads.
9123aeb6 5655 */
c647673c
RH
5656static inline QEMU_ALWAYS_INLINE
5657void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
aa13f7c3 5658 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
c647673c
RH
5659 const int esz, const int msz, const SVEContFault fault,
5660 sve_ldst1_host_fn *host_fn,
5661 sve_ldst1_tlb_fn *tlb_fn)
5662{
ba080b86 5663 const unsigned rd = simd_data(desc);
500d0484 5664 void *vd = &env->vfp.zregs[rd];
9123aeb6 5665 const intptr_t reg_max = simd_oprsz(desc);
c647673c
RH
5666 intptr_t reg_off, mem_off, reg_last;
5667 SVEContLdSt info;
5668 int flags;
9123aeb6
RH
5669 void *host;
5670
c647673c
RH
5671 /* Find the active elements. */
5672 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
9123aeb6
RH
5673 /* The entire predicate was false; no load occurs. */
5674 memset(vd, 0, reg_max);
5675 return;
5676 }
c647673c 5677 reg_off = info.reg_off_first[0];
9123aeb6 5678
c647673c
RH
5679 /* Probe the page(s). */
5680 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5681 /* Fault on first element. */
5682 tcg_debug_assert(fault == FAULT_NO);
5683 memset(vd, 0, reg_max);
5684 goto do_fault;
5685 }
5686
5687 mem_off = info.mem_off_first[0];
5688 flags = info.page[0].flags;
5689
aa13f7c3
RH
5690 /*
5691 * Disable MTE checking if the Tagged bit is not set. Since TBI must
5692 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
5693 */
5694 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
5695 mtedesc = 0;
5696 }
5697
c647673c 5698 if (fault == FAULT_FIRST) {
aa13f7c3
RH
5699 /* Trapping mte check for the first-fault element. */
5700 if (mtedesc) {
bd47b61c 5701 mte_check(env, mtedesc, addr + mem_off, retaddr);
aa13f7c3
RH
5702 }
5703
c647673c
RH
5704 /*
5705 * Special handling of the first active element,
5706 * if it crosses a page boundary or is MMIO.
5707 */
5708 bool is_split = mem_off == info.mem_off_split;
c647673c
RH
5709 if (unlikely(flags != 0) || unlikely(is_split)) {
5710 /*
5711 * Use the slow path for cross-page handling.
5712 * Might trap for MMIO or watchpoints.
5713 */
5714 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
5715
5716 /* After any fault, zero the other elements. */
9123aeb6 5717 swap_memzero(vd, reg_off);
c647673c
RH
5718 reg_off += 1 << esz;
5719 mem_off += 1 << msz;
5720 swap_memzero(vd + reg_off, reg_max - reg_off);
5721
5722 if (is_split) {
5723 goto second_page;
5724 }
5725 } else {
5726 memset(vd, 0, reg_max);
5727 }
5728 } else {
5729 memset(vd, 0, reg_max);
5730 if (unlikely(mem_off == info.mem_off_split)) {
5731 /* The first active element crosses a page boundary. */
5732 flags |= info.page[1].flags;
5733 if (unlikely(flags & TLB_MMIO)) {
5734 /* Some page is MMIO, see below. */
5735 goto do_fault;
5736 }
5737 if (unlikely(flags & TLB_WATCHPOINT) &&
5738 (cpu_watchpoint_address_matches
5739 (env_cpu(env), addr + mem_off, 1 << msz)
5740 & BP_MEM_READ)) {
5741 /* Watchpoint hit, see below. */
5742 goto do_fault;
5743 }
d304d280 5744 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
aa13f7c3
RH
5745 goto do_fault;
5746 }
c647673c
RH
5747 /*
5748 * Use the slow path for cross-page handling.
5749 * This is RAM, without a watchpoint, and will not trap.
5750 */
5751 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
5752 goto second_page;
9123aeb6
RH
5753 }
5754 }
5755
9123aeb6 5756 /*
c647673c
RH
5757 * From this point on, all memory operations are MemSingleNF.
5758 *
5759 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
5760 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
5761 *
5762 * Unfortuately we do not have access to the memory attributes from the
5763 * PTE to tell Device memory from Normal memory. So we make a mostly
5764 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
5765 * This gives the right answer for the common cases of "Normal memory,
5766 * backed by host RAM" and "Device memory, backed by MMIO".
5767 * The architecture allows us to suppress an NF load and return
5768 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
5769 * case of "Normal memory, backed by MMIO" is permitted. The case we
5770 * get wrong is "Device memory, backed by host RAM", for which we
5771 * should return (UNKNOWN, FAULT) for but do not.
5772 *
5773 * Similarly, CPU_BP breakpoints would raise exceptions, and so
5774 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
5775 * architectural breakpoints the same.
9123aeb6 5776 */
c647673c
RH
5777 if (unlikely(flags & TLB_MMIO)) {
5778 goto do_fault;
9123aeb6 5779 }
9123aeb6 5780
c647673c
RH
5781 reg_last = info.reg_off_last[0];
5782 host = info.page[0].host;
9123aeb6 5783
c647673c
RH
5784 do {
5785 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
cf4a49b7 5786 do {
c647673c
RH
5787 if ((pg >> (reg_off & 63)) & 1) {
5788 if (unlikely(flags & TLB_WATCHPOINT) &&
5789 (cpu_watchpoint_address_matches
5790 (env_cpu(env), addr + mem_off, 1 << msz)
5791 & BP_MEM_READ)) {
5792 goto do_fault;
5793 }
d304d280 5794 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
aa13f7c3
RH
5795 goto do_fault;
5796 }
c647673c
RH
5797 host_fn(vd, reg_off, host + mem_off);
5798 }
cf4a49b7 5799 reg_off += 1 << esz;
c647673c
RH
5800 mem_off += 1 << msz;
5801 } while (reg_off <= reg_last && (reg_off & 63));
5802 } while (reg_off <= reg_last);
9123aeb6 5803
c647673c
RH
5804 /*
5805 * MemSingleNF is allowed to fail for any reason. We have special
5806 * code above to handle the first element crossing a page boundary.
5807 * As an implementation choice, decline to handle a cross-page element
5808 * in any other position.
5809 */
5810 reg_off = info.reg_off_split;
5811 if (reg_off >= 0) {
5812 goto do_fault;
5813 }
9123aeb6 5814
c647673c
RH
5815 second_page:
5816 reg_off = info.reg_off_first[1];
5817 if (likely(reg_off < 0)) {
5818 /* No active elements on the second page. All done. */
9123aeb6
RH
5819 return;
5820 }
9123aeb6 5821
9123aeb6 5822 /*
c647673c
RH
5823 * MemSingleNF is allowed to fail for any reason. As an implementation
5824 * choice, decline to handle elements on the second page. This should
5825 * be low frequency as the guest walks through memory -- the next
5826 * iteration of the guest's loop should be aligned on the page boundary,
5827 * and then all following iterations will stay aligned.
9123aeb6 5828 */
9123aeb6 5829
c647673c 5830 do_fault:
9123aeb6
RH
5831 record_fault(env, reg_off, reg_max);
5832}
5833
aa13f7c3
RH
5834static inline QEMU_ALWAYS_INLINE
5835void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
5836 uint32_t desc, const uintptr_t retaddr,
5837 const int esz, const int msz, const SVEContFault fault,
5838 sve_ldst1_host_fn *host_fn,
5839 sve_ldst1_tlb_fn *tlb_fn)
5840{
5841 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5842 int bit55 = extract64(addr, 55, 1);
5843
5844 /* Remove mtedesc from the normal sve descriptor. */
5845 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5846
5847 /* Perform gross MTE suppression early. */
5848 if (!tbi_check(desc, bit55) ||
5849 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5850 mtedesc = 0;
5851 }
5852
5853 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
5854 esz, msz, fault, host_fn, tlb_fn);
5855}
5856
5857#define DO_LDFF1_LDNF1_1(PART, ESZ) \
9123aeb6
RH
5858void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
5859 target_ulong addr, uint32_t desc) \
e2654d75 5860{ \
aa13f7c3 5861 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
c647673c 5862 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
e2654d75 5863} \
9123aeb6
RH
5864void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
5865 target_ulong addr, uint32_t desc) \
e2654d75 5866{ \
aa13f7c3
RH
5867 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
5868 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5869} \
5870void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
5871 target_ulong addr, uint32_t desc) \
5872{ \
5873 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
5874 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5875} \
5876void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
5877 target_ulong addr, uint32_t desc) \
5878{ \
5879 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
c647673c 5880 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
e2654d75
RH
5881}
5882
aa13f7c3 5883#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
7d0a57a2
RH
5884void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
5885 target_ulong addr, uint32_t desc) \
e2654d75 5886{ \
aa13f7c3 5887 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
c647673c 5888 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
9123aeb6 5889} \
7d0a57a2
RH
5890void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
5891 target_ulong addr, uint32_t desc) \
9123aeb6 5892{ \
aa13f7c3 5893 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
c647673c 5894 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
7d0a57a2
RH
5895} \
5896void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
5897 target_ulong addr, uint32_t desc) \
5898{ \
aa13f7c3 5899 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
c647673c 5900 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
7d0a57a2
RH
5901} \
5902void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
5903 target_ulong addr, uint32_t desc) \
5904{ \
aa13f7c3 5905 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
c647673c 5906 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
aa13f7c3
RH
5907} \
5908void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5909 target_ulong addr, uint32_t desc) \
5910{ \
5911 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5912 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5913} \
5914void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5915 target_ulong addr, uint32_t desc) \
5916{ \
5917 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5918 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5919} \
5920void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5921 target_ulong addr, uint32_t desc) \
5922{ \
5923 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5924 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5925} \
5926void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5927 target_ulong addr, uint32_t desc) \
5928{ \
5929 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5930 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
e2654d75
RH
5931}
5932
c647673c
RH
5933DO_LDFF1_LDNF1_1(bb, MO_8)
5934DO_LDFF1_LDNF1_1(bhu, MO_16)
5935DO_LDFF1_LDNF1_1(bhs, MO_16)
5936DO_LDFF1_LDNF1_1(bsu, MO_32)
5937DO_LDFF1_LDNF1_1(bss, MO_32)
5938DO_LDFF1_LDNF1_1(bdu, MO_64)
5939DO_LDFF1_LDNF1_1(bds, MO_64)
e2654d75 5940
c647673c
RH
5941DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
5942DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
5943DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
5944DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
5945DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
e2654d75 5946
c647673c
RH
5947DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
5948DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
5949DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
e2654d75 5950
c647673c 5951DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
e2654d75 5952
9123aeb6
RH
5953#undef DO_LDFF1_LDNF1_1
5954#undef DO_LDFF1_LDNF1_2
1a039c7e 5955
9fd46c83 5956/*
0fa476c1 5957 * Common helper for all contiguous 1,2,3,4-register predicated stores.
9fd46c83 5958 */
0fa476c1
RH
5959
5960static inline QEMU_ALWAYS_INLINE
71b9f394
RH
5961void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
5962 uint32_t desc, const uintptr_t retaddr,
5963 const int esz, const int msz, const int N, uint32_t mtedesc,
0fa476c1 5964 sve_ldst1_host_fn *host_fn,
4c3310c7 5965 sve_ldst1_tlb_fn *tlb_fn)
9fd46c83 5966{
ba080b86 5967 const unsigned rd = simd_data(desc);
0fa476c1
RH
5968 const intptr_t reg_max = simd_oprsz(desc);
5969 intptr_t reg_off, reg_last, mem_off;
5970 SVEContLdSt info;
5971 void *host;
5972 int i, flags;
1a039c7e 5973
0fa476c1
RH
5974 /* Find the active elements. */
5975 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5976 /* The entire predicate was false; no store occurs. */
5977 return;
9fd46c83 5978 }
1a039c7e 5979
0fa476c1
RH
5980 /* Probe the page(s). Exit with exception for any invalid page. */
5981 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
1a039c7e 5982
0fa476c1
RH
5983 /* Handle watchpoints for all active elements. */
5984 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5985 BP_MEM_WRITE, retaddr);
5986
71b9f394
RH
5987 /*
5988 * Handle mte checks for all active elements.
5989 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5990 */
4c3310c7
RH
5991 if (mtedesc) {
5992 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5993 mtedesc, retaddr);
71b9f394 5994 }
0fa476c1
RH
5995
5996 flags = info.page[0].flags | info.page[1].flags;
5997 if (unlikely(flags != 0)) {
5998#ifdef CONFIG_USER_ONLY
5999 g_assert_not_reached();
6000#else
6001 /*
6002 * At least one page includes MMIO.
6003 * Any bus operation can fail with cpu_transaction_failed,
6004 * which for ARM will raise SyncExternal. We cannot avoid
6005 * this fault and will leave with the store incomplete.
6006 */
6007 mem_off = info.mem_off_first[0];
6008 reg_off = info.reg_off_first[0];
6009 reg_last = info.reg_off_last[1];
6010 if (reg_last < 0) {
6011 reg_last = info.reg_off_split;
6012 if (reg_last < 0) {
6013 reg_last = info.reg_off_last[0];
9fd46c83 6014 }
0fa476c1
RH
6015 }
6016
6017 do {
6018 uint64_t pg = vg[reg_off >> 6];
6019 do {
6020 if ((pg >> (reg_off & 63)) & 1) {
6021 for (i = 0; i < N; ++i) {
6022 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6023 addr + mem_off + (i << msz), retaddr);
6024 }
6025 }
6026 reg_off += 1 << esz;
6027 mem_off += N << msz;
6028 } while (reg_off & 63);
6029 } while (reg_off <= reg_last);
6030 return;
6031#endif
1a039c7e 6032 }
1a039c7e 6033
0fa476c1
RH
6034 mem_off = info.mem_off_first[0];
6035 reg_off = info.reg_off_first[0];
6036 reg_last = info.reg_off_last[0];
6037 host = info.page[0].host;
1a039c7e 6038
0fa476c1
RH
6039 while (reg_off <= reg_last) {
6040 uint64_t pg = vg[reg_off >> 6];
9fd46c83 6041 do {
0fa476c1
RH
6042 if ((pg >> (reg_off & 63)) & 1) {
6043 for (i = 0; i < N; ++i) {
6044 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6045 host + mem_off + (i << msz));
6046 }
9fd46c83 6047 }
0fa476c1
RH
6048 reg_off += 1 << esz;
6049 mem_off += N << msz;
6050 } while (reg_off <= reg_last && (reg_off & 63));
1a039c7e 6051 }
1a039c7e 6052
0fa476c1
RH
6053 /*
6054 * Use the slow path to manage the cross-page misalignment.
6055 * But we know this is RAM and cannot trap.
6056 */
6057 mem_off = info.mem_off_split;
6058 if (unlikely(mem_off >= 0)) {
6059 reg_off = info.reg_off_split;
6060 for (i = 0; i < N; ++i) {
6061 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6062 addr + mem_off + (i << msz), retaddr);
6063 }
6064 }
6065
6066 mem_off = info.mem_off_first[1];
6067 if (unlikely(mem_off >= 0)) {
6068 reg_off = info.reg_off_first[1];
6069 reg_last = info.reg_off_last[1];
6070 host = info.page[1].host;
1a039c7e 6071
9fd46c83 6072 do {
0fa476c1
RH
6073 uint64_t pg = vg[reg_off >> 6];
6074 do {
6075 if ((pg >> (reg_off & 63)) & 1) {
6076 for (i = 0; i < N; ++i) {
6077 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6078 host + mem_off + (i << msz));
6079 }
6080 }
6081 reg_off += 1 << esz;
6082 mem_off += N << msz;
6083 } while (reg_off & 63);
6084 } while (reg_off <= reg_last);
1a039c7e 6085 }
9fd46c83
RH
6086}
6087
71b9f394
RH
6088static inline QEMU_ALWAYS_INLINE
6089void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6090 uint32_t desc, const uintptr_t ra,
6091 const int esz, const int msz, const int N,
6092 sve_ldst1_host_fn *host_fn,
6093 sve_ldst1_tlb_fn *tlb_fn)
6094{
6095 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6096 int bit55 = extract64(addr, 55, 1);
6097
6098 /* Remove mtedesc from the normal sve descriptor. */
6099 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6100
6101 /* Perform gross MTE suppression early. */
6102 if (!tbi_check(desc, bit55) ||
6103 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6104 mtedesc = 0;
6105 }
6106
4c3310c7 6107 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
1a039c7e 6108}
f6dbf62a 6109
71b9f394
RH
6110#define DO_STN_1(N, NAME, ESZ) \
6111void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6112 target_ulong addr, uint32_t desc) \
6113{ \
6114 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
4c3310c7 6115 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
71b9f394
RH
6116} \
6117void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6118 target_ulong addr, uint32_t desc) \
6119{ \
6120 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6121 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6122}
6123
6124#define DO_STN_2(N, NAME, ESZ, MSZ) \
6125void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6126 target_ulong addr, uint32_t desc) \
6127{ \
6128 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
4c3310c7 6129 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
71b9f394
RH
6130} \
6131void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6132 target_ulong addr, uint32_t desc) \
6133{ \
6134 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
4c3310c7 6135 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
71b9f394
RH
6136} \
6137void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6138 target_ulong addr, uint32_t desc) \
6139{ \
6140 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6141 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6142} \
6143void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6144 target_ulong addr, uint32_t desc) \
6145{ \
6146 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6147 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
0fa476c1
RH
6148}
6149
6150DO_STN_1(1, bb, MO_8)
6151DO_STN_1(1, bh, MO_16)
6152DO_STN_1(1, bs, MO_32)
6153DO_STN_1(1, bd, MO_64)
6154DO_STN_1(2, bb, MO_8)
6155DO_STN_1(3, bb, MO_8)
6156DO_STN_1(4, bb, MO_8)
6157
6158DO_STN_2(1, hh, MO_16, MO_16)
6159DO_STN_2(1, hs, MO_32, MO_16)
6160DO_STN_2(1, hd, MO_64, MO_16)
6161DO_STN_2(2, hh, MO_16, MO_16)
6162DO_STN_2(3, hh, MO_16, MO_16)
6163DO_STN_2(4, hh, MO_16, MO_16)
6164
6165DO_STN_2(1, ss, MO_32, MO_32)
6166DO_STN_2(1, sd, MO_64, MO_32)
6167DO_STN_2(2, ss, MO_32, MO_32)
6168DO_STN_2(3, ss, MO_32, MO_32)
6169DO_STN_2(4, ss, MO_32, MO_32)
6170
6171DO_STN_2(1, dd, MO_64, MO_64)
6172DO_STN_2(2, dd, MO_64, MO_64)
6173DO_STN_2(3, dd, MO_64, MO_64)
6174DO_STN_2(4, dd, MO_64, MO_64)
9fd46c83
RH
6175
6176#undef DO_STN_1
6177#undef DO_STN_2
6178
d4f75f25
RH
6179/*
6180 * Loads with a vector index.
6181 */
673e9fa6 6182
d4f75f25
RH
6183/*
6184 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6185 */
6186typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6187
6188static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6189{
6190 return *(uint32_t *)(reg + H1_4(reg_ofs));
673e9fa6
RH
6191}
6192
d4f75f25
RH
6193static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6194{
6195 return *(int32_t *)(reg + H1_4(reg_ofs));
6196}
6197
6198static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6199{
6200 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6201}
6202
6203static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6204{
6205 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6206}
6207
6208static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6209{
6210 return *(uint64_t *)(reg + reg_ofs);
673e9fa6
RH
6211}
6212
10a85e2c
RH
6213static inline QEMU_ALWAYS_INLINE
6214void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6215 target_ulong base, uint32_t desc, uintptr_t retaddr,
d28d12f0
RH
6216 uint32_t mtedesc, int esize, int msize,
6217 zreg_off_fn *off_fn,
10a85e2c
RH
6218 sve_ldst1_host_fn *host_fn,
6219 sve_ldst1_tlb_fn *tlb_fn)
d4f75f25 6220{
10a85e2c
RH
6221 const int mmu_idx = cpu_mmu_index(env, false);
6222 const intptr_t reg_max = simd_oprsz(desc);
ba080b86 6223 const int scale = simd_data(desc);
10a85e2c
RH
6224 ARMVectorReg scratch;
6225 intptr_t reg_off;
6226 SVEHostPage info, info2;
d4f75f25 6227
10a85e2c
RH
6228 memset(&scratch, 0, reg_max);
6229 reg_off = 0;
6230 do {
6231 uint64_t pg = vg[reg_off >> 6];
d4f75f25
RH
6232 do {
6233 if (likely(pg & 1)) {
10a85e2c
RH
6234 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6235 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6236
6237 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6238 mmu_idx, retaddr);
6239
6240 if (likely(in_page >= msize)) {
6241 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6242 cpu_check_watchpoint(env_cpu(env), addr, msize,
6243 info.attrs, BP_MEM_READ, retaddr);
6244 }
d28d12f0 6245 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
bd47b61c 6246 mte_check(env, mtedesc, addr, retaddr);
d28d12f0 6247 }
10a85e2c
RH
6248 host_fn(&scratch, reg_off, info.host);
6249 } else {
6250 /* Element crosses the page boundary. */
6251 sve_probe_page(&info2, false, env, addr + in_page, 0,
6252 MMU_DATA_LOAD, mmu_idx, retaddr);
6253 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6254 cpu_check_watchpoint(env_cpu(env), addr,
6255 msize, info.attrs,
6256 BP_MEM_READ, retaddr);
6257 }
d28d12f0 6258 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
bd47b61c 6259 mte_check(env, mtedesc, addr, retaddr);
d28d12f0 6260 }
10a85e2c
RH
6261 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6262 }
d4f75f25 6263 }
10a85e2c
RH
6264 reg_off += esize;
6265 pg >>= esize;
6266 } while (reg_off & 63);
6267 } while (reg_off < reg_max);
d4f75f25
RH
6268
6269 /* Wait until all exceptions have been raised to write back. */
10a85e2c 6270 memcpy(vd, &scratch, reg_max);
d4f75f25
RH
6271}
6272
d28d12f0
RH
6273static inline QEMU_ALWAYS_INLINE
6274void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6275 target_ulong base, uint32_t desc, uintptr_t retaddr,
6276 int esize, int msize, zreg_off_fn *off_fn,
6277 sve_ldst1_host_fn *host_fn,
6278 sve_ldst1_tlb_fn *tlb_fn)
6279{
6280 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6281 /* Remove mtedesc from the normal sve descriptor. */
6282 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6283
6284 /*
6285 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6286 * offset base entirely over the address space hole to change the
6287 * pointer tag, or change the bit55 selector. So we could here
6288 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6289 */
6290 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6291 esize, msize, off_fn, host_fn, tlb_fn);
6292}
6293
10a85e2c
RH
6294#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6295void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6296 void *vm, target_ulong base, uint32_t desc) \
6297{ \
d28d12f0 6298 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
10a85e2c 6299 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
d28d12f0
RH
6300} \
6301void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6302 void *vm, target_ulong base, uint32_t desc) \
6303{ \
6304 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6305 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
10a85e2c 6306}
d4f75f25 6307
10a85e2c
RH
6308#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6309void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6310 void *vm, target_ulong base, uint32_t desc) \
6311{ \
d28d12f0 6312 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
10a85e2c 6313 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
d28d12f0
RH
6314} \
6315void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6316 void *vm, target_ulong base, uint32_t desc) \
6317{ \
6318 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6319 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
10a85e2c
RH
6320}
6321
6322DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6323DO_LD1_ZPZ_S(bsu, zss, MO_8)
6324DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6325DO_LD1_ZPZ_D(bdu, zss, MO_8)
6326DO_LD1_ZPZ_D(bdu, zd, MO_8)
6327
6328DO_LD1_ZPZ_S(bss, zsu, MO_8)
6329DO_LD1_ZPZ_S(bss, zss, MO_8)
6330DO_LD1_ZPZ_D(bds, zsu, MO_8)
6331DO_LD1_ZPZ_D(bds, zss, MO_8)
6332DO_LD1_ZPZ_D(bds, zd, MO_8)
6333
6334DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6335DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6336DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6337DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6338DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6339
6340DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6341DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6342DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6343DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6344DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6345
6346DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6347DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6348DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6349DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6350DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6351
6352DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6353DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6354DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6355DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6356DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6357
6358DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6359DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6360DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6361DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6362DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6363
6364DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6365DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6366DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6367DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6368DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6369
6370DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6371DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6372DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6373
6374DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6375DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6376DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6377
6378DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6379DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6380DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6381
6382DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6383DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6384DO_LD1_ZPZ_D(dd_be, zd, MO_64)
d4f75f25
RH
6385
6386#undef DO_LD1_ZPZ_S
6387#undef DO_LD1_ZPZ_D
673e9fa6 6388
ed67eb7f
RH
6389/* First fault loads with a vector index. */
6390
116347ce 6391/*
50de9b78 6392 * Common helpers for all gather first-faulting loads.
116347ce 6393 */
50de9b78
RH
6394
6395static inline QEMU_ALWAYS_INLINE
6396void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6397 target_ulong base, uint32_t desc, uintptr_t retaddr,
d28d12f0
RH
6398 uint32_t mtedesc, const int esz, const int msz,
6399 zreg_off_fn *off_fn,
50de9b78
RH
6400 sve_ldst1_host_fn *host_fn,
6401 sve_ldst1_tlb_fn *tlb_fn)
116347ce 6402{
50de9b78 6403 const int mmu_idx = cpu_mmu_index(env, false);
ba080b86
RH
6404 const intptr_t reg_max = simd_oprsz(desc);
6405 const int scale = simd_data(desc);
50de9b78
RH
6406 const int esize = 1 << esz;
6407 const int msize = 1 << msz;
50de9b78
RH
6408 intptr_t reg_off;
6409 SVEHostPage info;
6410 target_ulong addr, in_page;
116347ce
RH
6411
6412 /* Skip to the first true predicate. */
50de9b78
RH
6413 reg_off = find_next_active(vg, 0, reg_max, esz);
6414 if (unlikely(reg_off >= reg_max)) {
6415 /* The entire predicate was false; no load occurs. */
6416 memset(vd, 0, reg_max);
6417 return;
116347ce
RH
6418 }
6419
50de9b78
RH
6420 /*
6421 * Probe the first element, allowing faults.
6422 */
6423 addr = base + (off_fn(vm, reg_off) << scale);
d28d12f0 6424 if (mtedesc) {
bd47b61c 6425 mte_check(env, mtedesc, addr, retaddr);
d28d12f0 6426 }
50de9b78 6427 tlb_fn(env, vd, reg_off, addr, retaddr);
ed67eb7f 6428
50de9b78
RH
6429 /* After any fault, zero the other elements. */
6430 swap_memzero(vd, reg_off);
6431 reg_off += esize;
6432 swap_memzero(vd + reg_off, reg_max - reg_off);
116347ce 6433
50de9b78
RH
6434 /*
6435 * Probe the remaining elements, not allowing faults.
6436 */
6437 while (reg_off < reg_max) {
6438 uint64_t pg = vg[reg_off >> 6];
6439 do {
6440 if (likely((pg >> (reg_off & 63)) & 1)) {
6441 addr = base + (off_fn(vm, reg_off) << scale);
6442 in_page = -(addr | TARGET_PAGE_MASK);
116347ce 6443
50de9b78
RH
6444 if (unlikely(in_page < msize)) {
6445 /* Stop if the element crosses a page boundary. */
6446 goto fault;
6447 }
ed67eb7f 6448
50de9b78
RH
6449 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6450 mmu_idx, retaddr);
6451 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6452 goto fault;
6453 }
6454 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6455 (cpu_watchpoint_address_matches
6456 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6457 goto fault;
6458 }
d28d12f0
RH
6459 if (mtedesc &&
6460 arm_tlb_mte_tagged(&info.attrs) &&
d304d280 6461 !mte_probe(env, mtedesc, addr)) {
d28d12f0
RH
6462 goto fault;
6463 }
116347ce 6464
50de9b78 6465 host_fn(vd, reg_off, info.host);
116347ce 6466 }
50de9b78
RH
6467 reg_off += esize;
6468 } while (reg_off & 63);
116347ce 6469 }
50de9b78 6470 return;
116347ce 6471
50de9b78
RH
6472 fault:
6473 record_fault(env, reg_off, reg_max);
116347ce
RH
6474}
6475
d28d12f0
RH
6476static inline QEMU_ALWAYS_INLINE
6477void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6478 target_ulong base, uint32_t desc, uintptr_t retaddr,
6479 const int esz, const int msz,
6480 zreg_off_fn *off_fn,
6481 sve_ldst1_host_fn *host_fn,
6482 sve_ldst1_tlb_fn *tlb_fn)
6483{
6484 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6485 /* Remove mtedesc from the normal sve descriptor. */
6486 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6487
6488 /*
6489 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6490 * offset base entirely over the address space hole to change the
6491 * pointer tag, or change the bit55 selector. So we could here
6492 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6493 */
6494 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6495 esz, msz, off_fn, host_fn, tlb_fn);
50de9b78
RH
6496}
6497
d28d12f0
RH
6498#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6499void HELPER(sve_ldff##MEM##_##OFS) \
6500 (CPUARMState *env, void *vd, void *vg, \
6501 void *vm, target_ulong base, uint32_t desc) \
6502{ \
6503 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6504 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6505} \
6506void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6507 (CPUARMState *env, void *vd, void *vg, \
6508 void *vm, target_ulong base, uint32_t desc) \
6509{ \
6510 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6511 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6512}
6513
6514#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6515void HELPER(sve_ldff##MEM##_##OFS) \
6516 (CPUARMState *env, void *vd, void *vg, \
6517 void *vm, target_ulong base, uint32_t desc) \
6518{ \
6519 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6520 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6521} \
6522void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6523 (CPUARMState *env, void *vd, void *vg, \
6524 void *vm, target_ulong base, uint32_t desc) \
6525{ \
6526 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6527 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
50de9b78
RH
6528}
6529
6530DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6531DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6532DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6533DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6534DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6535
6536DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6537DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6538DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6539DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6540DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6541
6542DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6543DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6544DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6545DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6546DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6547
6548DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6549DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6550DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6551DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6552DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6553
6554DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6555DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6556DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6557DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6558DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6559
6560DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6561DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6562DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6563DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6564DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6565
6566DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
6567DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
6568DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6569DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6570DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6571
6572DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
6573DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
6574DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6575DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6576DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6577
6578DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6579DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6580DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6581
6582DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6583DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6584DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6585
6586DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6587DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6588DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6589
6590DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6591DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6592DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
ed67eb7f 6593
f6dbf62a
RH
6594/* Stores with a vector index. */
6595
88a660a4
RH
6596static inline QEMU_ALWAYS_INLINE
6597void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6598 target_ulong base, uint32_t desc, uintptr_t retaddr,
d28d12f0
RH
6599 uint32_t mtedesc, int esize, int msize,
6600 zreg_off_fn *off_fn,
88a660a4
RH
6601 sve_ldst1_host_fn *host_fn,
6602 sve_ldst1_tlb_fn *tlb_fn)
78cf1b88 6603{
88a660a4
RH
6604 const int mmu_idx = cpu_mmu_index(env, false);
6605 const intptr_t reg_max = simd_oprsz(desc);
ba080b86 6606 const int scale = simd_data(desc);
88a660a4
RH
6607 void *host[ARM_MAX_VQ * 4];
6608 intptr_t reg_off, i;
6609 SVEHostPage info, info2;
f6dbf62a 6610
88a660a4
RH
6611 /*
6612 * Probe all of the elements for host addresses and flags.
6613 */
6614 i = reg_off = 0;
6615 do {
6616 uint64_t pg = vg[reg_off >> 6];
78cf1b88 6617 do {
88a660a4
RH
6618 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6619 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
f6dbf62a 6620
88a660a4
RH
6621 host[i] = NULL;
6622 if (likely((pg >> (reg_off & 63)) & 1)) {
6623 if (likely(in_page >= msize)) {
6624 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6625 mmu_idx, retaddr);
6626 host[i] = info.host;
6627 } else {
6628 /*
6629 * Element crosses the page boundary.
6630 * Probe both pages, but do not record the host address,
6631 * so that we use the slow path.
6632 */
6633 sve_probe_page(&info, false, env, addr, 0,
6634 MMU_DATA_STORE, mmu_idx, retaddr);
6635 sve_probe_page(&info2, false, env, addr + in_page, 0,
6636 MMU_DATA_STORE, mmu_idx, retaddr);
6637 info.flags |= info2.flags;
6638 }
f6dbf62a 6639
88a660a4
RH
6640 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6641 cpu_check_watchpoint(env_cpu(env), addr, msize,
6642 info.attrs, BP_MEM_WRITE, retaddr);
6643 }
d28d12f0
RH
6644
6645 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
bd47b61c 6646 mte_check(env, mtedesc, addr, retaddr);
d28d12f0 6647 }
88a660a4
RH
6648 }
6649 i += 1;
6650 reg_off += esize;
6651 } while (reg_off & 63);
6652 } while (reg_off < reg_max);
6653
6654 /*
6655 * Now that we have recognized all exceptions except SyncExternal
6656 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6657 *
6658 * Note for the common case of an element in RAM, not crossing a page
6659 * boundary, we have stored the host address in host[]. This doubles
6660 * as a first-level check against the predicate, since only enabled
6661 * elements have non-null host addresses.
6662 */
6663 i = reg_off = 0;
6664 do {
6665 void *h = host[i];
6666 if (likely(h != NULL)) {
6667 host_fn(vd, reg_off, h);
6668 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
6669 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6670 tlb_fn(env, vd, reg_off, addr, retaddr);
78cf1b88 6671 }
88a660a4
RH
6672 i += 1;
6673 reg_off += esize;
6674 } while (reg_off < reg_max);
78cf1b88 6675}
f6dbf62a 6676
d28d12f0
RH
6677static inline QEMU_ALWAYS_INLINE
6678void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6679 target_ulong base, uint32_t desc, uintptr_t retaddr,
6680 int esize, int msize, zreg_off_fn *off_fn,
6681 sve_ldst1_host_fn *host_fn,
6682 sve_ldst1_tlb_fn *tlb_fn)
6683{
6684 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6685 /* Remove mtedesc from the normal sve descriptor. */
6686 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6687
6688 /*
6689 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6690 * offset base entirely over the address space hole to change the
6691 * pointer tag, or change the bit55 selector. So we could here
6692 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6693 */
6694 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6695 esize, msize, off_fn, host_fn, tlb_fn);
6696}
6697
6698#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
6699void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
88a660a4 6700 void *vm, target_ulong base, uint32_t desc) \
d28d12f0
RH
6701{ \
6702 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6703 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6704} \
6705void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6706 void *vm, target_ulong base, uint32_t desc) \
6707{ \
6708 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6709 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
78cf1b88 6710}
f6dbf62a 6711
d28d12f0
RH
6712#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
6713void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
88a660a4 6714 void *vm, target_ulong base, uint32_t desc) \
d28d12f0
RH
6715{ \
6716 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6717 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6718} \
6719void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6720 void *vm, target_ulong base, uint32_t desc) \
6721{ \
6722 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6723 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
88a660a4
RH
6724}
6725
6726DO_ST1_ZPZ_S(bs, zsu, MO_8)
6727DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
6728DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
6729DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
6730DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
6731
6732DO_ST1_ZPZ_S(bs, zss, MO_8)
6733DO_ST1_ZPZ_S(hs_le, zss, MO_16)
6734DO_ST1_ZPZ_S(hs_be, zss, MO_16)
6735DO_ST1_ZPZ_S(ss_le, zss, MO_32)
6736DO_ST1_ZPZ_S(ss_be, zss, MO_32)
6737
6738DO_ST1_ZPZ_D(bd, zsu, MO_8)
6739DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
6740DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
6741DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
6742DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
6743DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
6744DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
6745
6746DO_ST1_ZPZ_D(bd, zss, MO_8)
6747DO_ST1_ZPZ_D(hd_le, zss, MO_16)
6748DO_ST1_ZPZ_D(hd_be, zss, MO_16)
6749DO_ST1_ZPZ_D(sd_le, zss, MO_32)
6750DO_ST1_ZPZ_D(sd_be, zss, MO_32)
6751DO_ST1_ZPZ_D(dd_le, zss, MO_64)
6752DO_ST1_ZPZ_D(dd_be, zss, MO_64)
6753
6754DO_ST1_ZPZ_D(bd, zd, MO_8)
6755DO_ST1_ZPZ_D(hd_le, zd, MO_16)
6756DO_ST1_ZPZ_D(hd_be, zd, MO_16)
6757DO_ST1_ZPZ_D(sd_le, zd, MO_32)
6758DO_ST1_ZPZ_D(sd_be, zd, MO_32)
6759DO_ST1_ZPZ_D(dd_le, zd, MO_64)
6760DO_ST1_ZPZ_D(dd_be, zd, MO_64)
78cf1b88
RH
6761
6762#undef DO_ST1_ZPZ_S
6763#undef DO_ST1_ZPZ_D