]> git.proxmox.com Git - mirror_qemu.git/blame - target/arm/sve_helper.c
target/arm: Implement SVE2 integer absolute difference and accumulate
[mirror_qemu.git] / target / arm / sve_helper.c
CommitLineData
9e18d7a6
RH
1/*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
50f57e09 9 * version 2.1 of the License, or (at your option) any later version.
9e18d7a6
RH
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "qemu/osdep.h"
21#include "cpu.h"
500d0484 22#include "internals.h"
9e18d7a6
RH
23#include "exec/exec-all.h"
24#include "exec/cpu_ldst.h"
25#include "exec/helper-proto.h"
26#include "tcg/tcg-gvec-desc.h"
a1f233f2 27#include "fpu/softfloat.h"
dcb32f1d 28#include "tcg/tcg.h"
45d9503d 29#include "vec_internal.h"
9e18d7a6
RH
30
31
f97cfd59
RH
32/* Note that vector data is stored in host-endian 64-bit chunks,
33 so addressing units smaller than that needs a host-endian fixup. */
34#ifdef HOST_WORDS_BIGENDIAN
35#define H1(x) ((x) ^ 7)
36#define H1_2(x) ((x) ^ 6)
37#define H1_4(x) ((x) ^ 4)
38#define H2(x) ((x) ^ 3)
39#define H4(x) ((x) ^ 1)
40#else
41#define H1(x) (x)
42#define H1_2(x) (x)
43#define H1_4(x) (x)
44#define H2(x) (x)
45#define H4(x) (x)
46#endif
47
9e18d7a6
RH
48/* Return a value for NZCV as per the ARM PredTest pseudofunction.
49 *
50 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
51 * and bit 0 set if C is set. Compare the definitions of these variables
52 * within CPUARMState.
53 */
54
55/* For no G bits set, NZCV = C. */
56#define PREDTEST_INIT 1
57
58/* This is an iterative function, called for each Pd and Pg word
59 * moving forward.
60 */
61static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
62{
63 if (likely(g)) {
64 /* Compute N from first D & G.
65 Use bit 2 to signal first G bit seen. */
66 if (!(flags & 4)) {
67 flags |= ((d & (g & -g)) != 0) << 31;
68 flags |= 4;
69 }
70
71 /* Accumulate Z from each D & G. */
72 flags |= ((d & g) != 0) << 1;
73
74 /* Compute C from last !(D & G). Replace previous. */
75 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
76 }
77 return flags;
78}
79
757f9cff
RH
80/* This is an iterative function, called for each Pd and Pg word
81 * moving backward.
82 */
83static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
84{
85 if (likely(g)) {
86 /* Compute C from first (i.e last) !(D & G).
87 Use bit 2 to signal first G bit seen. */
88 if (!(flags & 4)) {
89 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
90 flags |= (d & pow2floor(g)) == 0;
91 }
92
93 /* Accumulate Z from each D & G. */
94 flags |= ((d & g) != 0) << 1;
95
96 /* Compute N from last (i.e first) D & G. Replace previous. */
97 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
98 }
99 return flags;
100}
101
9e18d7a6
RH
102/* The same for a single word predicate. */
103uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
104{
105 return iter_predtest_fwd(d, g, PREDTEST_INIT);
106}
107
108/* The same for a multi-word predicate. */
109uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
110{
111 uint32_t flags = PREDTEST_INIT;
112 uint64_t *d = vd, *g = vg;
113 uintptr_t i = 0;
114
115 do {
116 flags = iter_predtest_fwd(d[i], g[i], flags);
117 } while (++i < words);
118
119 return flags;
120}
516e246a 121
ccd841c3
RH
122/* Expand active predicate bits to bytes, for byte elements.
123 * for (i = 0; i < 256; ++i) {
124 * unsigned long m = 0;
125 * for (j = 0; j < 8; j++) {
126 * if ((i >> j) & 1) {
127 * m |= 0xfful << (j << 3);
128 * }
129 * }
130 * printf("0x%016lx,\n", m);
131 * }
132 */
133static inline uint64_t expand_pred_b(uint8_t byte)
134{
135 static const uint64_t word[256] = {
136 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
137 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
138 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
139 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
140 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
141 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
142 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
143 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
144 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
145 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
146 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
147 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
148 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
149 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
150 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
151 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
152 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
153 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
154 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
155 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
156 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
157 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
158 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
159 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
160 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
161 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
162 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
163 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
164 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
165 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
166 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
167 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
168 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
169 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
170 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
171 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
172 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
173 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
174 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
175 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
176 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
177 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
178 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
179 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
180 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
181 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
182 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
183 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
184 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
185 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
186 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
187 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
188 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
189 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
190 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
191 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
192 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
193 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
194 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
195 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
196 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
197 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
198 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
199 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
200 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
201 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
202 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
203 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
204 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
205 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
206 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
207 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
208 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
209 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
210 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
211 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
212 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
213 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
214 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
215 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
216 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
217 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
218 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
219 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
220 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
221 0xffffffffffffffff,
222 };
223 return word[byte];
224}
225
226/* Similarly for half-word elements.
227 * for (i = 0; i < 256; ++i) {
228 * unsigned long m = 0;
229 * if (i & 0xaa) {
230 * continue;
231 * }
232 * for (j = 0; j < 8; j += 2) {
233 * if ((i >> j) & 1) {
234 * m |= 0xfffful << (j << 3);
235 * }
236 * }
237 * printf("[0x%x] = 0x%016lx,\n", i, m);
238 * }
239 */
240static inline uint64_t expand_pred_h(uint8_t byte)
241{
242 static const uint64_t word[] = {
243 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
244 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
245 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
246 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
247 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
248 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
249 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
250 [0x55] = 0xffffffffffffffff,
251 };
252 return word[byte & 0x55];
253}
254
255/* Similarly for single word elements. */
256static inline uint64_t expand_pred_s(uint8_t byte)
257{
258 static const uint64_t word[] = {
259 [0x01] = 0x00000000ffffffffull,
260 [0x10] = 0xffffffff00000000ull,
261 [0x11] = 0xffffffffffffffffull,
262 };
263 return word[byte & 0x11];
264}
265
dae8fb90
RH
266/* Swap 16-bit words within a 32-bit word. */
267static inline uint32_t hswap32(uint32_t h)
268{
269 return rol32(h, 16);
270}
271
272/* Swap 16-bit words within a 64-bit word. */
273static inline uint64_t hswap64(uint64_t h)
274{
275 uint64_t m = 0x0000ffff0000ffffull;
276 h = rol64(h, 32);
277 return ((h & m) << 16) | ((h >> 16) & m);
278}
279
280/* Swap 32-bit words within a 64-bit word. */
281static inline uint64_t wswap64(uint64_t h)
282{
283 return rol64(h, 32);
284}
285
516e246a
RH
286#define LOGICAL_PPPP(NAME, FUNC) \
287void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
288{ \
289 uintptr_t opr_sz = simd_oprsz(desc); \
290 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
291 uintptr_t i; \
292 for (i = 0; i < opr_sz / 8; ++i) { \
293 d[i] = FUNC(n[i], m[i], g[i]); \
294 } \
295}
296
297#define DO_AND(N, M, G) (((N) & (M)) & (G))
298#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
299#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
300#define DO_ORR(N, M, G) (((N) | (M)) & (G))
301#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
302#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
303#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
304#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
305
306LOGICAL_PPPP(sve_and_pppp, DO_AND)
307LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
308LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
309LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
310LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
311LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
312LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
313LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
314
315#undef DO_AND
316#undef DO_BIC
317#undef DO_EOR
318#undef DO_ORR
319#undef DO_ORN
320#undef DO_NOR
321#undef DO_NAND
322#undef DO_SEL
323#undef LOGICAL_PPPP
028e2a7b 324
f97cfd59
RH
325/* Fully general three-operand expander, controlled by a predicate.
326 * This is complicated by the host-endian storage of the register file.
327 */
328/* ??? I don't expect the compiler could ever vectorize this itself.
329 * With some tables we can convert bit masks to byte masks, and with
330 * extra care wrt byte/word ordering we could use gcc generic vectors
331 * and do 16 bytes at a time.
332 */
333#define DO_ZPZZ(NAME, TYPE, H, OP) \
334void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
335{ \
336 intptr_t i, opr_sz = simd_oprsz(desc); \
337 for (i = 0; i < opr_sz; ) { \
338 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
339 do { \
340 if (pg & 1) { \
341 TYPE nn = *(TYPE *)(vn + H(i)); \
342 TYPE mm = *(TYPE *)(vm + H(i)); \
343 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
344 } \
345 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
346 } while (i & 15); \
347 } \
348}
349
350/* Similarly, specialized for 64-bit operands. */
351#define DO_ZPZZ_D(NAME, TYPE, OP) \
352void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
353{ \
354 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
355 TYPE *d = vd, *n = vn, *m = vm; \
356 uint8_t *pg = vg; \
357 for (i = 0; i < opr_sz; i += 1) { \
358 if (pg[H1(i)] & 1) { \
359 TYPE nn = n[i], mm = m[i]; \
360 d[i] = OP(nn, mm); \
361 } \
362 } \
363}
364
365#define DO_AND(N, M) (N & M)
366#define DO_EOR(N, M) (N ^ M)
367#define DO_ORR(N, M) (N | M)
368#define DO_BIC(N, M) (N & ~M)
369#define DO_ADD(N, M) (N + M)
370#define DO_SUB(N, M) (N - M)
371#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
372#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
373#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
374#define DO_MUL(N, M) (N * M)
7e8fafbf
RH
375
376
377/*
378 * We must avoid the C undefined behaviour cases: division by
379 * zero and signed division of INT_MIN by -1. Both of these
380 * have architecturally defined required results for Arm.
381 * We special case all signed divisions by -1 to avoid having
382 * to deduce the minimum integer for the type involved.
383 */
384#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
385#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
f97cfd59
RH
386
387DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
388DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
389DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
390DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
391
392DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
393DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
394DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
395DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
396
397DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
398DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
399DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
400DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
401
402DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
403DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
404DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
405DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
406
407DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
408DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
409DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
410DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
411
412DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
413DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
414DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
415DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
416
417DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
418DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
419DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
420DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
421
422DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
423DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
424DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
425DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
426
427DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
428DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
429DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
430DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
431
432DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
433DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
434DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
435DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
436
437DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
438DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
439DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
440DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
441
442DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
443DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
444DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
445DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
446
447/* Because the computation type is at least twice as large as required,
448 these work for both signed and unsigned source types. */
449static inline uint8_t do_mulh_b(int32_t n, int32_t m)
450{
451 return (n * m) >> 8;
452}
453
454static inline uint16_t do_mulh_h(int32_t n, int32_t m)
455{
456 return (n * m) >> 16;
457}
458
459static inline uint32_t do_mulh_s(int64_t n, int64_t m)
460{
461 return (n * m) >> 32;
462}
463
464static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
465{
466 uint64_t lo, hi;
467 muls64(&lo, &hi, n, m);
468 return hi;
469}
470
471static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
472{
473 uint64_t lo, hi;
474 mulu64(&lo, &hi, n, m);
475 return hi;
476}
477
478DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
479DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
480DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
481DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
482
483DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
484DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
485DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
486DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
487
488DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
489DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
490DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
491DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
492
7e8fafbf
RH
493DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
494DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
f97cfd59 495
7e8fafbf
RH
496DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
497DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
f97cfd59 498
27721dbb
RH
499/* Note that all bits of the shift are significant
500 and not modulo the element size. */
501#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
502#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
503#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
504
505DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
506DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
507DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
508
509DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
510DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
511DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
512
513DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
514DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
515DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
516
517DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
518DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
519DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
520
d4b1e59d
RH
521static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
522{
523 int8_t n1 = n, n2 = n >> 8;
524 return m + n1 + n2;
525}
526
527static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
528{
529 int16_t n1 = n, n2 = n >> 16;
530 return m + n1 + n2;
531}
532
533static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
534{
535 int32_t n1 = n, n2 = n >> 32;
536 return m + n1 + n2;
537}
538
539DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
540DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
541DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
542
543static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
544{
545 uint8_t n1 = n, n2 = n >> 8;
546 return m + n1 + n2;
547}
548
549static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
550{
551 uint16_t n1 = n, n2 = n >> 16;
552 return m + n1 + n2;
553}
554
555static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
556{
557 uint32_t n1 = n, n2 = n >> 32;
558 return m + n1 + n2;
559}
560
561DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
562DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
563DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
564
45d9503d
RH
565#define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
566#define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
567#define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
568#define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
569
570DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
571DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
572DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
573DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
574
575#define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
576#define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
577#define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
578#define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
579
580DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
581DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
582DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
583DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
584
585/*
586 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
587 * We pass in a pointer to a dummy saturation field to trigger
588 * the saturating arithmetic but discard the information about
589 * whether it has occurred.
590 */
591#define do_sqshl_b(n, m) \
592 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
593#define do_sqshl_h(n, m) \
594 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
595#define do_sqshl_s(n, m) \
596 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
597#define do_sqshl_d(n, m) \
598 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
599
600DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
601DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
602DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
603DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
604
605#define do_uqshl_b(n, m) \
606 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
607#define do_uqshl_h(n, m) \
608 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
609#define do_uqshl_s(n, m) \
610 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
611#define do_uqshl_d(n, m) \
612 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
613
614DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
615DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
616DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
617DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
618
619#define do_sqrshl_b(n, m) \
620 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
621#define do_sqrshl_h(n, m) \
622 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
623#define do_sqrshl_s(n, m) \
624 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
625#define do_sqrshl_d(n, m) \
626 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
627
628DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
629DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
630DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
631DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
632
633#undef do_sqrshl_d
634
635#define do_uqrshl_b(n, m) \
636 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
637#define do_uqrshl_h(n, m) \
638 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
639#define do_uqrshl_s(n, m) \
640 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
641#define do_uqrshl_d(n, m) \
642 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
643
644DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
645DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
646DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
647DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
648
649#undef do_uqrshl_d
650
a47dc220
RH
651#define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
652#define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
653
654DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
655DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
656DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
657DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
658
659DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
660DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
661DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
662DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
663
664#define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
665#define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
666
667DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
668DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
669DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
670DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
671
672DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
673DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
674DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
675DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
676
677#define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
678#define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
679
680DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
681DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
682DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
683DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
684
685DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
686DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
687DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
688DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
689
4f07fbeb
RH
690static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
691{
692 return val >= max ? max : val <= min ? min : val;
693}
694
695#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
696#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
697#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
698
699static inline int64_t do_sqadd_d(int64_t n, int64_t m)
700{
701 int64_t r = n + m;
702 if (((r ^ n) & ~(n ^ m)) < 0) {
703 /* Signed overflow. */
704 return r < 0 ? INT64_MAX : INT64_MIN;
705 }
706 return r;
707}
708
709DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
710DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
711DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
712DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
713
714#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
715#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
716#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
717
718static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
719{
720 uint64_t r = n + m;
721 return r < n ? UINT64_MAX : r;
722}
723
724DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
725DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
726DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
727DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
728
729#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
730#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
731#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
732
733static inline int64_t do_sqsub_d(int64_t n, int64_t m)
734{
735 int64_t r = n - m;
736 if (((r ^ n) & (n ^ m)) < 0) {
737 /* Signed overflow. */
738 return r < 0 ? INT64_MAX : INT64_MIN;
739 }
740 return r;
741}
742
743DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
744DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
745DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
746DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
747
748#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
749#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
750#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
751
752static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
753{
754 return n > m ? n - m : 0;
755}
756
757DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
758DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
759DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
760DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
761
762#define DO_SUQADD_B(n, m) \
763 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
764#define DO_SUQADD_H(n, m) \
765 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
766#define DO_SUQADD_S(n, m) \
767 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
768
769static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
770{
771 uint64_t r = n + m;
772
773 if (n < 0) {
774 /* Note that m - abs(n) cannot underflow. */
775 if (r > INT64_MAX) {
776 /* Result is either very large positive or negative. */
777 if (m > -n) {
778 /* m > abs(n), so r is a very large positive. */
779 return INT64_MAX;
780 }
781 /* Result is negative. */
782 }
783 } else {
784 /* Both inputs are positive: check for overflow. */
785 if (r < m || r > INT64_MAX) {
786 return INT64_MAX;
787 }
788 }
789 return r;
790}
791
792DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
793DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
794DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
795DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
796
797#define DO_USQADD_B(n, m) \
798 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
799#define DO_USQADD_H(n, m) \
800 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
801#define DO_USQADD_S(n, m) \
802 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
803
804static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
805{
806 uint64_t r = n + m;
807
808 if (m < 0) {
809 return n < -m ? 0 : r;
810 }
811 return r < n ? UINT64_MAX : r;
812}
813
814DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
815DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
816DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
817DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
818
f97cfd59
RH
819#undef DO_ZPZZ
820#undef DO_ZPZZ_D
047cec97 821
8597dc8b
RH
822/*
823 * Three operand expander, operating on element pairs.
824 * If the slot I is even, the elements from from VN {I, I+1}.
825 * If the slot I is odd, the elements from from VM {I-1, I}.
826 * Load all of the input elements in each pair before overwriting output.
827 */
828#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
829void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
830{ \
831 intptr_t i, opr_sz = simd_oprsz(desc); \
832 for (i = 0; i < opr_sz; ) { \
833 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
834 do { \
835 TYPE n0 = *(TYPE *)(vn + H(i)); \
836 TYPE m0 = *(TYPE *)(vm + H(i)); \
837 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
838 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
839 if (pg & 1) { \
840 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
841 } \
842 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
843 if (pg & 1) { \
844 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
845 } \
846 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
847 } while (i & 15); \
848 } \
849}
850
851/* Similarly, specialized for 64-bit operands. */
852#define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
853void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
854{ \
855 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
856 TYPE *d = vd, *n = vn, *m = vm; \
857 uint8_t *pg = vg; \
858 for (i = 0; i < opr_sz; i += 2) { \
859 TYPE n0 = n[i], n1 = n[i + 1]; \
860 TYPE m0 = m[i], m1 = m[i + 1]; \
861 if (pg[H1(i)] & 1) { \
862 d[i] = OP(n0, n1); \
863 } \
864 if (pg[H1(i + 1)] & 1) { \
865 d[i + 1] = OP(m0, m1); \
866 } \
867 } \
868}
869
870DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
871DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
872DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
873DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
874
875DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
876DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
877DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
878DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
879
880DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
881DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
882DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
883DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
884
885DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
886DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
887DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
888DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
889
890DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
891DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
892DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
893DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
894
895#undef DO_ZPZZ_PAIR
896#undef DO_ZPZZ_PAIR_D
897
fe7f8dfb
RH
898/* Three-operand expander, controlled by a predicate, in which the
899 * third operand is "wide". That is, for D = N op M, the same 64-bit
900 * value of M is used with all of the narrower values of N.
901 */
902#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
903void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
904{ \
905 intptr_t i, opr_sz = simd_oprsz(desc); \
906 for (i = 0; i < opr_sz; ) { \
907 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
908 TYPEW mm = *(TYPEW *)(vm + i); \
909 do { \
910 if (pg & 1) { \
911 TYPE nn = *(TYPE *)(vn + H(i)); \
912 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
913 } \
914 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
915 } while (i & 7); \
916 } \
917}
918
919DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
920DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
921DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
922
923DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
924DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
925DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
926
927DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
928DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
929DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
930
931#undef DO_ZPZW
932
afac6d04
RH
933/* Fully general two-operand expander, controlled by a predicate.
934 */
935#define DO_ZPZ(NAME, TYPE, H, OP) \
936void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
937{ \
938 intptr_t i, opr_sz = simd_oprsz(desc); \
939 for (i = 0; i < opr_sz; ) { \
940 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
941 do { \
942 if (pg & 1) { \
943 TYPE nn = *(TYPE *)(vn + H(i)); \
944 *(TYPE *)(vd + H(i)) = OP(nn); \
945 } \
946 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
947 } while (i & 15); \
948 } \
949}
950
951/* Similarly, specialized for 64-bit operands. */
952#define DO_ZPZ_D(NAME, TYPE, OP) \
953void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
954{ \
955 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
956 TYPE *d = vd, *n = vn; \
957 uint8_t *pg = vg; \
958 for (i = 0; i < opr_sz; i += 1) { \
959 if (pg[H1(i)] & 1) { \
960 TYPE nn = n[i]; \
961 d[i] = OP(nn); \
962 } \
963 } \
964}
965
966#define DO_CLS_B(N) (clrsb32(N) - 24)
967#define DO_CLS_H(N) (clrsb32(N) - 16)
968
969DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
970DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
971DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
972DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
973
974#define DO_CLZ_B(N) (clz32(N) - 24)
975#define DO_CLZ_H(N) (clz32(N) - 16)
976
977DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
978DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
979DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
980DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
981
982DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
983DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
984DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
985DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
986
987#define DO_CNOT(N) (N == 0)
988
989DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
990DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
991DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
992DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
993
994#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
995
996DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
997DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
998DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
999
1000#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
1001
1002DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
1003DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
1004DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
1005
1006#define DO_NOT(N) (~N)
1007
1008DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
1009DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
1010DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
1011DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
1012
1013#define DO_SXTB(N) ((int8_t)N)
1014#define DO_SXTH(N) ((int16_t)N)
1015#define DO_SXTS(N) ((int32_t)N)
1016#define DO_UXTB(N) ((uint8_t)N)
1017#define DO_UXTH(N) ((uint16_t)N)
1018#define DO_UXTS(N) ((uint32_t)N)
1019
1020DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
1021DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
1022DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
1023DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
1024DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
1025DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
1026
1027DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
1028DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
1029DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
1030DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
1031DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
1032DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
1033
1034#define DO_ABS(N) (N < 0 ? -N : N)
1035
1036DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
1037DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
1038DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
1039DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
1040
1041#define DO_NEG(N) (-N)
1042
1043DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
1044DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
1045DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
1046DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
1047
dae8fb90
RH
1048DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
1049DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
1050DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
1051
1052DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
1053DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
1054
1055DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
1056
1057DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
1058DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
1059DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
1060DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
1061
db366da8
RH
1062#define DO_SQABS(X) \
1063 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1064 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
1065
1066DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
1067DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
1068DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
1069DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
1070
1071#define DO_SQNEG(X) \
1072 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1073 x_ == min_ ? -min_ - 1 : -x_; })
1074
1075DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
1076DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
1077DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
1078DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
1079
1080DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
1081DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
1082
d9d78dcc
RH
1083/* Three-operand expander, unpredicated, in which the third operand is "wide".
1084 */
1085#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1086void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1087{ \
1088 intptr_t i, opr_sz = simd_oprsz(desc); \
1089 for (i = 0; i < opr_sz; ) { \
1090 TYPEW mm = *(TYPEW *)(vm + i); \
1091 do { \
1092 TYPE nn = *(TYPE *)(vn + H(i)); \
1093 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1094 i += sizeof(TYPE); \
1095 } while (i & 7); \
1096 } \
1097}
1098
1099DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1100DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1101DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1102
1103DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1104DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1105DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1106
1107DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1108DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1109DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1110
1111#undef DO_ZZW
1112
afac6d04
RH
1113#undef DO_CLS_B
1114#undef DO_CLS_H
1115#undef DO_CLZ_B
1116#undef DO_CLZ_H
1117#undef DO_CNOT
1118#undef DO_FABS
1119#undef DO_FNEG
1120#undef DO_ABS
1121#undef DO_NEG
1122#undef DO_ZPZ
1123#undef DO_ZPZ_D
1124
0ce1dda8
RH
1125/*
1126 * Three-operand expander, unpredicated, in which the two inputs are
1127 * selected from the top or bottom half of the wide column.
1128 */
1129#define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1130void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1131{ \
1132 intptr_t i, opr_sz = simd_oprsz(desc); \
1133 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1134 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1135 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1136 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1137 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1138 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1139 } \
1140}
1141
1142DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1143DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1144DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, , H1_4, DO_ADD)
1145
1146DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1147DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1148DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, , H1_4, DO_SUB)
1149
1150DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1151DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1152DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, , H1_4, DO_ABD)
1153
1154DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1155DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1156DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, , H1_4, DO_ADD)
1157
1158DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1159DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1160DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, , H1_4, DO_SUB)
1161
1162DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1163DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1164DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, , H1_4, DO_ABD)
1165
69ccc099
RH
1166DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1167DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1168DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, , H1_4, DO_MUL)
1169
1170DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1171DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1172DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, , H1_4, DO_MUL)
1173
1174/* Note that the multiply cannot overflow, but the doubling can. */
1175static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1176{
1177 int16_t val = n * m;
1178 return DO_SQADD_H(val, val);
1179}
1180
1181static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1182{
1183 int32_t val = n * m;
1184 return DO_SQADD_S(val, val);
1185}
1186
1187static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1188{
1189 int64_t val = n * m;
1190 return do_sqadd_d(val, val);
1191}
1192
1193DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1194DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1195DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, , H1_4, do_sqdmull_d)
1196
0ce1dda8
RH
1197#undef DO_ZZZ_TB
1198
81fccf09
RH
1199#define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1200void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1201{ \
1202 intptr_t i, opr_sz = simd_oprsz(desc); \
1203 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1204 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1205 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1206 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1207 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1208 } \
1209}
1210
1211DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1212DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1213DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, , H1_4, DO_ADD)
1214
1215DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1216DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1217DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, , H1_4, DO_SUB)
1218
1219DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1220DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1221DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, , H1_4, DO_ADD)
1222
1223DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1224DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1225DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, , H1_4, DO_SUB)
1226
1227#undef DO_ZZZ_WTB
1228
2df3ca55
RH
1229#define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1230void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1231{ \
1232 intptr_t i, opr_sz = simd_oprsz(desc); \
1233 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1234 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1235 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1236 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1237 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1238 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1239 } \
1240}
1241
1242DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1243DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1244DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1245DO_ZZZ_NTB(sve2_eoril_d, uint64_t, , DO_EOR)
1246
1247#undef DO_ZZZ_NTB
1248
38650638
RH
1249#define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1250void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1251{ \
1252 intptr_t i, opr_sz = simd_oprsz(desc); \
1253 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1254 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1255 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1256 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1257 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1258 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1259 } \
1260}
1261
1262DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1263DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1264DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, , H1_4, DO_ABD)
1265
1266DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1267DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1268DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, , H1_4, DO_ABD)
1269
1270#undef DO_ZZZW_ACC
1271
b8295dfb
RH
1272void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1273{
1274 intptr_t i, opr_sz = simd_oprsz(desc);
1275 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1276 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1277 uint32_t *a = va, *n = vn;
1278 uint64_t *d = vd, *m = vm;
1279
1280 for (i = 0; i < opr_sz / 8; ++i) {
1281 uint32_t e1 = a[2 * i + H4(0)];
1282 uint32_t e2 = n[2 * i + sel] ^ inv;
1283 uint64_t c = extract64(m[i], 32, 1);
1284 /* Compute and store the entire 33-bit result at once. */
1285 d[i] = c + e1 + e2;
1286 }
1287}
1288
1289void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1290{
1291 intptr_t i, opr_sz = simd_oprsz(desc);
1292 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1293 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1294 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1295
1296 for (i = 0; i < opr_sz / 8; i += 2) {
1297 Int128 e1 = int128_make64(a[i]);
1298 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1299 Int128 c = int128_make64(m[i + 1] & 1);
1300 Int128 r = int128_add(int128_add(e1, e2), c);
1301 d[i + 0] = int128_getlo(r);
1302 d[i + 1] = int128_gethi(r);
1303 }
1304}
1305
cb9c33b8
RH
1306#define DO_BITPERM(NAME, TYPE, OP) \
1307void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1308{ \
1309 intptr_t i, opr_sz = simd_oprsz(desc); \
1310 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1311 TYPE nn = *(TYPE *)(vn + i); \
1312 TYPE mm = *(TYPE *)(vm + i); \
1313 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1314 } \
1315}
1316
1317static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1318{
1319 uint64_t res = 0;
1320 int db, rb = 0;
1321
1322 for (db = 0; db < n; ++db) {
1323 if ((mask >> db) & 1) {
1324 res |= ((data >> db) & 1) << rb;
1325 ++rb;
1326 }
1327 }
1328 return res;
1329}
1330
1331DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1332DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1333DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1334DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1335
1336static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1337{
1338 uint64_t res = 0;
1339 int rb, db = 0;
1340
1341 for (rb = 0; rb < n; ++rb) {
1342 if ((mask >> rb) & 1) {
1343 res |= ((data >> db) & 1) << rb;
1344 ++db;
1345 }
1346 }
1347 return res;
1348}
1349
1350DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1351DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1352DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1353DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1354
1355static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1356{
1357 uint64_t resm = 0, resu = 0;
1358 int db, rbm = 0, rbu = 0;
1359
1360 for (db = 0; db < n; ++db) {
1361 uint64_t val = (data >> db) & 1;
1362 if ((mask >> db) & 1) {
1363 resm |= val << rbm++;
1364 } else {
1365 resu |= val << rbu++;
1366 }
1367 }
1368
1369 return resm | (resu << rbm);
1370}
1371
1372DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1373DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1374DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1375DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1376
1377#undef DO_BITPERM
1378
ed4a6387
RH
1379#define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1380void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1381{ \
1382 intptr_t i, opr_sz = simd_oprsz(desc); \
1383 int sub_r = simd_data(desc); \
1384 if (sub_r) { \
1385 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1386 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1387 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1388 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1389 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1390 acc_r = ADD_OP(acc_r, el2_i); \
1391 acc_i = SUB_OP(acc_i, el2_r); \
1392 *(TYPE *)(vd + H(i)) = acc_r; \
1393 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1394 } \
1395 } else { \
1396 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1397 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1398 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1399 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1400 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1401 acc_r = SUB_OP(acc_r, el2_i); \
1402 acc_i = ADD_OP(acc_i, el2_r); \
1403 *(TYPE *)(vd + H(i)) = acc_r; \
1404 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1405 } \
1406 } \
1407}
1408
1409DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1410DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1411DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1412DO_CADD(sve2_cadd_d, int64_t, , DO_ADD, DO_SUB)
1413
1414DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1415DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1416DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1417DO_CADD(sve2_sqcadd_d, int64_t, , do_sqadd_d, do_sqsub_d)
1418
1419#undef DO_CADD
1420
4269fef1
RH
1421#define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1422void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1423{ \
1424 intptr_t i, opr_sz = simd_oprsz(desc); \
1425 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1426 int shift = simd_data(desc) >> 1; \
1427 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1428 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1429 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1430 } \
1431}
1432
1433DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1434DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1435DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, , H1_4)
1436
1437DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1438DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1439DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, , H1_4)
1440
1441#undef DO_ZZI_SHLL
1442
047cec97
RH
1443/* Two-operand reduction expander, controlled by a predicate.
1444 * The difference between TYPERED and TYPERET has to do with
1445 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1446 * but TYPERET must be unsigned so that e.g. a 32-bit value
1447 * is not sign-extended to the ABI uint64_t return type.
1448 */
1449/* ??? If we were to vectorize this by hand the reduction ordering
1450 * would change. For integer operands, this is perfectly fine.
1451 */
1452#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1453uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1454{ \
1455 intptr_t i, opr_sz = simd_oprsz(desc); \
1456 TYPERED ret = INIT; \
1457 for (i = 0; i < opr_sz; ) { \
1458 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1459 do { \
1460 if (pg & 1) { \
1461 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1462 ret = OP(ret, nn); \
1463 } \
1464 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1465 } while (i & 15); \
1466 } \
1467 return (TYPERET)ret; \
1468}
1469
1470#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1471uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1472{ \
1473 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1474 TYPEE *n = vn; \
1475 uint8_t *pg = vg; \
1476 TYPER ret = INIT; \
1477 for (i = 0; i < opr_sz; i += 1) { \
1478 if (pg[H1(i)] & 1) { \
1479 TYPEE nn = n[i]; \
1480 ret = OP(ret, nn); \
1481 } \
1482 } \
1483 return ret; \
1484}
1485
1486DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1487DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1488DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1489DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1490
1491DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1492DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1493DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1494DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1495
1496DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1497DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1498DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1499DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1500
1501DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1502DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1503DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1504
1505DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1506DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1507DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1508DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1509
1510DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1511DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1512DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1513DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1514
1515DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1516DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1517DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1518DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1519
1520DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1521DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1522DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1523DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1524
1525DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1526DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1527DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1528DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1529
1530#undef DO_VPZ
1531#undef DO_VPZ_D
1532
6e6a157d
RH
1533/* Two vector operand, one scalar operand, unpredicated. */
1534#define DO_ZZI(NAME, TYPE, OP) \
1535void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1536{ \
1537 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1538 TYPE s = s64, *d = vd, *n = vn; \
1539 for (i = 0; i < opr_sz; ++i) { \
1540 d[i] = OP(n[i], s); \
1541 } \
1542}
1543
1544#define DO_SUBR(X, Y) (Y - X)
1545
1546DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1547DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1548DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1549DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1550
1551DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1552DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1553DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1554DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1555
1556DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1557DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1558DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1559DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1560
1561DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1562DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1563DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1564DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1565
1566DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1567DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1568DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1569DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1570
1571#undef DO_ZZI
1572
f97cfd59
RH
1573#undef DO_AND
1574#undef DO_ORR
1575#undef DO_EOR
1576#undef DO_BIC
1577#undef DO_ADD
1578#undef DO_SUB
1579#undef DO_MAX
1580#undef DO_MIN
1581#undef DO_ABD
1582#undef DO_MUL
1583#undef DO_DIV
27721dbb
RH
1584#undef DO_ASR
1585#undef DO_LSR
1586#undef DO_LSL
6e6a157d 1587#undef DO_SUBR
f97cfd59 1588
028e2a7b
RH
1589/* Similar to the ARM LastActiveElement pseudocode function, except the
1590 result is multiplied by the element size. This includes the not found
1591 indication; e.g. not found for esz=3 is -8. */
1592static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1593{
1594 uint64_t mask = pred_esz_masks[esz];
1595 intptr_t i = words;
1596
1597 do {
1598 uint64_t this_g = g[--i] & mask;
1599 if (this_g) {
1600 return i * 64 + (63 - clz64(this_g));
1601 }
1602 } while (i > 0);
1603 return (intptr_t)-1 << esz;
1604}
1605
86300b5d 1606uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
028e2a7b 1607{
86300b5d 1608 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
028e2a7b
RH
1609 uint32_t flags = PREDTEST_INIT;
1610 uint64_t *d = vd, *g = vg;
1611 intptr_t i = 0;
1612
1613 do {
1614 uint64_t this_d = d[i];
1615 uint64_t this_g = g[i];
1616
1617 if (this_g) {
1618 if (!(flags & 4)) {
1619 /* Set in D the first bit of G. */
1620 this_d |= this_g & -this_g;
1621 d[i] = this_d;
1622 }
1623 flags = iter_predtest_fwd(this_d, this_g, flags);
1624 }
1625 } while (++i < words);
1626
1627 return flags;
1628}
1629
1630uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1631{
86300b5d
RH
1632 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1633 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
028e2a7b
RH
1634 uint32_t flags = PREDTEST_INIT;
1635 uint64_t *d = vd, *g = vg, esz_mask;
1636 intptr_t i, next;
1637
1638 next = last_active_element(vd, words, esz) + (1 << esz);
1639 esz_mask = pred_esz_masks[esz];
1640
1641 /* Similar to the pseudocode for pnext, but scaled by ESZ
1642 so that we find the correct bit. */
1643 if (next < words * 64) {
1644 uint64_t mask = -1;
1645
1646 if (next & 63) {
1647 mask = ~((1ull << (next & 63)) - 1);
1648 next &= -64;
1649 }
1650 do {
1651 uint64_t this_g = g[next / 64] & esz_mask & mask;
1652 if (this_g != 0) {
1653 next = (next & -64) + ctz64(this_g);
1654 break;
1655 }
1656 next += 64;
1657 mask = -1;
1658 } while (next < words * 64);
1659 }
1660
1661 i = 0;
1662 do {
1663 uint64_t this_d = 0;
1664 if (i == next / 64) {
1665 this_d = 1ull << (next & 63);
1666 }
1667 d[i] = this_d;
1668 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1669 } while (++i < words);
1670
1671 return flags;
1672}
ccd841c3 1673
60245996
RH
1674/*
1675 * Copy Zn into Zd, and store zero into inactive elements.
1676 * If inv, store zeros into the active elements.
ccd841c3 1677 */
68459864
RH
1678void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1679{
1680 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
60245996 1681 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
68459864
RH
1682 uint64_t *d = vd, *n = vn;
1683 uint8_t *pg = vg;
60245996 1684
68459864 1685 for (i = 0; i < opr_sz; i += 1) {
60245996 1686 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
68459864
RH
1687 }
1688}
1689
1690void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1691{
1692 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
60245996 1693 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
68459864
RH
1694 uint64_t *d = vd, *n = vn;
1695 uint8_t *pg = vg;
60245996 1696
68459864 1697 for (i = 0; i < opr_sz; i += 1) {
60245996 1698 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
68459864
RH
1699 }
1700}
1701
1702void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1703{
1704 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
60245996 1705 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
68459864
RH
1706 uint64_t *d = vd, *n = vn;
1707 uint8_t *pg = vg;
60245996 1708
68459864 1709 for (i = 0; i < opr_sz; i += 1) {
60245996 1710 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
68459864
RH
1711 }
1712}
1713
1714void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1715{
1716 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1717 uint64_t *d = vd, *n = vn;
1718 uint8_t *pg = vg;
60245996
RH
1719 uint8_t inv = simd_data(desc);
1720
68459864 1721 for (i = 0; i < opr_sz; i += 1) {
60245996 1722 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
68459864
RH
1723 }
1724}
1725
ccd841c3
RH
1726/* Three-operand expander, immediate operand, controlled by a predicate.
1727 */
1728#define DO_ZPZI(NAME, TYPE, H, OP) \
1729void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1730{ \
1731 intptr_t i, opr_sz = simd_oprsz(desc); \
1732 TYPE imm = simd_data(desc); \
1733 for (i = 0; i < opr_sz; ) { \
1734 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1735 do { \
1736 if (pg & 1) { \
1737 TYPE nn = *(TYPE *)(vn + H(i)); \
1738 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1739 } \
1740 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1741 } while (i & 15); \
1742 } \
1743}
1744
1745/* Similarly, specialized for 64-bit operands. */
1746#define DO_ZPZI_D(NAME, TYPE, OP) \
1747void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1748{ \
1749 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1750 TYPE *d = vd, *n = vn; \
1751 TYPE imm = simd_data(desc); \
1752 uint8_t *pg = vg; \
1753 for (i = 0; i < opr_sz; i += 1) { \
1754 if (pg[H1(i)] & 1) { \
1755 TYPE nn = n[i]; \
1756 d[i] = OP(nn, imm); \
1757 } \
1758 } \
1759}
1760
1761#define DO_SHR(N, M) (N >> M)
1762#define DO_SHL(N, M) (N << M)
1763
1764/* Arithmetic shift right for division. This rounds negative numbers
1765 toward zero as per signed division. Therefore before shifting,
1766 when N is negative, add 2**M-1. */
1767#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1768
1769DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1770DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1771DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1772DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1773
1774DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1775DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1776DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1777DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1778
1779DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1780DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1781DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1782DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1783
1784DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1785DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1786DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1787DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1788
1789#undef DO_SHR
1790#undef DO_SHL
1791#undef DO_ASRD
1792#undef DO_ZPZI
1793#undef DO_ZPZI_D
96a36e4a
RH
1794
1795/* Fully general four-operand expander, controlled by a predicate.
1796 */
1797#define DO_ZPZZZ(NAME, TYPE, H, OP) \
1798void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1799 void *vg, uint32_t desc) \
1800{ \
1801 intptr_t i, opr_sz = simd_oprsz(desc); \
1802 for (i = 0; i < opr_sz; ) { \
1803 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1804 do { \
1805 if (pg & 1) { \
1806 TYPE nn = *(TYPE *)(vn + H(i)); \
1807 TYPE mm = *(TYPE *)(vm + H(i)); \
1808 TYPE aa = *(TYPE *)(va + H(i)); \
1809 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1810 } \
1811 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1812 } while (i & 15); \
1813 } \
1814}
1815
1816/* Similarly, specialized for 64-bit operands. */
1817#define DO_ZPZZZ_D(NAME, TYPE, OP) \
1818void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1819 void *vg, uint32_t desc) \
1820{ \
1821 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1822 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1823 uint8_t *pg = vg; \
1824 for (i = 0; i < opr_sz; i += 1) { \
1825 if (pg[H1(i)] & 1) { \
1826 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1827 d[i] = OP(aa, nn, mm); \
1828 } \
1829 } \
1830}
1831
1832#define DO_MLA(A, N, M) (A + N * M)
1833#define DO_MLS(A, N, M) (A - N * M)
1834
1835DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1836DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1837
1838DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1839DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1840
1841DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1842DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1843
1844DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1845DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1846
1847#undef DO_MLA
1848#undef DO_MLS
1849#undef DO_ZPZZZ
1850#undef DO_ZPZZZ_D
9a56c9c3
RH
1851
1852void HELPER(sve_index_b)(void *vd, uint32_t start,
1853 uint32_t incr, uint32_t desc)
1854{
1855 intptr_t i, opr_sz = simd_oprsz(desc);
1856 uint8_t *d = vd;
1857 for (i = 0; i < opr_sz; i += 1) {
1858 d[H1(i)] = start + i * incr;
1859 }
1860}
1861
1862void HELPER(sve_index_h)(void *vd, uint32_t start,
1863 uint32_t incr, uint32_t desc)
1864{
1865 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1866 uint16_t *d = vd;
1867 for (i = 0; i < opr_sz; i += 1) {
1868 d[H2(i)] = start + i * incr;
1869 }
1870}
1871
1872void HELPER(sve_index_s)(void *vd, uint32_t start,
1873 uint32_t incr, uint32_t desc)
1874{
1875 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1876 uint32_t *d = vd;
1877 for (i = 0; i < opr_sz; i += 1) {
1878 d[H4(i)] = start + i * incr;
1879 }
1880}
1881
1882void HELPER(sve_index_d)(void *vd, uint64_t start,
1883 uint64_t incr, uint32_t desc)
1884{
1885 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1886 uint64_t *d = vd;
1887 for (i = 0; i < opr_sz; i += 1) {
1888 d[i] = start + i * incr;
1889 }
1890}
4b242d9c
RH
1891
1892void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1893{
1894 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1895 uint32_t sh = simd_data(desc);
1896 uint32_t *d = vd, *n = vn, *m = vm;
1897 for (i = 0; i < opr_sz; i += 1) {
1898 d[i] = n[i] + (m[i] << sh);
1899 }
1900}
1901
1902void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1903{
1904 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1905 uint64_t sh = simd_data(desc);
1906 uint64_t *d = vd, *n = vn, *m = vm;
1907 for (i = 0; i < opr_sz; i += 1) {
1908 d[i] = n[i] + (m[i] << sh);
1909 }
1910}
1911
1912void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1913{
1914 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1915 uint64_t sh = simd_data(desc);
1916 uint64_t *d = vd, *n = vn, *m = vm;
1917 for (i = 0; i < opr_sz; i += 1) {
1918 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1919 }
1920}
1921
1922void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1923{
1924 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1925 uint64_t sh = simd_data(desc);
1926 uint64_t *d = vd, *n = vn, *m = vm;
1927 for (i = 0; i < opr_sz; i += 1) {
1928 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1929 }
1930}
0762cd42
RH
1931
1932void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1933{
1934 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1935 static const uint16_t coeff[] = {
1936 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1937 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1938 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1939 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1940 };
1941 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1942 uint16_t *d = vd, *n = vn;
1943
1944 for (i = 0; i < opr_sz; i++) {
1945 uint16_t nn = n[i];
1946 intptr_t idx = extract32(nn, 0, 5);
1947 uint16_t exp = extract32(nn, 5, 5);
1948 d[i] = coeff[idx] | (exp << 10);
1949 }
1950}
1951
1952void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1953{
1954 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1955 static const uint32_t coeff[] = {
1956 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1957 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1958 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1959 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1960 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1961 0x1ef532, 0x20b051, 0x227043, 0x243516,
1962 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1963 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1964 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1965 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1966 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1967 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1968 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1969 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1970 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1971 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1972 };
1973 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1974 uint32_t *d = vd, *n = vn;
1975
1976 for (i = 0; i < opr_sz; i++) {
1977 uint32_t nn = n[i];
1978 intptr_t idx = extract32(nn, 0, 6);
1979 uint32_t exp = extract32(nn, 6, 8);
1980 d[i] = coeff[idx] | (exp << 23);
1981 }
1982}
1983
1984void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1985{
1986 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1987 static const uint64_t coeff[] = {
1988 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1989 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1990 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1991 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1992 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1993 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1994 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1995 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1996 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1997 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1998 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1999 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2000 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2001 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2002 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2003 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2004 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2005 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2006 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2007 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2008 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2009 0xFA7C1819E90D8ull,
2010 };
2011 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2012 uint64_t *d = vd, *n = vn;
2013
2014 for (i = 0; i < opr_sz; i++) {
2015 uint64_t nn = n[i];
2016 intptr_t idx = extract32(nn, 0, 6);
2017 uint64_t exp = extract32(nn, 6, 11);
2018 d[i] = coeff[idx] | (exp << 52);
2019 }
2020}
a1f233f2
RH
2021
2022void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2023{
2024 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2025 uint16_t *d = vd, *n = vn, *m = vm;
2026 for (i = 0; i < opr_sz; i += 1) {
2027 uint16_t nn = n[i];
2028 uint16_t mm = m[i];
2029 if (mm & 1) {
2030 nn = float16_one;
2031 }
2032 d[i] = nn ^ (mm & 2) << 14;
2033 }
2034}
2035
2036void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2037{
2038 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2039 uint32_t *d = vd, *n = vn, *m = vm;
2040 for (i = 0; i < opr_sz; i += 1) {
2041 uint32_t nn = n[i];
2042 uint32_t mm = m[i];
2043 if (mm & 1) {
2044 nn = float32_one;
2045 }
2046 d[i] = nn ^ (mm & 2) << 30;
2047 }
2048}
2049
2050void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2051{
2052 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2053 uint64_t *d = vd, *n = vn, *m = vm;
2054 for (i = 0; i < opr_sz; i += 1) {
2055 uint64_t nn = n[i];
2056 uint64_t mm = m[i];
2057 if (mm & 1) {
2058 nn = float64_one;
2059 }
2060 d[i] = nn ^ (mm & 2) << 62;
2061 }
2062}
24e82e68
RH
2063
2064/*
2065 * Signed saturating addition with scalar operand.
2066 */
2067
2068void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2069{
2070 intptr_t i, oprsz = simd_oprsz(desc);
2071
2072 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
4f07fbeb 2073 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
24e82e68
RH
2074 }
2075}
2076
2077void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2078{
2079 intptr_t i, oprsz = simd_oprsz(desc);
2080
2081 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
4f07fbeb 2082 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
24e82e68
RH
2083 }
2084}
2085
2086void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2087{
2088 intptr_t i, oprsz = simd_oprsz(desc);
2089
2090 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
4f07fbeb 2091 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
24e82e68
RH
2092 }
2093}
2094
2095void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2096{
2097 intptr_t i, oprsz = simd_oprsz(desc);
2098
2099 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
4f07fbeb 2100 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
24e82e68
RH
2101 }
2102}
2103
2104/*
2105 * Unsigned saturating addition with scalar operand.
2106 */
2107
2108void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2109{
2110 intptr_t i, oprsz = simd_oprsz(desc);
2111
2112 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
4f07fbeb 2113 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
24e82e68
RH
2114 }
2115}
2116
2117void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2118{
2119 intptr_t i, oprsz = simd_oprsz(desc);
2120
2121 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
4f07fbeb 2122 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
24e82e68
RH
2123 }
2124}
2125
2126void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2127{
2128 intptr_t i, oprsz = simd_oprsz(desc);
2129
2130 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
4f07fbeb 2131 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
24e82e68
RH
2132 }
2133}
2134
2135void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2136{
2137 intptr_t i, oprsz = simd_oprsz(desc);
2138
2139 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
4f07fbeb 2140 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
24e82e68
RH
2141 }
2142}
2143
2144void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2145{
2146 intptr_t i, oprsz = simd_oprsz(desc);
2147
2148 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
4f07fbeb 2149 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
24e82e68
RH
2150 }
2151}
f25a2361
RH
2152
2153/* Two operand predicated copy immediate with merge. All valid immediates
2154 * can fit within 17 signed bits in the simd_data field.
2155 */
2156void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2157 uint64_t mm, uint32_t desc)
2158{
2159 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2160 uint64_t *d = vd, *n = vn;
2161 uint8_t *pg = vg;
2162
2163 mm = dup_const(MO_8, mm);
2164 for (i = 0; i < opr_sz; i += 1) {
2165 uint64_t nn = n[i];
2166 uint64_t pp = expand_pred_b(pg[H1(i)]);
2167 d[i] = (mm & pp) | (nn & ~pp);
2168 }
2169}
2170
2171void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2172 uint64_t mm, uint32_t desc)
2173{
2174 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2175 uint64_t *d = vd, *n = vn;
2176 uint8_t *pg = vg;
2177
2178 mm = dup_const(MO_16, mm);
2179 for (i = 0; i < opr_sz; i += 1) {
2180 uint64_t nn = n[i];
2181 uint64_t pp = expand_pred_h(pg[H1(i)]);
2182 d[i] = (mm & pp) | (nn & ~pp);
2183 }
2184}
2185
2186void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2187 uint64_t mm, uint32_t desc)
2188{
2189 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2190 uint64_t *d = vd, *n = vn;
2191 uint8_t *pg = vg;
2192
2193 mm = dup_const(MO_32, mm);
2194 for (i = 0; i < opr_sz; i += 1) {
2195 uint64_t nn = n[i];
2196 uint64_t pp = expand_pred_s(pg[H1(i)]);
2197 d[i] = (mm & pp) | (nn & ~pp);
2198 }
2199}
2200
2201void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2202 uint64_t mm, uint32_t desc)
2203{
2204 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2205 uint64_t *d = vd, *n = vn;
2206 uint8_t *pg = vg;
2207
2208 for (i = 0; i < opr_sz; i += 1) {
2209 uint64_t nn = n[i];
2210 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2211 }
2212}
2213
2214void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2215{
2216 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2217 uint64_t *d = vd;
2218 uint8_t *pg = vg;
2219
2220 val = dup_const(MO_8, val);
2221 for (i = 0; i < opr_sz; i += 1) {
2222 d[i] = val & expand_pred_b(pg[H1(i)]);
2223 }
2224}
2225
2226void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2227{
2228 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2229 uint64_t *d = vd;
2230 uint8_t *pg = vg;
2231
2232 val = dup_const(MO_16, val);
2233 for (i = 0; i < opr_sz; i += 1) {
2234 d[i] = val & expand_pred_h(pg[H1(i)]);
2235 }
2236}
2237
2238void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2239{
2240 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2241 uint64_t *d = vd;
2242 uint8_t *pg = vg;
2243
2244 val = dup_const(MO_32, val);
2245 for (i = 0; i < opr_sz; i += 1) {
2246 d[i] = val & expand_pred_s(pg[H1(i)]);
2247 }
2248}
2249
2250void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2251{
2252 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2253 uint64_t *d = vd;
2254 uint8_t *pg = vg;
2255
2256 for (i = 0; i < opr_sz; i += 1) {
2257 d[i] = (pg[H1(i)] & 1 ? val : 0);
2258 }
2259}
b94f8f60 2260
b4cd95d2 2261/* Big-endian hosts need to frob the byte indices. If the copy
b94f8f60
RH
2262 * happens to be 8-byte aligned, then no frobbing necessary.
2263 */
2264static void swap_memmove(void *vd, void *vs, size_t n)
2265{
2266 uintptr_t d = (uintptr_t)vd;
2267 uintptr_t s = (uintptr_t)vs;
2268 uintptr_t o = (d | s | n) & 7;
2269 size_t i;
2270
2271#ifndef HOST_WORDS_BIGENDIAN
2272 o = 0;
2273#endif
2274 switch (o) {
2275 case 0:
2276 memmove(vd, vs, n);
2277 break;
2278
2279 case 4:
2280 if (d < s || d >= s + n) {
2281 for (i = 0; i < n; i += 4) {
2282 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2283 }
2284 } else {
2285 for (i = n; i > 0; ) {
2286 i -= 4;
2287 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2288 }
2289 }
2290 break;
2291
2292 case 2:
2293 case 6:
2294 if (d < s || d >= s + n) {
2295 for (i = 0; i < n; i += 2) {
2296 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2297 }
2298 } else {
2299 for (i = n; i > 0; ) {
2300 i -= 2;
2301 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2302 }
2303 }
2304 break;
2305
2306 default:
2307 if (d < s || d >= s + n) {
2308 for (i = 0; i < n; i++) {
2309 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2310 }
2311 } else {
2312 for (i = n; i > 0; ) {
2313 i -= 1;
2314 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2315 }
2316 }
2317 break;
2318 }
2319}
2320
9123aeb6
RH
2321/* Similarly for memset of 0. */
2322static void swap_memzero(void *vd, size_t n)
2323{
2324 uintptr_t d = (uintptr_t)vd;
2325 uintptr_t o = (d | n) & 7;
2326 size_t i;
2327
2328 /* Usually, the first bit of a predicate is set, so N is 0. */
2329 if (likely(n == 0)) {
2330 return;
2331 }
2332
2333#ifndef HOST_WORDS_BIGENDIAN
2334 o = 0;
2335#endif
2336 switch (o) {
2337 case 0:
2338 memset(vd, 0, n);
2339 break;
2340
2341 case 4:
2342 for (i = 0; i < n; i += 4) {
2343 *(uint32_t *)H1_4(d + i) = 0;
2344 }
2345 break;
2346
2347 case 2:
2348 case 6:
2349 for (i = 0; i < n; i += 2) {
2350 *(uint16_t *)H1_2(d + i) = 0;
2351 }
2352 break;
2353
2354 default:
2355 for (i = 0; i < n; i++) {
2356 *(uint8_t *)H1(d + i) = 0;
2357 }
2358 break;
2359 }
2360}
2361
b94f8f60
RH
2362void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2363{
2364 intptr_t opr_sz = simd_oprsz(desc);
2365 size_t n_ofs = simd_data(desc);
2366 size_t n_siz = opr_sz - n_ofs;
2367
2368 if (vd != vm) {
2369 swap_memmove(vd, vn + n_ofs, n_siz);
2370 swap_memmove(vd + n_siz, vm, n_ofs);
2371 } else if (vd != vn) {
2372 swap_memmove(vd + n_siz, vd, n_ofs);
2373 swap_memmove(vd, vn + n_ofs, n_siz);
2374 } else {
2375 /* vd == vn == vm. Need temp space. */
2376 ARMVectorReg tmp;
2377 swap_memmove(&tmp, vm, n_ofs);
2378 swap_memmove(vd, vd + n_ofs, n_siz);
2379 memcpy(vd + n_siz, &tmp, n_ofs);
2380 }
2381}
30562ab7
RH
2382
2383#define DO_INSR(NAME, TYPE, H) \
2384void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2385{ \
2386 intptr_t opr_sz = simd_oprsz(desc); \
2387 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2388 *(TYPE *)(vd + H(0)) = val; \
2389}
2390
2391DO_INSR(sve_insr_b, uint8_t, H1)
2392DO_INSR(sve_insr_h, uint16_t, H1_2)
2393DO_INSR(sve_insr_s, uint32_t, H1_4)
2394DO_INSR(sve_insr_d, uint64_t, )
2395
2396#undef DO_INSR
2397
2398void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2399{
2400 intptr_t i, j, opr_sz = simd_oprsz(desc);
2401 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2402 uint64_t f = *(uint64_t *)(vn + i);
2403 uint64_t b = *(uint64_t *)(vn + j);
2404 *(uint64_t *)(vd + i) = bswap64(b);
2405 *(uint64_t *)(vd + j) = bswap64(f);
2406 }
2407}
2408
30562ab7
RH
2409void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2410{
2411 intptr_t i, j, opr_sz = simd_oprsz(desc);
2412 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2413 uint64_t f = *(uint64_t *)(vn + i);
2414 uint64_t b = *(uint64_t *)(vn + j);
2415 *(uint64_t *)(vd + i) = hswap64(b);
2416 *(uint64_t *)(vd + j) = hswap64(f);
2417 }
2418}
2419
2420void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2421{
2422 intptr_t i, j, opr_sz = simd_oprsz(desc);
2423 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2424 uint64_t f = *(uint64_t *)(vn + i);
2425 uint64_t b = *(uint64_t *)(vn + j);
2426 *(uint64_t *)(vd + i) = rol64(b, 32);
2427 *(uint64_t *)(vd + j) = rol64(f, 32);
2428 }
2429}
2430
2431void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2432{
2433 intptr_t i, j, opr_sz = simd_oprsz(desc);
2434 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2435 uint64_t f = *(uint64_t *)(vn + i);
2436 uint64_t b = *(uint64_t *)(vn + j);
2437 *(uint64_t *)(vd + i) = b;
2438 *(uint64_t *)(vd + j) = f;
2439 }
2440}
2441
2442#define DO_TBL(NAME, TYPE, H) \
2443void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2444{ \
2445 intptr_t i, opr_sz = simd_oprsz(desc); \
2446 uintptr_t elem = opr_sz / sizeof(TYPE); \
2447 TYPE *d = vd, *n = vn, *m = vm; \
2448 ARMVectorReg tmp; \
2449 if (unlikely(vd == vn)) { \
2450 n = memcpy(&tmp, vn, opr_sz); \
2451 } \
2452 for (i = 0; i < elem; i++) { \
2453 TYPE j = m[H(i)]; \
2454 d[H(i)] = j < elem ? n[H(j)] : 0; \
2455 } \
2456}
2457
2458DO_TBL(sve_tbl_b, uint8_t, H1)
2459DO_TBL(sve_tbl_h, uint16_t, H2)
2460DO_TBL(sve_tbl_s, uint32_t, H4)
2461DO_TBL(sve_tbl_d, uint64_t, )
2462
2463#undef TBL
2464
2465#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
2466void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2467{ \
2468 intptr_t i, opr_sz = simd_oprsz(desc); \
2469 TYPED *d = vd; \
2470 TYPES *n = vn; \
2471 ARMVectorReg tmp; \
2472 if (unlikely(vn - vd < opr_sz)) { \
2473 n = memcpy(&tmp, n, opr_sz / 2); \
2474 } \
2475 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
2476 d[HD(i)] = n[HS(i)]; \
2477 } \
2478}
2479
2480DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
2481DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
2482DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
2483
2484DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
2485DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
2486DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
2487
2488#undef DO_UNPK
d731d8cb
RH
2489
2490/* Mask of bits included in the even numbered predicates of width esz.
2491 * We also use this for expand_bits/compress_bits, and so extend the
2492 * same pattern out to 16-bit units.
2493 */
2494static const uint64_t even_bit_esz_masks[5] = {
2495 0x5555555555555555ull,
2496 0x3333333333333333ull,
2497 0x0f0f0f0f0f0f0f0full,
2498 0x00ff00ff00ff00ffull,
2499 0x0000ffff0000ffffull,
2500};
2501
2502/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
2503 * For N==0, this corresponds to the operation that in qemu/bitops.h
2504 * we call half_shuffle64; this algorithm is from Hacker's Delight,
2505 * section 7-2 Shuffling Bits.
2506 */
2507static uint64_t expand_bits(uint64_t x, int n)
2508{
2509 int i;
2510
2511 x &= 0xffffffffu;
2512 for (i = 4; i >= n; i--) {
2513 int sh = 1 << i;
2514 x = ((x << sh) | x) & even_bit_esz_masks[i];
2515 }
2516 return x;
2517}
2518
2519/* Compress units of 2**(N+1) bits to units of 2**N bits.
2520 * For N==0, this corresponds to the operation that in qemu/bitops.h
2521 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
2522 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
2523 */
2524static uint64_t compress_bits(uint64_t x, int n)
2525{
2526 int i;
2527
2528 for (i = n; i <= 4; i++) {
2529 int sh = 1 << i;
2530 x &= even_bit_esz_masks[i];
2531 x = (x >> sh) | x;
2532 }
2533 return x & 0xffffffffu;
2534}
2535
2536void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2537{
f9b0fcce
RH
2538 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2539 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2540 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
8e7fefed 2541 int esize = 1 << esz;
d731d8cb
RH
2542 uint64_t *d = vd;
2543 intptr_t i;
2544
2545 if (oprsz <= 8) {
2546 uint64_t nn = *(uint64_t *)vn;
2547 uint64_t mm = *(uint64_t *)vm;
2548 int half = 4 * oprsz;
2549
2550 nn = extract64(nn, high * half, half);
2551 mm = extract64(mm, high * half, half);
2552 nn = expand_bits(nn, esz);
2553 mm = expand_bits(mm, esz);
8e7fefed 2554 d[0] = nn | (mm << esize);
d731d8cb 2555 } else {
8e7fefed 2556 ARMPredicateReg tmp;
d731d8cb
RH
2557
2558 /* We produce output faster than we consume input.
2559 Therefore we must be mindful of possible overlap. */
8e7fefed
RH
2560 if (vd == vn) {
2561 vn = memcpy(&tmp, vn, oprsz);
2562 if (vd == vm) {
2563 vm = vn;
2564 }
2565 } else if (vd == vm) {
2566 vm = memcpy(&tmp, vm, oprsz);
d731d8cb
RH
2567 }
2568 if (high) {
2569 high = oprsz >> 1;
2570 }
2571
8e7fefed 2572 if ((oprsz & 7) == 0) {
d731d8cb
RH
2573 uint32_t *n = vn, *m = vm;
2574 high >>= 2;
2575
8e7fefed 2576 for (i = 0; i < oprsz / 8; i++) {
d731d8cb
RH
2577 uint64_t nn = n[H4(high + i)];
2578 uint64_t mm = m[H4(high + i)];
2579
2580 nn = expand_bits(nn, esz);
2581 mm = expand_bits(mm, esz);
8e7fefed 2582 d[i] = nn | (mm << esize);
d731d8cb
RH
2583 }
2584 } else {
2585 uint8_t *n = vn, *m = vm;
2586 uint16_t *d16 = vd;
2587
2588 for (i = 0; i < oprsz / 2; i++) {
2589 uint16_t nn = n[H1(high + i)];
2590 uint16_t mm = m[H1(high + i)];
2591
2592 nn = expand_bits(nn, esz);
2593 mm = expand_bits(mm, esz);
8e7fefed 2594 d16[H2(i)] = nn | (mm << esize);
d731d8cb
RH
2595 }
2596 }
2597 }
2598}
2599
2600void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2601{
f9b0fcce
RH
2602 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2603 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2604 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
d731d8cb
RH
2605 uint64_t *d = vd, *n = vn, *m = vm;
2606 uint64_t l, h;
2607 intptr_t i;
2608
2609 if (oprsz <= 8) {
2610 l = compress_bits(n[0] >> odd, esz);
2611 h = compress_bits(m[0] >> odd, esz);
226e6c04 2612 d[0] = l | (h << (4 * oprsz));
d731d8cb
RH
2613 } else {
2614 ARMPredicateReg tmp_m;
2615 intptr_t oprsz_16 = oprsz / 16;
2616
2617 if ((vm - vd) < (uintptr_t)oprsz) {
2618 m = memcpy(&tmp_m, vm, oprsz);
2619 }
2620
2621 for (i = 0; i < oprsz_16; i++) {
2622 l = n[2 * i + 0];
2623 h = n[2 * i + 1];
2624 l = compress_bits(l >> odd, esz);
2625 h = compress_bits(h >> odd, esz);
226e6c04 2626 d[i] = l | (h << 32);
d731d8cb
RH
2627 }
2628
226e6c04
RH
2629 /*
2630 * For VL which is not a multiple of 512, the results from M do not
2631 * align nicely with the uint64_t for D. Put the aligned results
2632 * from M into TMP_M and then copy it into place afterward.
2633 */
d731d8cb 2634 if (oprsz & 15) {
226e6c04
RH
2635 int final_shift = (oprsz & 15) * 2;
2636
2637 l = n[2 * i + 0];
2638 h = n[2 * i + 1];
2639 l = compress_bits(l >> odd, esz);
2640 h = compress_bits(h >> odd, esz);
2641 d[i] = l | (h << final_shift);
d731d8cb
RH
2642
2643 for (i = 0; i < oprsz_16; i++) {
2644 l = m[2 * i + 0];
2645 h = m[2 * i + 1];
2646 l = compress_bits(l >> odd, esz);
2647 h = compress_bits(h >> odd, esz);
226e6c04 2648 tmp_m.p[i] = l | (h << 32);
d731d8cb 2649 }
226e6c04
RH
2650 l = m[2 * i + 0];
2651 h = m[2 * i + 1];
2652 l = compress_bits(l >> odd, esz);
2653 h = compress_bits(h >> odd, esz);
2654 tmp_m.p[i] = l | (h << final_shift);
d731d8cb
RH
2655
2656 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
2657 } else {
2658 for (i = 0; i < oprsz_16; i++) {
2659 l = m[2 * i + 0];
2660 h = m[2 * i + 1];
2661 l = compress_bits(l >> odd, esz);
2662 h = compress_bits(h >> odd, esz);
226e6c04 2663 d[oprsz_16 + i] = l | (h << 32);
d731d8cb
RH
2664 }
2665 }
2666 }
2667}
2668
2669void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2670{
f9b0fcce
RH
2671 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2672 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2673 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
d731d8cb
RH
2674 uint64_t *d = vd, *n = vn, *m = vm;
2675 uint64_t mask;
2676 int shr, shl;
2677 intptr_t i;
2678
2679 shl = 1 << esz;
2680 shr = 0;
2681 mask = even_bit_esz_masks[esz];
2682 if (odd) {
2683 mask <<= shl;
2684 shr = shl;
2685 shl = 0;
2686 }
2687
2688 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2689 uint64_t nn = (n[i] & mask) >> shr;
2690 uint64_t mm = (m[i] & mask) << shl;
2691 d[i] = nn + mm;
2692 }
2693}
2694
2695/* Reverse units of 2**N bits. */
2696static uint64_t reverse_bits_64(uint64_t x, int n)
2697{
2698 int i, sh;
2699
2700 x = bswap64(x);
2701 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2702 uint64_t mask = even_bit_esz_masks[i];
2703 x = ((x & mask) << sh) | ((x >> sh) & mask);
2704 }
2705 return x;
2706}
2707
2708static uint8_t reverse_bits_8(uint8_t x, int n)
2709{
2710 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2711 int i, sh;
2712
2713 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2714 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2715 }
2716 return x;
2717}
2718
2719void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2720{
70acaafe
RH
2721 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2722 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
d731d8cb
RH
2723 intptr_t i, oprsz_2 = oprsz / 2;
2724
2725 if (oprsz <= 8) {
2726 uint64_t l = *(uint64_t *)vn;
2727 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2728 *(uint64_t *)vd = l;
2729 } else if ((oprsz & 15) == 0) {
2730 for (i = 0; i < oprsz_2; i += 8) {
2731 intptr_t ih = oprsz - 8 - i;
2732 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2733 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2734 *(uint64_t *)(vd + i) = h;
2735 *(uint64_t *)(vd + ih) = l;
2736 }
2737 } else {
2738 for (i = 0; i < oprsz_2; i += 1) {
2739 intptr_t il = H1(i);
2740 intptr_t ih = H1(oprsz - 1 - i);
2741 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2742 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2743 *(uint8_t *)(vd + il) = h;
2744 *(uint8_t *)(vd + ih) = l;
2745 }
2746 }
2747}
2748
2749void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2750{
70acaafe
RH
2751 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2752 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
d731d8cb
RH
2753 uint64_t *d = vd;
2754 intptr_t i;
2755
2756 if (oprsz <= 8) {
2757 uint64_t nn = *(uint64_t *)vn;
2758 int half = 4 * oprsz;
2759
2760 nn = extract64(nn, high * half, half);
2761 nn = expand_bits(nn, 0);
2762 d[0] = nn;
2763 } else {
2764 ARMPredicateReg tmp_n;
2765
2766 /* We produce output faster than we consume input.
2767 Therefore we must be mindful of possible overlap. */
2768 if ((vn - vd) < (uintptr_t)oprsz) {
2769 vn = memcpy(&tmp_n, vn, oprsz);
2770 }
2771 if (high) {
2772 high = oprsz >> 1;
2773 }
2774
fd911a21 2775 if ((oprsz & 7) == 0) {
d731d8cb
RH
2776 uint32_t *n = vn;
2777 high >>= 2;
2778
fd911a21 2779 for (i = 0; i < oprsz / 8; i++) {
d731d8cb
RH
2780 uint64_t nn = n[H4(high + i)];
2781 d[i] = expand_bits(nn, 0);
2782 }
2783 } else {
2784 uint16_t *d16 = vd;
2785 uint8_t *n = vn;
2786
2787 for (i = 0; i < oprsz / 2; i++) {
2788 uint16_t nn = n[H1(high + i)];
2789 d16[H2(i)] = expand_bits(nn, 0);
2790 }
2791 }
2792 }
2793}
234b48e9
RH
2794
2795#define DO_ZIP(NAME, TYPE, H) \
2796void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2797{ \
2798 intptr_t oprsz = simd_oprsz(desc); \
2799 intptr_t i, oprsz_2 = oprsz / 2; \
2800 ARMVectorReg tmp_n, tmp_m; \
2801 /* We produce output faster than we consume input. \
2802 Therefore we must be mindful of possible overlap. */ \
2803 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2804 vn = memcpy(&tmp_n, vn, oprsz_2); \
2805 } \
2806 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2807 vm = memcpy(&tmp_m, vm, oprsz_2); \
2808 } \
2809 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2810 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2811 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2812 } \
2813}
2814
2815DO_ZIP(sve_zip_b, uint8_t, H1)
2816DO_ZIP(sve_zip_h, uint16_t, H1_2)
2817DO_ZIP(sve_zip_s, uint32_t, H1_4)
2818DO_ZIP(sve_zip_d, uint64_t, )
2819
2820#define DO_UZP(NAME, TYPE, H) \
2821void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2822{ \
2823 intptr_t oprsz = simd_oprsz(desc); \
2824 intptr_t oprsz_2 = oprsz / 2; \
2825 intptr_t odd_ofs = simd_data(desc); \
2826 intptr_t i; \
2827 ARMVectorReg tmp_m; \
2828 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2829 vm = memcpy(&tmp_m, vm, oprsz); \
2830 } \
2831 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2832 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2833 } \
2834 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2835 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2836 } \
2837}
2838
2839DO_UZP(sve_uzp_b, uint8_t, H1)
2840DO_UZP(sve_uzp_h, uint16_t, H1_2)
2841DO_UZP(sve_uzp_s, uint32_t, H1_4)
2842DO_UZP(sve_uzp_d, uint64_t, )
2843
2844#define DO_TRN(NAME, TYPE, H) \
2845void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2846{ \
2847 intptr_t oprsz = simd_oprsz(desc); \
2848 intptr_t odd_ofs = simd_data(desc); \
2849 intptr_t i; \
2850 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2851 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2852 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2853 *(TYPE *)(vd + H(i + 0)) = ae; \
2854 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2855 } \
2856}
2857
2858DO_TRN(sve_trn_b, uint8_t, H1)
2859DO_TRN(sve_trn_h, uint16_t, H1_2)
2860DO_TRN(sve_trn_s, uint32_t, H1_4)
2861DO_TRN(sve_trn_d, uint64_t, )
2862
2863#undef DO_ZIP
2864#undef DO_UZP
2865#undef DO_TRN
3ca879ae
RH
2866
2867void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2868{
2869 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2870 uint32_t *d = vd, *n = vn;
2871 uint8_t *pg = vg;
2872
2873 for (i = j = 0; i < opr_sz; i++) {
2874 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2875 d[H4(j)] = n[H4(i)];
2876 j++;
2877 }
2878 }
2879 for (; j < opr_sz; j++) {
2880 d[H4(j)] = 0;
2881 }
2882}
2883
2884void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2885{
2886 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2887 uint64_t *d = vd, *n = vn;
2888 uint8_t *pg = vg;
2889
2890 for (i = j = 0; i < opr_sz; i++) {
2891 if (pg[H1(i)] & 1) {
2892 d[j] = n[i];
2893 j++;
2894 }
2895 }
2896 for (; j < opr_sz; j++) {
2897 d[j] = 0;
2898 }
2899}
ef23cb72
RH
2900
2901/* Similar to the ARM LastActiveElement pseudocode function, except the
2902 * result is multiplied by the element size. This includes the not found
2903 * indication; e.g. not found for esz=3 is -8.
2904 */
2905int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2906{
2acbfbe4
RH
2907 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
2908 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
ef23cb72 2909
2acbfbe4 2910 return last_active_element(vg, words, esz);
ef23cb72 2911}
b48ff240
RH
2912
2913void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2914{
2915 intptr_t opr_sz = simd_oprsz(desc) / 8;
2916 int esz = simd_data(desc);
2917 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2918 intptr_t i, first_i, last_i;
2919 ARMVectorReg tmp;
2920
2921 first_i = last_i = 0;
2922 first_g = last_g = 0;
2923
2924 /* Find the extent of the active elements within VG. */
2925 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2926 pg = *(uint64_t *)(vg + i) & mask;
2927 if (pg) {
2928 if (last_g == 0) {
2929 last_g = pg;
2930 last_i = i;
2931 }
2932 first_g = pg;
2933 first_i = i;
2934 }
2935 }
2936
2937 len = 0;
2938 if (first_g != 0) {
2939 first_i = first_i * 8 + ctz64(first_g);
2940 last_i = last_i * 8 + 63 - clz64(last_g);
2941 len = last_i - first_i + (1 << esz);
2942 if (vd == vm) {
2943 vm = memcpy(&tmp, vm, opr_sz * 8);
2944 }
2945 swap_memmove(vd, vn + first_i, len);
2946 }
2947 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2948}
d3fe4a29
RH
2949
2950void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2951 void *vg, uint32_t desc)
2952{
2953 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2954 uint64_t *d = vd, *n = vn, *m = vm;
2955 uint8_t *pg = vg;
2956
2957 for (i = 0; i < opr_sz; i += 1) {
2958 uint64_t nn = n[i], mm = m[i];
2959 uint64_t pp = expand_pred_b(pg[H1(i)]);
2960 d[i] = (nn & pp) | (mm & ~pp);
2961 }
2962}
2963
2964void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2965 void *vg, uint32_t desc)
2966{
2967 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2968 uint64_t *d = vd, *n = vn, *m = vm;
2969 uint8_t *pg = vg;
2970
2971 for (i = 0; i < opr_sz; i += 1) {
2972 uint64_t nn = n[i], mm = m[i];
2973 uint64_t pp = expand_pred_h(pg[H1(i)]);
2974 d[i] = (nn & pp) | (mm & ~pp);
2975 }
2976}
2977
2978void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2979 void *vg, uint32_t desc)
2980{
2981 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2982 uint64_t *d = vd, *n = vn, *m = vm;
2983 uint8_t *pg = vg;
2984
2985 for (i = 0; i < opr_sz; i += 1) {
2986 uint64_t nn = n[i], mm = m[i];
2987 uint64_t pp = expand_pred_s(pg[H1(i)]);
2988 d[i] = (nn & pp) | (mm & ~pp);
2989 }
2990}
2991
2992void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2993 void *vg, uint32_t desc)
2994{
2995 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2996 uint64_t *d = vd, *n = vn, *m = vm;
2997 uint8_t *pg = vg;
2998
2999 for (i = 0; i < opr_sz; i += 1) {
3000 uint64_t nn = n[i], mm = m[i];
3001 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3002 }
3003}
757f9cff
RH
3004
3005/* Two operand comparison controlled by a predicate.
3006 * ??? It is very tempting to want to be able to expand this inline
3007 * with x86 instructions, e.g.
3008 *
3009 * vcmpeqw zm, zn, %ymm0
3010 * vpmovmskb %ymm0, %eax
3011 * and $0x5555, %eax
3012 * and pg, %eax
3013 *
3014 * or even aarch64, e.g.
3015 *
3016 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3017 * cmeq v0.8h, zn, zm
3018 * and v0.8h, v0.8h, mask
3019 * addv h0, v0.8h
3020 * and v0.8b, pg
3021 *
3022 * However, coming up with an abstraction that allows vector inputs and
3023 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3024 * scalar outputs, is tricky.
3025 */
3026#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3027uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3028{ \
3029 intptr_t opr_sz = simd_oprsz(desc); \
3030 uint32_t flags = PREDTEST_INIT; \
3031 intptr_t i = opr_sz; \
3032 do { \
3033 uint64_t out = 0, pg; \
3034 do { \
3035 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3036 TYPE nn = *(TYPE *)(vn + H(i)); \
3037 TYPE mm = *(TYPE *)(vm + H(i)); \
3038 out |= nn OP mm; \
3039 } while (i & 63); \
3040 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3041 out &= pg; \
3042 *(uint64_t *)(vd + (i >> 3)) = out; \
3043 flags = iter_predtest_bwd(out, pg, flags); \
3044 } while (i > 0); \
3045 return flags; \
3046}
3047
3048#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3049 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3050#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3051 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3052#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3053 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3054#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3055 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
3056
3057DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3058DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3059DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3060DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3061
3062DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3063DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3064DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3065DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3066
3067DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3068DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3069DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3070DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3071
3072DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3073DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3074DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3075DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3076
3077DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3078DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3079DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3080DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3081
3082DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3083DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3084DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3085DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3086
3087#undef DO_CMP_PPZZ_B
3088#undef DO_CMP_PPZZ_H
3089#undef DO_CMP_PPZZ_S
3090#undef DO_CMP_PPZZ_D
3091#undef DO_CMP_PPZZ
3092
3093/* Similar, but the second source is "wide". */
3094#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3095uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3096{ \
3097 intptr_t opr_sz = simd_oprsz(desc); \
3098 uint32_t flags = PREDTEST_INIT; \
3099 intptr_t i = opr_sz; \
3100 do { \
3101 uint64_t out = 0, pg; \
3102 do { \
3103 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3104 do { \
3105 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3106 TYPE nn = *(TYPE *)(vn + H(i)); \
3107 out |= nn OP mm; \
3108 } while (i & 7); \
3109 } while (i & 63); \
3110 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3111 out &= pg; \
3112 *(uint64_t *)(vd + (i >> 3)) = out; \
3113 flags = iter_predtest_bwd(out, pg, flags); \
3114 } while (i > 0); \
3115 return flags; \
3116}
3117
3118#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3119 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3120#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3121 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3122#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3123 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3124
df4e0010
RH
3125DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3126DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3127DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
757f9cff 3128
df4e0010
RH
3129DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3130DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3131DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
757f9cff
RH
3132
3133DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3134DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3135DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3136
3137DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3138DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3139DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3140
3141DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3142DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3143DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3144
3145DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3146DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3147DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3148
3149DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3150DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3151DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3152
3153DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3154DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3155DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3156
3157DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3158DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3159DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3160
3161DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3162DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3163DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3164
3165#undef DO_CMP_PPZW_B
3166#undef DO_CMP_PPZW_H
3167#undef DO_CMP_PPZW_S
3168#undef DO_CMP_PPZW
38cadeba
RH
3169
3170/* Similar, but the second source is immediate. */
3171#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3172uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3173{ \
3174 intptr_t opr_sz = simd_oprsz(desc); \
3175 uint32_t flags = PREDTEST_INIT; \
3176 TYPE mm = simd_data(desc); \
3177 intptr_t i = opr_sz; \
3178 do { \
3179 uint64_t out = 0, pg; \
3180 do { \
3181 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3182 TYPE nn = *(TYPE *)(vn + H(i)); \
3183 out |= nn OP mm; \
3184 } while (i & 63); \
3185 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3186 out &= pg; \
3187 *(uint64_t *)(vd + (i >> 3)) = out; \
3188 flags = iter_predtest_bwd(out, pg, flags); \
3189 } while (i > 0); \
3190 return flags; \
3191}
3192
3193#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3194 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3195#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3196 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3197#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3198 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3199#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3200 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
3201
3202DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3203DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3204DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3205DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3206
3207DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3208DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3209DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3210DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3211
3212DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3213DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3214DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3215DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3216
3217DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3218DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3219DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3220DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3221
3222DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3223DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3224DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3225DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3226
3227DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3228DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3229DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3230DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3231
3232DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3233DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3234DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3235DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3236
3237DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3238DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3239DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3240DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3241
3242DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3243DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3244DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3245DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3246
3247DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3248DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3249DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3250DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3251
3252#undef DO_CMP_PPZI_B
3253#undef DO_CMP_PPZI_H
3254#undef DO_CMP_PPZI_S
3255#undef DO_CMP_PPZI_D
3256#undef DO_CMP_PPZI
35da316f
RH
3257
3258/* Similar to the ARM LastActive pseudocode function. */
3259static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3260{
3261 intptr_t i;
3262
3263 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3264 uint64_t pg = *(uint64_t *)(vg + i);
3265 if (pg) {
3266 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3267 }
3268 }
3269 return 0;
3270}
3271
3272/* Compute a mask into RETB that is true for all G, up to and including
3273 * (if after) or excluding (if !after) the first G & N.
3274 * Return true if BRK found.
3275 */
3276static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3277 bool brk, bool after)
3278{
3279 uint64_t b;
3280
3281 if (brk) {
3282 b = 0;
3283 } else if ((g & n) == 0) {
3284 /* For all G, no N are set; break not found. */
3285 b = g;
3286 } else {
3287 /* Break somewhere in N. Locate it. */
3288 b = g & n; /* guard true, pred true */
3289 b = b & -b; /* first such */
3290 if (after) {
3291 b = b | (b - 1); /* break after same */
3292 } else {
3293 b = b - 1; /* break before same */
3294 }
3295 brk = true;
3296 }
3297
3298 *retb = b;
3299 return brk;
3300}
3301
3302/* Compute a zeroing BRK. */
3303static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3304 intptr_t oprsz, bool after)
3305{
3306 bool brk = false;
3307 intptr_t i;
3308
3309 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3310 uint64_t this_b, this_g = g[i];
3311
3312 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3313 d[i] = this_b & this_g;
3314 }
3315}
3316
3317/* Likewise, but also compute flags. */
3318static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3319 intptr_t oprsz, bool after)
3320{
3321 uint32_t flags = PREDTEST_INIT;
3322 bool brk = false;
3323 intptr_t i;
3324
3325 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3326 uint64_t this_b, this_d, this_g = g[i];
3327
3328 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3329 d[i] = this_d = this_b & this_g;
3330 flags = iter_predtest_fwd(this_d, this_g, flags);
3331 }
3332 return flags;
3333}
3334
3335/* Compute a merging BRK. */
3336static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3337 intptr_t oprsz, bool after)
3338{
3339 bool brk = false;
3340 intptr_t i;
3341
3342 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3343 uint64_t this_b, this_g = g[i];
3344
3345 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3346 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3347 }
3348}
3349
3350/* Likewise, but also compute flags. */
3351static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3352 intptr_t oprsz, bool after)
3353{
3354 uint32_t flags = PREDTEST_INIT;
3355 bool brk = false;
3356 intptr_t i;
3357
3358 for (i = 0; i < oprsz / 8; ++i) {
3359 uint64_t this_b, this_d = d[i], this_g = g[i];
3360
3361 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3362 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3363 flags = iter_predtest_fwd(this_d, this_g, flags);
3364 }
3365 return flags;
3366}
3367
3368static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3369{
3370 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3371 * The compiler should turn this into 4 64-bit integer stores.
3372 */
3373 memset(d, 0, sizeof(ARMPredicateReg));
3374 return PREDTEST_INIT;
3375}
3376
3377void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3378 uint32_t pred_desc)
3379{
04c774a2 3380 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3381 if (last_active_pred(vn, vg, oprsz)) {
3382 compute_brk_z(vd, vm, vg, oprsz, true);
3383 } else {
3384 do_zero(vd, oprsz);
3385 }
3386}
3387
3388uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3389 uint32_t pred_desc)
3390{
04c774a2 3391 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3392 if (last_active_pred(vn, vg, oprsz)) {
3393 return compute_brks_z(vd, vm, vg, oprsz, true);
3394 } else {
3395 return do_zero(vd, oprsz);
3396 }
3397}
3398
3399void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3400 uint32_t pred_desc)
3401{
04c774a2 3402 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3403 if (last_active_pred(vn, vg, oprsz)) {
3404 compute_brk_z(vd, vm, vg, oprsz, false);
3405 } else {
3406 do_zero(vd, oprsz);
3407 }
3408}
3409
3410uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
3411 uint32_t pred_desc)
3412{
04c774a2 3413 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3414 if (last_active_pred(vn, vg, oprsz)) {
3415 return compute_brks_z(vd, vm, vg, oprsz, false);
3416 } else {
3417 return do_zero(vd, oprsz);
3418 }
3419}
3420
3421void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3422{
04c774a2 3423 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3424 compute_brk_z(vd, vn, vg, oprsz, true);
3425}
3426
3427uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3428{
04c774a2 3429 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3430 return compute_brks_z(vd, vn, vg, oprsz, true);
3431}
3432
3433void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3434{
04c774a2 3435 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3436 compute_brk_z(vd, vn, vg, oprsz, false);
3437}
3438
3439uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3440{
04c774a2 3441 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3442 return compute_brks_z(vd, vn, vg, oprsz, false);
3443}
3444
3445void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3446{
04c774a2 3447 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3448 compute_brk_m(vd, vn, vg, oprsz, true);
3449}
3450
3451uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3452{
04c774a2 3453 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3454 return compute_brks_m(vd, vn, vg, oprsz, true);
3455}
3456
3457void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3458{
04c774a2 3459 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3460 compute_brk_m(vd, vn, vg, oprsz, false);
3461}
3462
3463uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3464{
04c774a2 3465 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3466 return compute_brks_m(vd, vn, vg, oprsz, false);
3467}
3468
3469void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3470{
04c774a2 3471 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3472 if (!last_active_pred(vn, vg, oprsz)) {
3473 do_zero(vd, oprsz);
3474 }
3475}
3476
3477/* As if PredTest(Ones(PL), D, esz). */
3478static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
3479 uint64_t esz_mask)
3480{
3481 uint32_t flags = PREDTEST_INIT;
3482 intptr_t i;
3483
3484 for (i = 0; i < oprsz / 8; i++) {
3485 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
3486 }
3487 if (oprsz & 7) {
3488 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
3489 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
3490 }
3491 return flags;
3492}
3493
3494uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3495{
04c774a2 3496 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3497 if (last_active_pred(vn, vg, oprsz)) {
3498 return predtest_ones(vd, oprsz, -1);
3499 } else {
3500 return do_zero(vd, oprsz);
3501 }
3502}
9ee3a611
RH
3503
3504uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
3505{
f556a201
RH
3506 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3507 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
9ee3a611
RH
3508 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
3509 intptr_t i;
3510
f556a201 3511 for (i = 0; i < words; ++i) {
9ee3a611
RH
3512 uint64_t t = n[i] & g[i] & mask;
3513 sum += ctpop64(t);
3514 }
3515 return sum;
3516}
caf1cefc
RH
3517
3518uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
3519{
e610906c
RH
3520 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3521 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
caf1cefc
RH
3522 uint64_t esz_mask = pred_esz_masks[esz];
3523 ARMPredicateReg *d = vd;
3524 uint32_t flags;
3525 intptr_t i;
3526
3527 /* Begin with a zero predicate register. */
3528 flags = do_zero(d, oprsz);
3529 if (count == 0) {
3530 return flags;
3531 }
3532
caf1cefc
RH
3533 /* Set all of the requested bits. */
3534 for (i = 0; i < count / 64; ++i) {
3535 d->p[i] = esz_mask;
3536 }
3537 if (count & 63) {
3538 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
3539 }
3540
3541 return predtest_ones(d, oprsz, esz_mask);
3542}
c4e7c493 3543
23fbe79f
RH
3544/* Recursive reduction on a function;
3545 * C.f. the ARM ARM function ReducePredicated.
3546 *
3547 * While it would be possible to write this without the DATA temporary,
3548 * it is much simpler to process the predicate register this way.
3549 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
3550 * little to gain with a more complex non-recursive form.
3551 */
3552#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
3553static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
3554{ \
3555 if (n == 1) { \
3556 return *data; \
3557 } else { \
3558 uintptr_t half = n / 2; \
3559 TYPE lo = NAME##_reduce(data, status, half); \
3560 TYPE hi = NAME##_reduce(data + half, status, half); \
3561 return TYPE##_##FUNC(lo, hi, status); \
3562 } \
3563} \
3564uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
3565{ \
c648c9b7 3566 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
23fbe79f
RH
3567 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
3568 for (i = 0; i < oprsz; ) { \
3569 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3570 do { \
3571 TYPE nn = *(TYPE *)(vn + H(i)); \
3572 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
3573 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
3574 } while (i & 15); \
3575 } \
3576 for (; i < maxsz; i += sizeof(TYPE)) { \
3577 *(TYPE *)((void *)data + i) = IDENT; \
3578 } \
3579 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
3580}
3581
3582DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
3583DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
3584DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
3585
3586/* Identity is floatN_default_nan, without the function call. */
3587DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
3588DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
3589DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
3590
3591DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
3592DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
3593DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
3594
3595DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
3596DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
3597DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
3598
3599DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
3600DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
3601DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
3602
3603#undef DO_REDUCE
3604
7f9ddf64
RH
3605uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
3606 void *status, uint32_t desc)
3607{
3608 intptr_t i = 0, opr_sz = simd_oprsz(desc);
3609 float16 result = nn;
3610
3611 do {
3612 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
3613 do {
3614 if (pg & 1) {
3615 float16 mm = *(float16 *)(vm + H1_2(i));
3616 result = float16_add(result, mm, status);
3617 }
3618 i += sizeof(float16), pg >>= sizeof(float16);
3619 } while (i & 15);
3620 } while (i < opr_sz);
3621
3622 return result;
3623}
3624
3625uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
3626 void *status, uint32_t desc)
3627{
3628 intptr_t i = 0, opr_sz = simd_oprsz(desc);
3629 float32 result = nn;
3630
3631 do {
3632 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
3633 do {
3634 if (pg & 1) {
3635 float32 mm = *(float32 *)(vm + H1_2(i));
3636 result = float32_add(result, mm, status);
3637 }
3638 i += sizeof(float32), pg >>= sizeof(float32);
3639 } while (i & 15);
3640 } while (i < opr_sz);
3641
3642 return result;
3643}
3644
3645uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
3646 void *status, uint32_t desc)
3647{
3648 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
3649 uint64_t *m = vm;
3650 uint8_t *pg = vg;
3651
3652 for (i = 0; i < opr_sz; i++) {
3653 if (pg[H1(i)] & 1) {
3654 nn = float64_add(nn, m[i], status);
3655 }
3656 }
3657
3658 return nn;
3659}
3660
ec3b87c2
RH
3661/* Fully general three-operand expander, controlled by a predicate,
3662 * With the extra float_status parameter.
3663 */
3664#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
3665void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3666 void *status, uint32_t desc) \
3667{ \
3668 intptr_t i = simd_oprsz(desc); \
3669 uint64_t *g = vg; \
3670 do { \
3671 uint64_t pg = g[(i - 1) >> 6]; \
3672 do { \
3673 i -= sizeof(TYPE); \
3674 if (likely((pg >> (i & 63)) & 1)) { \
3675 TYPE nn = *(TYPE *)(vn + H(i)); \
3676 TYPE mm = *(TYPE *)(vm + H(i)); \
3677 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3678 } \
3679 } while (i & 63); \
3680 } while (i != 0); \
3681}
3682
3683DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3684DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3685DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3686
3687DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3688DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3689DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3690
3691DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3692DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3693DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3694
3695DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3696DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3697DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3698
3699DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3700DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3701DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3702
3703DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3704DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3705DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3706
3707DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3708DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3709DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3710
3711DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3712DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3713DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3714
3715static inline float16 abd_h(float16 a, float16 b, float_status *s)
3716{
3717 return float16_abs(float16_sub(a, b, s));
3718}
3719
3720static inline float32 abd_s(float32 a, float32 b, float_status *s)
3721{
3722 return float32_abs(float32_sub(a, b, s));
3723}
3724
3725static inline float64 abd_d(float64 a, float64 b, float_status *s)
3726{
3727 return float64_abs(float64_sub(a, b, s));
3728}
3729
3730DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3731DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3732DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3733
3734static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3735{
3736 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3737 return float64_scalbn(a, b_int, s);
3738}
3739
3740DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3741DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3742DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3743
3744DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3745DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3746DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3747
3748#undef DO_ZPZZ_FP
3749
cc48affe
RH
3750/* Three-operand expander, with one scalar operand, controlled by
3751 * a predicate, with the extra float_status parameter.
3752 */
3753#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3754void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3755 void *status, uint32_t desc) \
3756{ \
3757 intptr_t i = simd_oprsz(desc); \
3758 uint64_t *g = vg; \
3759 TYPE mm = scalar; \
3760 do { \
3761 uint64_t pg = g[(i - 1) >> 6]; \
3762 do { \
3763 i -= sizeof(TYPE); \
3764 if (likely((pg >> (i & 63)) & 1)) { \
3765 TYPE nn = *(TYPE *)(vn + H(i)); \
3766 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3767 } \
3768 } while (i & 63); \
3769 } while (i != 0); \
3770}
3771
3772DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3773DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3774DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3775
3776DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3777DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3778DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3779
3780DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3781DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3782DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3783
3784static inline float16 subr_h(float16 a, float16 b, float_status *s)
3785{
3786 return float16_sub(b, a, s);
3787}
3788
3789static inline float32 subr_s(float32 a, float32 b, float_status *s)
3790{
3791 return float32_sub(b, a, s);
3792}
3793
3794static inline float64 subr_d(float64 a, float64 b, float_status *s)
3795{
3796 return float64_sub(b, a, s);
3797}
3798
3799DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3800DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3801DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3802
3803DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3804DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3805DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3806
3807DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3808DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3809DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3810
3811DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3812DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3813DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3814
3815DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3816DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3817DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3818
8092c6a3
RH
3819/* Fully general two-operand expander, controlled by a predicate,
3820 * With the extra float_status parameter.
3821 */
3822#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3823void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3824{ \
3825 intptr_t i = simd_oprsz(desc); \
3826 uint64_t *g = vg; \
3827 do { \
3828 uint64_t pg = g[(i - 1) >> 6]; \
3829 do { \
3830 i -= sizeof(TYPE); \
3831 if (likely((pg >> (i & 63)) & 1)) { \
3832 TYPE nn = *(TYPE *)(vn + H(i)); \
3833 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3834 } \
3835 } while (i & 63); \
3836 } while (i != 0); \
3837}
3838
46d33d1e
RH
3839/* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3840 * FZ16. When converting from fp16, this affects flushing input denormals;
3841 * when converting to fp16, this affects flushing output denormals.
3842 */
3843static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3844{
c120391c 3845 bool save = get_flush_inputs_to_zero(fpst);
46d33d1e
RH
3846 float32 ret;
3847
3848 set_flush_inputs_to_zero(false, fpst);
3849 ret = float16_to_float32(f, true, fpst);
3850 set_flush_inputs_to_zero(save, fpst);
3851 return ret;
3852}
3853
3854static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3855{
c120391c 3856 bool save = get_flush_inputs_to_zero(fpst);
46d33d1e
RH
3857 float64 ret;
3858
3859 set_flush_inputs_to_zero(false, fpst);
3860 ret = float16_to_float64(f, true, fpst);
3861 set_flush_inputs_to_zero(save, fpst);
3862 return ret;
3863}
3864
3865static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3866{
c120391c 3867 bool save = get_flush_to_zero(fpst);
46d33d1e
RH
3868 float16 ret;
3869
3870 set_flush_to_zero(false, fpst);
3871 ret = float32_to_float16(f, true, fpst);
3872 set_flush_to_zero(save, fpst);
3873 return ret;
3874}
3875
3876static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3877{
c120391c 3878 bool save = get_flush_to_zero(fpst);
46d33d1e
RH
3879 float16 ret;
3880
3881 set_flush_to_zero(false, fpst);
3882 ret = float64_to_float16(f, true, fpst);
3883 set_flush_to_zero(save, fpst);
3884 return ret;
3885}
3886
df4de1af
RH
3887static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3888{
3889 if (float16_is_any_nan(f)) {
3890 float_raise(float_flag_invalid, s);
3891 return 0;
3892 }
3893 return float16_to_int16_round_to_zero(f, s);
3894}
3895
3896static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3897{
3898 if (float16_is_any_nan(f)) {
3899 float_raise(float_flag_invalid, s);
3900 return 0;
3901 }
3902 return float16_to_int64_round_to_zero(f, s);
3903}
3904
3905static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3906{
3907 if (float32_is_any_nan(f)) {
3908 float_raise(float_flag_invalid, s);
3909 return 0;
3910 }
3911 return float32_to_int64_round_to_zero(f, s);
3912}
3913
3914static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3915{
3916 if (float64_is_any_nan(f)) {
3917 float_raise(float_flag_invalid, s);
3918 return 0;
3919 }
3920 return float64_to_int64_round_to_zero(f, s);
3921}
3922
3923static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3924{
3925 if (float16_is_any_nan(f)) {
3926 float_raise(float_flag_invalid, s);
3927 return 0;
3928 }
3929 return float16_to_uint16_round_to_zero(f, s);
3930}
3931
3932static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3933{
3934 if (float16_is_any_nan(f)) {
3935 float_raise(float_flag_invalid, s);
3936 return 0;
3937 }
3938 return float16_to_uint64_round_to_zero(f, s);
3939}
3940
3941static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3942{
3943 if (float32_is_any_nan(f)) {
3944 float_raise(float_flag_invalid, s);
3945 return 0;
3946 }
3947 return float32_to_uint64_round_to_zero(f, s);
3948}
3949
3950static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3951{
3952 if (float64_is_any_nan(f)) {
3953 float_raise(float_flag_invalid, s);
3954 return 0;
3955 }
3956 return float64_to_uint64_round_to_zero(f, s);
3957}
3958
46d33d1e
RH
3959DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3960DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3961DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3962DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3963DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3964DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3965
df4de1af
RH
3966DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3967DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3968DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3969DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3970DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3971DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3972DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3973
3974DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3975DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3976DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3977DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3978DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3979DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3980DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3981
cda3c753
RH
3982DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3983DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3984DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3985
3986DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3987DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3988DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3989
ec5b375b
RH
3990DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3991DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3992DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3993
3994DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3995DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3996DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3997
8092c6a3
RH
3998DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3999DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4000DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4001DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
4002DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
4003DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
4004DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
4005
4006DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4007DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4008DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4009DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
4010DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
4011DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
4012DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
4013
4014#undef DO_ZPZ_FP
4015
08975da9
RH
4016static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4017 float_status *status, uint32_t desc,
6ceabaad
RH
4018 uint16_t neg1, uint16_t neg3)
4019{
4020 intptr_t i = simd_oprsz(desc);
6ceabaad
RH
4021 uint64_t *g = vg;
4022
4023 do {
4024 uint64_t pg = g[(i - 1) >> 6];
4025 do {
4026 i -= 2;
4027 if (likely((pg >> (i & 63)) & 1)) {
4028 float16 e1, e2, e3, r;
4029
4030 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4031 e2 = *(uint16_t *)(vm + H1_2(i));
4032 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
08975da9 4033 r = float16_muladd(e1, e2, e3, 0, status);
6ceabaad
RH
4034 *(uint16_t *)(vd + H1_2(i)) = r;
4035 }
4036 } while (i & 63);
4037 } while (i != 0);
4038}
4039
08975da9
RH
4040void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4041 void *vg, void *status, uint32_t desc)
6ceabaad 4042{
08975da9 4043 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
6ceabaad
RH
4044}
4045
08975da9
RH
4046void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4047 void *vg, void *status, uint32_t desc)
6ceabaad 4048{
08975da9 4049 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
6ceabaad
RH
4050}
4051
08975da9
RH
4052void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4053 void *vg, void *status, uint32_t desc)
6ceabaad 4054{
08975da9 4055 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
6ceabaad
RH
4056}
4057
08975da9
RH
4058void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4059 void *vg, void *status, uint32_t desc)
6ceabaad 4060{
08975da9 4061 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
6ceabaad
RH
4062}
4063
08975da9
RH
4064static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4065 float_status *status, uint32_t desc,
6ceabaad
RH
4066 uint32_t neg1, uint32_t neg3)
4067{
4068 intptr_t i = simd_oprsz(desc);
6ceabaad
RH
4069 uint64_t *g = vg;
4070
4071 do {
4072 uint64_t pg = g[(i - 1) >> 6];
4073 do {
4074 i -= 4;
4075 if (likely((pg >> (i & 63)) & 1)) {
4076 float32 e1, e2, e3, r;
4077
4078 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4079 e2 = *(uint32_t *)(vm + H1_4(i));
4080 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
08975da9 4081 r = float32_muladd(e1, e2, e3, 0, status);
6ceabaad
RH
4082 *(uint32_t *)(vd + H1_4(i)) = r;
4083 }
4084 } while (i & 63);
4085 } while (i != 0);
4086}
4087
08975da9
RH
4088void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4089 void *vg, void *status, uint32_t desc)
6ceabaad 4090{
08975da9 4091 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
6ceabaad
RH
4092}
4093
08975da9
RH
4094void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4095 void *vg, void *status, uint32_t desc)
6ceabaad 4096{
08975da9 4097 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
6ceabaad
RH
4098}
4099
08975da9
RH
4100void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4101 void *vg, void *status, uint32_t desc)
6ceabaad 4102{
08975da9 4103 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
6ceabaad
RH
4104}
4105
08975da9
RH
4106void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4107 void *vg, void *status, uint32_t desc)
6ceabaad 4108{
08975da9 4109 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
6ceabaad
RH
4110}
4111
08975da9
RH
4112static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4113 float_status *status, uint32_t desc,
6ceabaad
RH
4114 uint64_t neg1, uint64_t neg3)
4115{
4116 intptr_t i = simd_oprsz(desc);
6ceabaad
RH
4117 uint64_t *g = vg;
4118
4119 do {
4120 uint64_t pg = g[(i - 1) >> 6];
4121 do {
4122 i -= 8;
4123 if (likely((pg >> (i & 63)) & 1)) {
4124 float64 e1, e2, e3, r;
4125
4126 e1 = *(uint64_t *)(vn + i) ^ neg1;
4127 e2 = *(uint64_t *)(vm + i);
4128 e3 = *(uint64_t *)(va + i) ^ neg3;
08975da9 4129 r = float64_muladd(e1, e2, e3, 0, status);
6ceabaad
RH
4130 *(uint64_t *)(vd + i) = r;
4131 }
4132 } while (i & 63);
4133 } while (i != 0);
4134}
4135
08975da9
RH
4136void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4137 void *vg, void *status, uint32_t desc)
6ceabaad 4138{
08975da9 4139 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
6ceabaad
RH
4140}
4141
08975da9
RH
4142void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4143 void *vg, void *status, uint32_t desc)
6ceabaad 4144{
08975da9 4145 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
6ceabaad
RH
4146}
4147
08975da9
RH
4148void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4149 void *vg, void *status, uint32_t desc)
6ceabaad 4150{
08975da9 4151 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
6ceabaad
RH
4152}
4153
08975da9
RH
4154void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4155 void *vg, void *status, uint32_t desc)
6ceabaad 4156{
08975da9 4157 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
6ceabaad
RH
4158}
4159
abfdefd5
RH
4160/* Two operand floating-point comparison controlled by a predicate.
4161 * Unlike the integer version, we are not allowed to optimistically
4162 * compare operands, since the comparison may have side effects wrt
4163 * the FPSR.
4164 */
4165#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4166void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4167 void *status, uint32_t desc) \
4168{ \
4169 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4170 uint64_t *d = vd, *g = vg; \
4171 do { \
4172 uint64_t out = 0, pg = g[j]; \
4173 do { \
4174 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4175 if (likely((pg >> (i & 63)) & 1)) { \
4176 TYPE nn = *(TYPE *)(vn + H(i)); \
4177 TYPE mm = *(TYPE *)(vm + H(i)); \
4178 out |= OP(TYPE, nn, mm, status); \
4179 } \
4180 } while (i & 63); \
4181 d[j--] = out; \
4182 } while (i > 0); \
4183}
4184
4185#define DO_FPCMP_PPZZ_H(NAME, OP) \
4186 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4187#define DO_FPCMP_PPZZ_S(NAME, OP) \
4188 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4189#define DO_FPCMP_PPZZ_D(NAME, OP) \
4190 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
4191
4192#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4193 DO_FPCMP_PPZZ_H(NAME, OP) \
4194 DO_FPCMP_PPZZ_S(NAME, OP) \
4195 DO_FPCMP_PPZZ_D(NAME, OP)
4196
4197#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4198#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4d2e2a03
RH
4199#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4200#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
abfdefd5
RH
4201#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4202#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4203#define DO_FCMUO(TYPE, X, Y, ST) \
4204 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4205#define DO_FACGE(TYPE, X, Y, ST) \
4206 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4207#define DO_FACGT(TYPE, X, Y, ST) \
4208 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4209
4210DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4211DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4212DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4213DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4214DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4215DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4216DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4217
4218#undef DO_FPCMP_PPZZ_ALL
4219#undef DO_FPCMP_PPZZ_D
4220#undef DO_FPCMP_PPZZ_S
4221#undef DO_FPCMP_PPZZ_H
4222#undef DO_FPCMP_PPZZ
4223
4d2e2a03
RH
4224/* One operand floating-point comparison against zero, controlled
4225 * by a predicate.
4226 */
4227#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4228void HELPER(NAME)(void *vd, void *vn, void *vg, \
4229 void *status, uint32_t desc) \
4230{ \
4231 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4232 uint64_t *d = vd, *g = vg; \
4233 do { \
4234 uint64_t out = 0, pg = g[j]; \
4235 do { \
4236 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4237 if ((pg >> (i & 63)) & 1) { \
4238 TYPE nn = *(TYPE *)(vn + H(i)); \
4239 out |= OP(TYPE, nn, 0, status); \
4240 } \
4241 } while (i & 63); \
4242 d[j--] = out; \
4243 } while (i > 0); \
4244}
4245
4246#define DO_FPCMP_PPZ0_H(NAME, OP) \
4247 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4248#define DO_FPCMP_PPZ0_S(NAME, OP) \
4249 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4250#define DO_FPCMP_PPZ0_D(NAME, OP) \
4251 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
4252
4253#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4254 DO_FPCMP_PPZ0_H(NAME, OP) \
4255 DO_FPCMP_PPZ0_S(NAME, OP) \
4256 DO_FPCMP_PPZ0_D(NAME, OP)
4257
4258DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4259DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4260DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4261DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4262DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4263DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4264
67fcd9ad
RH
4265/* FP Trig Multiply-Add. */
4266
4267void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4268{
4269 static const float16 coeff[16] = {
4270 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4271 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4272 };
4273 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4274 intptr_t x = simd_data(desc);
4275 float16 *d = vd, *n = vn, *m = vm;
4276 for (i = 0; i < opr_sz; i++) {
4277 float16 mm = m[i];
4278 intptr_t xx = x;
4279 if (float16_is_neg(mm)) {
4280 mm = float16_abs(mm);
4281 xx += 8;
4282 }
4283 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
4284 }
4285}
4286
4287void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4288{
4289 static const float32 coeff[16] = {
4290 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
4291 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
4292 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
4293 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
4294 };
4295 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
4296 intptr_t x = simd_data(desc);
4297 float32 *d = vd, *n = vn, *m = vm;
4298 for (i = 0; i < opr_sz; i++) {
4299 float32 mm = m[i];
4300 intptr_t xx = x;
4301 if (float32_is_neg(mm)) {
4302 mm = float32_abs(mm);
4303 xx += 8;
4304 }
4305 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
4306 }
4307}
4308
4309void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4310{
4311 static const float64 coeff[16] = {
4312 0x3ff0000000000000ull, 0xbfc5555555555543ull,
4313 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
4314 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
4315 0x3de5d8408868552full, 0x0000000000000000ull,
4316 0x3ff0000000000000ull, 0xbfe0000000000000ull,
4317 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
4318 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
4319 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
4320 };
4321 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
4322 intptr_t x = simd_data(desc);
4323 float64 *d = vd, *n = vn, *m = vm;
4324 for (i = 0; i < opr_sz; i++) {
4325 float64 mm = m[i];
4326 intptr_t xx = x;
4327 if (float64_is_neg(mm)) {
4328 mm = float64_abs(mm);
4329 xx += 8;
4330 }
4331 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
4332 }
4333}
4334
76a9d9cd
RH
4335/*
4336 * FP Complex Add
4337 */
4338
4339void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
4340 void *vs, uint32_t desc)
4341{
4342 intptr_t j, i = simd_oprsz(desc);
4343 uint64_t *g = vg;
4344 float16 neg_imag = float16_set_sign(0, simd_data(desc));
4345 float16 neg_real = float16_chs(neg_imag);
4346
4347 do {
4348 uint64_t pg = g[(i - 1) >> 6];
4349 do {
4350 float16 e0, e1, e2, e3;
4351
4352 /* I holds the real index; J holds the imag index. */
4353 j = i - sizeof(float16);
4354 i -= 2 * sizeof(float16);
4355
4356 e0 = *(float16 *)(vn + H1_2(i));
4357 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
4358 e2 = *(float16 *)(vn + H1_2(j));
4359 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
4360
4361 if (likely((pg >> (i & 63)) & 1)) {
4362 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
4363 }
4364 if (likely((pg >> (j & 63)) & 1)) {
4365 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
4366 }
4367 } while (i & 63);
4368 } while (i != 0);
4369}
4370
4371void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
4372 void *vs, uint32_t desc)
4373{
4374 intptr_t j, i = simd_oprsz(desc);
4375 uint64_t *g = vg;
4376 float32 neg_imag = float32_set_sign(0, simd_data(desc));
4377 float32 neg_real = float32_chs(neg_imag);
4378
4379 do {
4380 uint64_t pg = g[(i - 1) >> 6];
4381 do {
4382 float32 e0, e1, e2, e3;
4383
4384 /* I holds the real index; J holds the imag index. */
4385 j = i - sizeof(float32);
4386 i -= 2 * sizeof(float32);
4387
4388 e0 = *(float32 *)(vn + H1_2(i));
4389 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
4390 e2 = *(float32 *)(vn + H1_2(j));
4391 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
4392
4393 if (likely((pg >> (i & 63)) & 1)) {
4394 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
4395 }
4396 if (likely((pg >> (j & 63)) & 1)) {
4397 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
4398 }
4399 } while (i & 63);
4400 } while (i != 0);
4401}
4402
4403void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
4404 void *vs, uint32_t desc)
4405{
4406 intptr_t j, i = simd_oprsz(desc);
4407 uint64_t *g = vg;
4408 float64 neg_imag = float64_set_sign(0, simd_data(desc));
4409 float64 neg_real = float64_chs(neg_imag);
4410
4411 do {
4412 uint64_t pg = g[(i - 1) >> 6];
4413 do {
4414 float64 e0, e1, e2, e3;
4415
4416 /* I holds the real index; J holds the imag index. */
4417 j = i - sizeof(float64);
4418 i -= 2 * sizeof(float64);
4419
4420 e0 = *(float64 *)(vn + H1_2(i));
4421 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
4422 e2 = *(float64 *)(vn + H1_2(j));
4423 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
4424
4425 if (likely((pg >> (i & 63)) & 1)) {
4426 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
4427 }
4428 if (likely((pg >> (j & 63)) & 1)) {
4429 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
4430 }
4431 } while (i & 63);
4432 } while (i != 0);
4433}
4434
05f48bab
RH
4435/*
4436 * FP Complex Multiply
4437 */
4438
08975da9
RH
4439void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4440 void *vg, void *status, uint32_t desc)
05f48bab
RH
4441{
4442 intptr_t j, i = simd_oprsz(desc);
08975da9 4443 unsigned rot = simd_data(desc);
05f48bab
RH
4444 bool flip = rot & 1;
4445 float16 neg_imag, neg_real;
05f48bab
RH
4446 uint64_t *g = vg;
4447
4448 neg_imag = float16_set_sign(0, (rot & 2) != 0);
4449 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
4450
4451 do {
4452 uint64_t pg = g[(i - 1) >> 6];
4453 do {
4454 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
4455
4456 /* I holds the real index; J holds the imag index. */
4457 j = i - sizeof(float16);
4458 i -= 2 * sizeof(float16);
4459
4460 nr = *(float16 *)(vn + H1_2(i));
4461 ni = *(float16 *)(vn + H1_2(j));
4462 mr = *(float16 *)(vm + H1_2(i));
4463 mi = *(float16 *)(vm + H1_2(j));
4464
4465 e2 = (flip ? ni : nr);
4466 e1 = (flip ? mi : mr) ^ neg_real;
4467 e4 = e2;
4468 e3 = (flip ? mr : mi) ^ neg_imag;
4469
4470 if (likely((pg >> (i & 63)) & 1)) {
4471 d = *(float16 *)(va + H1_2(i));
08975da9 4472 d = float16_muladd(e2, e1, d, 0, status);
05f48bab
RH
4473 *(float16 *)(vd + H1_2(i)) = d;
4474 }
4475 if (likely((pg >> (j & 63)) & 1)) {
4476 d = *(float16 *)(va + H1_2(j));
08975da9 4477 d = float16_muladd(e4, e3, d, 0, status);
05f48bab
RH
4478 *(float16 *)(vd + H1_2(j)) = d;
4479 }
4480 } while (i & 63);
4481 } while (i != 0);
4482}
4483
08975da9
RH
4484void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4485 void *vg, void *status, uint32_t desc)
05f48bab
RH
4486{
4487 intptr_t j, i = simd_oprsz(desc);
08975da9 4488 unsigned rot = simd_data(desc);
05f48bab
RH
4489 bool flip = rot & 1;
4490 float32 neg_imag, neg_real;
05f48bab
RH
4491 uint64_t *g = vg;
4492
4493 neg_imag = float32_set_sign(0, (rot & 2) != 0);
4494 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
4495
4496 do {
4497 uint64_t pg = g[(i - 1) >> 6];
4498 do {
4499 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
4500
4501 /* I holds the real index; J holds the imag index. */
4502 j = i - sizeof(float32);
4503 i -= 2 * sizeof(float32);
4504
4505 nr = *(float32 *)(vn + H1_2(i));
4506 ni = *(float32 *)(vn + H1_2(j));
4507 mr = *(float32 *)(vm + H1_2(i));
4508 mi = *(float32 *)(vm + H1_2(j));
4509
4510 e2 = (flip ? ni : nr);
4511 e1 = (flip ? mi : mr) ^ neg_real;
4512 e4 = e2;
4513 e3 = (flip ? mr : mi) ^ neg_imag;
4514
4515 if (likely((pg >> (i & 63)) & 1)) {
4516 d = *(float32 *)(va + H1_2(i));
08975da9 4517 d = float32_muladd(e2, e1, d, 0, status);
05f48bab
RH
4518 *(float32 *)(vd + H1_2(i)) = d;
4519 }
4520 if (likely((pg >> (j & 63)) & 1)) {
4521 d = *(float32 *)(va + H1_2(j));
08975da9 4522 d = float32_muladd(e4, e3, d, 0, status);
05f48bab
RH
4523 *(float32 *)(vd + H1_2(j)) = d;
4524 }
4525 } while (i & 63);
4526 } while (i != 0);
4527}
4528
08975da9
RH
4529void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4530 void *vg, void *status, uint32_t desc)
05f48bab
RH
4531{
4532 intptr_t j, i = simd_oprsz(desc);
08975da9 4533 unsigned rot = simd_data(desc);
05f48bab
RH
4534 bool flip = rot & 1;
4535 float64 neg_imag, neg_real;
05f48bab
RH
4536 uint64_t *g = vg;
4537
4538 neg_imag = float64_set_sign(0, (rot & 2) != 0);
4539 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
4540
4541 do {
4542 uint64_t pg = g[(i - 1) >> 6];
4543 do {
4544 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
4545
4546 /* I holds the real index; J holds the imag index. */
4547 j = i - sizeof(float64);
4548 i -= 2 * sizeof(float64);
4549
4550 nr = *(float64 *)(vn + H1_2(i));
4551 ni = *(float64 *)(vn + H1_2(j));
4552 mr = *(float64 *)(vm + H1_2(i));
4553 mi = *(float64 *)(vm + H1_2(j));
4554
4555 e2 = (flip ? ni : nr);
4556 e1 = (flip ? mi : mr) ^ neg_real;
4557 e4 = e2;
4558 e3 = (flip ? mr : mi) ^ neg_imag;
4559
4560 if (likely((pg >> (i & 63)) & 1)) {
4561 d = *(float64 *)(va + H1_2(i));
08975da9 4562 d = float64_muladd(e2, e1, d, 0, status);
05f48bab
RH
4563 *(float64 *)(vd + H1_2(i)) = d;
4564 }
4565 if (likely((pg >> (j & 63)) & 1)) {
4566 d = *(float64 *)(va + H1_2(j));
08975da9 4567 d = float64_muladd(e4, e3, d, 0, status);
05f48bab
RH
4568 *(float64 *)(vd + H1_2(j)) = d;
4569 }
4570 } while (i & 63);
4571 } while (i != 0);
4572}
4573
c4e7c493
RH
4574/*
4575 * Load contiguous data, protected by a governing predicate.
4576 */
9123aeb6
RH
4577
4578/*
cf4a49b7
RH
4579 * Load one element into @vd + @reg_off from @host.
4580 * The controlling predicate is known to be true.
9123aeb6 4581 */
cf4a49b7 4582typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
9123aeb6
RH
4583
4584/*
4585 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
4586 * The controlling predicate is known to be true.
4587 */
6799ce7b
RH
4588typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
4589 target_ulong vaddr, uintptr_t retaddr);
9123aeb6
RH
4590
4591/*
4592 * Generate the above primitives.
4593 */
4594
4595#define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
cf4a49b7
RH
4596static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
4597{ \
4598 TYPEM val = HOST(host); \
4599 *(TYPEE *)(vd + H(reg_off)) = val; \
9123aeb6
RH
4600}
4601
0fa476c1
RH
4602#define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
4603static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
4604{ HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
4605
6799ce7b 4606#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
9123aeb6 4607static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
6799ce7b 4608 target_ulong addr, uintptr_t ra) \
9123aeb6 4609{ \
c4af8ba1
RH
4610 *(TYPEE *)(vd + H(reg_off)) = \
4611 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
9123aeb6 4612}
6799ce7b
RH
4613
4614#define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
9123aeb6 4615static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
6799ce7b 4616 target_ulong addr, uintptr_t ra) \
9123aeb6 4617{ \
c4af8ba1
RH
4618 TLB(env, useronly_clean_ptr(addr), \
4619 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
9123aeb6 4620}
9123aeb6
RH
4621
4622#define DO_LD_PRIM_1(NAME, H, TE, TM) \
4623 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
6799ce7b 4624 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
9123aeb6
RH
4625
4626DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
4627DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
4628DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
4629DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
4630DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
4631DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
4632DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
4633
6799ce7b 4634#define DO_ST_PRIM_1(NAME, H, TE, TM) \
0fa476c1 4635 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
6799ce7b
RH
4636 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
4637
4638DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
4639DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
4640DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
4641DO_ST_PRIM_1(bd, , uint64_t, uint8_t)
9123aeb6 4642
6799ce7b
RH
4643#define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
4644 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
4645 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
4646 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
4647 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
9123aeb6 4648
6799ce7b 4649#define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
0fa476c1
RH
4650 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
4651 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
6799ce7b
RH
4652 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
4653 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
9123aeb6 4654
6799ce7b
RH
4655DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
4656DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
4657DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
4658DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw)
4659DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw)
9123aeb6 4660
6799ce7b
RH
4661DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
4662DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
4663DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw)
9123aeb6 4664
6799ce7b
RH
4665DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
4666DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl)
4667DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl)
9123aeb6 4668
6799ce7b
RH
4669DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
4670DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl)
4671
4672DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq)
4673DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq)
9123aeb6
RH
4674
4675#undef DO_LD_TLB
6799ce7b 4676#undef DO_ST_TLB
9123aeb6
RH
4677#undef DO_LD_HOST
4678#undef DO_LD_PRIM_1
6799ce7b 4679#undef DO_ST_PRIM_1
9123aeb6 4680#undef DO_LD_PRIM_2
6799ce7b 4681#undef DO_ST_PRIM_2
9123aeb6
RH
4682
4683/*
4684 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4685 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4686 * element >= @reg_off, or @reg_max if there were no active elements at all.
4687 */
4688static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4689 intptr_t reg_max, int esz)
4690{
4691 uint64_t pg_mask = pred_esz_masks[esz];
4692 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4693
4694 /* In normal usage, the first element is active. */
4695 if (likely(pg & 1)) {
4696 return reg_off;
4697 }
4698
4699 if (pg == 0) {
4700 reg_off &= -64;
4701 do {
4702 reg_off += 64;
4703 if (unlikely(reg_off >= reg_max)) {
4704 /* The entire predicate was false. */
4705 return reg_max;
4706 }
4707 pg = vg[reg_off >> 6] & pg_mask;
4708 } while (pg == 0);
4709 }
4710 reg_off += ctz64(pg);
4711
4712 /* We should never see an out of range predicate bit set. */
4713 tcg_debug_assert(reg_off < reg_max);
4714 return reg_off;
4715}
4716
b4cd95d2
RH
4717/*
4718 * Resolve the guest virtual address to info->host and info->flags.
4719 * If @nofault, return false if the page is invalid, otherwise
4720 * exit via page fault exception.
4721 */
4722
4723typedef struct {
4724 void *host;
4725 int flags;
4726 MemTxAttrs attrs;
4727} SVEHostPage;
4728
4729static bool sve_probe_page(SVEHostPage *info, bool nofault,
4730 CPUARMState *env, target_ulong addr,
4731 int mem_off, MMUAccessType access_type,
4732 int mmu_idx, uintptr_t retaddr)
4733{
4734 int flags;
4735
4736 addr += mem_off;
c4af8ba1
RH
4737
4738 /*
4739 * User-only currently always issues with TBI. See the comment
4740 * above useronly_clean_ptr. Usually we clean this top byte away
4741 * during translation, but we can't do that for e.g. vector + imm
4742 * addressing modes.
4743 *
4744 * We currently always enable TBI for user-only, and do not provide
4745 * a way to turn it off. So clean the pointer unconditionally here,
4746 * rather than look it up here, or pass it down from above.
4747 */
4748 addr = useronly_clean_ptr(addr);
4749
b4cd95d2
RH
4750 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
4751 &info->host, retaddr);
4752 info->flags = flags;
4753
4754 if (flags & TLB_INVALID_MASK) {
4755 g_assert(nofault);
4756 return false;
4757 }
4758
4759 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
4760 info->host -= mem_off;
4761
4762#ifdef CONFIG_USER_ONLY
4763 memset(&info->attrs, 0, sizeof(info->attrs));
4764#else
4765 /*
4766 * Find the iotlbentry for addr and return the transaction attributes.
4767 * This *must* be present in the TLB because we just found the mapping.
4768 */
4769 {
4770 uintptr_t index = tlb_index(env, mmu_idx, addr);
4771
4772# ifdef CONFIG_DEBUG_TCG
4773 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
4774 target_ulong comparator = (access_type == MMU_DATA_LOAD
4775 ? entry->addr_read
4776 : tlb_addr_write(entry));
4777 g_assert(tlb_hit(comparator, addr));
4778# endif
4779
4780 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
4781 info->attrs = iotlbentry->attrs;
4782 }
4783#endif
4784
4785 return true;
4786}
4787
4788
4789/*
4790 * Analyse contiguous data, protected by a governing predicate.
4791 */
4792
4793typedef enum {
4794 FAULT_NO,
4795 FAULT_FIRST,
4796 FAULT_ALL,
4797} SVEContFault;
4798
4799typedef struct {
4800 /*
4801 * First and last element wholly contained within the two pages.
4802 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
4803 * reg_off_last[0] may be < 0 if the first element crosses pages.
4804 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
4805 * are set >= 0 only if there are complete elements on a second page.
4806 *
4807 * The reg_off_* offsets are relative to the internal vector register.
4808 * The mem_off_first offset is relative to the memory address; the
4809 * two offsets are different when a load operation extends, a store
4810 * operation truncates, or for multi-register operations.
4811 */
4812 int16_t mem_off_first[2];
4813 int16_t reg_off_first[2];
4814 int16_t reg_off_last[2];
4815
4816 /*
4817 * One element that is misaligned and spans both pages,
4818 * or -1 if there is no such active element.
4819 */
4820 int16_t mem_off_split;
4821 int16_t reg_off_split;
4822
4823 /*
4824 * The byte offset at which the entire operation crosses a page boundary.
4825 * Set >= 0 if and only if the entire operation spans two pages.
4826 */
4827 int16_t page_split;
4828
4829 /* TLB data for the two pages. */
4830 SVEHostPage page[2];
4831} SVEContLdSt;
4832
4833/*
4834 * Find first active element on each page, and a loose bound for the
4835 * final element on each page. Identify any single element that spans
4836 * the page boundary. Return true if there are any active elements.
4837 */
b854fd06
RH
4838static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
4839 uint64_t *vg, intptr_t reg_max,
4840 int esz, int msize)
b4cd95d2
RH
4841{
4842 const int esize = 1 << esz;
4843 const uint64_t pg_mask = pred_esz_masks[esz];
4844 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
4845 intptr_t mem_off_last, mem_off_split;
4846 intptr_t page_split, elt_split;
4847 intptr_t i;
4848
4849 /* Set all of the element indices to -1, and the TLB data to 0. */
4850 memset(info, -1, offsetof(SVEContLdSt, page));
4851 memset(info->page, 0, sizeof(info->page));
4852
4853 /* Gross scan over the entire predicate to find bounds. */
4854 i = 0;
4855 do {
4856 uint64_t pg = vg[i] & pg_mask;
4857 if (pg) {
4858 reg_off_last = i * 64 + 63 - clz64(pg);
4859 if (reg_off_first < 0) {
4860 reg_off_first = i * 64 + ctz64(pg);
4861 }
4862 }
4863 } while (++i * 64 < reg_max);
4864
4865 if (unlikely(reg_off_first < 0)) {
4866 /* No active elements, no pages touched. */
4867 return false;
4868 }
4869 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
4870
4871 info->reg_off_first[0] = reg_off_first;
4872 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
4873 mem_off_last = (reg_off_last >> esz) * msize;
4874
4875 page_split = -(addr | TARGET_PAGE_MASK);
4876 if (likely(mem_off_last + msize <= page_split)) {
4877 /* The entire operation fits within a single page. */
4878 info->reg_off_last[0] = reg_off_last;
4879 return true;
4880 }
4881
4882 info->page_split = page_split;
4883 elt_split = page_split / msize;
4884 reg_off_split = elt_split << esz;
4885 mem_off_split = elt_split * msize;
4886
4887 /*
4888 * This is the last full element on the first page, but it is not
4889 * necessarily active. If there is no full element, i.e. the first
4890 * active element is the one that's split, this value remains -1.
4891 * It is useful as iteration bounds.
4892 */
4893 if (elt_split != 0) {
4894 info->reg_off_last[0] = reg_off_split - esize;
4895 }
4896
4897 /* Determine if an unaligned element spans the pages. */
4898 if (page_split % msize != 0) {
4899 /* It is helpful to know if the split element is active. */
4900 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
4901 info->reg_off_split = reg_off_split;
4902 info->mem_off_split = mem_off_split;
4903
4904 if (reg_off_split == reg_off_last) {
4905 /* The page crossing element is last. */
4906 return true;
4907 }
4908 }
4909 reg_off_split += esize;
4910 mem_off_split += msize;
4911 }
4912
4913 /*
4914 * We do want the first active element on the second page, because
4915 * this may affect the address reported in an exception.
4916 */
4917 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
4918 tcg_debug_assert(reg_off_split <= reg_off_last);
4919 info->reg_off_first[1] = reg_off_split;
4920 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
4921 info->reg_off_last[1] = reg_off_last;
4922 return true;
4923}
4924
4925/*
4926 * Resolve the guest virtual addresses to info->page[].
4927 * Control the generation of page faults with @fault. Return false if
4928 * there is no work to do, which can only happen with @fault == FAULT_NO.
4929 */
b854fd06
RH
4930static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
4931 CPUARMState *env, target_ulong addr,
4932 MMUAccessType access_type, uintptr_t retaddr)
b4cd95d2
RH
4933{
4934 int mmu_idx = cpu_mmu_index(env, false);
4935 int mem_off = info->mem_off_first[0];
4936 bool nofault = fault == FAULT_NO;
4937 bool have_work = true;
4938
4939 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
4940 access_type, mmu_idx, retaddr)) {
4941 /* No work to be done. */
4942 return false;
4943 }
4944
4945 if (likely(info->page_split < 0)) {
4946 /* The entire operation was on the one page. */
4947 return true;
4948 }
4949
4950 /*
4951 * If the second page is invalid, then we want the fault address to be
4952 * the first byte on that page which is accessed.
4953 */
4954 if (info->mem_off_split >= 0) {
4955 /*
4956 * There is an element split across the pages. The fault address
4957 * should be the first byte of the second page.
4958 */
4959 mem_off = info->page_split;
4960 /*
4961 * If the split element is also the first active element
4962 * of the vector, then: For first-fault we should continue
4963 * to generate faults for the second page. For no-fault,
4964 * we have work only if the second page is valid.
4965 */
4966 if (info->mem_off_first[0] < info->mem_off_split) {
4967 nofault = FAULT_FIRST;
4968 have_work = false;
4969 }
4970 } else {
4971 /*
4972 * There is no element split across the pages. The fault address
4973 * should be the first active element on the second page.
4974 */
4975 mem_off = info->mem_off_first[1];
4976 /*
4977 * There must have been one active element on the first page,
4978 * so we're out of first-fault territory.
4979 */
4980 nofault = fault != FAULT_ALL;
4981 }
4982
4983 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
4984 access_type, mmu_idx, retaddr);
4985 return have_work;
4986}
4987
4bcc3f0f
RH
4988static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
4989 uint64_t *vg, target_ulong addr,
4990 int esize, int msize, int wp_access,
4991 uintptr_t retaddr)
4992{
4993#ifndef CONFIG_USER_ONLY
4994 intptr_t mem_off, reg_off, reg_last;
4995 int flags0 = info->page[0].flags;
4996 int flags1 = info->page[1].flags;
4997
4998 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
4999 return;
5000 }
5001
5002 /* Indicate that watchpoints are handled. */
5003 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5004 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5005
5006 if (flags0 & TLB_WATCHPOINT) {
5007 mem_off = info->mem_off_first[0];
5008 reg_off = info->reg_off_first[0];
5009 reg_last = info->reg_off_last[0];
5010
5011 while (reg_off <= reg_last) {
5012 uint64_t pg = vg[reg_off >> 6];
5013 do {
5014 if ((pg >> (reg_off & 63)) & 1) {
5015 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5016 msize, info->page[0].attrs,
5017 wp_access, retaddr);
5018 }
5019 reg_off += esize;
5020 mem_off += msize;
5021 } while (reg_off <= reg_last && (reg_off & 63));
5022 }
5023 }
5024
5025 mem_off = info->mem_off_split;
5026 if (mem_off >= 0) {
5027 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5028 info->page[0].attrs, wp_access, retaddr);
5029 }
5030
5031 mem_off = info->mem_off_first[1];
5032 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5033 reg_off = info->reg_off_first[1];
5034 reg_last = info->reg_off_last[1];
5035
5036 do {
5037 uint64_t pg = vg[reg_off >> 6];
5038 do {
5039 if ((pg >> (reg_off & 63)) & 1) {
5040 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5041 msize, info->page[1].attrs,
5042 wp_access, retaddr);
5043 }
5044 reg_off += esize;
5045 mem_off += msize;
5046 } while (reg_off & 63);
5047 } while (reg_off <= reg_last);
5048 }
5049#endif
5050}
5051
4c3310c7
RH
5052static void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5053 uint64_t *vg, target_ulong addr, int esize,
5054 int msize, uint32_t mtedesc, uintptr_t ra)
206adacf
RH
5055{
5056 intptr_t mem_off, reg_off, reg_last;
5057
5058 /* Process the page only if MemAttr == Tagged. */
5059 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
5060 mem_off = info->mem_off_first[0];
5061 reg_off = info->reg_off_first[0];
5062 reg_last = info->reg_off_split;
5063 if (reg_last < 0) {
5064 reg_last = info->reg_off_last[0];
5065 }
5066
5067 do {
5068 uint64_t pg = vg[reg_off >> 6];
5069 do {
5070 if ((pg >> (reg_off & 63)) & 1) {
4c3310c7 5071 mte_check(env, mtedesc, addr, ra);
206adacf
RH
5072 }
5073 reg_off += esize;
5074 mem_off += msize;
5075 } while (reg_off <= reg_last && (reg_off & 63));
5076 } while (reg_off <= reg_last);
5077 }
5078
5079 mem_off = info->mem_off_first[1];
5080 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
5081 reg_off = info->reg_off_first[1];
5082 reg_last = info->reg_off_last[1];
5083
5084 do {
5085 uint64_t pg = vg[reg_off >> 6];
5086 do {
5087 if ((pg >> (reg_off & 63)) & 1) {
4c3310c7 5088 mte_check(env, mtedesc, addr, ra);
206adacf
RH
5089 }
5090 reg_off += esize;
5091 mem_off += msize;
5092 } while (reg_off & 63);
5093 } while (reg_off <= reg_last);
5094 }
5095}
5096
9123aeb6 5097/*
5c9b8458 5098 * Common helper for all contiguous 1,2,3,4-register predicated stores.
9123aeb6 5099 */
b854fd06 5100static inline QEMU_ALWAYS_INLINE
5c9b8458 5101void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
b854fd06 5102 uint32_t desc, const uintptr_t retaddr,
206adacf 5103 const int esz, const int msz, const int N, uint32_t mtedesc,
b854fd06 5104 sve_ldst1_host_fn *host_fn,
4c3310c7 5105 sve_ldst1_tlb_fn *tlb_fn)
b854fd06 5106{
ba080b86 5107 const unsigned rd = simd_data(desc);
9123aeb6 5108 const intptr_t reg_max = simd_oprsz(desc);
b854fd06
RH
5109 intptr_t reg_off, reg_last, mem_off;
5110 SVEContLdSt info;
9123aeb6 5111 void *host;
5c9b8458 5112 int flags, i;
9123aeb6 5113
b854fd06 5114 /* Find the active elements. */
5c9b8458 5115 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
9123aeb6 5116 /* The entire predicate was false; no load occurs. */
5c9b8458
RH
5117 for (i = 0; i < N; ++i) {
5118 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5119 }
9123aeb6
RH
5120 return;
5121 }
9123aeb6 5122
b854fd06
RH
5123 /* Probe the page(s). Exit with exception for any invalid page. */
5124 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
9123aeb6 5125
4bcc3f0f 5126 /* Handle watchpoints for all active elements. */
5c9b8458 5127 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
4bcc3f0f
RH
5128 BP_MEM_READ, retaddr);
5129
206adacf
RH
5130 /*
5131 * Handle mte checks for all active elements.
5132 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5133 */
4c3310c7
RH
5134 if (mtedesc) {
5135 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5136 mtedesc, retaddr);
206adacf 5137 }
4bcc3f0f 5138
b854fd06
RH
5139 flags = info.page[0].flags | info.page[1].flags;
5140 if (unlikely(flags != 0)) {
9123aeb6 5141#ifdef CONFIG_USER_ONLY
b854fd06 5142 g_assert_not_reached();
9123aeb6 5143#else
b854fd06 5144 /*
4bcc3f0f 5145 * At least one page includes MMIO.
b854fd06
RH
5146 * Any bus operation can fail with cpu_transaction_failed,
5147 * which for ARM will raise SyncExternal. Perform the load
5148 * into scratch memory to preserve register state until the end.
5149 */
5c9b8458 5150 ARMVectorReg scratch[4] = { };
b854fd06 5151
b854fd06
RH
5152 mem_off = info.mem_off_first[0];
5153 reg_off = info.reg_off_first[0];
5154 reg_last = info.reg_off_last[1];
5155 if (reg_last < 0) {
5156 reg_last = info.reg_off_split;
5157 if (reg_last < 0) {
5158 reg_last = info.reg_off_last[0];
9123aeb6
RH
5159 }
5160 }
5161
b854fd06
RH
5162 do {
5163 uint64_t pg = vg[reg_off >> 6];
5164 do {
5165 if ((pg >> (reg_off & 63)) & 1) {
5c9b8458
RH
5166 for (i = 0; i < N; ++i) {
5167 tlb_fn(env, &scratch[i], reg_off,
5168 addr + mem_off + (i << msz), retaddr);
5169 }
b854fd06
RH
5170 }
5171 reg_off += 1 << esz;
5c9b8458 5172 mem_off += N << msz;
b854fd06
RH
5173 } while (reg_off & 63);
5174 } while (reg_off <= reg_last);
5175
5c9b8458
RH
5176 for (i = 0; i < N; ++i) {
5177 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5178 }
b854fd06 5179 return;
9123aeb6 5180#endif
b854fd06
RH
5181 }
5182
5183 /* The entire operation is in RAM, on valid pages. */
5184
5c9b8458
RH
5185 for (i = 0; i < N; ++i) {
5186 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5187 }
5188
b854fd06
RH
5189 mem_off = info.mem_off_first[0];
5190 reg_off = info.reg_off_first[0];
5191 reg_last = info.reg_off_last[0];
5192 host = info.page[0].host;
5193
5194 while (reg_off <= reg_last) {
5195 uint64_t pg = vg[reg_off >> 6];
5196 do {
5197 if ((pg >> (reg_off & 63)) & 1) {
5c9b8458
RH
5198 for (i = 0; i < N; ++i) {
5199 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5200 host + mem_off + (i << msz));
5201 }
b854fd06
RH
5202 }
5203 reg_off += 1 << esz;
5c9b8458 5204 mem_off += N << msz;
b854fd06
RH
5205 } while (reg_off <= reg_last && (reg_off & 63));
5206 }
9123aeb6 5207
b854fd06
RH
5208 /*
5209 * Use the slow path to manage the cross-page misalignment.
5210 * But we know this is RAM and cannot trap.
5211 */
5212 mem_off = info.mem_off_split;
5213 if (unlikely(mem_off >= 0)) {
5c9b8458
RH
5214 reg_off = info.reg_off_split;
5215 for (i = 0; i < N; ++i) {
5216 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5217 addr + mem_off + (i << msz), retaddr);
5218 }
b854fd06
RH
5219 }
5220
5221 mem_off = info.mem_off_first[1];
5222 if (unlikely(mem_off >= 0)) {
5223 reg_off = info.reg_off_first[1];
5224 reg_last = info.reg_off_last[1];
5225 host = info.page[1].host;
5226
5227 do {
5228 uint64_t pg = vg[reg_off >> 6];
5229 do {
5230 if ((pg >> (reg_off & 63)) & 1) {
5c9b8458
RH
5231 for (i = 0; i < N; ++i) {
5232 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5233 host + mem_off + (i << msz));
5234 }
b854fd06
RH
5235 }
5236 reg_off += 1 << esz;
5c9b8458 5237 mem_off += N << msz;
b854fd06
RH
5238 } while (reg_off & 63);
5239 } while (reg_off <= reg_last);
5240 }
c4e7c493
RH
5241}
5242
206adacf
RH
5243static inline QEMU_ALWAYS_INLINE
5244void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5245 uint32_t desc, const uintptr_t ra,
5246 const int esz, const int msz, const int N,
5247 sve_ldst1_host_fn *host_fn,
5248 sve_ldst1_tlb_fn *tlb_fn)
5249{
5250 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5251 int bit55 = extract64(addr, 55, 1);
5252
5253 /* Remove mtedesc from the normal sve descriptor. */
5254 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5255
5256 /* Perform gross MTE suppression early. */
5257 if (!tbi_check(desc, bit55) ||
5258 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5259 mtedesc = 0;
5260 }
5261
4c3310c7 5262 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
9123aeb6
RH
5263}
5264
206adacf
RH
5265#define DO_LD1_1(NAME, ESZ) \
5266void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5267 target_ulong addr, uint32_t desc) \
5268{ \
5269 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
4c3310c7 5270 sve_##NAME##_host, sve_##NAME##_tlb); \
206adacf
RH
5271} \
5272void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5273 target_ulong addr, uint32_t desc) \
5274{ \
5275 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5276 sve_##NAME##_host, sve_##NAME##_tlb); \
5277}
5278
5279#define DO_LD1_2(NAME, ESZ, MSZ) \
5280void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5281 target_ulong addr, uint32_t desc) \
5282{ \
5283 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4c3310c7 5284 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
206adacf
RH
5285} \
5286void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5287 target_ulong addr, uint32_t desc) \
5288{ \
5289 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4c3310c7 5290 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
206adacf
RH
5291} \
5292void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
4c3310c7 5293 target_ulong addr, uint32_t desc) \
206adacf
RH
5294{ \
5295 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5296 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5297} \
5298void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
4c3310c7 5299 target_ulong addr, uint32_t desc) \
206adacf
RH
5300{ \
5301 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5302 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
9123aeb6
RH
5303}
5304
5c9b8458
RH
5305DO_LD1_1(ld1bb, MO_8)
5306DO_LD1_1(ld1bhu, MO_16)
5307DO_LD1_1(ld1bhs, MO_16)
5308DO_LD1_1(ld1bsu, MO_32)
5309DO_LD1_1(ld1bss, MO_32)
5310DO_LD1_1(ld1bdu, MO_64)
5311DO_LD1_1(ld1bds, MO_64)
9123aeb6 5312
5c9b8458
RH
5313DO_LD1_2(ld1hh, MO_16, MO_16)
5314DO_LD1_2(ld1hsu, MO_32, MO_16)
5315DO_LD1_2(ld1hss, MO_32, MO_16)
5316DO_LD1_2(ld1hdu, MO_64, MO_16)
5317DO_LD1_2(ld1hds, MO_64, MO_16)
9123aeb6 5318
5c9b8458
RH
5319DO_LD1_2(ld1ss, MO_32, MO_32)
5320DO_LD1_2(ld1sdu, MO_64, MO_32)
5321DO_LD1_2(ld1sds, MO_64, MO_32)
9123aeb6 5322
5c9b8458 5323DO_LD1_2(ld1dd, MO_64, MO_64)
9123aeb6
RH
5324
5325#undef DO_LD1_1
5326#undef DO_LD1_2
5327
206adacf
RH
5328#define DO_LDN_1(N) \
5329void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5330 target_ulong addr, uint32_t desc) \
5331{ \
5332 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
4c3310c7 5333 sve_ld1bb_host, sve_ld1bb_tlb); \
206adacf
RH
5334} \
5335void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
5336 target_ulong addr, uint32_t desc) \
5337{ \
5338 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
5339 sve_ld1bb_host, sve_ld1bb_tlb); \
f27d4dc2
RH
5340}
5341
206adacf
RH
5342#define DO_LDN_2(N, SUFF, ESZ) \
5343void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
5344 target_ulong addr, uint32_t desc) \
5345{ \
5346 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4c3310c7 5347 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
206adacf
RH
5348} \
5349void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
5350 target_ulong addr, uint32_t desc) \
5351{ \
5352 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4c3310c7 5353 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
206adacf
RH
5354} \
5355void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
5356 target_ulong addr, uint32_t desc) \
5357{ \
5358 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5359 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5360} \
5361void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
5362 target_ulong addr, uint32_t desc) \
5363{ \
5364 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5365 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
c4e7c493
RH
5366}
5367
f27d4dc2
RH
5368DO_LDN_1(2)
5369DO_LDN_1(3)
5370DO_LDN_1(4)
c4e7c493 5371
5c9b8458
RH
5372DO_LDN_2(2, hh, MO_16)
5373DO_LDN_2(3, hh, MO_16)
5374DO_LDN_2(4, hh, MO_16)
c4e7c493 5375
5c9b8458
RH
5376DO_LDN_2(2, ss, MO_32)
5377DO_LDN_2(3, ss, MO_32)
5378DO_LDN_2(4, ss, MO_32)
c4e7c493 5379
5c9b8458
RH
5380DO_LDN_2(2, dd, MO_64)
5381DO_LDN_2(3, dd, MO_64)
5382DO_LDN_2(4, dd, MO_64)
c4e7c493 5383
f27d4dc2
RH
5384#undef DO_LDN_1
5385#undef DO_LDN_2
e2654d75
RH
5386
5387/*
5388 * Load contiguous data, first-fault and no-fault.
9123aeb6
RH
5389 *
5390 * For user-only, one could argue that we should hold the mmap_lock during
5391 * the operation so that there is no race between page_check_range and the
5392 * load operation. However, unmapping pages out from under a running thread
5393 * is extraordinarily unlikely. This theoretical race condition also affects
5394 * linux-user/ in its get_user/put_user macros.
5395 *
5396 * TODO: Construct some helpers, written in assembly, that interact with
5397 * handle_cpu_signal to produce memory ops which can properly report errors
5398 * without racing.
e2654d75
RH
5399 */
5400
e2654d75
RH
5401/* Fault on byte I. All bits in FFR from I are cleared. The vector
5402 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5403 * option, which leaves subsequent data unchanged.
5404 */
5405static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5406{
5407 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5408
5409 if (i & 63) {
5410 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5411 i = ROUND_UP(i, 64);
5412 }
5413 for (; i < oprsz; i += 64) {
5414 ffr[i / 64] = 0;
5415 }
5416}
5417
9123aeb6 5418/*
c647673c 5419 * Common helper for all contiguous no-fault and first-fault loads.
9123aeb6 5420 */
c647673c
RH
5421static inline QEMU_ALWAYS_INLINE
5422void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
aa13f7c3 5423 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
c647673c
RH
5424 const int esz, const int msz, const SVEContFault fault,
5425 sve_ldst1_host_fn *host_fn,
5426 sve_ldst1_tlb_fn *tlb_fn)
5427{
ba080b86 5428 const unsigned rd = simd_data(desc);
500d0484 5429 void *vd = &env->vfp.zregs[rd];
9123aeb6 5430 const intptr_t reg_max = simd_oprsz(desc);
c647673c
RH
5431 intptr_t reg_off, mem_off, reg_last;
5432 SVEContLdSt info;
5433 int flags;
9123aeb6
RH
5434 void *host;
5435
c647673c
RH
5436 /* Find the active elements. */
5437 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
9123aeb6
RH
5438 /* The entire predicate was false; no load occurs. */
5439 memset(vd, 0, reg_max);
5440 return;
5441 }
c647673c 5442 reg_off = info.reg_off_first[0];
9123aeb6 5443
c647673c
RH
5444 /* Probe the page(s). */
5445 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5446 /* Fault on first element. */
5447 tcg_debug_assert(fault == FAULT_NO);
5448 memset(vd, 0, reg_max);
5449 goto do_fault;
5450 }
5451
5452 mem_off = info.mem_off_first[0];
5453 flags = info.page[0].flags;
5454
aa13f7c3
RH
5455 /*
5456 * Disable MTE checking if the Tagged bit is not set. Since TBI must
5457 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
5458 */
5459 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
5460 mtedesc = 0;
5461 }
5462
c647673c 5463 if (fault == FAULT_FIRST) {
aa13f7c3
RH
5464 /* Trapping mte check for the first-fault element. */
5465 if (mtedesc) {
bd47b61c 5466 mte_check(env, mtedesc, addr + mem_off, retaddr);
aa13f7c3
RH
5467 }
5468
c647673c
RH
5469 /*
5470 * Special handling of the first active element,
5471 * if it crosses a page boundary or is MMIO.
5472 */
5473 bool is_split = mem_off == info.mem_off_split;
c647673c
RH
5474 if (unlikely(flags != 0) || unlikely(is_split)) {
5475 /*
5476 * Use the slow path for cross-page handling.
5477 * Might trap for MMIO or watchpoints.
5478 */
5479 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
5480
5481 /* After any fault, zero the other elements. */
9123aeb6 5482 swap_memzero(vd, reg_off);
c647673c
RH
5483 reg_off += 1 << esz;
5484 mem_off += 1 << msz;
5485 swap_memzero(vd + reg_off, reg_max - reg_off);
5486
5487 if (is_split) {
5488 goto second_page;
5489 }
5490 } else {
5491 memset(vd, 0, reg_max);
5492 }
5493 } else {
5494 memset(vd, 0, reg_max);
5495 if (unlikely(mem_off == info.mem_off_split)) {
5496 /* The first active element crosses a page boundary. */
5497 flags |= info.page[1].flags;
5498 if (unlikely(flags & TLB_MMIO)) {
5499 /* Some page is MMIO, see below. */
5500 goto do_fault;
5501 }
5502 if (unlikely(flags & TLB_WATCHPOINT) &&
5503 (cpu_watchpoint_address_matches
5504 (env_cpu(env), addr + mem_off, 1 << msz)
5505 & BP_MEM_READ)) {
5506 /* Watchpoint hit, see below. */
5507 goto do_fault;
5508 }
d304d280 5509 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
aa13f7c3
RH
5510 goto do_fault;
5511 }
c647673c
RH
5512 /*
5513 * Use the slow path for cross-page handling.
5514 * This is RAM, without a watchpoint, and will not trap.
5515 */
5516 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
5517 goto second_page;
9123aeb6
RH
5518 }
5519 }
5520
9123aeb6 5521 /*
c647673c
RH
5522 * From this point on, all memory operations are MemSingleNF.
5523 *
5524 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
5525 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
5526 *
5527 * Unfortuately we do not have access to the memory attributes from the
5528 * PTE to tell Device memory from Normal memory. So we make a mostly
5529 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
5530 * This gives the right answer for the common cases of "Normal memory,
5531 * backed by host RAM" and "Device memory, backed by MMIO".
5532 * The architecture allows us to suppress an NF load and return
5533 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
5534 * case of "Normal memory, backed by MMIO" is permitted. The case we
5535 * get wrong is "Device memory, backed by host RAM", for which we
5536 * should return (UNKNOWN, FAULT) for but do not.
5537 *
5538 * Similarly, CPU_BP breakpoints would raise exceptions, and so
5539 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
5540 * architectural breakpoints the same.
9123aeb6 5541 */
c647673c
RH
5542 if (unlikely(flags & TLB_MMIO)) {
5543 goto do_fault;
9123aeb6 5544 }
9123aeb6 5545
c647673c
RH
5546 reg_last = info.reg_off_last[0];
5547 host = info.page[0].host;
9123aeb6 5548
c647673c
RH
5549 do {
5550 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
cf4a49b7 5551 do {
c647673c
RH
5552 if ((pg >> (reg_off & 63)) & 1) {
5553 if (unlikely(flags & TLB_WATCHPOINT) &&
5554 (cpu_watchpoint_address_matches
5555 (env_cpu(env), addr + mem_off, 1 << msz)
5556 & BP_MEM_READ)) {
5557 goto do_fault;
5558 }
d304d280 5559 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
aa13f7c3
RH
5560 goto do_fault;
5561 }
c647673c
RH
5562 host_fn(vd, reg_off, host + mem_off);
5563 }
cf4a49b7 5564 reg_off += 1 << esz;
c647673c
RH
5565 mem_off += 1 << msz;
5566 } while (reg_off <= reg_last && (reg_off & 63));
5567 } while (reg_off <= reg_last);
9123aeb6 5568
c647673c
RH
5569 /*
5570 * MemSingleNF is allowed to fail for any reason. We have special
5571 * code above to handle the first element crossing a page boundary.
5572 * As an implementation choice, decline to handle a cross-page element
5573 * in any other position.
5574 */
5575 reg_off = info.reg_off_split;
5576 if (reg_off >= 0) {
5577 goto do_fault;
5578 }
9123aeb6 5579
c647673c
RH
5580 second_page:
5581 reg_off = info.reg_off_first[1];
5582 if (likely(reg_off < 0)) {
5583 /* No active elements on the second page. All done. */
9123aeb6
RH
5584 return;
5585 }
9123aeb6 5586
9123aeb6 5587 /*
c647673c
RH
5588 * MemSingleNF is allowed to fail for any reason. As an implementation
5589 * choice, decline to handle elements on the second page. This should
5590 * be low frequency as the guest walks through memory -- the next
5591 * iteration of the guest's loop should be aligned on the page boundary,
5592 * and then all following iterations will stay aligned.
9123aeb6 5593 */
9123aeb6 5594
c647673c 5595 do_fault:
9123aeb6
RH
5596 record_fault(env, reg_off, reg_max);
5597}
5598
aa13f7c3
RH
5599static inline QEMU_ALWAYS_INLINE
5600void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
5601 uint32_t desc, const uintptr_t retaddr,
5602 const int esz, const int msz, const SVEContFault fault,
5603 sve_ldst1_host_fn *host_fn,
5604 sve_ldst1_tlb_fn *tlb_fn)
5605{
5606 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5607 int bit55 = extract64(addr, 55, 1);
5608
5609 /* Remove mtedesc from the normal sve descriptor. */
5610 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5611
5612 /* Perform gross MTE suppression early. */
5613 if (!tbi_check(desc, bit55) ||
5614 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5615 mtedesc = 0;
5616 }
5617
5618 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
5619 esz, msz, fault, host_fn, tlb_fn);
5620}
5621
5622#define DO_LDFF1_LDNF1_1(PART, ESZ) \
9123aeb6
RH
5623void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
5624 target_ulong addr, uint32_t desc) \
e2654d75 5625{ \
aa13f7c3 5626 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
c647673c 5627 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
e2654d75 5628} \
9123aeb6
RH
5629void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
5630 target_ulong addr, uint32_t desc) \
e2654d75 5631{ \
aa13f7c3
RH
5632 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
5633 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5634} \
5635void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
5636 target_ulong addr, uint32_t desc) \
5637{ \
5638 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
5639 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5640} \
5641void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
5642 target_ulong addr, uint32_t desc) \
5643{ \
5644 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
c647673c 5645 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
e2654d75
RH
5646}
5647
aa13f7c3 5648#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
7d0a57a2
RH
5649void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
5650 target_ulong addr, uint32_t desc) \
e2654d75 5651{ \
aa13f7c3 5652 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
c647673c 5653 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
9123aeb6 5654} \
7d0a57a2
RH
5655void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
5656 target_ulong addr, uint32_t desc) \
9123aeb6 5657{ \
aa13f7c3 5658 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
c647673c 5659 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
7d0a57a2
RH
5660} \
5661void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
5662 target_ulong addr, uint32_t desc) \
5663{ \
aa13f7c3 5664 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
c647673c 5665 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
7d0a57a2
RH
5666} \
5667void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
5668 target_ulong addr, uint32_t desc) \
5669{ \
aa13f7c3 5670 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
c647673c 5671 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
aa13f7c3
RH
5672} \
5673void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5674 target_ulong addr, uint32_t desc) \
5675{ \
5676 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5677 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5678} \
5679void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5680 target_ulong addr, uint32_t desc) \
5681{ \
5682 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5683 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5684} \
5685void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5686 target_ulong addr, uint32_t desc) \
5687{ \
5688 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5689 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5690} \
5691void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5692 target_ulong addr, uint32_t desc) \
5693{ \
5694 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5695 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
e2654d75
RH
5696}
5697
c647673c
RH
5698DO_LDFF1_LDNF1_1(bb, MO_8)
5699DO_LDFF1_LDNF1_1(bhu, MO_16)
5700DO_LDFF1_LDNF1_1(bhs, MO_16)
5701DO_LDFF1_LDNF1_1(bsu, MO_32)
5702DO_LDFF1_LDNF1_1(bss, MO_32)
5703DO_LDFF1_LDNF1_1(bdu, MO_64)
5704DO_LDFF1_LDNF1_1(bds, MO_64)
e2654d75 5705
c647673c
RH
5706DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
5707DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
5708DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
5709DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
5710DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
e2654d75 5711
c647673c
RH
5712DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
5713DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
5714DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
e2654d75 5715
c647673c 5716DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
e2654d75 5717
9123aeb6
RH
5718#undef DO_LDFF1_LDNF1_1
5719#undef DO_LDFF1_LDNF1_2
1a039c7e 5720
9fd46c83 5721/*
0fa476c1 5722 * Common helper for all contiguous 1,2,3,4-register predicated stores.
9fd46c83 5723 */
0fa476c1
RH
5724
5725static inline QEMU_ALWAYS_INLINE
71b9f394
RH
5726void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
5727 uint32_t desc, const uintptr_t retaddr,
5728 const int esz, const int msz, const int N, uint32_t mtedesc,
0fa476c1 5729 sve_ldst1_host_fn *host_fn,
4c3310c7 5730 sve_ldst1_tlb_fn *tlb_fn)
9fd46c83 5731{
ba080b86 5732 const unsigned rd = simd_data(desc);
0fa476c1
RH
5733 const intptr_t reg_max = simd_oprsz(desc);
5734 intptr_t reg_off, reg_last, mem_off;
5735 SVEContLdSt info;
5736 void *host;
5737 int i, flags;
1a039c7e 5738
0fa476c1
RH
5739 /* Find the active elements. */
5740 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5741 /* The entire predicate was false; no store occurs. */
5742 return;
9fd46c83 5743 }
1a039c7e 5744
0fa476c1
RH
5745 /* Probe the page(s). Exit with exception for any invalid page. */
5746 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
1a039c7e 5747
0fa476c1
RH
5748 /* Handle watchpoints for all active elements. */
5749 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5750 BP_MEM_WRITE, retaddr);
5751
71b9f394
RH
5752 /*
5753 * Handle mte checks for all active elements.
5754 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5755 */
4c3310c7
RH
5756 if (mtedesc) {
5757 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5758 mtedesc, retaddr);
71b9f394 5759 }
0fa476c1
RH
5760
5761 flags = info.page[0].flags | info.page[1].flags;
5762 if (unlikely(flags != 0)) {
5763#ifdef CONFIG_USER_ONLY
5764 g_assert_not_reached();
5765#else
5766 /*
5767 * At least one page includes MMIO.
5768 * Any bus operation can fail with cpu_transaction_failed,
5769 * which for ARM will raise SyncExternal. We cannot avoid
5770 * this fault and will leave with the store incomplete.
5771 */
5772 mem_off = info.mem_off_first[0];
5773 reg_off = info.reg_off_first[0];
5774 reg_last = info.reg_off_last[1];
5775 if (reg_last < 0) {
5776 reg_last = info.reg_off_split;
5777 if (reg_last < 0) {
5778 reg_last = info.reg_off_last[0];
9fd46c83 5779 }
0fa476c1
RH
5780 }
5781
5782 do {
5783 uint64_t pg = vg[reg_off >> 6];
5784 do {
5785 if ((pg >> (reg_off & 63)) & 1) {
5786 for (i = 0; i < N; ++i) {
5787 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5788 addr + mem_off + (i << msz), retaddr);
5789 }
5790 }
5791 reg_off += 1 << esz;
5792 mem_off += N << msz;
5793 } while (reg_off & 63);
5794 } while (reg_off <= reg_last);
5795 return;
5796#endif
1a039c7e 5797 }
1a039c7e 5798
0fa476c1
RH
5799 mem_off = info.mem_off_first[0];
5800 reg_off = info.reg_off_first[0];
5801 reg_last = info.reg_off_last[0];
5802 host = info.page[0].host;
1a039c7e 5803
0fa476c1
RH
5804 while (reg_off <= reg_last) {
5805 uint64_t pg = vg[reg_off >> 6];
9fd46c83 5806 do {
0fa476c1
RH
5807 if ((pg >> (reg_off & 63)) & 1) {
5808 for (i = 0; i < N; ++i) {
5809 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5810 host + mem_off + (i << msz));
5811 }
9fd46c83 5812 }
0fa476c1
RH
5813 reg_off += 1 << esz;
5814 mem_off += N << msz;
5815 } while (reg_off <= reg_last && (reg_off & 63));
1a039c7e 5816 }
1a039c7e 5817
0fa476c1
RH
5818 /*
5819 * Use the slow path to manage the cross-page misalignment.
5820 * But we know this is RAM and cannot trap.
5821 */
5822 mem_off = info.mem_off_split;
5823 if (unlikely(mem_off >= 0)) {
5824 reg_off = info.reg_off_split;
5825 for (i = 0; i < N; ++i) {
5826 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5827 addr + mem_off + (i << msz), retaddr);
5828 }
5829 }
5830
5831 mem_off = info.mem_off_first[1];
5832 if (unlikely(mem_off >= 0)) {
5833 reg_off = info.reg_off_first[1];
5834 reg_last = info.reg_off_last[1];
5835 host = info.page[1].host;
1a039c7e 5836
9fd46c83 5837 do {
0fa476c1
RH
5838 uint64_t pg = vg[reg_off >> 6];
5839 do {
5840 if ((pg >> (reg_off & 63)) & 1) {
5841 for (i = 0; i < N; ++i) {
5842 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5843 host + mem_off + (i << msz));
5844 }
5845 }
5846 reg_off += 1 << esz;
5847 mem_off += N << msz;
5848 } while (reg_off & 63);
5849 } while (reg_off <= reg_last);
1a039c7e 5850 }
9fd46c83
RH
5851}
5852
71b9f394
RH
5853static inline QEMU_ALWAYS_INLINE
5854void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5855 uint32_t desc, const uintptr_t ra,
5856 const int esz, const int msz, const int N,
5857 sve_ldst1_host_fn *host_fn,
5858 sve_ldst1_tlb_fn *tlb_fn)
5859{
5860 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5861 int bit55 = extract64(addr, 55, 1);
5862
5863 /* Remove mtedesc from the normal sve descriptor. */
5864 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5865
5866 /* Perform gross MTE suppression early. */
5867 if (!tbi_check(desc, bit55) ||
5868 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5869 mtedesc = 0;
5870 }
5871
4c3310c7 5872 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
1a039c7e 5873}
f6dbf62a 5874
71b9f394
RH
5875#define DO_STN_1(N, NAME, ESZ) \
5876void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
5877 target_ulong addr, uint32_t desc) \
5878{ \
5879 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
4c3310c7 5880 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
71b9f394
RH
5881} \
5882void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
5883 target_ulong addr, uint32_t desc) \
5884{ \
5885 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
5886 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
5887}
5888
5889#define DO_STN_2(N, NAME, ESZ, MSZ) \
5890void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
5891 target_ulong addr, uint32_t desc) \
5892{ \
5893 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
4c3310c7 5894 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
71b9f394
RH
5895} \
5896void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
5897 target_ulong addr, uint32_t desc) \
5898{ \
5899 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
4c3310c7 5900 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
71b9f394
RH
5901} \
5902void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5903 target_ulong addr, uint32_t desc) \
5904{ \
5905 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5906 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
5907} \
5908void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5909 target_ulong addr, uint32_t desc) \
5910{ \
5911 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5912 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
0fa476c1
RH
5913}
5914
5915DO_STN_1(1, bb, MO_8)
5916DO_STN_1(1, bh, MO_16)
5917DO_STN_1(1, bs, MO_32)
5918DO_STN_1(1, bd, MO_64)
5919DO_STN_1(2, bb, MO_8)
5920DO_STN_1(3, bb, MO_8)
5921DO_STN_1(4, bb, MO_8)
5922
5923DO_STN_2(1, hh, MO_16, MO_16)
5924DO_STN_2(1, hs, MO_32, MO_16)
5925DO_STN_2(1, hd, MO_64, MO_16)
5926DO_STN_2(2, hh, MO_16, MO_16)
5927DO_STN_2(3, hh, MO_16, MO_16)
5928DO_STN_2(4, hh, MO_16, MO_16)
5929
5930DO_STN_2(1, ss, MO_32, MO_32)
5931DO_STN_2(1, sd, MO_64, MO_32)
5932DO_STN_2(2, ss, MO_32, MO_32)
5933DO_STN_2(3, ss, MO_32, MO_32)
5934DO_STN_2(4, ss, MO_32, MO_32)
5935
5936DO_STN_2(1, dd, MO_64, MO_64)
5937DO_STN_2(2, dd, MO_64, MO_64)
5938DO_STN_2(3, dd, MO_64, MO_64)
5939DO_STN_2(4, dd, MO_64, MO_64)
9fd46c83
RH
5940
5941#undef DO_STN_1
5942#undef DO_STN_2
5943
d4f75f25
RH
5944/*
5945 * Loads with a vector index.
5946 */
673e9fa6 5947
d4f75f25
RH
5948/*
5949 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
5950 */
5951typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
5952
5953static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
5954{
5955 return *(uint32_t *)(reg + H1_4(reg_ofs));
673e9fa6
RH
5956}
5957
d4f75f25
RH
5958static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
5959{
5960 return *(int32_t *)(reg + H1_4(reg_ofs));
5961}
5962
5963static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
5964{
5965 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
5966}
5967
5968static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
5969{
5970 return (int32_t)*(uint64_t *)(reg + reg_ofs);
5971}
5972
5973static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
5974{
5975 return *(uint64_t *)(reg + reg_ofs);
673e9fa6
RH
5976}
5977
10a85e2c
RH
5978static inline QEMU_ALWAYS_INLINE
5979void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5980 target_ulong base, uint32_t desc, uintptr_t retaddr,
d28d12f0
RH
5981 uint32_t mtedesc, int esize, int msize,
5982 zreg_off_fn *off_fn,
10a85e2c
RH
5983 sve_ldst1_host_fn *host_fn,
5984 sve_ldst1_tlb_fn *tlb_fn)
d4f75f25 5985{
10a85e2c
RH
5986 const int mmu_idx = cpu_mmu_index(env, false);
5987 const intptr_t reg_max = simd_oprsz(desc);
ba080b86 5988 const int scale = simd_data(desc);
10a85e2c
RH
5989 ARMVectorReg scratch;
5990 intptr_t reg_off;
5991 SVEHostPage info, info2;
d4f75f25 5992
10a85e2c
RH
5993 memset(&scratch, 0, reg_max);
5994 reg_off = 0;
5995 do {
5996 uint64_t pg = vg[reg_off >> 6];
d4f75f25
RH
5997 do {
5998 if (likely(pg & 1)) {
10a85e2c
RH
5999 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6000 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6001
6002 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6003 mmu_idx, retaddr);
6004
6005 if (likely(in_page >= msize)) {
6006 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6007 cpu_check_watchpoint(env_cpu(env), addr, msize,
6008 info.attrs, BP_MEM_READ, retaddr);
6009 }
d28d12f0 6010 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
bd47b61c 6011 mte_check(env, mtedesc, addr, retaddr);
d28d12f0 6012 }
10a85e2c
RH
6013 host_fn(&scratch, reg_off, info.host);
6014 } else {
6015 /* Element crosses the page boundary. */
6016 sve_probe_page(&info2, false, env, addr + in_page, 0,
6017 MMU_DATA_LOAD, mmu_idx, retaddr);
6018 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6019 cpu_check_watchpoint(env_cpu(env), addr,
6020 msize, info.attrs,
6021 BP_MEM_READ, retaddr);
6022 }
d28d12f0 6023 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
bd47b61c 6024 mte_check(env, mtedesc, addr, retaddr);
d28d12f0 6025 }
10a85e2c
RH
6026 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6027 }
d4f75f25 6028 }
10a85e2c
RH
6029 reg_off += esize;
6030 pg >>= esize;
6031 } while (reg_off & 63);
6032 } while (reg_off < reg_max);
d4f75f25
RH
6033
6034 /* Wait until all exceptions have been raised to write back. */
10a85e2c 6035 memcpy(vd, &scratch, reg_max);
d4f75f25
RH
6036}
6037
d28d12f0
RH
6038static inline QEMU_ALWAYS_INLINE
6039void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6040 target_ulong base, uint32_t desc, uintptr_t retaddr,
6041 int esize, int msize, zreg_off_fn *off_fn,
6042 sve_ldst1_host_fn *host_fn,
6043 sve_ldst1_tlb_fn *tlb_fn)
6044{
6045 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6046 /* Remove mtedesc from the normal sve descriptor. */
6047 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6048
6049 /*
6050 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6051 * offset base entirely over the address space hole to change the
6052 * pointer tag, or change the bit55 selector. So we could here
6053 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6054 */
6055 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6056 esize, msize, off_fn, host_fn, tlb_fn);
6057}
6058
10a85e2c
RH
6059#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6060void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6061 void *vm, target_ulong base, uint32_t desc) \
6062{ \
d28d12f0 6063 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
10a85e2c 6064 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
d28d12f0
RH
6065} \
6066void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6067 void *vm, target_ulong base, uint32_t desc) \
6068{ \
6069 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6070 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
10a85e2c 6071}
d4f75f25 6072
10a85e2c
RH
6073#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6074void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6075 void *vm, target_ulong base, uint32_t desc) \
6076{ \
d28d12f0 6077 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
10a85e2c 6078 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
d28d12f0
RH
6079} \
6080void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6081 void *vm, target_ulong base, uint32_t desc) \
6082{ \
6083 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6084 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
10a85e2c
RH
6085}
6086
6087DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6088DO_LD1_ZPZ_S(bsu, zss, MO_8)
6089DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6090DO_LD1_ZPZ_D(bdu, zss, MO_8)
6091DO_LD1_ZPZ_D(bdu, zd, MO_8)
6092
6093DO_LD1_ZPZ_S(bss, zsu, MO_8)
6094DO_LD1_ZPZ_S(bss, zss, MO_8)
6095DO_LD1_ZPZ_D(bds, zsu, MO_8)
6096DO_LD1_ZPZ_D(bds, zss, MO_8)
6097DO_LD1_ZPZ_D(bds, zd, MO_8)
6098
6099DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6100DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6101DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6102DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6103DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6104
6105DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6106DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6107DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6108DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6109DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6110
6111DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6112DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6113DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6114DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6115DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6116
6117DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6118DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6119DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6120DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6121DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6122
6123DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6124DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6125DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6126DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6127DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6128
6129DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6130DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6131DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6132DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6133DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6134
6135DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6136DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6137DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6138
6139DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6140DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6141DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6142
6143DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6144DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6145DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6146
6147DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6148DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6149DO_LD1_ZPZ_D(dd_be, zd, MO_64)
d4f75f25
RH
6150
6151#undef DO_LD1_ZPZ_S
6152#undef DO_LD1_ZPZ_D
673e9fa6 6153
ed67eb7f
RH
6154/* First fault loads with a vector index. */
6155
116347ce 6156/*
50de9b78 6157 * Common helpers for all gather first-faulting loads.
116347ce 6158 */
50de9b78
RH
6159
6160static inline QEMU_ALWAYS_INLINE
6161void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6162 target_ulong base, uint32_t desc, uintptr_t retaddr,
d28d12f0
RH
6163 uint32_t mtedesc, const int esz, const int msz,
6164 zreg_off_fn *off_fn,
50de9b78
RH
6165 sve_ldst1_host_fn *host_fn,
6166 sve_ldst1_tlb_fn *tlb_fn)
116347ce 6167{
50de9b78 6168 const int mmu_idx = cpu_mmu_index(env, false);
ba080b86
RH
6169 const intptr_t reg_max = simd_oprsz(desc);
6170 const int scale = simd_data(desc);
50de9b78
RH
6171 const int esize = 1 << esz;
6172 const int msize = 1 << msz;
50de9b78
RH
6173 intptr_t reg_off;
6174 SVEHostPage info;
6175 target_ulong addr, in_page;
116347ce
RH
6176
6177 /* Skip to the first true predicate. */
50de9b78
RH
6178 reg_off = find_next_active(vg, 0, reg_max, esz);
6179 if (unlikely(reg_off >= reg_max)) {
6180 /* The entire predicate was false; no load occurs. */
6181 memset(vd, 0, reg_max);
6182 return;
116347ce
RH
6183 }
6184
50de9b78
RH
6185 /*
6186 * Probe the first element, allowing faults.
6187 */
6188 addr = base + (off_fn(vm, reg_off) << scale);
d28d12f0 6189 if (mtedesc) {
bd47b61c 6190 mte_check(env, mtedesc, addr, retaddr);
d28d12f0 6191 }
50de9b78 6192 tlb_fn(env, vd, reg_off, addr, retaddr);
ed67eb7f 6193
50de9b78
RH
6194 /* After any fault, zero the other elements. */
6195 swap_memzero(vd, reg_off);
6196 reg_off += esize;
6197 swap_memzero(vd + reg_off, reg_max - reg_off);
116347ce 6198
50de9b78
RH
6199 /*
6200 * Probe the remaining elements, not allowing faults.
6201 */
6202 while (reg_off < reg_max) {
6203 uint64_t pg = vg[reg_off >> 6];
6204 do {
6205 if (likely((pg >> (reg_off & 63)) & 1)) {
6206 addr = base + (off_fn(vm, reg_off) << scale);
6207 in_page = -(addr | TARGET_PAGE_MASK);
116347ce 6208
50de9b78
RH
6209 if (unlikely(in_page < msize)) {
6210 /* Stop if the element crosses a page boundary. */
6211 goto fault;
6212 }
ed67eb7f 6213
50de9b78
RH
6214 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6215 mmu_idx, retaddr);
6216 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6217 goto fault;
6218 }
6219 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6220 (cpu_watchpoint_address_matches
6221 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6222 goto fault;
6223 }
d28d12f0
RH
6224 if (mtedesc &&
6225 arm_tlb_mte_tagged(&info.attrs) &&
d304d280 6226 !mte_probe(env, mtedesc, addr)) {
d28d12f0
RH
6227 goto fault;
6228 }
116347ce 6229
50de9b78 6230 host_fn(vd, reg_off, info.host);
116347ce 6231 }
50de9b78
RH
6232 reg_off += esize;
6233 } while (reg_off & 63);
116347ce 6234 }
50de9b78 6235 return;
116347ce 6236
50de9b78
RH
6237 fault:
6238 record_fault(env, reg_off, reg_max);
116347ce
RH
6239}
6240
d28d12f0
RH
6241static inline QEMU_ALWAYS_INLINE
6242void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6243 target_ulong base, uint32_t desc, uintptr_t retaddr,
6244 const int esz, const int msz,
6245 zreg_off_fn *off_fn,
6246 sve_ldst1_host_fn *host_fn,
6247 sve_ldst1_tlb_fn *tlb_fn)
6248{
6249 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6250 /* Remove mtedesc from the normal sve descriptor. */
6251 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6252
6253 /*
6254 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6255 * offset base entirely over the address space hole to change the
6256 * pointer tag, or change the bit55 selector. So we could here
6257 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6258 */
6259 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6260 esz, msz, off_fn, host_fn, tlb_fn);
50de9b78
RH
6261}
6262
d28d12f0
RH
6263#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6264void HELPER(sve_ldff##MEM##_##OFS) \
6265 (CPUARMState *env, void *vd, void *vg, \
6266 void *vm, target_ulong base, uint32_t desc) \
6267{ \
6268 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6269 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6270} \
6271void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6272 (CPUARMState *env, void *vd, void *vg, \
6273 void *vm, target_ulong base, uint32_t desc) \
6274{ \
6275 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6276 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6277}
6278
6279#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6280void HELPER(sve_ldff##MEM##_##OFS) \
6281 (CPUARMState *env, void *vd, void *vg, \
6282 void *vm, target_ulong base, uint32_t desc) \
6283{ \
6284 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6285 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6286} \
6287void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6288 (CPUARMState *env, void *vd, void *vg, \
6289 void *vm, target_ulong base, uint32_t desc) \
6290{ \
6291 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6292 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
50de9b78
RH
6293}
6294
6295DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6296DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6297DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6298DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6299DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6300
6301DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6302DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6303DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6304DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6305DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6306
6307DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6308DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6309DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6310DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6311DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6312
6313DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6314DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6315DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6316DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6317DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6318
6319DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6320DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6321DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6322DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6323DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6324
6325DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6326DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6327DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6328DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6329DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6330
6331DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
6332DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
6333DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6334DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6335DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6336
6337DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
6338DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
6339DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6340DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6341DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6342
6343DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6344DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6345DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6346
6347DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6348DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6349DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6350
6351DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6352DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6353DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6354
6355DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6356DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6357DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
ed67eb7f 6358
f6dbf62a
RH
6359/* Stores with a vector index. */
6360
88a660a4
RH
6361static inline QEMU_ALWAYS_INLINE
6362void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6363 target_ulong base, uint32_t desc, uintptr_t retaddr,
d28d12f0
RH
6364 uint32_t mtedesc, int esize, int msize,
6365 zreg_off_fn *off_fn,
88a660a4
RH
6366 sve_ldst1_host_fn *host_fn,
6367 sve_ldst1_tlb_fn *tlb_fn)
78cf1b88 6368{
88a660a4
RH
6369 const int mmu_idx = cpu_mmu_index(env, false);
6370 const intptr_t reg_max = simd_oprsz(desc);
ba080b86 6371 const int scale = simd_data(desc);
88a660a4
RH
6372 void *host[ARM_MAX_VQ * 4];
6373 intptr_t reg_off, i;
6374 SVEHostPage info, info2;
f6dbf62a 6375
88a660a4
RH
6376 /*
6377 * Probe all of the elements for host addresses and flags.
6378 */
6379 i = reg_off = 0;
6380 do {
6381 uint64_t pg = vg[reg_off >> 6];
78cf1b88 6382 do {
88a660a4
RH
6383 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6384 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
f6dbf62a 6385
88a660a4
RH
6386 host[i] = NULL;
6387 if (likely((pg >> (reg_off & 63)) & 1)) {
6388 if (likely(in_page >= msize)) {
6389 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6390 mmu_idx, retaddr);
6391 host[i] = info.host;
6392 } else {
6393 /*
6394 * Element crosses the page boundary.
6395 * Probe both pages, but do not record the host address,
6396 * so that we use the slow path.
6397 */
6398 sve_probe_page(&info, false, env, addr, 0,
6399 MMU_DATA_STORE, mmu_idx, retaddr);
6400 sve_probe_page(&info2, false, env, addr + in_page, 0,
6401 MMU_DATA_STORE, mmu_idx, retaddr);
6402 info.flags |= info2.flags;
6403 }
f6dbf62a 6404
88a660a4
RH
6405 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6406 cpu_check_watchpoint(env_cpu(env), addr, msize,
6407 info.attrs, BP_MEM_WRITE, retaddr);
6408 }
d28d12f0
RH
6409
6410 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
bd47b61c 6411 mte_check(env, mtedesc, addr, retaddr);
d28d12f0 6412 }
88a660a4
RH
6413 }
6414 i += 1;
6415 reg_off += esize;
6416 } while (reg_off & 63);
6417 } while (reg_off < reg_max);
6418
6419 /*
6420 * Now that we have recognized all exceptions except SyncExternal
6421 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6422 *
6423 * Note for the common case of an element in RAM, not crossing a page
6424 * boundary, we have stored the host address in host[]. This doubles
6425 * as a first-level check against the predicate, since only enabled
6426 * elements have non-null host addresses.
6427 */
6428 i = reg_off = 0;
6429 do {
6430 void *h = host[i];
6431 if (likely(h != NULL)) {
6432 host_fn(vd, reg_off, h);
6433 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
6434 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6435 tlb_fn(env, vd, reg_off, addr, retaddr);
78cf1b88 6436 }
88a660a4
RH
6437 i += 1;
6438 reg_off += esize;
6439 } while (reg_off < reg_max);
78cf1b88 6440}
f6dbf62a 6441
d28d12f0
RH
6442static inline QEMU_ALWAYS_INLINE
6443void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6444 target_ulong base, uint32_t desc, uintptr_t retaddr,
6445 int esize, int msize, zreg_off_fn *off_fn,
6446 sve_ldst1_host_fn *host_fn,
6447 sve_ldst1_tlb_fn *tlb_fn)
6448{
6449 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6450 /* Remove mtedesc from the normal sve descriptor. */
6451 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6452
6453 /*
6454 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6455 * offset base entirely over the address space hole to change the
6456 * pointer tag, or change the bit55 selector. So we could here
6457 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6458 */
6459 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6460 esize, msize, off_fn, host_fn, tlb_fn);
6461}
6462
6463#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
6464void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
88a660a4 6465 void *vm, target_ulong base, uint32_t desc) \
d28d12f0
RH
6466{ \
6467 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6468 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6469} \
6470void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6471 void *vm, target_ulong base, uint32_t desc) \
6472{ \
6473 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6474 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
78cf1b88 6475}
f6dbf62a 6476
d28d12f0
RH
6477#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
6478void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
88a660a4 6479 void *vm, target_ulong base, uint32_t desc) \
d28d12f0
RH
6480{ \
6481 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6482 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6483} \
6484void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6485 void *vm, target_ulong base, uint32_t desc) \
6486{ \
6487 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6488 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
88a660a4
RH
6489}
6490
6491DO_ST1_ZPZ_S(bs, zsu, MO_8)
6492DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
6493DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
6494DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
6495DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
6496
6497DO_ST1_ZPZ_S(bs, zss, MO_8)
6498DO_ST1_ZPZ_S(hs_le, zss, MO_16)
6499DO_ST1_ZPZ_S(hs_be, zss, MO_16)
6500DO_ST1_ZPZ_S(ss_le, zss, MO_32)
6501DO_ST1_ZPZ_S(ss_be, zss, MO_32)
6502
6503DO_ST1_ZPZ_D(bd, zsu, MO_8)
6504DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
6505DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
6506DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
6507DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
6508DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
6509DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
6510
6511DO_ST1_ZPZ_D(bd, zss, MO_8)
6512DO_ST1_ZPZ_D(hd_le, zss, MO_16)
6513DO_ST1_ZPZ_D(hd_be, zss, MO_16)
6514DO_ST1_ZPZ_D(sd_le, zss, MO_32)
6515DO_ST1_ZPZ_D(sd_be, zss, MO_32)
6516DO_ST1_ZPZ_D(dd_le, zss, MO_64)
6517DO_ST1_ZPZ_D(dd_be, zss, MO_64)
6518
6519DO_ST1_ZPZ_D(bd, zd, MO_8)
6520DO_ST1_ZPZ_D(hd_le, zd, MO_16)
6521DO_ST1_ZPZ_D(hd_be, zd, MO_16)
6522DO_ST1_ZPZ_D(sd_le, zd, MO_32)
6523DO_ST1_ZPZ_D(sd_be, zd, MO_32)
6524DO_ST1_ZPZ_D(dd_le, zd, MO_64)
6525DO_ST1_ZPZ_D(dd_be, zd, MO_64)
78cf1b88
RH
6526
6527#undef DO_ST1_ZPZ_S
6528#undef DO_ST1_ZPZ_D