]> git.proxmox.com Git - mirror_qemu.git/blame - target/arm/sve_helper.c
target/arm: Implement SVE2 bitwise permute
[mirror_qemu.git] / target / arm / sve_helper.c
CommitLineData
9e18d7a6
RH
1/*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
50f57e09 9 * version 2.1 of the License, or (at your option) any later version.
9e18d7a6
RH
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "qemu/osdep.h"
21#include "cpu.h"
500d0484 22#include "internals.h"
9e18d7a6
RH
23#include "exec/exec-all.h"
24#include "exec/cpu_ldst.h"
25#include "exec/helper-proto.h"
26#include "tcg/tcg-gvec-desc.h"
a1f233f2 27#include "fpu/softfloat.h"
dcb32f1d 28#include "tcg/tcg.h"
45d9503d 29#include "vec_internal.h"
9e18d7a6
RH
30
31
f97cfd59
RH
32/* Note that vector data is stored in host-endian 64-bit chunks,
33 so addressing units smaller than that needs a host-endian fixup. */
34#ifdef HOST_WORDS_BIGENDIAN
35#define H1(x) ((x) ^ 7)
36#define H1_2(x) ((x) ^ 6)
37#define H1_4(x) ((x) ^ 4)
38#define H2(x) ((x) ^ 3)
39#define H4(x) ((x) ^ 1)
40#else
41#define H1(x) (x)
42#define H1_2(x) (x)
43#define H1_4(x) (x)
44#define H2(x) (x)
45#define H4(x) (x)
46#endif
47
9e18d7a6
RH
48/* Return a value for NZCV as per the ARM PredTest pseudofunction.
49 *
50 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
51 * and bit 0 set if C is set. Compare the definitions of these variables
52 * within CPUARMState.
53 */
54
55/* For no G bits set, NZCV = C. */
56#define PREDTEST_INIT 1
57
58/* This is an iterative function, called for each Pd and Pg word
59 * moving forward.
60 */
61static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
62{
63 if (likely(g)) {
64 /* Compute N from first D & G.
65 Use bit 2 to signal first G bit seen. */
66 if (!(flags & 4)) {
67 flags |= ((d & (g & -g)) != 0) << 31;
68 flags |= 4;
69 }
70
71 /* Accumulate Z from each D & G. */
72 flags |= ((d & g) != 0) << 1;
73
74 /* Compute C from last !(D & G). Replace previous. */
75 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
76 }
77 return flags;
78}
79
757f9cff
RH
80/* This is an iterative function, called for each Pd and Pg word
81 * moving backward.
82 */
83static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
84{
85 if (likely(g)) {
86 /* Compute C from first (i.e last) !(D & G).
87 Use bit 2 to signal first G bit seen. */
88 if (!(flags & 4)) {
89 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
90 flags |= (d & pow2floor(g)) == 0;
91 }
92
93 /* Accumulate Z from each D & G. */
94 flags |= ((d & g) != 0) << 1;
95
96 /* Compute N from last (i.e first) D & G. Replace previous. */
97 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
98 }
99 return flags;
100}
101
9e18d7a6
RH
102/* The same for a single word predicate. */
103uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
104{
105 return iter_predtest_fwd(d, g, PREDTEST_INIT);
106}
107
108/* The same for a multi-word predicate. */
109uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
110{
111 uint32_t flags = PREDTEST_INIT;
112 uint64_t *d = vd, *g = vg;
113 uintptr_t i = 0;
114
115 do {
116 flags = iter_predtest_fwd(d[i], g[i], flags);
117 } while (++i < words);
118
119 return flags;
120}
516e246a 121
ccd841c3
RH
122/* Expand active predicate bits to bytes, for byte elements.
123 * for (i = 0; i < 256; ++i) {
124 * unsigned long m = 0;
125 * for (j = 0; j < 8; j++) {
126 * if ((i >> j) & 1) {
127 * m |= 0xfful << (j << 3);
128 * }
129 * }
130 * printf("0x%016lx,\n", m);
131 * }
132 */
133static inline uint64_t expand_pred_b(uint8_t byte)
134{
135 static const uint64_t word[256] = {
136 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
137 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
138 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
139 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
140 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
141 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
142 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
143 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
144 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
145 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
146 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
147 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
148 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
149 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
150 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
151 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
152 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
153 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
154 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
155 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
156 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
157 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
158 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
159 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
160 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
161 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
162 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
163 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
164 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
165 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
166 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
167 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
168 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
169 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
170 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
171 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
172 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
173 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
174 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
175 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
176 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
177 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
178 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
179 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
180 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
181 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
182 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
183 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
184 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
185 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
186 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
187 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
188 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
189 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
190 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
191 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
192 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
193 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
194 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
195 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
196 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
197 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
198 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
199 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
200 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
201 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
202 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
203 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
204 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
205 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
206 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
207 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
208 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
209 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
210 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
211 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
212 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
213 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
214 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
215 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
216 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
217 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
218 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
219 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
220 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
221 0xffffffffffffffff,
222 };
223 return word[byte];
224}
225
226/* Similarly for half-word elements.
227 * for (i = 0; i < 256; ++i) {
228 * unsigned long m = 0;
229 * if (i & 0xaa) {
230 * continue;
231 * }
232 * for (j = 0; j < 8; j += 2) {
233 * if ((i >> j) & 1) {
234 * m |= 0xfffful << (j << 3);
235 * }
236 * }
237 * printf("[0x%x] = 0x%016lx,\n", i, m);
238 * }
239 */
240static inline uint64_t expand_pred_h(uint8_t byte)
241{
242 static const uint64_t word[] = {
243 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
244 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
245 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
246 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
247 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
248 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
249 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
250 [0x55] = 0xffffffffffffffff,
251 };
252 return word[byte & 0x55];
253}
254
255/* Similarly for single word elements. */
256static inline uint64_t expand_pred_s(uint8_t byte)
257{
258 static const uint64_t word[] = {
259 [0x01] = 0x00000000ffffffffull,
260 [0x10] = 0xffffffff00000000ull,
261 [0x11] = 0xffffffffffffffffull,
262 };
263 return word[byte & 0x11];
264}
265
dae8fb90
RH
266/* Swap 16-bit words within a 32-bit word. */
267static inline uint32_t hswap32(uint32_t h)
268{
269 return rol32(h, 16);
270}
271
272/* Swap 16-bit words within a 64-bit word. */
273static inline uint64_t hswap64(uint64_t h)
274{
275 uint64_t m = 0x0000ffff0000ffffull;
276 h = rol64(h, 32);
277 return ((h & m) << 16) | ((h >> 16) & m);
278}
279
280/* Swap 32-bit words within a 64-bit word. */
281static inline uint64_t wswap64(uint64_t h)
282{
283 return rol64(h, 32);
284}
285
516e246a
RH
286#define LOGICAL_PPPP(NAME, FUNC) \
287void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
288{ \
289 uintptr_t opr_sz = simd_oprsz(desc); \
290 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
291 uintptr_t i; \
292 for (i = 0; i < opr_sz / 8; ++i) { \
293 d[i] = FUNC(n[i], m[i], g[i]); \
294 } \
295}
296
297#define DO_AND(N, M, G) (((N) & (M)) & (G))
298#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
299#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
300#define DO_ORR(N, M, G) (((N) | (M)) & (G))
301#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
302#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
303#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
304#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
305
306LOGICAL_PPPP(sve_and_pppp, DO_AND)
307LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
308LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
309LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
310LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
311LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
312LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
313LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
314
315#undef DO_AND
316#undef DO_BIC
317#undef DO_EOR
318#undef DO_ORR
319#undef DO_ORN
320#undef DO_NOR
321#undef DO_NAND
322#undef DO_SEL
323#undef LOGICAL_PPPP
028e2a7b 324
f97cfd59
RH
325/* Fully general three-operand expander, controlled by a predicate.
326 * This is complicated by the host-endian storage of the register file.
327 */
328/* ??? I don't expect the compiler could ever vectorize this itself.
329 * With some tables we can convert bit masks to byte masks, and with
330 * extra care wrt byte/word ordering we could use gcc generic vectors
331 * and do 16 bytes at a time.
332 */
333#define DO_ZPZZ(NAME, TYPE, H, OP) \
334void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
335{ \
336 intptr_t i, opr_sz = simd_oprsz(desc); \
337 for (i = 0; i < opr_sz; ) { \
338 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
339 do { \
340 if (pg & 1) { \
341 TYPE nn = *(TYPE *)(vn + H(i)); \
342 TYPE mm = *(TYPE *)(vm + H(i)); \
343 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
344 } \
345 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
346 } while (i & 15); \
347 } \
348}
349
350/* Similarly, specialized for 64-bit operands. */
351#define DO_ZPZZ_D(NAME, TYPE, OP) \
352void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
353{ \
354 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
355 TYPE *d = vd, *n = vn, *m = vm; \
356 uint8_t *pg = vg; \
357 for (i = 0; i < opr_sz; i += 1) { \
358 if (pg[H1(i)] & 1) { \
359 TYPE nn = n[i], mm = m[i]; \
360 d[i] = OP(nn, mm); \
361 } \
362 } \
363}
364
365#define DO_AND(N, M) (N & M)
366#define DO_EOR(N, M) (N ^ M)
367#define DO_ORR(N, M) (N | M)
368#define DO_BIC(N, M) (N & ~M)
369#define DO_ADD(N, M) (N + M)
370#define DO_SUB(N, M) (N - M)
371#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
372#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
373#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
374#define DO_MUL(N, M) (N * M)
7e8fafbf
RH
375
376
377/*
378 * We must avoid the C undefined behaviour cases: division by
379 * zero and signed division of INT_MIN by -1. Both of these
380 * have architecturally defined required results for Arm.
381 * We special case all signed divisions by -1 to avoid having
382 * to deduce the minimum integer for the type involved.
383 */
384#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
385#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
f97cfd59
RH
386
387DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
388DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
389DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
390DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
391
392DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
393DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
394DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
395DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
396
397DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
398DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
399DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
400DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
401
402DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
403DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
404DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
405DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
406
407DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
408DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
409DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
410DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
411
412DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
413DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
414DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
415DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
416
417DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
418DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
419DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
420DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
421
422DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
423DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
424DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
425DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
426
427DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
428DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
429DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
430DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
431
432DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
433DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
434DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
435DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
436
437DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
438DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
439DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
440DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
441
442DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
443DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
444DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
445DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
446
447/* Because the computation type is at least twice as large as required,
448 these work for both signed and unsigned source types. */
449static inline uint8_t do_mulh_b(int32_t n, int32_t m)
450{
451 return (n * m) >> 8;
452}
453
454static inline uint16_t do_mulh_h(int32_t n, int32_t m)
455{
456 return (n * m) >> 16;
457}
458
459static inline uint32_t do_mulh_s(int64_t n, int64_t m)
460{
461 return (n * m) >> 32;
462}
463
464static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
465{
466 uint64_t lo, hi;
467 muls64(&lo, &hi, n, m);
468 return hi;
469}
470
471static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
472{
473 uint64_t lo, hi;
474 mulu64(&lo, &hi, n, m);
475 return hi;
476}
477
478DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
479DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
480DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
481DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
482
483DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
484DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
485DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
486DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
487
488DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
489DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
490DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
491DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
492
7e8fafbf
RH
493DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
494DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
f97cfd59 495
7e8fafbf
RH
496DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
497DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
f97cfd59 498
27721dbb
RH
499/* Note that all bits of the shift are significant
500 and not modulo the element size. */
501#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
502#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
503#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
504
505DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
506DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
507DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
508
509DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
510DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
511DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
512
513DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
514DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
515DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
516
517DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
518DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
519DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
520
d4b1e59d
RH
521static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
522{
523 int8_t n1 = n, n2 = n >> 8;
524 return m + n1 + n2;
525}
526
527static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
528{
529 int16_t n1 = n, n2 = n >> 16;
530 return m + n1 + n2;
531}
532
533static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
534{
535 int32_t n1 = n, n2 = n >> 32;
536 return m + n1 + n2;
537}
538
539DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
540DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
541DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
542
543static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
544{
545 uint8_t n1 = n, n2 = n >> 8;
546 return m + n1 + n2;
547}
548
549static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
550{
551 uint16_t n1 = n, n2 = n >> 16;
552 return m + n1 + n2;
553}
554
555static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
556{
557 uint32_t n1 = n, n2 = n >> 32;
558 return m + n1 + n2;
559}
560
561DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
562DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
563DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
564
45d9503d
RH
565#define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
566#define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
567#define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
568#define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
569
570DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
571DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
572DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
573DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
574
575#define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
576#define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
577#define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
578#define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
579
580DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
581DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
582DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
583DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
584
585/*
586 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
587 * We pass in a pointer to a dummy saturation field to trigger
588 * the saturating arithmetic but discard the information about
589 * whether it has occurred.
590 */
591#define do_sqshl_b(n, m) \
592 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
593#define do_sqshl_h(n, m) \
594 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
595#define do_sqshl_s(n, m) \
596 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
597#define do_sqshl_d(n, m) \
598 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
599
600DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
601DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
602DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
603DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
604
605#define do_uqshl_b(n, m) \
606 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
607#define do_uqshl_h(n, m) \
608 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
609#define do_uqshl_s(n, m) \
610 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
611#define do_uqshl_d(n, m) \
612 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
613
614DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
615DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
616DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
617DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
618
619#define do_sqrshl_b(n, m) \
620 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
621#define do_sqrshl_h(n, m) \
622 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
623#define do_sqrshl_s(n, m) \
624 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
625#define do_sqrshl_d(n, m) \
626 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
627
628DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
629DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
630DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
631DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
632
633#undef do_sqrshl_d
634
635#define do_uqrshl_b(n, m) \
636 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
637#define do_uqrshl_h(n, m) \
638 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
639#define do_uqrshl_s(n, m) \
640 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
641#define do_uqrshl_d(n, m) \
642 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
643
644DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
645DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
646DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
647DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
648
649#undef do_uqrshl_d
650
a47dc220
RH
651#define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
652#define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
653
654DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
655DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
656DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
657DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
658
659DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
660DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
661DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
662DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
663
664#define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
665#define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
666
667DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
668DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
669DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
670DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
671
672DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
673DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
674DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
675DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
676
677#define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
678#define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
679
680DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
681DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
682DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
683DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
684
685DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
686DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
687DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
688DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
689
4f07fbeb
RH
690static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
691{
692 return val >= max ? max : val <= min ? min : val;
693}
694
695#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
696#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
697#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
698
699static inline int64_t do_sqadd_d(int64_t n, int64_t m)
700{
701 int64_t r = n + m;
702 if (((r ^ n) & ~(n ^ m)) < 0) {
703 /* Signed overflow. */
704 return r < 0 ? INT64_MAX : INT64_MIN;
705 }
706 return r;
707}
708
709DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
710DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
711DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
712DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
713
714#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
715#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
716#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
717
718static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
719{
720 uint64_t r = n + m;
721 return r < n ? UINT64_MAX : r;
722}
723
724DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
725DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
726DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
727DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
728
729#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
730#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
731#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
732
733static inline int64_t do_sqsub_d(int64_t n, int64_t m)
734{
735 int64_t r = n - m;
736 if (((r ^ n) & (n ^ m)) < 0) {
737 /* Signed overflow. */
738 return r < 0 ? INT64_MAX : INT64_MIN;
739 }
740 return r;
741}
742
743DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
744DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
745DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
746DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
747
748#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
749#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
750#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
751
752static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
753{
754 return n > m ? n - m : 0;
755}
756
757DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
758DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
759DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
760DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
761
762#define DO_SUQADD_B(n, m) \
763 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
764#define DO_SUQADD_H(n, m) \
765 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
766#define DO_SUQADD_S(n, m) \
767 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
768
769static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
770{
771 uint64_t r = n + m;
772
773 if (n < 0) {
774 /* Note that m - abs(n) cannot underflow. */
775 if (r > INT64_MAX) {
776 /* Result is either very large positive or negative. */
777 if (m > -n) {
778 /* m > abs(n), so r is a very large positive. */
779 return INT64_MAX;
780 }
781 /* Result is negative. */
782 }
783 } else {
784 /* Both inputs are positive: check for overflow. */
785 if (r < m || r > INT64_MAX) {
786 return INT64_MAX;
787 }
788 }
789 return r;
790}
791
792DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
793DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
794DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
795DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
796
797#define DO_USQADD_B(n, m) \
798 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
799#define DO_USQADD_H(n, m) \
800 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
801#define DO_USQADD_S(n, m) \
802 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
803
804static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
805{
806 uint64_t r = n + m;
807
808 if (m < 0) {
809 return n < -m ? 0 : r;
810 }
811 return r < n ? UINT64_MAX : r;
812}
813
814DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
815DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
816DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
817DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
818
f97cfd59
RH
819#undef DO_ZPZZ
820#undef DO_ZPZZ_D
047cec97 821
8597dc8b
RH
822/*
823 * Three operand expander, operating on element pairs.
824 * If the slot I is even, the elements from from VN {I, I+1}.
825 * If the slot I is odd, the elements from from VM {I-1, I}.
826 * Load all of the input elements in each pair before overwriting output.
827 */
828#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
829void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
830{ \
831 intptr_t i, opr_sz = simd_oprsz(desc); \
832 for (i = 0; i < opr_sz; ) { \
833 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
834 do { \
835 TYPE n0 = *(TYPE *)(vn + H(i)); \
836 TYPE m0 = *(TYPE *)(vm + H(i)); \
837 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
838 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
839 if (pg & 1) { \
840 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
841 } \
842 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
843 if (pg & 1) { \
844 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
845 } \
846 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
847 } while (i & 15); \
848 } \
849}
850
851/* Similarly, specialized for 64-bit operands. */
852#define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
853void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
854{ \
855 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
856 TYPE *d = vd, *n = vn, *m = vm; \
857 uint8_t *pg = vg; \
858 for (i = 0; i < opr_sz; i += 2) { \
859 TYPE n0 = n[i], n1 = n[i + 1]; \
860 TYPE m0 = m[i], m1 = m[i + 1]; \
861 if (pg[H1(i)] & 1) { \
862 d[i] = OP(n0, n1); \
863 } \
864 if (pg[H1(i + 1)] & 1) { \
865 d[i + 1] = OP(m0, m1); \
866 } \
867 } \
868}
869
870DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
871DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
872DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
873DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
874
875DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
876DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
877DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
878DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
879
880DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
881DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
882DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
883DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
884
885DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
886DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
887DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
888DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
889
890DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
891DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
892DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
893DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
894
895#undef DO_ZPZZ_PAIR
896#undef DO_ZPZZ_PAIR_D
897
fe7f8dfb
RH
898/* Three-operand expander, controlled by a predicate, in which the
899 * third operand is "wide". That is, for D = N op M, the same 64-bit
900 * value of M is used with all of the narrower values of N.
901 */
902#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
903void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
904{ \
905 intptr_t i, opr_sz = simd_oprsz(desc); \
906 for (i = 0; i < opr_sz; ) { \
907 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
908 TYPEW mm = *(TYPEW *)(vm + i); \
909 do { \
910 if (pg & 1) { \
911 TYPE nn = *(TYPE *)(vn + H(i)); \
912 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
913 } \
914 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
915 } while (i & 7); \
916 } \
917}
918
919DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
920DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
921DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
922
923DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
924DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
925DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
926
927DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
928DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
929DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
930
931#undef DO_ZPZW
932
afac6d04
RH
933/* Fully general two-operand expander, controlled by a predicate.
934 */
935#define DO_ZPZ(NAME, TYPE, H, OP) \
936void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
937{ \
938 intptr_t i, opr_sz = simd_oprsz(desc); \
939 for (i = 0; i < opr_sz; ) { \
940 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
941 do { \
942 if (pg & 1) { \
943 TYPE nn = *(TYPE *)(vn + H(i)); \
944 *(TYPE *)(vd + H(i)) = OP(nn); \
945 } \
946 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
947 } while (i & 15); \
948 } \
949}
950
951/* Similarly, specialized for 64-bit operands. */
952#define DO_ZPZ_D(NAME, TYPE, OP) \
953void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
954{ \
955 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
956 TYPE *d = vd, *n = vn; \
957 uint8_t *pg = vg; \
958 for (i = 0; i < opr_sz; i += 1) { \
959 if (pg[H1(i)] & 1) { \
960 TYPE nn = n[i]; \
961 d[i] = OP(nn); \
962 } \
963 } \
964}
965
966#define DO_CLS_B(N) (clrsb32(N) - 24)
967#define DO_CLS_H(N) (clrsb32(N) - 16)
968
969DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
970DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
971DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
972DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
973
974#define DO_CLZ_B(N) (clz32(N) - 24)
975#define DO_CLZ_H(N) (clz32(N) - 16)
976
977DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
978DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
979DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
980DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
981
982DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
983DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
984DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
985DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
986
987#define DO_CNOT(N) (N == 0)
988
989DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
990DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
991DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
992DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
993
994#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
995
996DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
997DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
998DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
999
1000#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
1001
1002DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
1003DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
1004DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
1005
1006#define DO_NOT(N) (~N)
1007
1008DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
1009DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
1010DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
1011DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
1012
1013#define DO_SXTB(N) ((int8_t)N)
1014#define DO_SXTH(N) ((int16_t)N)
1015#define DO_SXTS(N) ((int32_t)N)
1016#define DO_UXTB(N) ((uint8_t)N)
1017#define DO_UXTH(N) ((uint16_t)N)
1018#define DO_UXTS(N) ((uint32_t)N)
1019
1020DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
1021DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
1022DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
1023DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
1024DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
1025DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
1026
1027DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
1028DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
1029DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
1030DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
1031DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
1032DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
1033
1034#define DO_ABS(N) (N < 0 ? -N : N)
1035
1036DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
1037DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
1038DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
1039DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
1040
1041#define DO_NEG(N) (-N)
1042
1043DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
1044DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
1045DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
1046DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
1047
dae8fb90
RH
1048DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
1049DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
1050DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
1051
1052DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
1053DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
1054
1055DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
1056
1057DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
1058DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
1059DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
1060DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
1061
db366da8
RH
1062#define DO_SQABS(X) \
1063 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1064 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
1065
1066DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
1067DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
1068DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
1069DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
1070
1071#define DO_SQNEG(X) \
1072 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1073 x_ == min_ ? -min_ - 1 : -x_; })
1074
1075DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
1076DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
1077DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
1078DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
1079
1080DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
1081DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
1082
d9d78dcc
RH
1083/* Three-operand expander, unpredicated, in which the third operand is "wide".
1084 */
1085#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1086void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1087{ \
1088 intptr_t i, opr_sz = simd_oprsz(desc); \
1089 for (i = 0; i < opr_sz; ) { \
1090 TYPEW mm = *(TYPEW *)(vm + i); \
1091 do { \
1092 TYPE nn = *(TYPE *)(vn + H(i)); \
1093 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1094 i += sizeof(TYPE); \
1095 } while (i & 7); \
1096 } \
1097}
1098
1099DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1100DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1101DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1102
1103DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1104DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1105DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1106
1107DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1108DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1109DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1110
1111#undef DO_ZZW
1112
afac6d04
RH
1113#undef DO_CLS_B
1114#undef DO_CLS_H
1115#undef DO_CLZ_B
1116#undef DO_CLZ_H
1117#undef DO_CNOT
1118#undef DO_FABS
1119#undef DO_FNEG
1120#undef DO_ABS
1121#undef DO_NEG
1122#undef DO_ZPZ
1123#undef DO_ZPZ_D
1124
0ce1dda8
RH
1125/*
1126 * Three-operand expander, unpredicated, in which the two inputs are
1127 * selected from the top or bottom half of the wide column.
1128 */
1129#define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1130void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1131{ \
1132 intptr_t i, opr_sz = simd_oprsz(desc); \
1133 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1134 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1135 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1136 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1137 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1138 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1139 } \
1140}
1141
1142DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1143DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1144DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, , H1_4, DO_ADD)
1145
1146DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1147DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1148DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, , H1_4, DO_SUB)
1149
1150DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1151DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1152DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, , H1_4, DO_ABD)
1153
1154DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1155DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1156DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, , H1_4, DO_ADD)
1157
1158DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1159DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1160DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, , H1_4, DO_SUB)
1161
1162DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1163DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1164DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, , H1_4, DO_ABD)
1165
69ccc099
RH
1166DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1167DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1168DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, , H1_4, DO_MUL)
1169
1170DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1171DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1172DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, , H1_4, DO_MUL)
1173
1174/* Note that the multiply cannot overflow, but the doubling can. */
1175static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1176{
1177 int16_t val = n * m;
1178 return DO_SQADD_H(val, val);
1179}
1180
1181static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1182{
1183 int32_t val = n * m;
1184 return DO_SQADD_S(val, val);
1185}
1186
1187static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1188{
1189 int64_t val = n * m;
1190 return do_sqadd_d(val, val);
1191}
1192
1193DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1194DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1195DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, , H1_4, do_sqdmull_d)
1196
0ce1dda8
RH
1197#undef DO_ZZZ_TB
1198
81fccf09
RH
1199#define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1200void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1201{ \
1202 intptr_t i, opr_sz = simd_oprsz(desc); \
1203 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1204 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1205 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1206 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1207 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1208 } \
1209}
1210
1211DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1212DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1213DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, , H1_4, DO_ADD)
1214
1215DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1216DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1217DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, , H1_4, DO_SUB)
1218
1219DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1220DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1221DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, , H1_4, DO_ADD)
1222
1223DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1224DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1225DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, , H1_4, DO_SUB)
1226
1227#undef DO_ZZZ_WTB
1228
2df3ca55
RH
1229#define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1230void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1231{ \
1232 intptr_t i, opr_sz = simd_oprsz(desc); \
1233 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1234 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1235 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1236 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1237 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1238 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1239 } \
1240}
1241
1242DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1243DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1244DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1245DO_ZZZ_NTB(sve2_eoril_d, uint64_t, , DO_EOR)
1246
1247#undef DO_ZZZ_NTB
1248
cb9c33b8
RH
1249#define DO_BITPERM(NAME, TYPE, OP) \
1250void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1251{ \
1252 intptr_t i, opr_sz = simd_oprsz(desc); \
1253 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1254 TYPE nn = *(TYPE *)(vn + i); \
1255 TYPE mm = *(TYPE *)(vm + i); \
1256 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1257 } \
1258}
1259
1260static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1261{
1262 uint64_t res = 0;
1263 int db, rb = 0;
1264
1265 for (db = 0; db < n; ++db) {
1266 if ((mask >> db) & 1) {
1267 res |= ((data >> db) & 1) << rb;
1268 ++rb;
1269 }
1270 }
1271 return res;
1272}
1273
1274DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1275DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1276DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1277DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1278
1279static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1280{
1281 uint64_t res = 0;
1282 int rb, db = 0;
1283
1284 for (rb = 0; rb < n; ++rb) {
1285 if ((mask >> rb) & 1) {
1286 res |= ((data >> db) & 1) << rb;
1287 ++db;
1288 }
1289 }
1290 return res;
1291}
1292
1293DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1294DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1295DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1296DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1297
1298static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1299{
1300 uint64_t resm = 0, resu = 0;
1301 int db, rbm = 0, rbu = 0;
1302
1303 for (db = 0; db < n; ++db) {
1304 uint64_t val = (data >> db) & 1;
1305 if ((mask >> db) & 1) {
1306 resm |= val << rbm++;
1307 } else {
1308 resu |= val << rbu++;
1309 }
1310 }
1311
1312 return resm | (resu << rbm);
1313}
1314
1315DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1316DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1317DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1318DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1319
1320#undef DO_BITPERM
1321
4269fef1
RH
1322#define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1323void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1324{ \
1325 intptr_t i, opr_sz = simd_oprsz(desc); \
1326 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1327 int shift = simd_data(desc) >> 1; \
1328 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1329 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1330 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1331 } \
1332}
1333
1334DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1335DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1336DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, , H1_4)
1337
1338DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1339DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1340DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, , H1_4)
1341
1342#undef DO_ZZI_SHLL
1343
047cec97
RH
1344/* Two-operand reduction expander, controlled by a predicate.
1345 * The difference between TYPERED and TYPERET has to do with
1346 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1347 * but TYPERET must be unsigned so that e.g. a 32-bit value
1348 * is not sign-extended to the ABI uint64_t return type.
1349 */
1350/* ??? If we were to vectorize this by hand the reduction ordering
1351 * would change. For integer operands, this is perfectly fine.
1352 */
1353#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1354uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1355{ \
1356 intptr_t i, opr_sz = simd_oprsz(desc); \
1357 TYPERED ret = INIT; \
1358 for (i = 0; i < opr_sz; ) { \
1359 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1360 do { \
1361 if (pg & 1) { \
1362 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1363 ret = OP(ret, nn); \
1364 } \
1365 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1366 } while (i & 15); \
1367 } \
1368 return (TYPERET)ret; \
1369}
1370
1371#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1372uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1373{ \
1374 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1375 TYPEE *n = vn; \
1376 uint8_t *pg = vg; \
1377 TYPER ret = INIT; \
1378 for (i = 0; i < opr_sz; i += 1) { \
1379 if (pg[H1(i)] & 1) { \
1380 TYPEE nn = n[i]; \
1381 ret = OP(ret, nn); \
1382 } \
1383 } \
1384 return ret; \
1385}
1386
1387DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1388DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1389DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1390DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1391
1392DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1393DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1394DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1395DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1396
1397DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1398DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1399DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1400DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1401
1402DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1403DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1404DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1405
1406DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1407DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1408DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1409DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1410
1411DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1412DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1413DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1414DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1415
1416DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1417DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1418DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1419DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1420
1421DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1422DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1423DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1424DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1425
1426DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1427DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1428DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1429DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1430
1431#undef DO_VPZ
1432#undef DO_VPZ_D
1433
6e6a157d
RH
1434/* Two vector operand, one scalar operand, unpredicated. */
1435#define DO_ZZI(NAME, TYPE, OP) \
1436void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1437{ \
1438 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1439 TYPE s = s64, *d = vd, *n = vn; \
1440 for (i = 0; i < opr_sz; ++i) { \
1441 d[i] = OP(n[i], s); \
1442 } \
1443}
1444
1445#define DO_SUBR(X, Y) (Y - X)
1446
1447DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1448DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1449DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1450DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1451
1452DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1453DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1454DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1455DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1456
1457DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1458DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1459DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1460DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1461
1462DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1463DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1464DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1465DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1466
1467DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1468DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1469DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1470DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1471
1472#undef DO_ZZI
1473
f97cfd59
RH
1474#undef DO_AND
1475#undef DO_ORR
1476#undef DO_EOR
1477#undef DO_BIC
1478#undef DO_ADD
1479#undef DO_SUB
1480#undef DO_MAX
1481#undef DO_MIN
1482#undef DO_ABD
1483#undef DO_MUL
1484#undef DO_DIV
27721dbb
RH
1485#undef DO_ASR
1486#undef DO_LSR
1487#undef DO_LSL
6e6a157d 1488#undef DO_SUBR
f97cfd59 1489
028e2a7b
RH
1490/* Similar to the ARM LastActiveElement pseudocode function, except the
1491 result is multiplied by the element size. This includes the not found
1492 indication; e.g. not found for esz=3 is -8. */
1493static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1494{
1495 uint64_t mask = pred_esz_masks[esz];
1496 intptr_t i = words;
1497
1498 do {
1499 uint64_t this_g = g[--i] & mask;
1500 if (this_g) {
1501 return i * 64 + (63 - clz64(this_g));
1502 }
1503 } while (i > 0);
1504 return (intptr_t)-1 << esz;
1505}
1506
86300b5d 1507uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
028e2a7b 1508{
86300b5d 1509 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
028e2a7b
RH
1510 uint32_t flags = PREDTEST_INIT;
1511 uint64_t *d = vd, *g = vg;
1512 intptr_t i = 0;
1513
1514 do {
1515 uint64_t this_d = d[i];
1516 uint64_t this_g = g[i];
1517
1518 if (this_g) {
1519 if (!(flags & 4)) {
1520 /* Set in D the first bit of G. */
1521 this_d |= this_g & -this_g;
1522 d[i] = this_d;
1523 }
1524 flags = iter_predtest_fwd(this_d, this_g, flags);
1525 }
1526 } while (++i < words);
1527
1528 return flags;
1529}
1530
1531uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1532{
86300b5d
RH
1533 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1534 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
028e2a7b
RH
1535 uint32_t flags = PREDTEST_INIT;
1536 uint64_t *d = vd, *g = vg, esz_mask;
1537 intptr_t i, next;
1538
1539 next = last_active_element(vd, words, esz) + (1 << esz);
1540 esz_mask = pred_esz_masks[esz];
1541
1542 /* Similar to the pseudocode for pnext, but scaled by ESZ
1543 so that we find the correct bit. */
1544 if (next < words * 64) {
1545 uint64_t mask = -1;
1546
1547 if (next & 63) {
1548 mask = ~((1ull << (next & 63)) - 1);
1549 next &= -64;
1550 }
1551 do {
1552 uint64_t this_g = g[next / 64] & esz_mask & mask;
1553 if (this_g != 0) {
1554 next = (next & -64) + ctz64(this_g);
1555 break;
1556 }
1557 next += 64;
1558 mask = -1;
1559 } while (next < words * 64);
1560 }
1561
1562 i = 0;
1563 do {
1564 uint64_t this_d = 0;
1565 if (i == next / 64) {
1566 this_d = 1ull << (next & 63);
1567 }
1568 d[i] = this_d;
1569 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1570 } while (++i < words);
1571
1572 return flags;
1573}
ccd841c3 1574
60245996
RH
1575/*
1576 * Copy Zn into Zd, and store zero into inactive elements.
1577 * If inv, store zeros into the active elements.
ccd841c3 1578 */
68459864
RH
1579void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1580{
1581 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
60245996 1582 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
68459864
RH
1583 uint64_t *d = vd, *n = vn;
1584 uint8_t *pg = vg;
60245996 1585
68459864 1586 for (i = 0; i < opr_sz; i += 1) {
60245996 1587 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
68459864
RH
1588 }
1589}
1590
1591void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1592{
1593 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
60245996 1594 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
68459864
RH
1595 uint64_t *d = vd, *n = vn;
1596 uint8_t *pg = vg;
60245996 1597
68459864 1598 for (i = 0; i < opr_sz; i += 1) {
60245996 1599 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
68459864
RH
1600 }
1601}
1602
1603void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1604{
1605 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
60245996 1606 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
68459864
RH
1607 uint64_t *d = vd, *n = vn;
1608 uint8_t *pg = vg;
60245996 1609
68459864 1610 for (i = 0; i < opr_sz; i += 1) {
60245996 1611 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
68459864
RH
1612 }
1613}
1614
1615void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1616{
1617 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1618 uint64_t *d = vd, *n = vn;
1619 uint8_t *pg = vg;
60245996
RH
1620 uint8_t inv = simd_data(desc);
1621
68459864 1622 for (i = 0; i < opr_sz; i += 1) {
60245996 1623 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
68459864
RH
1624 }
1625}
1626
ccd841c3
RH
1627/* Three-operand expander, immediate operand, controlled by a predicate.
1628 */
1629#define DO_ZPZI(NAME, TYPE, H, OP) \
1630void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1631{ \
1632 intptr_t i, opr_sz = simd_oprsz(desc); \
1633 TYPE imm = simd_data(desc); \
1634 for (i = 0; i < opr_sz; ) { \
1635 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1636 do { \
1637 if (pg & 1) { \
1638 TYPE nn = *(TYPE *)(vn + H(i)); \
1639 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1640 } \
1641 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1642 } while (i & 15); \
1643 } \
1644}
1645
1646/* Similarly, specialized for 64-bit operands. */
1647#define DO_ZPZI_D(NAME, TYPE, OP) \
1648void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1649{ \
1650 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1651 TYPE *d = vd, *n = vn; \
1652 TYPE imm = simd_data(desc); \
1653 uint8_t *pg = vg; \
1654 for (i = 0; i < opr_sz; i += 1) { \
1655 if (pg[H1(i)] & 1) { \
1656 TYPE nn = n[i]; \
1657 d[i] = OP(nn, imm); \
1658 } \
1659 } \
1660}
1661
1662#define DO_SHR(N, M) (N >> M)
1663#define DO_SHL(N, M) (N << M)
1664
1665/* Arithmetic shift right for division. This rounds negative numbers
1666 toward zero as per signed division. Therefore before shifting,
1667 when N is negative, add 2**M-1. */
1668#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1669
1670DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1671DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1672DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1673DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1674
1675DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1676DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1677DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1678DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1679
1680DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1681DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1682DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1683DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1684
1685DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1686DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1687DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1688DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1689
1690#undef DO_SHR
1691#undef DO_SHL
1692#undef DO_ASRD
1693#undef DO_ZPZI
1694#undef DO_ZPZI_D
96a36e4a
RH
1695
1696/* Fully general four-operand expander, controlled by a predicate.
1697 */
1698#define DO_ZPZZZ(NAME, TYPE, H, OP) \
1699void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1700 void *vg, uint32_t desc) \
1701{ \
1702 intptr_t i, opr_sz = simd_oprsz(desc); \
1703 for (i = 0; i < opr_sz; ) { \
1704 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1705 do { \
1706 if (pg & 1) { \
1707 TYPE nn = *(TYPE *)(vn + H(i)); \
1708 TYPE mm = *(TYPE *)(vm + H(i)); \
1709 TYPE aa = *(TYPE *)(va + H(i)); \
1710 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1711 } \
1712 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1713 } while (i & 15); \
1714 } \
1715}
1716
1717/* Similarly, specialized for 64-bit operands. */
1718#define DO_ZPZZZ_D(NAME, TYPE, OP) \
1719void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1720 void *vg, uint32_t desc) \
1721{ \
1722 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1723 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1724 uint8_t *pg = vg; \
1725 for (i = 0; i < opr_sz; i += 1) { \
1726 if (pg[H1(i)] & 1) { \
1727 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1728 d[i] = OP(aa, nn, mm); \
1729 } \
1730 } \
1731}
1732
1733#define DO_MLA(A, N, M) (A + N * M)
1734#define DO_MLS(A, N, M) (A - N * M)
1735
1736DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1737DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1738
1739DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1740DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1741
1742DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1743DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1744
1745DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1746DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1747
1748#undef DO_MLA
1749#undef DO_MLS
1750#undef DO_ZPZZZ
1751#undef DO_ZPZZZ_D
9a56c9c3
RH
1752
1753void HELPER(sve_index_b)(void *vd, uint32_t start,
1754 uint32_t incr, uint32_t desc)
1755{
1756 intptr_t i, opr_sz = simd_oprsz(desc);
1757 uint8_t *d = vd;
1758 for (i = 0; i < opr_sz; i += 1) {
1759 d[H1(i)] = start + i * incr;
1760 }
1761}
1762
1763void HELPER(sve_index_h)(void *vd, uint32_t start,
1764 uint32_t incr, uint32_t desc)
1765{
1766 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1767 uint16_t *d = vd;
1768 for (i = 0; i < opr_sz; i += 1) {
1769 d[H2(i)] = start + i * incr;
1770 }
1771}
1772
1773void HELPER(sve_index_s)(void *vd, uint32_t start,
1774 uint32_t incr, uint32_t desc)
1775{
1776 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1777 uint32_t *d = vd;
1778 for (i = 0; i < opr_sz; i += 1) {
1779 d[H4(i)] = start + i * incr;
1780 }
1781}
1782
1783void HELPER(sve_index_d)(void *vd, uint64_t start,
1784 uint64_t incr, uint32_t desc)
1785{
1786 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1787 uint64_t *d = vd;
1788 for (i = 0; i < opr_sz; i += 1) {
1789 d[i] = start + i * incr;
1790 }
1791}
4b242d9c
RH
1792
1793void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1794{
1795 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1796 uint32_t sh = simd_data(desc);
1797 uint32_t *d = vd, *n = vn, *m = vm;
1798 for (i = 0; i < opr_sz; i += 1) {
1799 d[i] = n[i] + (m[i] << sh);
1800 }
1801}
1802
1803void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1804{
1805 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1806 uint64_t sh = simd_data(desc);
1807 uint64_t *d = vd, *n = vn, *m = vm;
1808 for (i = 0; i < opr_sz; i += 1) {
1809 d[i] = n[i] + (m[i] << sh);
1810 }
1811}
1812
1813void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1814{
1815 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1816 uint64_t sh = simd_data(desc);
1817 uint64_t *d = vd, *n = vn, *m = vm;
1818 for (i = 0; i < opr_sz; i += 1) {
1819 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1820 }
1821}
1822
1823void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1824{
1825 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1826 uint64_t sh = simd_data(desc);
1827 uint64_t *d = vd, *n = vn, *m = vm;
1828 for (i = 0; i < opr_sz; i += 1) {
1829 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1830 }
1831}
0762cd42
RH
1832
1833void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1834{
1835 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1836 static const uint16_t coeff[] = {
1837 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1838 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1839 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1840 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1841 };
1842 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1843 uint16_t *d = vd, *n = vn;
1844
1845 for (i = 0; i < opr_sz; i++) {
1846 uint16_t nn = n[i];
1847 intptr_t idx = extract32(nn, 0, 5);
1848 uint16_t exp = extract32(nn, 5, 5);
1849 d[i] = coeff[idx] | (exp << 10);
1850 }
1851}
1852
1853void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1854{
1855 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1856 static const uint32_t coeff[] = {
1857 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1858 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1859 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1860 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1861 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1862 0x1ef532, 0x20b051, 0x227043, 0x243516,
1863 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1864 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1865 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1866 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1867 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1868 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1869 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1870 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1871 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1872 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1873 };
1874 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1875 uint32_t *d = vd, *n = vn;
1876
1877 for (i = 0; i < opr_sz; i++) {
1878 uint32_t nn = n[i];
1879 intptr_t idx = extract32(nn, 0, 6);
1880 uint32_t exp = extract32(nn, 6, 8);
1881 d[i] = coeff[idx] | (exp << 23);
1882 }
1883}
1884
1885void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1886{
1887 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1888 static const uint64_t coeff[] = {
1889 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1890 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1891 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1892 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1893 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1894 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1895 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1896 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1897 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1898 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1899 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1900 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1901 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1902 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1903 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1904 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1905 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1906 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1907 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1908 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1909 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1910 0xFA7C1819E90D8ull,
1911 };
1912 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1913 uint64_t *d = vd, *n = vn;
1914
1915 for (i = 0; i < opr_sz; i++) {
1916 uint64_t nn = n[i];
1917 intptr_t idx = extract32(nn, 0, 6);
1918 uint64_t exp = extract32(nn, 6, 11);
1919 d[i] = coeff[idx] | (exp << 52);
1920 }
1921}
a1f233f2
RH
1922
1923void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1924{
1925 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1926 uint16_t *d = vd, *n = vn, *m = vm;
1927 for (i = 0; i < opr_sz; i += 1) {
1928 uint16_t nn = n[i];
1929 uint16_t mm = m[i];
1930 if (mm & 1) {
1931 nn = float16_one;
1932 }
1933 d[i] = nn ^ (mm & 2) << 14;
1934 }
1935}
1936
1937void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1938{
1939 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1940 uint32_t *d = vd, *n = vn, *m = vm;
1941 for (i = 0; i < opr_sz; i += 1) {
1942 uint32_t nn = n[i];
1943 uint32_t mm = m[i];
1944 if (mm & 1) {
1945 nn = float32_one;
1946 }
1947 d[i] = nn ^ (mm & 2) << 30;
1948 }
1949}
1950
1951void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1952{
1953 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1954 uint64_t *d = vd, *n = vn, *m = vm;
1955 for (i = 0; i < opr_sz; i += 1) {
1956 uint64_t nn = n[i];
1957 uint64_t mm = m[i];
1958 if (mm & 1) {
1959 nn = float64_one;
1960 }
1961 d[i] = nn ^ (mm & 2) << 62;
1962 }
1963}
24e82e68
RH
1964
1965/*
1966 * Signed saturating addition with scalar operand.
1967 */
1968
1969void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1970{
1971 intptr_t i, oprsz = simd_oprsz(desc);
1972
1973 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
4f07fbeb 1974 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
24e82e68
RH
1975 }
1976}
1977
1978void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1979{
1980 intptr_t i, oprsz = simd_oprsz(desc);
1981
1982 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
4f07fbeb 1983 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
24e82e68
RH
1984 }
1985}
1986
1987void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1988{
1989 intptr_t i, oprsz = simd_oprsz(desc);
1990
1991 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
4f07fbeb 1992 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
24e82e68
RH
1993 }
1994}
1995
1996void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1997{
1998 intptr_t i, oprsz = simd_oprsz(desc);
1999
2000 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
4f07fbeb 2001 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
24e82e68
RH
2002 }
2003}
2004
2005/*
2006 * Unsigned saturating addition with scalar operand.
2007 */
2008
2009void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2010{
2011 intptr_t i, oprsz = simd_oprsz(desc);
2012
2013 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
4f07fbeb 2014 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
24e82e68
RH
2015 }
2016}
2017
2018void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2019{
2020 intptr_t i, oprsz = simd_oprsz(desc);
2021
2022 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
4f07fbeb 2023 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
24e82e68
RH
2024 }
2025}
2026
2027void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2028{
2029 intptr_t i, oprsz = simd_oprsz(desc);
2030
2031 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
4f07fbeb 2032 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
24e82e68
RH
2033 }
2034}
2035
2036void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2037{
2038 intptr_t i, oprsz = simd_oprsz(desc);
2039
2040 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
4f07fbeb 2041 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
24e82e68
RH
2042 }
2043}
2044
2045void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2046{
2047 intptr_t i, oprsz = simd_oprsz(desc);
2048
2049 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
4f07fbeb 2050 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
24e82e68
RH
2051 }
2052}
f25a2361
RH
2053
2054/* Two operand predicated copy immediate with merge. All valid immediates
2055 * can fit within 17 signed bits in the simd_data field.
2056 */
2057void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2058 uint64_t mm, uint32_t desc)
2059{
2060 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2061 uint64_t *d = vd, *n = vn;
2062 uint8_t *pg = vg;
2063
2064 mm = dup_const(MO_8, mm);
2065 for (i = 0; i < opr_sz; i += 1) {
2066 uint64_t nn = n[i];
2067 uint64_t pp = expand_pred_b(pg[H1(i)]);
2068 d[i] = (mm & pp) | (nn & ~pp);
2069 }
2070}
2071
2072void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2073 uint64_t mm, uint32_t desc)
2074{
2075 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2076 uint64_t *d = vd, *n = vn;
2077 uint8_t *pg = vg;
2078
2079 mm = dup_const(MO_16, mm);
2080 for (i = 0; i < opr_sz; i += 1) {
2081 uint64_t nn = n[i];
2082 uint64_t pp = expand_pred_h(pg[H1(i)]);
2083 d[i] = (mm & pp) | (nn & ~pp);
2084 }
2085}
2086
2087void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2088 uint64_t mm, uint32_t desc)
2089{
2090 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2091 uint64_t *d = vd, *n = vn;
2092 uint8_t *pg = vg;
2093
2094 mm = dup_const(MO_32, mm);
2095 for (i = 0; i < opr_sz; i += 1) {
2096 uint64_t nn = n[i];
2097 uint64_t pp = expand_pred_s(pg[H1(i)]);
2098 d[i] = (mm & pp) | (nn & ~pp);
2099 }
2100}
2101
2102void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2103 uint64_t mm, uint32_t desc)
2104{
2105 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2106 uint64_t *d = vd, *n = vn;
2107 uint8_t *pg = vg;
2108
2109 for (i = 0; i < opr_sz; i += 1) {
2110 uint64_t nn = n[i];
2111 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2112 }
2113}
2114
2115void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2116{
2117 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2118 uint64_t *d = vd;
2119 uint8_t *pg = vg;
2120
2121 val = dup_const(MO_8, val);
2122 for (i = 0; i < opr_sz; i += 1) {
2123 d[i] = val & expand_pred_b(pg[H1(i)]);
2124 }
2125}
2126
2127void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2128{
2129 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2130 uint64_t *d = vd;
2131 uint8_t *pg = vg;
2132
2133 val = dup_const(MO_16, val);
2134 for (i = 0; i < opr_sz; i += 1) {
2135 d[i] = val & expand_pred_h(pg[H1(i)]);
2136 }
2137}
2138
2139void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2140{
2141 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2142 uint64_t *d = vd;
2143 uint8_t *pg = vg;
2144
2145 val = dup_const(MO_32, val);
2146 for (i = 0; i < opr_sz; i += 1) {
2147 d[i] = val & expand_pred_s(pg[H1(i)]);
2148 }
2149}
2150
2151void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2152{
2153 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2154 uint64_t *d = vd;
2155 uint8_t *pg = vg;
2156
2157 for (i = 0; i < opr_sz; i += 1) {
2158 d[i] = (pg[H1(i)] & 1 ? val : 0);
2159 }
2160}
b94f8f60 2161
b4cd95d2 2162/* Big-endian hosts need to frob the byte indices. If the copy
b94f8f60
RH
2163 * happens to be 8-byte aligned, then no frobbing necessary.
2164 */
2165static void swap_memmove(void *vd, void *vs, size_t n)
2166{
2167 uintptr_t d = (uintptr_t)vd;
2168 uintptr_t s = (uintptr_t)vs;
2169 uintptr_t o = (d | s | n) & 7;
2170 size_t i;
2171
2172#ifndef HOST_WORDS_BIGENDIAN
2173 o = 0;
2174#endif
2175 switch (o) {
2176 case 0:
2177 memmove(vd, vs, n);
2178 break;
2179
2180 case 4:
2181 if (d < s || d >= s + n) {
2182 for (i = 0; i < n; i += 4) {
2183 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2184 }
2185 } else {
2186 for (i = n; i > 0; ) {
2187 i -= 4;
2188 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2189 }
2190 }
2191 break;
2192
2193 case 2:
2194 case 6:
2195 if (d < s || d >= s + n) {
2196 for (i = 0; i < n; i += 2) {
2197 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2198 }
2199 } else {
2200 for (i = n; i > 0; ) {
2201 i -= 2;
2202 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2203 }
2204 }
2205 break;
2206
2207 default:
2208 if (d < s || d >= s + n) {
2209 for (i = 0; i < n; i++) {
2210 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2211 }
2212 } else {
2213 for (i = n; i > 0; ) {
2214 i -= 1;
2215 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2216 }
2217 }
2218 break;
2219 }
2220}
2221
9123aeb6
RH
2222/* Similarly for memset of 0. */
2223static void swap_memzero(void *vd, size_t n)
2224{
2225 uintptr_t d = (uintptr_t)vd;
2226 uintptr_t o = (d | n) & 7;
2227 size_t i;
2228
2229 /* Usually, the first bit of a predicate is set, so N is 0. */
2230 if (likely(n == 0)) {
2231 return;
2232 }
2233
2234#ifndef HOST_WORDS_BIGENDIAN
2235 o = 0;
2236#endif
2237 switch (o) {
2238 case 0:
2239 memset(vd, 0, n);
2240 break;
2241
2242 case 4:
2243 for (i = 0; i < n; i += 4) {
2244 *(uint32_t *)H1_4(d + i) = 0;
2245 }
2246 break;
2247
2248 case 2:
2249 case 6:
2250 for (i = 0; i < n; i += 2) {
2251 *(uint16_t *)H1_2(d + i) = 0;
2252 }
2253 break;
2254
2255 default:
2256 for (i = 0; i < n; i++) {
2257 *(uint8_t *)H1(d + i) = 0;
2258 }
2259 break;
2260 }
2261}
2262
b94f8f60
RH
2263void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2264{
2265 intptr_t opr_sz = simd_oprsz(desc);
2266 size_t n_ofs = simd_data(desc);
2267 size_t n_siz = opr_sz - n_ofs;
2268
2269 if (vd != vm) {
2270 swap_memmove(vd, vn + n_ofs, n_siz);
2271 swap_memmove(vd + n_siz, vm, n_ofs);
2272 } else if (vd != vn) {
2273 swap_memmove(vd + n_siz, vd, n_ofs);
2274 swap_memmove(vd, vn + n_ofs, n_siz);
2275 } else {
2276 /* vd == vn == vm. Need temp space. */
2277 ARMVectorReg tmp;
2278 swap_memmove(&tmp, vm, n_ofs);
2279 swap_memmove(vd, vd + n_ofs, n_siz);
2280 memcpy(vd + n_siz, &tmp, n_ofs);
2281 }
2282}
30562ab7
RH
2283
2284#define DO_INSR(NAME, TYPE, H) \
2285void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2286{ \
2287 intptr_t opr_sz = simd_oprsz(desc); \
2288 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2289 *(TYPE *)(vd + H(0)) = val; \
2290}
2291
2292DO_INSR(sve_insr_b, uint8_t, H1)
2293DO_INSR(sve_insr_h, uint16_t, H1_2)
2294DO_INSR(sve_insr_s, uint32_t, H1_4)
2295DO_INSR(sve_insr_d, uint64_t, )
2296
2297#undef DO_INSR
2298
2299void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2300{
2301 intptr_t i, j, opr_sz = simd_oprsz(desc);
2302 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2303 uint64_t f = *(uint64_t *)(vn + i);
2304 uint64_t b = *(uint64_t *)(vn + j);
2305 *(uint64_t *)(vd + i) = bswap64(b);
2306 *(uint64_t *)(vd + j) = bswap64(f);
2307 }
2308}
2309
30562ab7
RH
2310void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2311{
2312 intptr_t i, j, opr_sz = simd_oprsz(desc);
2313 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2314 uint64_t f = *(uint64_t *)(vn + i);
2315 uint64_t b = *(uint64_t *)(vn + j);
2316 *(uint64_t *)(vd + i) = hswap64(b);
2317 *(uint64_t *)(vd + j) = hswap64(f);
2318 }
2319}
2320
2321void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2322{
2323 intptr_t i, j, opr_sz = simd_oprsz(desc);
2324 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2325 uint64_t f = *(uint64_t *)(vn + i);
2326 uint64_t b = *(uint64_t *)(vn + j);
2327 *(uint64_t *)(vd + i) = rol64(b, 32);
2328 *(uint64_t *)(vd + j) = rol64(f, 32);
2329 }
2330}
2331
2332void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2333{
2334 intptr_t i, j, opr_sz = simd_oprsz(desc);
2335 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2336 uint64_t f = *(uint64_t *)(vn + i);
2337 uint64_t b = *(uint64_t *)(vn + j);
2338 *(uint64_t *)(vd + i) = b;
2339 *(uint64_t *)(vd + j) = f;
2340 }
2341}
2342
2343#define DO_TBL(NAME, TYPE, H) \
2344void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2345{ \
2346 intptr_t i, opr_sz = simd_oprsz(desc); \
2347 uintptr_t elem = opr_sz / sizeof(TYPE); \
2348 TYPE *d = vd, *n = vn, *m = vm; \
2349 ARMVectorReg tmp; \
2350 if (unlikely(vd == vn)) { \
2351 n = memcpy(&tmp, vn, opr_sz); \
2352 } \
2353 for (i = 0; i < elem; i++) { \
2354 TYPE j = m[H(i)]; \
2355 d[H(i)] = j < elem ? n[H(j)] : 0; \
2356 } \
2357}
2358
2359DO_TBL(sve_tbl_b, uint8_t, H1)
2360DO_TBL(sve_tbl_h, uint16_t, H2)
2361DO_TBL(sve_tbl_s, uint32_t, H4)
2362DO_TBL(sve_tbl_d, uint64_t, )
2363
2364#undef TBL
2365
2366#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
2367void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2368{ \
2369 intptr_t i, opr_sz = simd_oprsz(desc); \
2370 TYPED *d = vd; \
2371 TYPES *n = vn; \
2372 ARMVectorReg tmp; \
2373 if (unlikely(vn - vd < opr_sz)) { \
2374 n = memcpy(&tmp, n, opr_sz / 2); \
2375 } \
2376 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
2377 d[HD(i)] = n[HS(i)]; \
2378 } \
2379}
2380
2381DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
2382DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
2383DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
2384
2385DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
2386DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
2387DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
2388
2389#undef DO_UNPK
d731d8cb
RH
2390
2391/* Mask of bits included in the even numbered predicates of width esz.
2392 * We also use this for expand_bits/compress_bits, and so extend the
2393 * same pattern out to 16-bit units.
2394 */
2395static const uint64_t even_bit_esz_masks[5] = {
2396 0x5555555555555555ull,
2397 0x3333333333333333ull,
2398 0x0f0f0f0f0f0f0f0full,
2399 0x00ff00ff00ff00ffull,
2400 0x0000ffff0000ffffull,
2401};
2402
2403/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
2404 * For N==0, this corresponds to the operation that in qemu/bitops.h
2405 * we call half_shuffle64; this algorithm is from Hacker's Delight,
2406 * section 7-2 Shuffling Bits.
2407 */
2408static uint64_t expand_bits(uint64_t x, int n)
2409{
2410 int i;
2411
2412 x &= 0xffffffffu;
2413 for (i = 4; i >= n; i--) {
2414 int sh = 1 << i;
2415 x = ((x << sh) | x) & even_bit_esz_masks[i];
2416 }
2417 return x;
2418}
2419
2420/* Compress units of 2**(N+1) bits to units of 2**N bits.
2421 * For N==0, this corresponds to the operation that in qemu/bitops.h
2422 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
2423 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
2424 */
2425static uint64_t compress_bits(uint64_t x, int n)
2426{
2427 int i;
2428
2429 for (i = n; i <= 4; i++) {
2430 int sh = 1 << i;
2431 x &= even_bit_esz_masks[i];
2432 x = (x >> sh) | x;
2433 }
2434 return x & 0xffffffffu;
2435}
2436
2437void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2438{
f9b0fcce
RH
2439 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2440 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2441 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
8e7fefed 2442 int esize = 1 << esz;
d731d8cb
RH
2443 uint64_t *d = vd;
2444 intptr_t i;
2445
2446 if (oprsz <= 8) {
2447 uint64_t nn = *(uint64_t *)vn;
2448 uint64_t mm = *(uint64_t *)vm;
2449 int half = 4 * oprsz;
2450
2451 nn = extract64(nn, high * half, half);
2452 mm = extract64(mm, high * half, half);
2453 nn = expand_bits(nn, esz);
2454 mm = expand_bits(mm, esz);
8e7fefed 2455 d[0] = nn | (mm << esize);
d731d8cb 2456 } else {
8e7fefed 2457 ARMPredicateReg tmp;
d731d8cb
RH
2458
2459 /* We produce output faster than we consume input.
2460 Therefore we must be mindful of possible overlap. */
8e7fefed
RH
2461 if (vd == vn) {
2462 vn = memcpy(&tmp, vn, oprsz);
2463 if (vd == vm) {
2464 vm = vn;
2465 }
2466 } else if (vd == vm) {
2467 vm = memcpy(&tmp, vm, oprsz);
d731d8cb
RH
2468 }
2469 if (high) {
2470 high = oprsz >> 1;
2471 }
2472
8e7fefed 2473 if ((oprsz & 7) == 0) {
d731d8cb
RH
2474 uint32_t *n = vn, *m = vm;
2475 high >>= 2;
2476
8e7fefed 2477 for (i = 0; i < oprsz / 8; i++) {
d731d8cb
RH
2478 uint64_t nn = n[H4(high + i)];
2479 uint64_t mm = m[H4(high + i)];
2480
2481 nn = expand_bits(nn, esz);
2482 mm = expand_bits(mm, esz);
8e7fefed 2483 d[i] = nn | (mm << esize);
d731d8cb
RH
2484 }
2485 } else {
2486 uint8_t *n = vn, *m = vm;
2487 uint16_t *d16 = vd;
2488
2489 for (i = 0; i < oprsz / 2; i++) {
2490 uint16_t nn = n[H1(high + i)];
2491 uint16_t mm = m[H1(high + i)];
2492
2493 nn = expand_bits(nn, esz);
2494 mm = expand_bits(mm, esz);
8e7fefed 2495 d16[H2(i)] = nn | (mm << esize);
d731d8cb
RH
2496 }
2497 }
2498 }
2499}
2500
2501void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2502{
f9b0fcce
RH
2503 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2504 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2505 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
d731d8cb
RH
2506 uint64_t *d = vd, *n = vn, *m = vm;
2507 uint64_t l, h;
2508 intptr_t i;
2509
2510 if (oprsz <= 8) {
2511 l = compress_bits(n[0] >> odd, esz);
2512 h = compress_bits(m[0] >> odd, esz);
226e6c04 2513 d[0] = l | (h << (4 * oprsz));
d731d8cb
RH
2514 } else {
2515 ARMPredicateReg tmp_m;
2516 intptr_t oprsz_16 = oprsz / 16;
2517
2518 if ((vm - vd) < (uintptr_t)oprsz) {
2519 m = memcpy(&tmp_m, vm, oprsz);
2520 }
2521
2522 for (i = 0; i < oprsz_16; i++) {
2523 l = n[2 * i + 0];
2524 h = n[2 * i + 1];
2525 l = compress_bits(l >> odd, esz);
2526 h = compress_bits(h >> odd, esz);
226e6c04 2527 d[i] = l | (h << 32);
d731d8cb
RH
2528 }
2529
226e6c04
RH
2530 /*
2531 * For VL which is not a multiple of 512, the results from M do not
2532 * align nicely with the uint64_t for D. Put the aligned results
2533 * from M into TMP_M and then copy it into place afterward.
2534 */
d731d8cb 2535 if (oprsz & 15) {
226e6c04
RH
2536 int final_shift = (oprsz & 15) * 2;
2537
2538 l = n[2 * i + 0];
2539 h = n[2 * i + 1];
2540 l = compress_bits(l >> odd, esz);
2541 h = compress_bits(h >> odd, esz);
2542 d[i] = l | (h << final_shift);
d731d8cb
RH
2543
2544 for (i = 0; i < oprsz_16; i++) {
2545 l = m[2 * i + 0];
2546 h = m[2 * i + 1];
2547 l = compress_bits(l >> odd, esz);
2548 h = compress_bits(h >> odd, esz);
226e6c04 2549 tmp_m.p[i] = l | (h << 32);
d731d8cb 2550 }
226e6c04
RH
2551 l = m[2 * i + 0];
2552 h = m[2 * i + 1];
2553 l = compress_bits(l >> odd, esz);
2554 h = compress_bits(h >> odd, esz);
2555 tmp_m.p[i] = l | (h << final_shift);
d731d8cb
RH
2556
2557 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
2558 } else {
2559 for (i = 0; i < oprsz_16; i++) {
2560 l = m[2 * i + 0];
2561 h = m[2 * i + 1];
2562 l = compress_bits(l >> odd, esz);
2563 h = compress_bits(h >> odd, esz);
226e6c04 2564 d[oprsz_16 + i] = l | (h << 32);
d731d8cb
RH
2565 }
2566 }
2567 }
2568}
2569
2570void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2571{
f9b0fcce
RH
2572 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2573 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2574 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
d731d8cb
RH
2575 uint64_t *d = vd, *n = vn, *m = vm;
2576 uint64_t mask;
2577 int shr, shl;
2578 intptr_t i;
2579
2580 shl = 1 << esz;
2581 shr = 0;
2582 mask = even_bit_esz_masks[esz];
2583 if (odd) {
2584 mask <<= shl;
2585 shr = shl;
2586 shl = 0;
2587 }
2588
2589 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2590 uint64_t nn = (n[i] & mask) >> shr;
2591 uint64_t mm = (m[i] & mask) << shl;
2592 d[i] = nn + mm;
2593 }
2594}
2595
2596/* Reverse units of 2**N bits. */
2597static uint64_t reverse_bits_64(uint64_t x, int n)
2598{
2599 int i, sh;
2600
2601 x = bswap64(x);
2602 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2603 uint64_t mask = even_bit_esz_masks[i];
2604 x = ((x & mask) << sh) | ((x >> sh) & mask);
2605 }
2606 return x;
2607}
2608
2609static uint8_t reverse_bits_8(uint8_t x, int n)
2610{
2611 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2612 int i, sh;
2613
2614 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2615 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2616 }
2617 return x;
2618}
2619
2620void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2621{
70acaafe
RH
2622 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2623 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
d731d8cb
RH
2624 intptr_t i, oprsz_2 = oprsz / 2;
2625
2626 if (oprsz <= 8) {
2627 uint64_t l = *(uint64_t *)vn;
2628 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2629 *(uint64_t *)vd = l;
2630 } else if ((oprsz & 15) == 0) {
2631 for (i = 0; i < oprsz_2; i += 8) {
2632 intptr_t ih = oprsz - 8 - i;
2633 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2634 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2635 *(uint64_t *)(vd + i) = h;
2636 *(uint64_t *)(vd + ih) = l;
2637 }
2638 } else {
2639 for (i = 0; i < oprsz_2; i += 1) {
2640 intptr_t il = H1(i);
2641 intptr_t ih = H1(oprsz - 1 - i);
2642 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2643 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2644 *(uint8_t *)(vd + il) = h;
2645 *(uint8_t *)(vd + ih) = l;
2646 }
2647 }
2648}
2649
2650void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2651{
70acaafe
RH
2652 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2653 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
d731d8cb
RH
2654 uint64_t *d = vd;
2655 intptr_t i;
2656
2657 if (oprsz <= 8) {
2658 uint64_t nn = *(uint64_t *)vn;
2659 int half = 4 * oprsz;
2660
2661 nn = extract64(nn, high * half, half);
2662 nn = expand_bits(nn, 0);
2663 d[0] = nn;
2664 } else {
2665 ARMPredicateReg tmp_n;
2666
2667 /* We produce output faster than we consume input.
2668 Therefore we must be mindful of possible overlap. */
2669 if ((vn - vd) < (uintptr_t)oprsz) {
2670 vn = memcpy(&tmp_n, vn, oprsz);
2671 }
2672 if (high) {
2673 high = oprsz >> 1;
2674 }
2675
fd911a21 2676 if ((oprsz & 7) == 0) {
d731d8cb
RH
2677 uint32_t *n = vn;
2678 high >>= 2;
2679
fd911a21 2680 for (i = 0; i < oprsz / 8; i++) {
d731d8cb
RH
2681 uint64_t nn = n[H4(high + i)];
2682 d[i] = expand_bits(nn, 0);
2683 }
2684 } else {
2685 uint16_t *d16 = vd;
2686 uint8_t *n = vn;
2687
2688 for (i = 0; i < oprsz / 2; i++) {
2689 uint16_t nn = n[H1(high + i)];
2690 d16[H2(i)] = expand_bits(nn, 0);
2691 }
2692 }
2693 }
2694}
234b48e9
RH
2695
2696#define DO_ZIP(NAME, TYPE, H) \
2697void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2698{ \
2699 intptr_t oprsz = simd_oprsz(desc); \
2700 intptr_t i, oprsz_2 = oprsz / 2; \
2701 ARMVectorReg tmp_n, tmp_m; \
2702 /* We produce output faster than we consume input. \
2703 Therefore we must be mindful of possible overlap. */ \
2704 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2705 vn = memcpy(&tmp_n, vn, oprsz_2); \
2706 } \
2707 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2708 vm = memcpy(&tmp_m, vm, oprsz_2); \
2709 } \
2710 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2711 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2712 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2713 } \
2714}
2715
2716DO_ZIP(sve_zip_b, uint8_t, H1)
2717DO_ZIP(sve_zip_h, uint16_t, H1_2)
2718DO_ZIP(sve_zip_s, uint32_t, H1_4)
2719DO_ZIP(sve_zip_d, uint64_t, )
2720
2721#define DO_UZP(NAME, TYPE, H) \
2722void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2723{ \
2724 intptr_t oprsz = simd_oprsz(desc); \
2725 intptr_t oprsz_2 = oprsz / 2; \
2726 intptr_t odd_ofs = simd_data(desc); \
2727 intptr_t i; \
2728 ARMVectorReg tmp_m; \
2729 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2730 vm = memcpy(&tmp_m, vm, oprsz); \
2731 } \
2732 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2733 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2734 } \
2735 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2736 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2737 } \
2738}
2739
2740DO_UZP(sve_uzp_b, uint8_t, H1)
2741DO_UZP(sve_uzp_h, uint16_t, H1_2)
2742DO_UZP(sve_uzp_s, uint32_t, H1_4)
2743DO_UZP(sve_uzp_d, uint64_t, )
2744
2745#define DO_TRN(NAME, TYPE, H) \
2746void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2747{ \
2748 intptr_t oprsz = simd_oprsz(desc); \
2749 intptr_t odd_ofs = simd_data(desc); \
2750 intptr_t i; \
2751 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2752 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2753 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2754 *(TYPE *)(vd + H(i + 0)) = ae; \
2755 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2756 } \
2757}
2758
2759DO_TRN(sve_trn_b, uint8_t, H1)
2760DO_TRN(sve_trn_h, uint16_t, H1_2)
2761DO_TRN(sve_trn_s, uint32_t, H1_4)
2762DO_TRN(sve_trn_d, uint64_t, )
2763
2764#undef DO_ZIP
2765#undef DO_UZP
2766#undef DO_TRN
3ca879ae
RH
2767
2768void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2769{
2770 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2771 uint32_t *d = vd, *n = vn;
2772 uint8_t *pg = vg;
2773
2774 for (i = j = 0; i < opr_sz; i++) {
2775 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2776 d[H4(j)] = n[H4(i)];
2777 j++;
2778 }
2779 }
2780 for (; j < opr_sz; j++) {
2781 d[H4(j)] = 0;
2782 }
2783}
2784
2785void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2786{
2787 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2788 uint64_t *d = vd, *n = vn;
2789 uint8_t *pg = vg;
2790
2791 for (i = j = 0; i < opr_sz; i++) {
2792 if (pg[H1(i)] & 1) {
2793 d[j] = n[i];
2794 j++;
2795 }
2796 }
2797 for (; j < opr_sz; j++) {
2798 d[j] = 0;
2799 }
2800}
ef23cb72
RH
2801
2802/* Similar to the ARM LastActiveElement pseudocode function, except the
2803 * result is multiplied by the element size. This includes the not found
2804 * indication; e.g. not found for esz=3 is -8.
2805 */
2806int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2807{
2acbfbe4
RH
2808 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
2809 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
ef23cb72 2810
2acbfbe4 2811 return last_active_element(vg, words, esz);
ef23cb72 2812}
b48ff240
RH
2813
2814void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2815{
2816 intptr_t opr_sz = simd_oprsz(desc) / 8;
2817 int esz = simd_data(desc);
2818 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2819 intptr_t i, first_i, last_i;
2820 ARMVectorReg tmp;
2821
2822 first_i = last_i = 0;
2823 first_g = last_g = 0;
2824
2825 /* Find the extent of the active elements within VG. */
2826 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2827 pg = *(uint64_t *)(vg + i) & mask;
2828 if (pg) {
2829 if (last_g == 0) {
2830 last_g = pg;
2831 last_i = i;
2832 }
2833 first_g = pg;
2834 first_i = i;
2835 }
2836 }
2837
2838 len = 0;
2839 if (first_g != 0) {
2840 first_i = first_i * 8 + ctz64(first_g);
2841 last_i = last_i * 8 + 63 - clz64(last_g);
2842 len = last_i - first_i + (1 << esz);
2843 if (vd == vm) {
2844 vm = memcpy(&tmp, vm, opr_sz * 8);
2845 }
2846 swap_memmove(vd, vn + first_i, len);
2847 }
2848 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2849}
d3fe4a29
RH
2850
2851void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2852 void *vg, uint32_t desc)
2853{
2854 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2855 uint64_t *d = vd, *n = vn, *m = vm;
2856 uint8_t *pg = vg;
2857
2858 for (i = 0; i < opr_sz; i += 1) {
2859 uint64_t nn = n[i], mm = m[i];
2860 uint64_t pp = expand_pred_b(pg[H1(i)]);
2861 d[i] = (nn & pp) | (mm & ~pp);
2862 }
2863}
2864
2865void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2866 void *vg, uint32_t desc)
2867{
2868 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2869 uint64_t *d = vd, *n = vn, *m = vm;
2870 uint8_t *pg = vg;
2871
2872 for (i = 0; i < opr_sz; i += 1) {
2873 uint64_t nn = n[i], mm = m[i];
2874 uint64_t pp = expand_pred_h(pg[H1(i)]);
2875 d[i] = (nn & pp) | (mm & ~pp);
2876 }
2877}
2878
2879void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2880 void *vg, uint32_t desc)
2881{
2882 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2883 uint64_t *d = vd, *n = vn, *m = vm;
2884 uint8_t *pg = vg;
2885
2886 for (i = 0; i < opr_sz; i += 1) {
2887 uint64_t nn = n[i], mm = m[i];
2888 uint64_t pp = expand_pred_s(pg[H1(i)]);
2889 d[i] = (nn & pp) | (mm & ~pp);
2890 }
2891}
2892
2893void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2894 void *vg, uint32_t desc)
2895{
2896 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2897 uint64_t *d = vd, *n = vn, *m = vm;
2898 uint8_t *pg = vg;
2899
2900 for (i = 0; i < opr_sz; i += 1) {
2901 uint64_t nn = n[i], mm = m[i];
2902 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2903 }
2904}
757f9cff
RH
2905
2906/* Two operand comparison controlled by a predicate.
2907 * ??? It is very tempting to want to be able to expand this inline
2908 * with x86 instructions, e.g.
2909 *
2910 * vcmpeqw zm, zn, %ymm0
2911 * vpmovmskb %ymm0, %eax
2912 * and $0x5555, %eax
2913 * and pg, %eax
2914 *
2915 * or even aarch64, e.g.
2916 *
2917 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2918 * cmeq v0.8h, zn, zm
2919 * and v0.8h, v0.8h, mask
2920 * addv h0, v0.8h
2921 * and v0.8b, pg
2922 *
2923 * However, coming up with an abstraction that allows vector inputs and
2924 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2925 * scalar outputs, is tricky.
2926 */
2927#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2928uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2929{ \
2930 intptr_t opr_sz = simd_oprsz(desc); \
2931 uint32_t flags = PREDTEST_INIT; \
2932 intptr_t i = opr_sz; \
2933 do { \
2934 uint64_t out = 0, pg; \
2935 do { \
2936 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2937 TYPE nn = *(TYPE *)(vn + H(i)); \
2938 TYPE mm = *(TYPE *)(vm + H(i)); \
2939 out |= nn OP mm; \
2940 } while (i & 63); \
2941 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2942 out &= pg; \
2943 *(uint64_t *)(vd + (i >> 3)) = out; \
2944 flags = iter_predtest_bwd(out, pg, flags); \
2945 } while (i > 0); \
2946 return flags; \
2947}
2948
2949#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2950 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2951#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2952 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2953#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2954 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2955#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2956 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2957
2958DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2959DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2960DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2961DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2962
2963DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2964DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2965DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2966DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2967
2968DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2969DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2970DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2971DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2972
2973DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2974DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2975DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2976DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2977
2978DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2979DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2980DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2981DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2982
2983DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2984DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2985DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2986DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2987
2988#undef DO_CMP_PPZZ_B
2989#undef DO_CMP_PPZZ_H
2990#undef DO_CMP_PPZZ_S
2991#undef DO_CMP_PPZZ_D
2992#undef DO_CMP_PPZZ
2993
2994/* Similar, but the second source is "wide". */
2995#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2996uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2997{ \
2998 intptr_t opr_sz = simd_oprsz(desc); \
2999 uint32_t flags = PREDTEST_INIT; \
3000 intptr_t i = opr_sz; \
3001 do { \
3002 uint64_t out = 0, pg; \
3003 do { \
3004 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3005 do { \
3006 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3007 TYPE nn = *(TYPE *)(vn + H(i)); \
3008 out |= nn OP mm; \
3009 } while (i & 7); \
3010 } while (i & 63); \
3011 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3012 out &= pg; \
3013 *(uint64_t *)(vd + (i >> 3)) = out; \
3014 flags = iter_predtest_bwd(out, pg, flags); \
3015 } while (i > 0); \
3016 return flags; \
3017}
3018
3019#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3020 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3021#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3022 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3023#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3024 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3025
df4e0010
RH
3026DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3027DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3028DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
757f9cff 3029
df4e0010
RH
3030DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3031DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3032DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
757f9cff
RH
3033
3034DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3035DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3036DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3037
3038DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3039DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3040DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3041
3042DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3043DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3044DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3045
3046DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3047DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3048DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3049
3050DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3051DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3052DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3053
3054DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3055DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3056DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3057
3058DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3059DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3060DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3061
3062DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3063DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3064DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3065
3066#undef DO_CMP_PPZW_B
3067#undef DO_CMP_PPZW_H
3068#undef DO_CMP_PPZW_S
3069#undef DO_CMP_PPZW
38cadeba
RH
3070
3071/* Similar, but the second source is immediate. */
3072#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3073uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3074{ \
3075 intptr_t opr_sz = simd_oprsz(desc); \
3076 uint32_t flags = PREDTEST_INIT; \
3077 TYPE mm = simd_data(desc); \
3078 intptr_t i = opr_sz; \
3079 do { \
3080 uint64_t out = 0, pg; \
3081 do { \
3082 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3083 TYPE nn = *(TYPE *)(vn + H(i)); \
3084 out |= nn OP mm; \
3085 } while (i & 63); \
3086 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3087 out &= pg; \
3088 *(uint64_t *)(vd + (i >> 3)) = out; \
3089 flags = iter_predtest_bwd(out, pg, flags); \
3090 } while (i > 0); \
3091 return flags; \
3092}
3093
3094#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3095 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3096#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3097 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3098#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3099 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3100#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3101 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
3102
3103DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3104DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3105DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3106DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3107
3108DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3109DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3110DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3111DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3112
3113DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3114DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3115DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3116DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3117
3118DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3119DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3120DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3121DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3122
3123DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3124DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3125DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3126DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3127
3128DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3129DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3130DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3131DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3132
3133DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3134DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3135DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3136DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3137
3138DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3139DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3140DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3141DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3142
3143DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3144DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3145DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3146DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3147
3148DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3149DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3150DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3151DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3152
3153#undef DO_CMP_PPZI_B
3154#undef DO_CMP_PPZI_H
3155#undef DO_CMP_PPZI_S
3156#undef DO_CMP_PPZI_D
3157#undef DO_CMP_PPZI
35da316f
RH
3158
3159/* Similar to the ARM LastActive pseudocode function. */
3160static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3161{
3162 intptr_t i;
3163
3164 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3165 uint64_t pg = *(uint64_t *)(vg + i);
3166 if (pg) {
3167 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3168 }
3169 }
3170 return 0;
3171}
3172
3173/* Compute a mask into RETB that is true for all G, up to and including
3174 * (if after) or excluding (if !after) the first G & N.
3175 * Return true if BRK found.
3176 */
3177static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3178 bool brk, bool after)
3179{
3180 uint64_t b;
3181
3182 if (brk) {
3183 b = 0;
3184 } else if ((g & n) == 0) {
3185 /* For all G, no N are set; break not found. */
3186 b = g;
3187 } else {
3188 /* Break somewhere in N. Locate it. */
3189 b = g & n; /* guard true, pred true */
3190 b = b & -b; /* first such */
3191 if (after) {
3192 b = b | (b - 1); /* break after same */
3193 } else {
3194 b = b - 1; /* break before same */
3195 }
3196 brk = true;
3197 }
3198
3199 *retb = b;
3200 return brk;
3201}
3202
3203/* Compute a zeroing BRK. */
3204static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3205 intptr_t oprsz, bool after)
3206{
3207 bool brk = false;
3208 intptr_t i;
3209
3210 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3211 uint64_t this_b, this_g = g[i];
3212
3213 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3214 d[i] = this_b & this_g;
3215 }
3216}
3217
3218/* Likewise, but also compute flags. */
3219static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3220 intptr_t oprsz, bool after)
3221{
3222 uint32_t flags = PREDTEST_INIT;
3223 bool brk = false;
3224 intptr_t i;
3225
3226 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3227 uint64_t this_b, this_d, this_g = g[i];
3228
3229 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3230 d[i] = this_d = this_b & this_g;
3231 flags = iter_predtest_fwd(this_d, this_g, flags);
3232 }
3233 return flags;
3234}
3235
3236/* Compute a merging BRK. */
3237static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3238 intptr_t oprsz, bool after)
3239{
3240 bool brk = false;
3241 intptr_t i;
3242
3243 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3244 uint64_t this_b, this_g = g[i];
3245
3246 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3247 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3248 }
3249}
3250
3251/* Likewise, but also compute flags. */
3252static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3253 intptr_t oprsz, bool after)
3254{
3255 uint32_t flags = PREDTEST_INIT;
3256 bool brk = false;
3257 intptr_t i;
3258
3259 for (i = 0; i < oprsz / 8; ++i) {
3260 uint64_t this_b, this_d = d[i], this_g = g[i];
3261
3262 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3263 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3264 flags = iter_predtest_fwd(this_d, this_g, flags);
3265 }
3266 return flags;
3267}
3268
3269static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3270{
3271 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3272 * The compiler should turn this into 4 64-bit integer stores.
3273 */
3274 memset(d, 0, sizeof(ARMPredicateReg));
3275 return PREDTEST_INIT;
3276}
3277
3278void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3279 uint32_t pred_desc)
3280{
04c774a2 3281 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3282 if (last_active_pred(vn, vg, oprsz)) {
3283 compute_brk_z(vd, vm, vg, oprsz, true);
3284 } else {
3285 do_zero(vd, oprsz);
3286 }
3287}
3288
3289uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3290 uint32_t pred_desc)
3291{
04c774a2 3292 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3293 if (last_active_pred(vn, vg, oprsz)) {
3294 return compute_brks_z(vd, vm, vg, oprsz, true);
3295 } else {
3296 return do_zero(vd, oprsz);
3297 }
3298}
3299
3300void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3301 uint32_t pred_desc)
3302{
04c774a2 3303 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3304 if (last_active_pred(vn, vg, oprsz)) {
3305 compute_brk_z(vd, vm, vg, oprsz, false);
3306 } else {
3307 do_zero(vd, oprsz);
3308 }
3309}
3310
3311uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
3312 uint32_t pred_desc)
3313{
04c774a2 3314 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3315 if (last_active_pred(vn, vg, oprsz)) {
3316 return compute_brks_z(vd, vm, vg, oprsz, false);
3317 } else {
3318 return do_zero(vd, oprsz);
3319 }
3320}
3321
3322void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3323{
04c774a2 3324 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3325 compute_brk_z(vd, vn, vg, oprsz, true);
3326}
3327
3328uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3329{
04c774a2 3330 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3331 return compute_brks_z(vd, vn, vg, oprsz, true);
3332}
3333
3334void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3335{
04c774a2 3336 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3337 compute_brk_z(vd, vn, vg, oprsz, false);
3338}
3339
3340uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3341{
04c774a2 3342 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3343 return compute_brks_z(vd, vn, vg, oprsz, false);
3344}
3345
3346void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3347{
04c774a2 3348 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3349 compute_brk_m(vd, vn, vg, oprsz, true);
3350}
3351
3352uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3353{
04c774a2 3354 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3355 return compute_brks_m(vd, vn, vg, oprsz, true);
3356}
3357
3358void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3359{
04c774a2 3360 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3361 compute_brk_m(vd, vn, vg, oprsz, false);
3362}
3363
3364uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3365{
04c774a2 3366 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3367 return compute_brks_m(vd, vn, vg, oprsz, false);
3368}
3369
3370void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3371{
04c774a2 3372 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3373 if (!last_active_pred(vn, vg, oprsz)) {
3374 do_zero(vd, oprsz);
3375 }
3376}
3377
3378/* As if PredTest(Ones(PL), D, esz). */
3379static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
3380 uint64_t esz_mask)
3381{
3382 uint32_t flags = PREDTEST_INIT;
3383 intptr_t i;
3384
3385 for (i = 0; i < oprsz / 8; i++) {
3386 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
3387 }
3388 if (oprsz & 7) {
3389 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
3390 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
3391 }
3392 return flags;
3393}
3394
3395uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3396{
04c774a2 3397 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3398 if (last_active_pred(vn, vg, oprsz)) {
3399 return predtest_ones(vd, oprsz, -1);
3400 } else {
3401 return do_zero(vd, oprsz);
3402 }
3403}
9ee3a611
RH
3404
3405uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
3406{
f556a201
RH
3407 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3408 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
9ee3a611
RH
3409 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
3410 intptr_t i;
3411
f556a201 3412 for (i = 0; i < words; ++i) {
9ee3a611
RH
3413 uint64_t t = n[i] & g[i] & mask;
3414 sum += ctpop64(t);
3415 }
3416 return sum;
3417}
caf1cefc
RH
3418
3419uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
3420{
e610906c
RH
3421 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3422 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
caf1cefc
RH
3423 uint64_t esz_mask = pred_esz_masks[esz];
3424 ARMPredicateReg *d = vd;
3425 uint32_t flags;
3426 intptr_t i;
3427
3428 /* Begin with a zero predicate register. */
3429 flags = do_zero(d, oprsz);
3430 if (count == 0) {
3431 return flags;
3432 }
3433
caf1cefc
RH
3434 /* Set all of the requested bits. */
3435 for (i = 0; i < count / 64; ++i) {
3436 d->p[i] = esz_mask;
3437 }
3438 if (count & 63) {
3439 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
3440 }
3441
3442 return predtest_ones(d, oprsz, esz_mask);
3443}
c4e7c493 3444
23fbe79f
RH
3445/* Recursive reduction on a function;
3446 * C.f. the ARM ARM function ReducePredicated.
3447 *
3448 * While it would be possible to write this without the DATA temporary,
3449 * it is much simpler to process the predicate register this way.
3450 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
3451 * little to gain with a more complex non-recursive form.
3452 */
3453#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
3454static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
3455{ \
3456 if (n == 1) { \
3457 return *data; \
3458 } else { \
3459 uintptr_t half = n / 2; \
3460 TYPE lo = NAME##_reduce(data, status, half); \
3461 TYPE hi = NAME##_reduce(data + half, status, half); \
3462 return TYPE##_##FUNC(lo, hi, status); \
3463 } \
3464} \
3465uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
3466{ \
c648c9b7 3467 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
23fbe79f
RH
3468 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
3469 for (i = 0; i < oprsz; ) { \
3470 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3471 do { \
3472 TYPE nn = *(TYPE *)(vn + H(i)); \
3473 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
3474 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
3475 } while (i & 15); \
3476 } \
3477 for (; i < maxsz; i += sizeof(TYPE)) { \
3478 *(TYPE *)((void *)data + i) = IDENT; \
3479 } \
3480 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
3481}
3482
3483DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
3484DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
3485DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
3486
3487/* Identity is floatN_default_nan, without the function call. */
3488DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
3489DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
3490DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
3491
3492DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
3493DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
3494DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
3495
3496DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
3497DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
3498DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
3499
3500DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
3501DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
3502DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
3503
3504#undef DO_REDUCE
3505
7f9ddf64
RH
3506uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
3507 void *status, uint32_t desc)
3508{
3509 intptr_t i = 0, opr_sz = simd_oprsz(desc);
3510 float16 result = nn;
3511
3512 do {
3513 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
3514 do {
3515 if (pg & 1) {
3516 float16 mm = *(float16 *)(vm + H1_2(i));
3517 result = float16_add(result, mm, status);
3518 }
3519 i += sizeof(float16), pg >>= sizeof(float16);
3520 } while (i & 15);
3521 } while (i < opr_sz);
3522
3523 return result;
3524}
3525
3526uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
3527 void *status, uint32_t desc)
3528{
3529 intptr_t i = 0, opr_sz = simd_oprsz(desc);
3530 float32 result = nn;
3531
3532 do {
3533 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
3534 do {
3535 if (pg & 1) {
3536 float32 mm = *(float32 *)(vm + H1_2(i));
3537 result = float32_add(result, mm, status);
3538 }
3539 i += sizeof(float32), pg >>= sizeof(float32);
3540 } while (i & 15);
3541 } while (i < opr_sz);
3542
3543 return result;
3544}
3545
3546uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
3547 void *status, uint32_t desc)
3548{
3549 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
3550 uint64_t *m = vm;
3551 uint8_t *pg = vg;
3552
3553 for (i = 0; i < opr_sz; i++) {
3554 if (pg[H1(i)] & 1) {
3555 nn = float64_add(nn, m[i], status);
3556 }
3557 }
3558
3559 return nn;
3560}
3561
ec3b87c2
RH
3562/* Fully general three-operand expander, controlled by a predicate,
3563 * With the extra float_status parameter.
3564 */
3565#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
3566void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3567 void *status, uint32_t desc) \
3568{ \
3569 intptr_t i = simd_oprsz(desc); \
3570 uint64_t *g = vg; \
3571 do { \
3572 uint64_t pg = g[(i - 1) >> 6]; \
3573 do { \
3574 i -= sizeof(TYPE); \
3575 if (likely((pg >> (i & 63)) & 1)) { \
3576 TYPE nn = *(TYPE *)(vn + H(i)); \
3577 TYPE mm = *(TYPE *)(vm + H(i)); \
3578 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3579 } \
3580 } while (i & 63); \
3581 } while (i != 0); \
3582}
3583
3584DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3585DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3586DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3587
3588DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3589DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3590DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3591
3592DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3593DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3594DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3595
3596DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3597DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3598DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3599
3600DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3601DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3602DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3603
3604DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3605DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3606DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3607
3608DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3609DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3610DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3611
3612DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3613DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3614DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3615
3616static inline float16 abd_h(float16 a, float16 b, float_status *s)
3617{
3618 return float16_abs(float16_sub(a, b, s));
3619}
3620
3621static inline float32 abd_s(float32 a, float32 b, float_status *s)
3622{
3623 return float32_abs(float32_sub(a, b, s));
3624}
3625
3626static inline float64 abd_d(float64 a, float64 b, float_status *s)
3627{
3628 return float64_abs(float64_sub(a, b, s));
3629}
3630
3631DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3632DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3633DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3634
3635static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3636{
3637 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3638 return float64_scalbn(a, b_int, s);
3639}
3640
3641DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3642DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3643DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3644
3645DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3646DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3647DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3648
3649#undef DO_ZPZZ_FP
3650
cc48affe
RH
3651/* Three-operand expander, with one scalar operand, controlled by
3652 * a predicate, with the extra float_status parameter.
3653 */
3654#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3655void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3656 void *status, uint32_t desc) \
3657{ \
3658 intptr_t i = simd_oprsz(desc); \
3659 uint64_t *g = vg; \
3660 TYPE mm = scalar; \
3661 do { \
3662 uint64_t pg = g[(i - 1) >> 6]; \
3663 do { \
3664 i -= sizeof(TYPE); \
3665 if (likely((pg >> (i & 63)) & 1)) { \
3666 TYPE nn = *(TYPE *)(vn + H(i)); \
3667 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3668 } \
3669 } while (i & 63); \
3670 } while (i != 0); \
3671}
3672
3673DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3674DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3675DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3676
3677DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3678DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3679DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3680
3681DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3682DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3683DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3684
3685static inline float16 subr_h(float16 a, float16 b, float_status *s)
3686{
3687 return float16_sub(b, a, s);
3688}
3689
3690static inline float32 subr_s(float32 a, float32 b, float_status *s)
3691{
3692 return float32_sub(b, a, s);
3693}
3694
3695static inline float64 subr_d(float64 a, float64 b, float_status *s)
3696{
3697 return float64_sub(b, a, s);
3698}
3699
3700DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3701DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3702DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3703
3704DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3705DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3706DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3707
3708DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3709DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3710DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3711
3712DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3713DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3714DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3715
3716DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3717DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3718DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3719
8092c6a3
RH
3720/* Fully general two-operand expander, controlled by a predicate,
3721 * With the extra float_status parameter.
3722 */
3723#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3724void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3725{ \
3726 intptr_t i = simd_oprsz(desc); \
3727 uint64_t *g = vg; \
3728 do { \
3729 uint64_t pg = g[(i - 1) >> 6]; \
3730 do { \
3731 i -= sizeof(TYPE); \
3732 if (likely((pg >> (i & 63)) & 1)) { \
3733 TYPE nn = *(TYPE *)(vn + H(i)); \
3734 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3735 } \
3736 } while (i & 63); \
3737 } while (i != 0); \
3738}
3739
46d33d1e
RH
3740/* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3741 * FZ16. When converting from fp16, this affects flushing input denormals;
3742 * when converting to fp16, this affects flushing output denormals.
3743 */
3744static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3745{
c120391c 3746 bool save = get_flush_inputs_to_zero(fpst);
46d33d1e
RH
3747 float32 ret;
3748
3749 set_flush_inputs_to_zero(false, fpst);
3750 ret = float16_to_float32(f, true, fpst);
3751 set_flush_inputs_to_zero(save, fpst);
3752 return ret;
3753}
3754
3755static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3756{
c120391c 3757 bool save = get_flush_inputs_to_zero(fpst);
46d33d1e
RH
3758 float64 ret;
3759
3760 set_flush_inputs_to_zero(false, fpst);
3761 ret = float16_to_float64(f, true, fpst);
3762 set_flush_inputs_to_zero(save, fpst);
3763 return ret;
3764}
3765
3766static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3767{
c120391c 3768 bool save = get_flush_to_zero(fpst);
46d33d1e
RH
3769 float16 ret;
3770
3771 set_flush_to_zero(false, fpst);
3772 ret = float32_to_float16(f, true, fpst);
3773 set_flush_to_zero(save, fpst);
3774 return ret;
3775}
3776
3777static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3778{
c120391c 3779 bool save = get_flush_to_zero(fpst);
46d33d1e
RH
3780 float16 ret;
3781
3782 set_flush_to_zero(false, fpst);
3783 ret = float64_to_float16(f, true, fpst);
3784 set_flush_to_zero(save, fpst);
3785 return ret;
3786}
3787
df4de1af
RH
3788static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3789{
3790 if (float16_is_any_nan(f)) {
3791 float_raise(float_flag_invalid, s);
3792 return 0;
3793 }
3794 return float16_to_int16_round_to_zero(f, s);
3795}
3796
3797static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3798{
3799 if (float16_is_any_nan(f)) {
3800 float_raise(float_flag_invalid, s);
3801 return 0;
3802 }
3803 return float16_to_int64_round_to_zero(f, s);
3804}
3805
3806static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3807{
3808 if (float32_is_any_nan(f)) {
3809 float_raise(float_flag_invalid, s);
3810 return 0;
3811 }
3812 return float32_to_int64_round_to_zero(f, s);
3813}
3814
3815static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3816{
3817 if (float64_is_any_nan(f)) {
3818 float_raise(float_flag_invalid, s);
3819 return 0;
3820 }
3821 return float64_to_int64_round_to_zero(f, s);
3822}
3823
3824static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3825{
3826 if (float16_is_any_nan(f)) {
3827 float_raise(float_flag_invalid, s);
3828 return 0;
3829 }
3830 return float16_to_uint16_round_to_zero(f, s);
3831}
3832
3833static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3834{
3835 if (float16_is_any_nan(f)) {
3836 float_raise(float_flag_invalid, s);
3837 return 0;
3838 }
3839 return float16_to_uint64_round_to_zero(f, s);
3840}
3841
3842static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3843{
3844 if (float32_is_any_nan(f)) {
3845 float_raise(float_flag_invalid, s);
3846 return 0;
3847 }
3848 return float32_to_uint64_round_to_zero(f, s);
3849}
3850
3851static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3852{
3853 if (float64_is_any_nan(f)) {
3854 float_raise(float_flag_invalid, s);
3855 return 0;
3856 }
3857 return float64_to_uint64_round_to_zero(f, s);
3858}
3859
46d33d1e
RH
3860DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3861DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3862DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3863DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3864DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3865DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3866
df4de1af
RH
3867DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3868DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3869DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3870DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3871DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3872DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3873DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3874
3875DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3876DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3877DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3878DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3879DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3880DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3881DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3882
cda3c753
RH
3883DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3884DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3885DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3886
3887DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3888DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3889DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3890
ec5b375b
RH
3891DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3892DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3893DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3894
3895DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3896DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3897DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3898
8092c6a3
RH
3899DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3900DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3901DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3902DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3903DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3904DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3905DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3906
3907DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3908DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3909DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3910DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3911DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3912DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3913DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3914
3915#undef DO_ZPZ_FP
3916
08975da9
RH
3917static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
3918 float_status *status, uint32_t desc,
6ceabaad
RH
3919 uint16_t neg1, uint16_t neg3)
3920{
3921 intptr_t i = simd_oprsz(desc);
6ceabaad
RH
3922 uint64_t *g = vg;
3923
3924 do {
3925 uint64_t pg = g[(i - 1) >> 6];
3926 do {
3927 i -= 2;
3928 if (likely((pg >> (i & 63)) & 1)) {
3929 float16 e1, e2, e3, r;
3930
3931 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3932 e2 = *(uint16_t *)(vm + H1_2(i));
3933 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
08975da9 3934 r = float16_muladd(e1, e2, e3, 0, status);
6ceabaad
RH
3935 *(uint16_t *)(vd + H1_2(i)) = r;
3936 }
3937 } while (i & 63);
3938 } while (i != 0);
3939}
3940
08975da9
RH
3941void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3942 void *vg, void *status, uint32_t desc)
6ceabaad 3943{
08975da9 3944 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
6ceabaad
RH
3945}
3946
08975da9
RH
3947void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3948 void *vg, void *status, uint32_t desc)
6ceabaad 3949{
08975da9 3950 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
6ceabaad
RH
3951}
3952
08975da9
RH
3953void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3954 void *vg, void *status, uint32_t desc)
6ceabaad 3955{
08975da9 3956 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
6ceabaad
RH
3957}
3958
08975da9
RH
3959void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
3960 void *vg, void *status, uint32_t desc)
6ceabaad 3961{
08975da9 3962 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
6ceabaad
RH
3963}
3964
08975da9
RH
3965static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
3966 float_status *status, uint32_t desc,
6ceabaad
RH
3967 uint32_t neg1, uint32_t neg3)
3968{
3969 intptr_t i = simd_oprsz(desc);
6ceabaad
RH
3970 uint64_t *g = vg;
3971
3972 do {
3973 uint64_t pg = g[(i - 1) >> 6];
3974 do {
3975 i -= 4;
3976 if (likely((pg >> (i & 63)) & 1)) {
3977 float32 e1, e2, e3, r;
3978
3979 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3980 e2 = *(uint32_t *)(vm + H1_4(i));
3981 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
08975da9 3982 r = float32_muladd(e1, e2, e3, 0, status);
6ceabaad
RH
3983 *(uint32_t *)(vd + H1_4(i)) = r;
3984 }
3985 } while (i & 63);
3986 } while (i != 0);
3987}
3988
08975da9
RH
3989void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3990 void *vg, void *status, uint32_t desc)
6ceabaad 3991{
08975da9 3992 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
6ceabaad
RH
3993}
3994
08975da9
RH
3995void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
3996 void *vg, void *status, uint32_t desc)
6ceabaad 3997{
08975da9 3998 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
6ceabaad
RH
3999}
4000
08975da9
RH
4001void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4002 void *vg, void *status, uint32_t desc)
6ceabaad 4003{
08975da9 4004 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
6ceabaad
RH
4005}
4006
08975da9
RH
4007void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4008 void *vg, void *status, uint32_t desc)
6ceabaad 4009{
08975da9 4010 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
6ceabaad
RH
4011}
4012
08975da9
RH
4013static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4014 float_status *status, uint32_t desc,
6ceabaad
RH
4015 uint64_t neg1, uint64_t neg3)
4016{
4017 intptr_t i = simd_oprsz(desc);
6ceabaad
RH
4018 uint64_t *g = vg;
4019
4020 do {
4021 uint64_t pg = g[(i - 1) >> 6];
4022 do {
4023 i -= 8;
4024 if (likely((pg >> (i & 63)) & 1)) {
4025 float64 e1, e2, e3, r;
4026
4027 e1 = *(uint64_t *)(vn + i) ^ neg1;
4028 e2 = *(uint64_t *)(vm + i);
4029 e3 = *(uint64_t *)(va + i) ^ neg3;
08975da9 4030 r = float64_muladd(e1, e2, e3, 0, status);
6ceabaad
RH
4031 *(uint64_t *)(vd + i) = r;
4032 }
4033 } while (i & 63);
4034 } while (i != 0);
4035}
4036
08975da9
RH
4037void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4038 void *vg, void *status, uint32_t desc)
6ceabaad 4039{
08975da9 4040 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
6ceabaad
RH
4041}
4042
08975da9
RH
4043void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4044 void *vg, void *status, uint32_t desc)
6ceabaad 4045{
08975da9 4046 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
6ceabaad
RH
4047}
4048
08975da9
RH
4049void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4050 void *vg, void *status, uint32_t desc)
6ceabaad 4051{
08975da9 4052 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
6ceabaad
RH
4053}
4054
08975da9
RH
4055void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4056 void *vg, void *status, uint32_t desc)
6ceabaad 4057{
08975da9 4058 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
6ceabaad
RH
4059}
4060
abfdefd5
RH
4061/* Two operand floating-point comparison controlled by a predicate.
4062 * Unlike the integer version, we are not allowed to optimistically
4063 * compare operands, since the comparison may have side effects wrt
4064 * the FPSR.
4065 */
4066#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4067void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4068 void *status, uint32_t desc) \
4069{ \
4070 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4071 uint64_t *d = vd, *g = vg; \
4072 do { \
4073 uint64_t out = 0, pg = g[j]; \
4074 do { \
4075 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4076 if (likely((pg >> (i & 63)) & 1)) { \
4077 TYPE nn = *(TYPE *)(vn + H(i)); \
4078 TYPE mm = *(TYPE *)(vm + H(i)); \
4079 out |= OP(TYPE, nn, mm, status); \
4080 } \
4081 } while (i & 63); \
4082 d[j--] = out; \
4083 } while (i > 0); \
4084}
4085
4086#define DO_FPCMP_PPZZ_H(NAME, OP) \
4087 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4088#define DO_FPCMP_PPZZ_S(NAME, OP) \
4089 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4090#define DO_FPCMP_PPZZ_D(NAME, OP) \
4091 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
4092
4093#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4094 DO_FPCMP_PPZZ_H(NAME, OP) \
4095 DO_FPCMP_PPZZ_S(NAME, OP) \
4096 DO_FPCMP_PPZZ_D(NAME, OP)
4097
4098#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4099#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4d2e2a03
RH
4100#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4101#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
abfdefd5
RH
4102#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4103#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4104#define DO_FCMUO(TYPE, X, Y, ST) \
4105 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4106#define DO_FACGE(TYPE, X, Y, ST) \
4107 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4108#define DO_FACGT(TYPE, X, Y, ST) \
4109 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4110
4111DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4112DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4113DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4114DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4115DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4116DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4117DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4118
4119#undef DO_FPCMP_PPZZ_ALL
4120#undef DO_FPCMP_PPZZ_D
4121#undef DO_FPCMP_PPZZ_S
4122#undef DO_FPCMP_PPZZ_H
4123#undef DO_FPCMP_PPZZ
4124
4d2e2a03
RH
4125/* One operand floating-point comparison against zero, controlled
4126 * by a predicate.
4127 */
4128#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4129void HELPER(NAME)(void *vd, void *vn, void *vg, \
4130 void *status, uint32_t desc) \
4131{ \
4132 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4133 uint64_t *d = vd, *g = vg; \
4134 do { \
4135 uint64_t out = 0, pg = g[j]; \
4136 do { \
4137 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4138 if ((pg >> (i & 63)) & 1) { \
4139 TYPE nn = *(TYPE *)(vn + H(i)); \
4140 out |= OP(TYPE, nn, 0, status); \
4141 } \
4142 } while (i & 63); \
4143 d[j--] = out; \
4144 } while (i > 0); \
4145}
4146
4147#define DO_FPCMP_PPZ0_H(NAME, OP) \
4148 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4149#define DO_FPCMP_PPZ0_S(NAME, OP) \
4150 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4151#define DO_FPCMP_PPZ0_D(NAME, OP) \
4152 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
4153
4154#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4155 DO_FPCMP_PPZ0_H(NAME, OP) \
4156 DO_FPCMP_PPZ0_S(NAME, OP) \
4157 DO_FPCMP_PPZ0_D(NAME, OP)
4158
4159DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4160DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4161DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4162DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4163DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4164DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4165
67fcd9ad
RH
4166/* FP Trig Multiply-Add. */
4167
4168void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4169{
4170 static const float16 coeff[16] = {
4171 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4172 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4173 };
4174 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4175 intptr_t x = simd_data(desc);
4176 float16 *d = vd, *n = vn, *m = vm;
4177 for (i = 0; i < opr_sz; i++) {
4178 float16 mm = m[i];
4179 intptr_t xx = x;
4180 if (float16_is_neg(mm)) {
4181 mm = float16_abs(mm);
4182 xx += 8;
4183 }
4184 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
4185 }
4186}
4187
4188void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4189{
4190 static const float32 coeff[16] = {
4191 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
4192 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
4193 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
4194 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
4195 };
4196 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
4197 intptr_t x = simd_data(desc);
4198 float32 *d = vd, *n = vn, *m = vm;
4199 for (i = 0; i < opr_sz; i++) {
4200 float32 mm = m[i];
4201 intptr_t xx = x;
4202 if (float32_is_neg(mm)) {
4203 mm = float32_abs(mm);
4204 xx += 8;
4205 }
4206 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
4207 }
4208}
4209
4210void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4211{
4212 static const float64 coeff[16] = {
4213 0x3ff0000000000000ull, 0xbfc5555555555543ull,
4214 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
4215 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
4216 0x3de5d8408868552full, 0x0000000000000000ull,
4217 0x3ff0000000000000ull, 0xbfe0000000000000ull,
4218 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
4219 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
4220 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
4221 };
4222 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
4223 intptr_t x = simd_data(desc);
4224 float64 *d = vd, *n = vn, *m = vm;
4225 for (i = 0; i < opr_sz; i++) {
4226 float64 mm = m[i];
4227 intptr_t xx = x;
4228 if (float64_is_neg(mm)) {
4229 mm = float64_abs(mm);
4230 xx += 8;
4231 }
4232 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
4233 }
4234}
4235
76a9d9cd
RH
4236/*
4237 * FP Complex Add
4238 */
4239
4240void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
4241 void *vs, uint32_t desc)
4242{
4243 intptr_t j, i = simd_oprsz(desc);
4244 uint64_t *g = vg;
4245 float16 neg_imag = float16_set_sign(0, simd_data(desc));
4246 float16 neg_real = float16_chs(neg_imag);
4247
4248 do {
4249 uint64_t pg = g[(i - 1) >> 6];
4250 do {
4251 float16 e0, e1, e2, e3;
4252
4253 /* I holds the real index; J holds the imag index. */
4254 j = i - sizeof(float16);
4255 i -= 2 * sizeof(float16);
4256
4257 e0 = *(float16 *)(vn + H1_2(i));
4258 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
4259 e2 = *(float16 *)(vn + H1_2(j));
4260 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
4261
4262 if (likely((pg >> (i & 63)) & 1)) {
4263 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
4264 }
4265 if (likely((pg >> (j & 63)) & 1)) {
4266 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
4267 }
4268 } while (i & 63);
4269 } while (i != 0);
4270}
4271
4272void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
4273 void *vs, uint32_t desc)
4274{
4275 intptr_t j, i = simd_oprsz(desc);
4276 uint64_t *g = vg;
4277 float32 neg_imag = float32_set_sign(0, simd_data(desc));
4278 float32 neg_real = float32_chs(neg_imag);
4279
4280 do {
4281 uint64_t pg = g[(i - 1) >> 6];
4282 do {
4283 float32 e0, e1, e2, e3;
4284
4285 /* I holds the real index; J holds the imag index. */
4286 j = i - sizeof(float32);
4287 i -= 2 * sizeof(float32);
4288
4289 e0 = *(float32 *)(vn + H1_2(i));
4290 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
4291 e2 = *(float32 *)(vn + H1_2(j));
4292 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
4293
4294 if (likely((pg >> (i & 63)) & 1)) {
4295 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
4296 }
4297 if (likely((pg >> (j & 63)) & 1)) {
4298 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
4299 }
4300 } while (i & 63);
4301 } while (i != 0);
4302}
4303
4304void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
4305 void *vs, uint32_t desc)
4306{
4307 intptr_t j, i = simd_oprsz(desc);
4308 uint64_t *g = vg;
4309 float64 neg_imag = float64_set_sign(0, simd_data(desc));
4310 float64 neg_real = float64_chs(neg_imag);
4311
4312 do {
4313 uint64_t pg = g[(i - 1) >> 6];
4314 do {
4315 float64 e0, e1, e2, e3;
4316
4317 /* I holds the real index; J holds the imag index. */
4318 j = i - sizeof(float64);
4319 i -= 2 * sizeof(float64);
4320
4321 e0 = *(float64 *)(vn + H1_2(i));
4322 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
4323 e2 = *(float64 *)(vn + H1_2(j));
4324 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
4325
4326 if (likely((pg >> (i & 63)) & 1)) {
4327 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
4328 }
4329 if (likely((pg >> (j & 63)) & 1)) {
4330 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
4331 }
4332 } while (i & 63);
4333 } while (i != 0);
4334}
4335
05f48bab
RH
4336/*
4337 * FP Complex Multiply
4338 */
4339
08975da9
RH
4340void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4341 void *vg, void *status, uint32_t desc)
05f48bab
RH
4342{
4343 intptr_t j, i = simd_oprsz(desc);
08975da9 4344 unsigned rot = simd_data(desc);
05f48bab
RH
4345 bool flip = rot & 1;
4346 float16 neg_imag, neg_real;
05f48bab
RH
4347 uint64_t *g = vg;
4348
4349 neg_imag = float16_set_sign(0, (rot & 2) != 0);
4350 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
4351
4352 do {
4353 uint64_t pg = g[(i - 1) >> 6];
4354 do {
4355 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
4356
4357 /* I holds the real index; J holds the imag index. */
4358 j = i - sizeof(float16);
4359 i -= 2 * sizeof(float16);
4360
4361 nr = *(float16 *)(vn + H1_2(i));
4362 ni = *(float16 *)(vn + H1_2(j));
4363 mr = *(float16 *)(vm + H1_2(i));
4364 mi = *(float16 *)(vm + H1_2(j));
4365
4366 e2 = (flip ? ni : nr);
4367 e1 = (flip ? mi : mr) ^ neg_real;
4368 e4 = e2;
4369 e3 = (flip ? mr : mi) ^ neg_imag;
4370
4371 if (likely((pg >> (i & 63)) & 1)) {
4372 d = *(float16 *)(va + H1_2(i));
08975da9 4373 d = float16_muladd(e2, e1, d, 0, status);
05f48bab
RH
4374 *(float16 *)(vd + H1_2(i)) = d;
4375 }
4376 if (likely((pg >> (j & 63)) & 1)) {
4377 d = *(float16 *)(va + H1_2(j));
08975da9 4378 d = float16_muladd(e4, e3, d, 0, status);
05f48bab
RH
4379 *(float16 *)(vd + H1_2(j)) = d;
4380 }
4381 } while (i & 63);
4382 } while (i != 0);
4383}
4384
08975da9
RH
4385void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4386 void *vg, void *status, uint32_t desc)
05f48bab
RH
4387{
4388 intptr_t j, i = simd_oprsz(desc);
08975da9 4389 unsigned rot = simd_data(desc);
05f48bab
RH
4390 bool flip = rot & 1;
4391 float32 neg_imag, neg_real;
05f48bab
RH
4392 uint64_t *g = vg;
4393
4394 neg_imag = float32_set_sign(0, (rot & 2) != 0);
4395 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
4396
4397 do {
4398 uint64_t pg = g[(i - 1) >> 6];
4399 do {
4400 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
4401
4402 /* I holds the real index; J holds the imag index. */
4403 j = i - sizeof(float32);
4404 i -= 2 * sizeof(float32);
4405
4406 nr = *(float32 *)(vn + H1_2(i));
4407 ni = *(float32 *)(vn + H1_2(j));
4408 mr = *(float32 *)(vm + H1_2(i));
4409 mi = *(float32 *)(vm + H1_2(j));
4410
4411 e2 = (flip ? ni : nr);
4412 e1 = (flip ? mi : mr) ^ neg_real;
4413 e4 = e2;
4414 e3 = (flip ? mr : mi) ^ neg_imag;
4415
4416 if (likely((pg >> (i & 63)) & 1)) {
4417 d = *(float32 *)(va + H1_2(i));
08975da9 4418 d = float32_muladd(e2, e1, d, 0, status);
05f48bab
RH
4419 *(float32 *)(vd + H1_2(i)) = d;
4420 }
4421 if (likely((pg >> (j & 63)) & 1)) {
4422 d = *(float32 *)(va + H1_2(j));
08975da9 4423 d = float32_muladd(e4, e3, d, 0, status);
05f48bab
RH
4424 *(float32 *)(vd + H1_2(j)) = d;
4425 }
4426 } while (i & 63);
4427 } while (i != 0);
4428}
4429
08975da9
RH
4430void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4431 void *vg, void *status, uint32_t desc)
05f48bab
RH
4432{
4433 intptr_t j, i = simd_oprsz(desc);
08975da9 4434 unsigned rot = simd_data(desc);
05f48bab
RH
4435 bool flip = rot & 1;
4436 float64 neg_imag, neg_real;
05f48bab
RH
4437 uint64_t *g = vg;
4438
4439 neg_imag = float64_set_sign(0, (rot & 2) != 0);
4440 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
4441
4442 do {
4443 uint64_t pg = g[(i - 1) >> 6];
4444 do {
4445 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
4446
4447 /* I holds the real index; J holds the imag index. */
4448 j = i - sizeof(float64);
4449 i -= 2 * sizeof(float64);
4450
4451 nr = *(float64 *)(vn + H1_2(i));
4452 ni = *(float64 *)(vn + H1_2(j));
4453 mr = *(float64 *)(vm + H1_2(i));
4454 mi = *(float64 *)(vm + H1_2(j));
4455
4456 e2 = (flip ? ni : nr);
4457 e1 = (flip ? mi : mr) ^ neg_real;
4458 e4 = e2;
4459 e3 = (flip ? mr : mi) ^ neg_imag;
4460
4461 if (likely((pg >> (i & 63)) & 1)) {
4462 d = *(float64 *)(va + H1_2(i));
08975da9 4463 d = float64_muladd(e2, e1, d, 0, status);
05f48bab
RH
4464 *(float64 *)(vd + H1_2(i)) = d;
4465 }
4466 if (likely((pg >> (j & 63)) & 1)) {
4467 d = *(float64 *)(va + H1_2(j));
08975da9 4468 d = float64_muladd(e4, e3, d, 0, status);
05f48bab
RH
4469 *(float64 *)(vd + H1_2(j)) = d;
4470 }
4471 } while (i & 63);
4472 } while (i != 0);
4473}
4474
c4e7c493
RH
4475/*
4476 * Load contiguous data, protected by a governing predicate.
4477 */
9123aeb6
RH
4478
4479/*
cf4a49b7
RH
4480 * Load one element into @vd + @reg_off from @host.
4481 * The controlling predicate is known to be true.
9123aeb6 4482 */
cf4a49b7 4483typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
9123aeb6
RH
4484
4485/*
4486 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
4487 * The controlling predicate is known to be true.
4488 */
6799ce7b
RH
4489typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
4490 target_ulong vaddr, uintptr_t retaddr);
9123aeb6
RH
4491
4492/*
4493 * Generate the above primitives.
4494 */
4495
4496#define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
cf4a49b7
RH
4497static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
4498{ \
4499 TYPEM val = HOST(host); \
4500 *(TYPEE *)(vd + H(reg_off)) = val; \
9123aeb6
RH
4501}
4502
0fa476c1
RH
4503#define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
4504static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
4505{ HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
4506
6799ce7b 4507#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
9123aeb6 4508static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
6799ce7b 4509 target_ulong addr, uintptr_t ra) \
9123aeb6 4510{ \
c4af8ba1
RH
4511 *(TYPEE *)(vd + H(reg_off)) = \
4512 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
9123aeb6 4513}
6799ce7b
RH
4514
4515#define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
9123aeb6 4516static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
6799ce7b 4517 target_ulong addr, uintptr_t ra) \
9123aeb6 4518{ \
c4af8ba1
RH
4519 TLB(env, useronly_clean_ptr(addr), \
4520 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
9123aeb6 4521}
9123aeb6
RH
4522
4523#define DO_LD_PRIM_1(NAME, H, TE, TM) \
4524 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
6799ce7b 4525 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
9123aeb6
RH
4526
4527DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
4528DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
4529DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
4530DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
4531DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
4532DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
4533DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
4534
6799ce7b 4535#define DO_ST_PRIM_1(NAME, H, TE, TM) \
0fa476c1 4536 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
6799ce7b
RH
4537 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
4538
4539DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
4540DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
4541DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
4542DO_ST_PRIM_1(bd, , uint64_t, uint8_t)
9123aeb6 4543
6799ce7b
RH
4544#define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
4545 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
4546 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
4547 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
4548 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
9123aeb6 4549
6799ce7b 4550#define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
0fa476c1
RH
4551 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
4552 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
6799ce7b
RH
4553 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
4554 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
9123aeb6 4555
6799ce7b
RH
4556DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
4557DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
4558DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
4559DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw)
4560DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw)
9123aeb6 4561
6799ce7b
RH
4562DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
4563DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
4564DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw)
9123aeb6 4565
6799ce7b
RH
4566DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
4567DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl)
4568DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl)
9123aeb6 4569
6799ce7b
RH
4570DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
4571DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl)
4572
4573DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq)
4574DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq)
9123aeb6
RH
4575
4576#undef DO_LD_TLB
6799ce7b 4577#undef DO_ST_TLB
9123aeb6
RH
4578#undef DO_LD_HOST
4579#undef DO_LD_PRIM_1
6799ce7b 4580#undef DO_ST_PRIM_1
9123aeb6 4581#undef DO_LD_PRIM_2
6799ce7b 4582#undef DO_ST_PRIM_2
9123aeb6
RH
4583
4584/*
4585 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4586 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4587 * element >= @reg_off, or @reg_max if there were no active elements at all.
4588 */
4589static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4590 intptr_t reg_max, int esz)
4591{
4592 uint64_t pg_mask = pred_esz_masks[esz];
4593 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4594
4595 /* In normal usage, the first element is active. */
4596 if (likely(pg & 1)) {
4597 return reg_off;
4598 }
4599
4600 if (pg == 0) {
4601 reg_off &= -64;
4602 do {
4603 reg_off += 64;
4604 if (unlikely(reg_off >= reg_max)) {
4605 /* The entire predicate was false. */
4606 return reg_max;
4607 }
4608 pg = vg[reg_off >> 6] & pg_mask;
4609 } while (pg == 0);
4610 }
4611 reg_off += ctz64(pg);
4612
4613 /* We should never see an out of range predicate bit set. */
4614 tcg_debug_assert(reg_off < reg_max);
4615 return reg_off;
4616}
4617
b4cd95d2
RH
4618/*
4619 * Resolve the guest virtual address to info->host and info->flags.
4620 * If @nofault, return false if the page is invalid, otherwise
4621 * exit via page fault exception.
4622 */
4623
4624typedef struct {
4625 void *host;
4626 int flags;
4627 MemTxAttrs attrs;
4628} SVEHostPage;
4629
4630static bool sve_probe_page(SVEHostPage *info, bool nofault,
4631 CPUARMState *env, target_ulong addr,
4632 int mem_off, MMUAccessType access_type,
4633 int mmu_idx, uintptr_t retaddr)
4634{
4635 int flags;
4636
4637 addr += mem_off;
c4af8ba1
RH
4638
4639 /*
4640 * User-only currently always issues with TBI. See the comment
4641 * above useronly_clean_ptr. Usually we clean this top byte away
4642 * during translation, but we can't do that for e.g. vector + imm
4643 * addressing modes.
4644 *
4645 * We currently always enable TBI for user-only, and do not provide
4646 * a way to turn it off. So clean the pointer unconditionally here,
4647 * rather than look it up here, or pass it down from above.
4648 */
4649 addr = useronly_clean_ptr(addr);
4650
b4cd95d2
RH
4651 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
4652 &info->host, retaddr);
4653 info->flags = flags;
4654
4655 if (flags & TLB_INVALID_MASK) {
4656 g_assert(nofault);
4657 return false;
4658 }
4659
4660 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
4661 info->host -= mem_off;
4662
4663#ifdef CONFIG_USER_ONLY
4664 memset(&info->attrs, 0, sizeof(info->attrs));
4665#else
4666 /*
4667 * Find the iotlbentry for addr and return the transaction attributes.
4668 * This *must* be present in the TLB because we just found the mapping.
4669 */
4670 {
4671 uintptr_t index = tlb_index(env, mmu_idx, addr);
4672
4673# ifdef CONFIG_DEBUG_TCG
4674 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
4675 target_ulong comparator = (access_type == MMU_DATA_LOAD
4676 ? entry->addr_read
4677 : tlb_addr_write(entry));
4678 g_assert(tlb_hit(comparator, addr));
4679# endif
4680
4681 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
4682 info->attrs = iotlbentry->attrs;
4683 }
4684#endif
4685
4686 return true;
4687}
4688
4689
4690/*
4691 * Analyse contiguous data, protected by a governing predicate.
4692 */
4693
4694typedef enum {
4695 FAULT_NO,
4696 FAULT_FIRST,
4697 FAULT_ALL,
4698} SVEContFault;
4699
4700typedef struct {
4701 /*
4702 * First and last element wholly contained within the two pages.
4703 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
4704 * reg_off_last[0] may be < 0 if the first element crosses pages.
4705 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
4706 * are set >= 0 only if there are complete elements on a second page.
4707 *
4708 * The reg_off_* offsets are relative to the internal vector register.
4709 * The mem_off_first offset is relative to the memory address; the
4710 * two offsets are different when a load operation extends, a store
4711 * operation truncates, or for multi-register operations.
4712 */
4713 int16_t mem_off_first[2];
4714 int16_t reg_off_first[2];
4715 int16_t reg_off_last[2];
4716
4717 /*
4718 * One element that is misaligned and spans both pages,
4719 * or -1 if there is no such active element.
4720 */
4721 int16_t mem_off_split;
4722 int16_t reg_off_split;
4723
4724 /*
4725 * The byte offset at which the entire operation crosses a page boundary.
4726 * Set >= 0 if and only if the entire operation spans two pages.
4727 */
4728 int16_t page_split;
4729
4730 /* TLB data for the two pages. */
4731 SVEHostPage page[2];
4732} SVEContLdSt;
4733
4734/*
4735 * Find first active element on each page, and a loose bound for the
4736 * final element on each page. Identify any single element that spans
4737 * the page boundary. Return true if there are any active elements.
4738 */
b854fd06
RH
4739static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
4740 uint64_t *vg, intptr_t reg_max,
4741 int esz, int msize)
b4cd95d2
RH
4742{
4743 const int esize = 1 << esz;
4744 const uint64_t pg_mask = pred_esz_masks[esz];
4745 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
4746 intptr_t mem_off_last, mem_off_split;
4747 intptr_t page_split, elt_split;
4748 intptr_t i;
4749
4750 /* Set all of the element indices to -1, and the TLB data to 0. */
4751 memset(info, -1, offsetof(SVEContLdSt, page));
4752 memset(info->page, 0, sizeof(info->page));
4753
4754 /* Gross scan over the entire predicate to find bounds. */
4755 i = 0;
4756 do {
4757 uint64_t pg = vg[i] & pg_mask;
4758 if (pg) {
4759 reg_off_last = i * 64 + 63 - clz64(pg);
4760 if (reg_off_first < 0) {
4761 reg_off_first = i * 64 + ctz64(pg);
4762 }
4763 }
4764 } while (++i * 64 < reg_max);
4765
4766 if (unlikely(reg_off_first < 0)) {
4767 /* No active elements, no pages touched. */
4768 return false;
4769 }
4770 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
4771
4772 info->reg_off_first[0] = reg_off_first;
4773 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
4774 mem_off_last = (reg_off_last >> esz) * msize;
4775
4776 page_split = -(addr | TARGET_PAGE_MASK);
4777 if (likely(mem_off_last + msize <= page_split)) {
4778 /* The entire operation fits within a single page. */
4779 info->reg_off_last[0] = reg_off_last;
4780 return true;
4781 }
4782
4783 info->page_split = page_split;
4784 elt_split = page_split / msize;
4785 reg_off_split = elt_split << esz;
4786 mem_off_split = elt_split * msize;
4787
4788 /*
4789 * This is the last full element on the first page, but it is not
4790 * necessarily active. If there is no full element, i.e. the first
4791 * active element is the one that's split, this value remains -1.
4792 * It is useful as iteration bounds.
4793 */
4794 if (elt_split != 0) {
4795 info->reg_off_last[0] = reg_off_split - esize;
4796 }
4797
4798 /* Determine if an unaligned element spans the pages. */
4799 if (page_split % msize != 0) {
4800 /* It is helpful to know if the split element is active. */
4801 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
4802 info->reg_off_split = reg_off_split;
4803 info->mem_off_split = mem_off_split;
4804
4805 if (reg_off_split == reg_off_last) {
4806 /* The page crossing element is last. */
4807 return true;
4808 }
4809 }
4810 reg_off_split += esize;
4811 mem_off_split += msize;
4812 }
4813
4814 /*
4815 * We do want the first active element on the second page, because
4816 * this may affect the address reported in an exception.
4817 */
4818 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
4819 tcg_debug_assert(reg_off_split <= reg_off_last);
4820 info->reg_off_first[1] = reg_off_split;
4821 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
4822 info->reg_off_last[1] = reg_off_last;
4823 return true;
4824}
4825
4826/*
4827 * Resolve the guest virtual addresses to info->page[].
4828 * Control the generation of page faults with @fault. Return false if
4829 * there is no work to do, which can only happen with @fault == FAULT_NO.
4830 */
b854fd06
RH
4831static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
4832 CPUARMState *env, target_ulong addr,
4833 MMUAccessType access_type, uintptr_t retaddr)
b4cd95d2
RH
4834{
4835 int mmu_idx = cpu_mmu_index(env, false);
4836 int mem_off = info->mem_off_first[0];
4837 bool nofault = fault == FAULT_NO;
4838 bool have_work = true;
4839
4840 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
4841 access_type, mmu_idx, retaddr)) {
4842 /* No work to be done. */
4843 return false;
4844 }
4845
4846 if (likely(info->page_split < 0)) {
4847 /* The entire operation was on the one page. */
4848 return true;
4849 }
4850
4851 /*
4852 * If the second page is invalid, then we want the fault address to be
4853 * the first byte on that page which is accessed.
4854 */
4855 if (info->mem_off_split >= 0) {
4856 /*
4857 * There is an element split across the pages. The fault address
4858 * should be the first byte of the second page.
4859 */
4860 mem_off = info->page_split;
4861 /*
4862 * If the split element is also the first active element
4863 * of the vector, then: For first-fault we should continue
4864 * to generate faults for the second page. For no-fault,
4865 * we have work only if the second page is valid.
4866 */
4867 if (info->mem_off_first[0] < info->mem_off_split) {
4868 nofault = FAULT_FIRST;
4869 have_work = false;
4870 }
4871 } else {
4872 /*
4873 * There is no element split across the pages. The fault address
4874 * should be the first active element on the second page.
4875 */
4876 mem_off = info->mem_off_first[1];
4877 /*
4878 * There must have been one active element on the first page,
4879 * so we're out of first-fault territory.
4880 */
4881 nofault = fault != FAULT_ALL;
4882 }
4883
4884 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
4885 access_type, mmu_idx, retaddr);
4886 return have_work;
4887}
4888
4bcc3f0f
RH
4889static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
4890 uint64_t *vg, target_ulong addr,
4891 int esize, int msize, int wp_access,
4892 uintptr_t retaddr)
4893{
4894#ifndef CONFIG_USER_ONLY
4895 intptr_t mem_off, reg_off, reg_last;
4896 int flags0 = info->page[0].flags;
4897 int flags1 = info->page[1].flags;
4898
4899 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
4900 return;
4901 }
4902
4903 /* Indicate that watchpoints are handled. */
4904 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
4905 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
4906
4907 if (flags0 & TLB_WATCHPOINT) {
4908 mem_off = info->mem_off_first[0];
4909 reg_off = info->reg_off_first[0];
4910 reg_last = info->reg_off_last[0];
4911
4912 while (reg_off <= reg_last) {
4913 uint64_t pg = vg[reg_off >> 6];
4914 do {
4915 if ((pg >> (reg_off & 63)) & 1) {
4916 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4917 msize, info->page[0].attrs,
4918 wp_access, retaddr);
4919 }
4920 reg_off += esize;
4921 mem_off += msize;
4922 } while (reg_off <= reg_last && (reg_off & 63));
4923 }
4924 }
4925
4926 mem_off = info->mem_off_split;
4927 if (mem_off >= 0) {
4928 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
4929 info->page[0].attrs, wp_access, retaddr);
4930 }
4931
4932 mem_off = info->mem_off_first[1];
4933 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
4934 reg_off = info->reg_off_first[1];
4935 reg_last = info->reg_off_last[1];
4936
4937 do {
4938 uint64_t pg = vg[reg_off >> 6];
4939 do {
4940 if ((pg >> (reg_off & 63)) & 1) {
4941 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4942 msize, info->page[1].attrs,
4943 wp_access, retaddr);
4944 }
4945 reg_off += esize;
4946 mem_off += msize;
4947 } while (reg_off & 63);
4948 } while (reg_off <= reg_last);
4949 }
4950#endif
4951}
4952
4c3310c7
RH
4953static void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
4954 uint64_t *vg, target_ulong addr, int esize,
4955 int msize, uint32_t mtedesc, uintptr_t ra)
206adacf
RH
4956{
4957 intptr_t mem_off, reg_off, reg_last;
4958
4959 /* Process the page only if MemAttr == Tagged. */
4960 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
4961 mem_off = info->mem_off_first[0];
4962 reg_off = info->reg_off_first[0];
4963 reg_last = info->reg_off_split;
4964 if (reg_last < 0) {
4965 reg_last = info->reg_off_last[0];
4966 }
4967
4968 do {
4969 uint64_t pg = vg[reg_off >> 6];
4970 do {
4971 if ((pg >> (reg_off & 63)) & 1) {
4c3310c7 4972 mte_check(env, mtedesc, addr, ra);
206adacf
RH
4973 }
4974 reg_off += esize;
4975 mem_off += msize;
4976 } while (reg_off <= reg_last && (reg_off & 63));
4977 } while (reg_off <= reg_last);
4978 }
4979
4980 mem_off = info->mem_off_first[1];
4981 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
4982 reg_off = info->reg_off_first[1];
4983 reg_last = info->reg_off_last[1];
4984
4985 do {
4986 uint64_t pg = vg[reg_off >> 6];
4987 do {
4988 if ((pg >> (reg_off & 63)) & 1) {
4c3310c7 4989 mte_check(env, mtedesc, addr, ra);
206adacf
RH
4990 }
4991 reg_off += esize;
4992 mem_off += msize;
4993 } while (reg_off & 63);
4994 } while (reg_off <= reg_last);
4995 }
4996}
4997
9123aeb6 4998/*
5c9b8458 4999 * Common helper for all contiguous 1,2,3,4-register predicated stores.
9123aeb6 5000 */
b854fd06 5001static inline QEMU_ALWAYS_INLINE
5c9b8458 5002void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
b854fd06 5003 uint32_t desc, const uintptr_t retaddr,
206adacf 5004 const int esz, const int msz, const int N, uint32_t mtedesc,
b854fd06 5005 sve_ldst1_host_fn *host_fn,
4c3310c7 5006 sve_ldst1_tlb_fn *tlb_fn)
b854fd06 5007{
ba080b86 5008 const unsigned rd = simd_data(desc);
9123aeb6 5009 const intptr_t reg_max = simd_oprsz(desc);
b854fd06
RH
5010 intptr_t reg_off, reg_last, mem_off;
5011 SVEContLdSt info;
9123aeb6 5012 void *host;
5c9b8458 5013 int flags, i;
9123aeb6 5014
b854fd06 5015 /* Find the active elements. */
5c9b8458 5016 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
9123aeb6 5017 /* The entire predicate was false; no load occurs. */
5c9b8458
RH
5018 for (i = 0; i < N; ++i) {
5019 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5020 }
9123aeb6
RH
5021 return;
5022 }
9123aeb6 5023
b854fd06
RH
5024 /* Probe the page(s). Exit with exception for any invalid page. */
5025 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
9123aeb6 5026
4bcc3f0f 5027 /* Handle watchpoints for all active elements. */
5c9b8458 5028 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
4bcc3f0f
RH
5029 BP_MEM_READ, retaddr);
5030
206adacf
RH
5031 /*
5032 * Handle mte checks for all active elements.
5033 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5034 */
4c3310c7
RH
5035 if (mtedesc) {
5036 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5037 mtedesc, retaddr);
206adacf 5038 }
4bcc3f0f 5039
b854fd06
RH
5040 flags = info.page[0].flags | info.page[1].flags;
5041 if (unlikely(flags != 0)) {
9123aeb6 5042#ifdef CONFIG_USER_ONLY
b854fd06 5043 g_assert_not_reached();
9123aeb6 5044#else
b854fd06 5045 /*
4bcc3f0f 5046 * At least one page includes MMIO.
b854fd06
RH
5047 * Any bus operation can fail with cpu_transaction_failed,
5048 * which for ARM will raise SyncExternal. Perform the load
5049 * into scratch memory to preserve register state until the end.
5050 */
5c9b8458 5051 ARMVectorReg scratch[4] = { };
b854fd06 5052
b854fd06
RH
5053 mem_off = info.mem_off_first[0];
5054 reg_off = info.reg_off_first[0];
5055 reg_last = info.reg_off_last[1];
5056 if (reg_last < 0) {
5057 reg_last = info.reg_off_split;
5058 if (reg_last < 0) {
5059 reg_last = info.reg_off_last[0];
9123aeb6
RH
5060 }
5061 }
5062
b854fd06
RH
5063 do {
5064 uint64_t pg = vg[reg_off >> 6];
5065 do {
5066 if ((pg >> (reg_off & 63)) & 1) {
5c9b8458
RH
5067 for (i = 0; i < N; ++i) {
5068 tlb_fn(env, &scratch[i], reg_off,
5069 addr + mem_off + (i << msz), retaddr);
5070 }
b854fd06
RH
5071 }
5072 reg_off += 1 << esz;
5c9b8458 5073 mem_off += N << msz;
b854fd06
RH
5074 } while (reg_off & 63);
5075 } while (reg_off <= reg_last);
5076
5c9b8458
RH
5077 for (i = 0; i < N; ++i) {
5078 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5079 }
b854fd06 5080 return;
9123aeb6 5081#endif
b854fd06
RH
5082 }
5083
5084 /* The entire operation is in RAM, on valid pages. */
5085
5c9b8458
RH
5086 for (i = 0; i < N; ++i) {
5087 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5088 }
5089
b854fd06
RH
5090 mem_off = info.mem_off_first[0];
5091 reg_off = info.reg_off_first[0];
5092 reg_last = info.reg_off_last[0];
5093 host = info.page[0].host;
5094
5095 while (reg_off <= reg_last) {
5096 uint64_t pg = vg[reg_off >> 6];
5097 do {
5098 if ((pg >> (reg_off & 63)) & 1) {
5c9b8458
RH
5099 for (i = 0; i < N; ++i) {
5100 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5101 host + mem_off + (i << msz));
5102 }
b854fd06
RH
5103 }
5104 reg_off += 1 << esz;
5c9b8458 5105 mem_off += N << msz;
b854fd06
RH
5106 } while (reg_off <= reg_last && (reg_off & 63));
5107 }
9123aeb6 5108
b854fd06
RH
5109 /*
5110 * Use the slow path to manage the cross-page misalignment.
5111 * But we know this is RAM and cannot trap.
5112 */
5113 mem_off = info.mem_off_split;
5114 if (unlikely(mem_off >= 0)) {
5c9b8458
RH
5115 reg_off = info.reg_off_split;
5116 for (i = 0; i < N; ++i) {
5117 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5118 addr + mem_off + (i << msz), retaddr);
5119 }
b854fd06
RH
5120 }
5121
5122 mem_off = info.mem_off_first[1];
5123 if (unlikely(mem_off >= 0)) {
5124 reg_off = info.reg_off_first[1];
5125 reg_last = info.reg_off_last[1];
5126 host = info.page[1].host;
5127
5128 do {
5129 uint64_t pg = vg[reg_off >> 6];
5130 do {
5131 if ((pg >> (reg_off & 63)) & 1) {
5c9b8458
RH
5132 for (i = 0; i < N; ++i) {
5133 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5134 host + mem_off + (i << msz));
5135 }
b854fd06
RH
5136 }
5137 reg_off += 1 << esz;
5c9b8458 5138 mem_off += N << msz;
b854fd06
RH
5139 } while (reg_off & 63);
5140 } while (reg_off <= reg_last);
5141 }
c4e7c493
RH
5142}
5143
206adacf
RH
5144static inline QEMU_ALWAYS_INLINE
5145void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5146 uint32_t desc, const uintptr_t ra,
5147 const int esz, const int msz, const int N,
5148 sve_ldst1_host_fn *host_fn,
5149 sve_ldst1_tlb_fn *tlb_fn)
5150{
5151 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5152 int bit55 = extract64(addr, 55, 1);
5153
5154 /* Remove mtedesc from the normal sve descriptor. */
5155 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5156
5157 /* Perform gross MTE suppression early. */
5158 if (!tbi_check(desc, bit55) ||
5159 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5160 mtedesc = 0;
5161 }
5162
4c3310c7 5163 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
9123aeb6
RH
5164}
5165
206adacf
RH
5166#define DO_LD1_1(NAME, ESZ) \
5167void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5168 target_ulong addr, uint32_t desc) \
5169{ \
5170 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
4c3310c7 5171 sve_##NAME##_host, sve_##NAME##_tlb); \
206adacf
RH
5172} \
5173void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5174 target_ulong addr, uint32_t desc) \
5175{ \
5176 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5177 sve_##NAME##_host, sve_##NAME##_tlb); \
5178}
5179
5180#define DO_LD1_2(NAME, ESZ, MSZ) \
5181void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5182 target_ulong addr, uint32_t desc) \
5183{ \
5184 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4c3310c7 5185 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
206adacf
RH
5186} \
5187void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5188 target_ulong addr, uint32_t desc) \
5189{ \
5190 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4c3310c7 5191 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
206adacf
RH
5192} \
5193void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
4c3310c7 5194 target_ulong addr, uint32_t desc) \
206adacf
RH
5195{ \
5196 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5197 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5198} \
5199void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
4c3310c7 5200 target_ulong addr, uint32_t desc) \
206adacf
RH
5201{ \
5202 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5203 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
9123aeb6
RH
5204}
5205
5c9b8458
RH
5206DO_LD1_1(ld1bb, MO_8)
5207DO_LD1_1(ld1bhu, MO_16)
5208DO_LD1_1(ld1bhs, MO_16)
5209DO_LD1_1(ld1bsu, MO_32)
5210DO_LD1_1(ld1bss, MO_32)
5211DO_LD1_1(ld1bdu, MO_64)
5212DO_LD1_1(ld1bds, MO_64)
9123aeb6 5213
5c9b8458
RH
5214DO_LD1_2(ld1hh, MO_16, MO_16)
5215DO_LD1_2(ld1hsu, MO_32, MO_16)
5216DO_LD1_2(ld1hss, MO_32, MO_16)
5217DO_LD1_2(ld1hdu, MO_64, MO_16)
5218DO_LD1_2(ld1hds, MO_64, MO_16)
9123aeb6 5219
5c9b8458
RH
5220DO_LD1_2(ld1ss, MO_32, MO_32)
5221DO_LD1_2(ld1sdu, MO_64, MO_32)
5222DO_LD1_2(ld1sds, MO_64, MO_32)
9123aeb6 5223
5c9b8458 5224DO_LD1_2(ld1dd, MO_64, MO_64)
9123aeb6
RH
5225
5226#undef DO_LD1_1
5227#undef DO_LD1_2
5228
206adacf
RH
5229#define DO_LDN_1(N) \
5230void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5231 target_ulong addr, uint32_t desc) \
5232{ \
5233 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
4c3310c7 5234 sve_ld1bb_host, sve_ld1bb_tlb); \
206adacf
RH
5235} \
5236void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
5237 target_ulong addr, uint32_t desc) \
5238{ \
5239 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
5240 sve_ld1bb_host, sve_ld1bb_tlb); \
f27d4dc2
RH
5241}
5242
206adacf
RH
5243#define DO_LDN_2(N, SUFF, ESZ) \
5244void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
5245 target_ulong addr, uint32_t desc) \
5246{ \
5247 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4c3310c7 5248 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
206adacf
RH
5249} \
5250void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
5251 target_ulong addr, uint32_t desc) \
5252{ \
5253 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4c3310c7 5254 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
206adacf
RH
5255} \
5256void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
5257 target_ulong addr, uint32_t desc) \
5258{ \
5259 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5260 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5261} \
5262void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
5263 target_ulong addr, uint32_t desc) \
5264{ \
5265 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5266 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
c4e7c493
RH
5267}
5268
f27d4dc2
RH
5269DO_LDN_1(2)
5270DO_LDN_1(3)
5271DO_LDN_1(4)
c4e7c493 5272
5c9b8458
RH
5273DO_LDN_2(2, hh, MO_16)
5274DO_LDN_2(3, hh, MO_16)
5275DO_LDN_2(4, hh, MO_16)
c4e7c493 5276
5c9b8458
RH
5277DO_LDN_2(2, ss, MO_32)
5278DO_LDN_2(3, ss, MO_32)
5279DO_LDN_2(4, ss, MO_32)
c4e7c493 5280
5c9b8458
RH
5281DO_LDN_2(2, dd, MO_64)
5282DO_LDN_2(3, dd, MO_64)
5283DO_LDN_2(4, dd, MO_64)
c4e7c493 5284
f27d4dc2
RH
5285#undef DO_LDN_1
5286#undef DO_LDN_2
e2654d75
RH
5287
5288/*
5289 * Load contiguous data, first-fault and no-fault.
9123aeb6
RH
5290 *
5291 * For user-only, one could argue that we should hold the mmap_lock during
5292 * the operation so that there is no race between page_check_range and the
5293 * load operation. However, unmapping pages out from under a running thread
5294 * is extraordinarily unlikely. This theoretical race condition also affects
5295 * linux-user/ in its get_user/put_user macros.
5296 *
5297 * TODO: Construct some helpers, written in assembly, that interact with
5298 * handle_cpu_signal to produce memory ops which can properly report errors
5299 * without racing.
e2654d75
RH
5300 */
5301
e2654d75
RH
5302/* Fault on byte I. All bits in FFR from I are cleared. The vector
5303 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5304 * option, which leaves subsequent data unchanged.
5305 */
5306static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5307{
5308 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5309
5310 if (i & 63) {
5311 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5312 i = ROUND_UP(i, 64);
5313 }
5314 for (; i < oprsz; i += 64) {
5315 ffr[i / 64] = 0;
5316 }
5317}
5318
9123aeb6 5319/*
c647673c 5320 * Common helper for all contiguous no-fault and first-fault loads.
9123aeb6 5321 */
c647673c
RH
5322static inline QEMU_ALWAYS_INLINE
5323void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
aa13f7c3 5324 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
c647673c
RH
5325 const int esz, const int msz, const SVEContFault fault,
5326 sve_ldst1_host_fn *host_fn,
5327 sve_ldst1_tlb_fn *tlb_fn)
5328{
ba080b86 5329 const unsigned rd = simd_data(desc);
500d0484 5330 void *vd = &env->vfp.zregs[rd];
9123aeb6 5331 const intptr_t reg_max = simd_oprsz(desc);
c647673c
RH
5332 intptr_t reg_off, mem_off, reg_last;
5333 SVEContLdSt info;
5334 int flags;
9123aeb6
RH
5335 void *host;
5336
c647673c
RH
5337 /* Find the active elements. */
5338 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
9123aeb6
RH
5339 /* The entire predicate was false; no load occurs. */
5340 memset(vd, 0, reg_max);
5341 return;
5342 }
c647673c 5343 reg_off = info.reg_off_first[0];
9123aeb6 5344
c647673c
RH
5345 /* Probe the page(s). */
5346 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5347 /* Fault on first element. */
5348 tcg_debug_assert(fault == FAULT_NO);
5349 memset(vd, 0, reg_max);
5350 goto do_fault;
5351 }
5352
5353 mem_off = info.mem_off_first[0];
5354 flags = info.page[0].flags;
5355
aa13f7c3
RH
5356 /*
5357 * Disable MTE checking if the Tagged bit is not set. Since TBI must
5358 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
5359 */
5360 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
5361 mtedesc = 0;
5362 }
5363
c647673c 5364 if (fault == FAULT_FIRST) {
aa13f7c3
RH
5365 /* Trapping mte check for the first-fault element. */
5366 if (mtedesc) {
bd47b61c 5367 mte_check(env, mtedesc, addr + mem_off, retaddr);
aa13f7c3
RH
5368 }
5369
c647673c
RH
5370 /*
5371 * Special handling of the first active element,
5372 * if it crosses a page boundary or is MMIO.
5373 */
5374 bool is_split = mem_off == info.mem_off_split;
c647673c
RH
5375 if (unlikely(flags != 0) || unlikely(is_split)) {
5376 /*
5377 * Use the slow path for cross-page handling.
5378 * Might trap for MMIO or watchpoints.
5379 */
5380 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
5381
5382 /* After any fault, zero the other elements. */
9123aeb6 5383 swap_memzero(vd, reg_off);
c647673c
RH
5384 reg_off += 1 << esz;
5385 mem_off += 1 << msz;
5386 swap_memzero(vd + reg_off, reg_max - reg_off);
5387
5388 if (is_split) {
5389 goto second_page;
5390 }
5391 } else {
5392 memset(vd, 0, reg_max);
5393 }
5394 } else {
5395 memset(vd, 0, reg_max);
5396 if (unlikely(mem_off == info.mem_off_split)) {
5397 /* The first active element crosses a page boundary. */
5398 flags |= info.page[1].flags;
5399 if (unlikely(flags & TLB_MMIO)) {
5400 /* Some page is MMIO, see below. */
5401 goto do_fault;
5402 }
5403 if (unlikely(flags & TLB_WATCHPOINT) &&
5404 (cpu_watchpoint_address_matches
5405 (env_cpu(env), addr + mem_off, 1 << msz)
5406 & BP_MEM_READ)) {
5407 /* Watchpoint hit, see below. */
5408 goto do_fault;
5409 }
d304d280 5410 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
aa13f7c3
RH
5411 goto do_fault;
5412 }
c647673c
RH
5413 /*
5414 * Use the slow path for cross-page handling.
5415 * This is RAM, without a watchpoint, and will not trap.
5416 */
5417 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
5418 goto second_page;
9123aeb6
RH
5419 }
5420 }
5421
9123aeb6 5422 /*
c647673c
RH
5423 * From this point on, all memory operations are MemSingleNF.
5424 *
5425 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
5426 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
5427 *
5428 * Unfortuately we do not have access to the memory attributes from the
5429 * PTE to tell Device memory from Normal memory. So we make a mostly
5430 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
5431 * This gives the right answer for the common cases of "Normal memory,
5432 * backed by host RAM" and "Device memory, backed by MMIO".
5433 * The architecture allows us to suppress an NF load and return
5434 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
5435 * case of "Normal memory, backed by MMIO" is permitted. The case we
5436 * get wrong is "Device memory, backed by host RAM", for which we
5437 * should return (UNKNOWN, FAULT) for but do not.
5438 *
5439 * Similarly, CPU_BP breakpoints would raise exceptions, and so
5440 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
5441 * architectural breakpoints the same.
9123aeb6 5442 */
c647673c
RH
5443 if (unlikely(flags & TLB_MMIO)) {
5444 goto do_fault;
9123aeb6 5445 }
9123aeb6 5446
c647673c
RH
5447 reg_last = info.reg_off_last[0];
5448 host = info.page[0].host;
9123aeb6 5449
c647673c
RH
5450 do {
5451 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
cf4a49b7 5452 do {
c647673c
RH
5453 if ((pg >> (reg_off & 63)) & 1) {
5454 if (unlikely(flags & TLB_WATCHPOINT) &&
5455 (cpu_watchpoint_address_matches
5456 (env_cpu(env), addr + mem_off, 1 << msz)
5457 & BP_MEM_READ)) {
5458 goto do_fault;
5459 }
d304d280 5460 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
aa13f7c3
RH
5461 goto do_fault;
5462 }
c647673c
RH
5463 host_fn(vd, reg_off, host + mem_off);
5464 }
cf4a49b7 5465 reg_off += 1 << esz;
c647673c
RH
5466 mem_off += 1 << msz;
5467 } while (reg_off <= reg_last && (reg_off & 63));
5468 } while (reg_off <= reg_last);
9123aeb6 5469
c647673c
RH
5470 /*
5471 * MemSingleNF is allowed to fail for any reason. We have special
5472 * code above to handle the first element crossing a page boundary.
5473 * As an implementation choice, decline to handle a cross-page element
5474 * in any other position.
5475 */
5476 reg_off = info.reg_off_split;
5477 if (reg_off >= 0) {
5478 goto do_fault;
5479 }
9123aeb6 5480
c647673c
RH
5481 second_page:
5482 reg_off = info.reg_off_first[1];
5483 if (likely(reg_off < 0)) {
5484 /* No active elements on the second page. All done. */
9123aeb6
RH
5485 return;
5486 }
9123aeb6 5487
9123aeb6 5488 /*
c647673c
RH
5489 * MemSingleNF is allowed to fail for any reason. As an implementation
5490 * choice, decline to handle elements on the second page. This should
5491 * be low frequency as the guest walks through memory -- the next
5492 * iteration of the guest's loop should be aligned on the page boundary,
5493 * and then all following iterations will stay aligned.
9123aeb6 5494 */
9123aeb6 5495
c647673c 5496 do_fault:
9123aeb6
RH
5497 record_fault(env, reg_off, reg_max);
5498}
5499
aa13f7c3
RH
5500static inline QEMU_ALWAYS_INLINE
5501void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
5502 uint32_t desc, const uintptr_t retaddr,
5503 const int esz, const int msz, const SVEContFault fault,
5504 sve_ldst1_host_fn *host_fn,
5505 sve_ldst1_tlb_fn *tlb_fn)
5506{
5507 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5508 int bit55 = extract64(addr, 55, 1);
5509
5510 /* Remove mtedesc from the normal sve descriptor. */
5511 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5512
5513 /* Perform gross MTE suppression early. */
5514 if (!tbi_check(desc, bit55) ||
5515 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5516 mtedesc = 0;
5517 }
5518
5519 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
5520 esz, msz, fault, host_fn, tlb_fn);
5521}
5522
5523#define DO_LDFF1_LDNF1_1(PART, ESZ) \
9123aeb6
RH
5524void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
5525 target_ulong addr, uint32_t desc) \
e2654d75 5526{ \
aa13f7c3 5527 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
c647673c 5528 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
e2654d75 5529} \
9123aeb6
RH
5530void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
5531 target_ulong addr, uint32_t desc) \
e2654d75 5532{ \
aa13f7c3
RH
5533 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
5534 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5535} \
5536void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
5537 target_ulong addr, uint32_t desc) \
5538{ \
5539 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
5540 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5541} \
5542void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
5543 target_ulong addr, uint32_t desc) \
5544{ \
5545 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
c647673c 5546 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
e2654d75
RH
5547}
5548
aa13f7c3 5549#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
7d0a57a2
RH
5550void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
5551 target_ulong addr, uint32_t desc) \
e2654d75 5552{ \
aa13f7c3 5553 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
c647673c 5554 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
9123aeb6 5555} \
7d0a57a2
RH
5556void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
5557 target_ulong addr, uint32_t desc) \
9123aeb6 5558{ \
aa13f7c3 5559 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
c647673c 5560 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
7d0a57a2
RH
5561} \
5562void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
5563 target_ulong addr, uint32_t desc) \
5564{ \
aa13f7c3 5565 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
c647673c 5566 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
7d0a57a2
RH
5567} \
5568void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
5569 target_ulong addr, uint32_t desc) \
5570{ \
aa13f7c3 5571 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
c647673c 5572 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
aa13f7c3
RH
5573} \
5574void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5575 target_ulong addr, uint32_t desc) \
5576{ \
5577 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5578 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5579} \
5580void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5581 target_ulong addr, uint32_t desc) \
5582{ \
5583 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5584 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5585} \
5586void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5587 target_ulong addr, uint32_t desc) \
5588{ \
5589 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5590 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5591} \
5592void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5593 target_ulong addr, uint32_t desc) \
5594{ \
5595 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5596 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
e2654d75
RH
5597}
5598
c647673c
RH
5599DO_LDFF1_LDNF1_1(bb, MO_8)
5600DO_LDFF1_LDNF1_1(bhu, MO_16)
5601DO_LDFF1_LDNF1_1(bhs, MO_16)
5602DO_LDFF1_LDNF1_1(bsu, MO_32)
5603DO_LDFF1_LDNF1_1(bss, MO_32)
5604DO_LDFF1_LDNF1_1(bdu, MO_64)
5605DO_LDFF1_LDNF1_1(bds, MO_64)
e2654d75 5606
c647673c
RH
5607DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
5608DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
5609DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
5610DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
5611DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
e2654d75 5612
c647673c
RH
5613DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
5614DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
5615DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
e2654d75 5616
c647673c 5617DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
e2654d75 5618
9123aeb6
RH
5619#undef DO_LDFF1_LDNF1_1
5620#undef DO_LDFF1_LDNF1_2
1a039c7e 5621
9fd46c83 5622/*
0fa476c1 5623 * Common helper for all contiguous 1,2,3,4-register predicated stores.
9fd46c83 5624 */
0fa476c1
RH
5625
5626static inline QEMU_ALWAYS_INLINE
71b9f394
RH
5627void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
5628 uint32_t desc, const uintptr_t retaddr,
5629 const int esz, const int msz, const int N, uint32_t mtedesc,
0fa476c1 5630 sve_ldst1_host_fn *host_fn,
4c3310c7 5631 sve_ldst1_tlb_fn *tlb_fn)
9fd46c83 5632{
ba080b86 5633 const unsigned rd = simd_data(desc);
0fa476c1
RH
5634 const intptr_t reg_max = simd_oprsz(desc);
5635 intptr_t reg_off, reg_last, mem_off;
5636 SVEContLdSt info;
5637 void *host;
5638 int i, flags;
1a039c7e 5639
0fa476c1
RH
5640 /* Find the active elements. */
5641 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
5642 /* The entire predicate was false; no store occurs. */
5643 return;
9fd46c83 5644 }
1a039c7e 5645
0fa476c1
RH
5646 /* Probe the page(s). Exit with exception for any invalid page. */
5647 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
1a039c7e 5648
0fa476c1
RH
5649 /* Handle watchpoints for all active elements. */
5650 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
5651 BP_MEM_WRITE, retaddr);
5652
71b9f394
RH
5653 /*
5654 * Handle mte checks for all active elements.
5655 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5656 */
4c3310c7
RH
5657 if (mtedesc) {
5658 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5659 mtedesc, retaddr);
71b9f394 5660 }
0fa476c1
RH
5661
5662 flags = info.page[0].flags | info.page[1].flags;
5663 if (unlikely(flags != 0)) {
5664#ifdef CONFIG_USER_ONLY
5665 g_assert_not_reached();
5666#else
5667 /*
5668 * At least one page includes MMIO.
5669 * Any bus operation can fail with cpu_transaction_failed,
5670 * which for ARM will raise SyncExternal. We cannot avoid
5671 * this fault and will leave with the store incomplete.
5672 */
5673 mem_off = info.mem_off_first[0];
5674 reg_off = info.reg_off_first[0];
5675 reg_last = info.reg_off_last[1];
5676 if (reg_last < 0) {
5677 reg_last = info.reg_off_split;
5678 if (reg_last < 0) {
5679 reg_last = info.reg_off_last[0];
9fd46c83 5680 }
0fa476c1
RH
5681 }
5682
5683 do {
5684 uint64_t pg = vg[reg_off >> 6];
5685 do {
5686 if ((pg >> (reg_off & 63)) & 1) {
5687 for (i = 0; i < N; ++i) {
5688 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5689 addr + mem_off + (i << msz), retaddr);
5690 }
5691 }
5692 reg_off += 1 << esz;
5693 mem_off += N << msz;
5694 } while (reg_off & 63);
5695 } while (reg_off <= reg_last);
5696 return;
5697#endif
1a039c7e 5698 }
1a039c7e 5699
0fa476c1
RH
5700 mem_off = info.mem_off_first[0];
5701 reg_off = info.reg_off_first[0];
5702 reg_last = info.reg_off_last[0];
5703 host = info.page[0].host;
1a039c7e 5704
0fa476c1
RH
5705 while (reg_off <= reg_last) {
5706 uint64_t pg = vg[reg_off >> 6];
9fd46c83 5707 do {
0fa476c1
RH
5708 if ((pg >> (reg_off & 63)) & 1) {
5709 for (i = 0; i < N; ++i) {
5710 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5711 host + mem_off + (i << msz));
5712 }
9fd46c83 5713 }
0fa476c1
RH
5714 reg_off += 1 << esz;
5715 mem_off += N << msz;
5716 } while (reg_off <= reg_last && (reg_off & 63));
1a039c7e 5717 }
1a039c7e 5718
0fa476c1
RH
5719 /*
5720 * Use the slow path to manage the cross-page misalignment.
5721 * But we know this is RAM and cannot trap.
5722 */
5723 mem_off = info.mem_off_split;
5724 if (unlikely(mem_off >= 0)) {
5725 reg_off = info.reg_off_split;
5726 for (i = 0; i < N; ++i) {
5727 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5728 addr + mem_off + (i << msz), retaddr);
5729 }
5730 }
5731
5732 mem_off = info.mem_off_first[1];
5733 if (unlikely(mem_off >= 0)) {
5734 reg_off = info.reg_off_first[1];
5735 reg_last = info.reg_off_last[1];
5736 host = info.page[1].host;
1a039c7e 5737
9fd46c83 5738 do {
0fa476c1
RH
5739 uint64_t pg = vg[reg_off >> 6];
5740 do {
5741 if ((pg >> (reg_off & 63)) & 1) {
5742 for (i = 0; i < N; ++i) {
5743 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5744 host + mem_off + (i << msz));
5745 }
5746 }
5747 reg_off += 1 << esz;
5748 mem_off += N << msz;
5749 } while (reg_off & 63);
5750 } while (reg_off <= reg_last);
1a039c7e 5751 }
9fd46c83
RH
5752}
5753
71b9f394
RH
5754static inline QEMU_ALWAYS_INLINE
5755void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5756 uint32_t desc, const uintptr_t ra,
5757 const int esz, const int msz, const int N,
5758 sve_ldst1_host_fn *host_fn,
5759 sve_ldst1_tlb_fn *tlb_fn)
5760{
5761 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5762 int bit55 = extract64(addr, 55, 1);
5763
5764 /* Remove mtedesc from the normal sve descriptor. */
5765 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5766
5767 /* Perform gross MTE suppression early. */
5768 if (!tbi_check(desc, bit55) ||
5769 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5770 mtedesc = 0;
5771 }
5772
4c3310c7 5773 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
1a039c7e 5774}
f6dbf62a 5775
71b9f394
RH
5776#define DO_STN_1(N, NAME, ESZ) \
5777void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
5778 target_ulong addr, uint32_t desc) \
5779{ \
5780 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
4c3310c7 5781 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
71b9f394
RH
5782} \
5783void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
5784 target_ulong addr, uint32_t desc) \
5785{ \
5786 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
5787 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
5788}
5789
5790#define DO_STN_2(N, NAME, ESZ, MSZ) \
5791void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
5792 target_ulong addr, uint32_t desc) \
5793{ \
5794 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
4c3310c7 5795 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
71b9f394
RH
5796} \
5797void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
5798 target_ulong addr, uint32_t desc) \
5799{ \
5800 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
4c3310c7 5801 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
71b9f394
RH
5802} \
5803void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5804 target_ulong addr, uint32_t desc) \
5805{ \
5806 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5807 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
5808} \
5809void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5810 target_ulong addr, uint32_t desc) \
5811{ \
5812 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5813 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
0fa476c1
RH
5814}
5815
5816DO_STN_1(1, bb, MO_8)
5817DO_STN_1(1, bh, MO_16)
5818DO_STN_1(1, bs, MO_32)
5819DO_STN_1(1, bd, MO_64)
5820DO_STN_1(2, bb, MO_8)
5821DO_STN_1(3, bb, MO_8)
5822DO_STN_1(4, bb, MO_8)
5823
5824DO_STN_2(1, hh, MO_16, MO_16)
5825DO_STN_2(1, hs, MO_32, MO_16)
5826DO_STN_2(1, hd, MO_64, MO_16)
5827DO_STN_2(2, hh, MO_16, MO_16)
5828DO_STN_2(3, hh, MO_16, MO_16)
5829DO_STN_2(4, hh, MO_16, MO_16)
5830
5831DO_STN_2(1, ss, MO_32, MO_32)
5832DO_STN_2(1, sd, MO_64, MO_32)
5833DO_STN_2(2, ss, MO_32, MO_32)
5834DO_STN_2(3, ss, MO_32, MO_32)
5835DO_STN_2(4, ss, MO_32, MO_32)
5836
5837DO_STN_2(1, dd, MO_64, MO_64)
5838DO_STN_2(2, dd, MO_64, MO_64)
5839DO_STN_2(3, dd, MO_64, MO_64)
5840DO_STN_2(4, dd, MO_64, MO_64)
9fd46c83
RH
5841
5842#undef DO_STN_1
5843#undef DO_STN_2
5844
d4f75f25
RH
5845/*
5846 * Loads with a vector index.
5847 */
673e9fa6 5848
d4f75f25
RH
5849/*
5850 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
5851 */
5852typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
5853
5854static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
5855{
5856 return *(uint32_t *)(reg + H1_4(reg_ofs));
673e9fa6
RH
5857}
5858
d4f75f25
RH
5859static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
5860{
5861 return *(int32_t *)(reg + H1_4(reg_ofs));
5862}
5863
5864static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
5865{
5866 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
5867}
5868
5869static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
5870{
5871 return (int32_t)*(uint64_t *)(reg + reg_ofs);
5872}
5873
5874static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
5875{
5876 return *(uint64_t *)(reg + reg_ofs);
673e9fa6
RH
5877}
5878
10a85e2c
RH
5879static inline QEMU_ALWAYS_INLINE
5880void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5881 target_ulong base, uint32_t desc, uintptr_t retaddr,
d28d12f0
RH
5882 uint32_t mtedesc, int esize, int msize,
5883 zreg_off_fn *off_fn,
10a85e2c
RH
5884 sve_ldst1_host_fn *host_fn,
5885 sve_ldst1_tlb_fn *tlb_fn)
d4f75f25 5886{
10a85e2c
RH
5887 const int mmu_idx = cpu_mmu_index(env, false);
5888 const intptr_t reg_max = simd_oprsz(desc);
ba080b86 5889 const int scale = simd_data(desc);
10a85e2c
RH
5890 ARMVectorReg scratch;
5891 intptr_t reg_off;
5892 SVEHostPage info, info2;
d4f75f25 5893
10a85e2c
RH
5894 memset(&scratch, 0, reg_max);
5895 reg_off = 0;
5896 do {
5897 uint64_t pg = vg[reg_off >> 6];
d4f75f25
RH
5898 do {
5899 if (likely(pg & 1)) {
10a85e2c
RH
5900 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
5901 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
5902
5903 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
5904 mmu_idx, retaddr);
5905
5906 if (likely(in_page >= msize)) {
5907 if (unlikely(info.flags & TLB_WATCHPOINT)) {
5908 cpu_check_watchpoint(env_cpu(env), addr, msize,
5909 info.attrs, BP_MEM_READ, retaddr);
5910 }
d28d12f0 5911 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
bd47b61c 5912 mte_check(env, mtedesc, addr, retaddr);
d28d12f0 5913 }
10a85e2c
RH
5914 host_fn(&scratch, reg_off, info.host);
5915 } else {
5916 /* Element crosses the page boundary. */
5917 sve_probe_page(&info2, false, env, addr + in_page, 0,
5918 MMU_DATA_LOAD, mmu_idx, retaddr);
5919 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
5920 cpu_check_watchpoint(env_cpu(env), addr,
5921 msize, info.attrs,
5922 BP_MEM_READ, retaddr);
5923 }
d28d12f0 5924 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
bd47b61c 5925 mte_check(env, mtedesc, addr, retaddr);
d28d12f0 5926 }
10a85e2c
RH
5927 tlb_fn(env, &scratch, reg_off, addr, retaddr);
5928 }
d4f75f25 5929 }
10a85e2c
RH
5930 reg_off += esize;
5931 pg >>= esize;
5932 } while (reg_off & 63);
5933 } while (reg_off < reg_max);
d4f75f25
RH
5934
5935 /* Wait until all exceptions have been raised to write back. */
10a85e2c 5936 memcpy(vd, &scratch, reg_max);
d4f75f25
RH
5937}
5938
d28d12f0
RH
5939static inline QEMU_ALWAYS_INLINE
5940void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
5941 target_ulong base, uint32_t desc, uintptr_t retaddr,
5942 int esize, int msize, zreg_off_fn *off_fn,
5943 sve_ldst1_host_fn *host_fn,
5944 sve_ldst1_tlb_fn *tlb_fn)
5945{
5946 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5947 /* Remove mtedesc from the normal sve descriptor. */
5948 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5949
5950 /*
5951 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5952 * offset base entirely over the address space hole to change the
5953 * pointer tag, or change the bit55 selector. So we could here
5954 * examine TBI + TCMA like we do for sve_ldN_r_mte().
5955 */
5956 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
5957 esize, msize, off_fn, host_fn, tlb_fn);
5958}
5959
10a85e2c
RH
5960#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
5961void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5962 void *vm, target_ulong base, uint32_t desc) \
5963{ \
d28d12f0 5964 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
10a85e2c 5965 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
d28d12f0
RH
5966} \
5967void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5968 void *vm, target_ulong base, uint32_t desc) \
5969{ \
5970 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
5971 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
10a85e2c 5972}
d4f75f25 5973
10a85e2c
RH
5974#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
5975void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5976 void *vm, target_ulong base, uint32_t desc) \
5977{ \
d28d12f0 5978 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
10a85e2c 5979 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
d28d12f0
RH
5980} \
5981void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5982 void *vm, target_ulong base, uint32_t desc) \
5983{ \
5984 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
5985 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
10a85e2c
RH
5986}
5987
5988DO_LD1_ZPZ_S(bsu, zsu, MO_8)
5989DO_LD1_ZPZ_S(bsu, zss, MO_8)
5990DO_LD1_ZPZ_D(bdu, zsu, MO_8)
5991DO_LD1_ZPZ_D(bdu, zss, MO_8)
5992DO_LD1_ZPZ_D(bdu, zd, MO_8)
5993
5994DO_LD1_ZPZ_S(bss, zsu, MO_8)
5995DO_LD1_ZPZ_S(bss, zss, MO_8)
5996DO_LD1_ZPZ_D(bds, zsu, MO_8)
5997DO_LD1_ZPZ_D(bds, zss, MO_8)
5998DO_LD1_ZPZ_D(bds, zd, MO_8)
5999
6000DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6001DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6002DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6003DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6004DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6005
6006DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6007DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6008DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6009DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6010DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6011
6012DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6013DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6014DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6015DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6016DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6017
6018DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6019DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6020DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6021DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6022DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6023
6024DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6025DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6026DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6027DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6028DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6029
6030DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6031DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6032DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6033DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6034DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6035
6036DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6037DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6038DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6039
6040DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6041DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6042DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6043
6044DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6045DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6046DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6047
6048DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6049DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6050DO_LD1_ZPZ_D(dd_be, zd, MO_64)
d4f75f25
RH
6051
6052#undef DO_LD1_ZPZ_S
6053#undef DO_LD1_ZPZ_D
673e9fa6 6054
ed67eb7f
RH
6055/* First fault loads with a vector index. */
6056
116347ce 6057/*
50de9b78 6058 * Common helpers for all gather first-faulting loads.
116347ce 6059 */
50de9b78
RH
6060
6061static inline QEMU_ALWAYS_INLINE
6062void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6063 target_ulong base, uint32_t desc, uintptr_t retaddr,
d28d12f0
RH
6064 uint32_t mtedesc, const int esz, const int msz,
6065 zreg_off_fn *off_fn,
50de9b78
RH
6066 sve_ldst1_host_fn *host_fn,
6067 sve_ldst1_tlb_fn *tlb_fn)
116347ce 6068{
50de9b78 6069 const int mmu_idx = cpu_mmu_index(env, false);
ba080b86
RH
6070 const intptr_t reg_max = simd_oprsz(desc);
6071 const int scale = simd_data(desc);
50de9b78
RH
6072 const int esize = 1 << esz;
6073 const int msize = 1 << msz;
50de9b78
RH
6074 intptr_t reg_off;
6075 SVEHostPage info;
6076 target_ulong addr, in_page;
116347ce
RH
6077
6078 /* Skip to the first true predicate. */
50de9b78
RH
6079 reg_off = find_next_active(vg, 0, reg_max, esz);
6080 if (unlikely(reg_off >= reg_max)) {
6081 /* The entire predicate was false; no load occurs. */
6082 memset(vd, 0, reg_max);
6083 return;
116347ce
RH
6084 }
6085
50de9b78
RH
6086 /*
6087 * Probe the first element, allowing faults.
6088 */
6089 addr = base + (off_fn(vm, reg_off) << scale);
d28d12f0 6090 if (mtedesc) {
bd47b61c 6091 mte_check(env, mtedesc, addr, retaddr);
d28d12f0 6092 }
50de9b78 6093 tlb_fn(env, vd, reg_off, addr, retaddr);
ed67eb7f 6094
50de9b78
RH
6095 /* After any fault, zero the other elements. */
6096 swap_memzero(vd, reg_off);
6097 reg_off += esize;
6098 swap_memzero(vd + reg_off, reg_max - reg_off);
116347ce 6099
50de9b78
RH
6100 /*
6101 * Probe the remaining elements, not allowing faults.
6102 */
6103 while (reg_off < reg_max) {
6104 uint64_t pg = vg[reg_off >> 6];
6105 do {
6106 if (likely((pg >> (reg_off & 63)) & 1)) {
6107 addr = base + (off_fn(vm, reg_off) << scale);
6108 in_page = -(addr | TARGET_PAGE_MASK);
116347ce 6109
50de9b78
RH
6110 if (unlikely(in_page < msize)) {
6111 /* Stop if the element crosses a page boundary. */
6112 goto fault;
6113 }
ed67eb7f 6114
50de9b78
RH
6115 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6116 mmu_idx, retaddr);
6117 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6118 goto fault;
6119 }
6120 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6121 (cpu_watchpoint_address_matches
6122 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6123 goto fault;
6124 }
d28d12f0
RH
6125 if (mtedesc &&
6126 arm_tlb_mte_tagged(&info.attrs) &&
d304d280 6127 !mte_probe(env, mtedesc, addr)) {
d28d12f0
RH
6128 goto fault;
6129 }
116347ce 6130
50de9b78 6131 host_fn(vd, reg_off, info.host);
116347ce 6132 }
50de9b78
RH
6133 reg_off += esize;
6134 } while (reg_off & 63);
116347ce 6135 }
50de9b78 6136 return;
116347ce 6137
50de9b78
RH
6138 fault:
6139 record_fault(env, reg_off, reg_max);
116347ce
RH
6140}
6141
d28d12f0
RH
6142static inline QEMU_ALWAYS_INLINE
6143void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6144 target_ulong base, uint32_t desc, uintptr_t retaddr,
6145 const int esz, const int msz,
6146 zreg_off_fn *off_fn,
6147 sve_ldst1_host_fn *host_fn,
6148 sve_ldst1_tlb_fn *tlb_fn)
6149{
6150 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6151 /* Remove mtedesc from the normal sve descriptor. */
6152 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6153
6154 /*
6155 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6156 * offset base entirely over the address space hole to change the
6157 * pointer tag, or change the bit55 selector. So we could here
6158 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6159 */
6160 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6161 esz, msz, off_fn, host_fn, tlb_fn);
50de9b78
RH
6162}
6163
d28d12f0
RH
6164#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6165void HELPER(sve_ldff##MEM##_##OFS) \
6166 (CPUARMState *env, void *vd, void *vg, \
6167 void *vm, target_ulong base, uint32_t desc) \
6168{ \
6169 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6170 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6171} \
6172void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6173 (CPUARMState *env, void *vd, void *vg, \
6174 void *vm, target_ulong base, uint32_t desc) \
6175{ \
6176 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6177 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6178}
6179
6180#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6181void HELPER(sve_ldff##MEM##_##OFS) \
6182 (CPUARMState *env, void *vd, void *vg, \
6183 void *vm, target_ulong base, uint32_t desc) \
6184{ \
6185 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6186 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6187} \
6188void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6189 (CPUARMState *env, void *vd, void *vg, \
6190 void *vm, target_ulong base, uint32_t desc) \
6191{ \
6192 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6193 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
50de9b78
RH
6194}
6195
6196DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6197DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6198DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6199DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6200DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6201
6202DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6203DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6204DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6205DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6206DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6207
6208DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6209DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6210DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6211DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6212DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6213
6214DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6215DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6216DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6217DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6218DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6219
6220DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6221DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6222DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6223DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6224DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6225
6226DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6227DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6228DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6229DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6230DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6231
6232DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
6233DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
6234DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6235DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6236DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6237
6238DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
6239DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
6240DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6241DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6242DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6243
6244DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6245DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6246DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6247
6248DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6249DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6250DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6251
6252DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6253DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6254DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6255
6256DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6257DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6258DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
ed67eb7f 6259
f6dbf62a
RH
6260/* Stores with a vector index. */
6261
88a660a4
RH
6262static inline QEMU_ALWAYS_INLINE
6263void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6264 target_ulong base, uint32_t desc, uintptr_t retaddr,
d28d12f0
RH
6265 uint32_t mtedesc, int esize, int msize,
6266 zreg_off_fn *off_fn,
88a660a4
RH
6267 sve_ldst1_host_fn *host_fn,
6268 sve_ldst1_tlb_fn *tlb_fn)
78cf1b88 6269{
88a660a4
RH
6270 const int mmu_idx = cpu_mmu_index(env, false);
6271 const intptr_t reg_max = simd_oprsz(desc);
ba080b86 6272 const int scale = simd_data(desc);
88a660a4
RH
6273 void *host[ARM_MAX_VQ * 4];
6274 intptr_t reg_off, i;
6275 SVEHostPage info, info2;
f6dbf62a 6276
88a660a4
RH
6277 /*
6278 * Probe all of the elements for host addresses and flags.
6279 */
6280 i = reg_off = 0;
6281 do {
6282 uint64_t pg = vg[reg_off >> 6];
78cf1b88 6283 do {
88a660a4
RH
6284 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6285 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
f6dbf62a 6286
88a660a4
RH
6287 host[i] = NULL;
6288 if (likely((pg >> (reg_off & 63)) & 1)) {
6289 if (likely(in_page >= msize)) {
6290 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6291 mmu_idx, retaddr);
6292 host[i] = info.host;
6293 } else {
6294 /*
6295 * Element crosses the page boundary.
6296 * Probe both pages, but do not record the host address,
6297 * so that we use the slow path.
6298 */
6299 sve_probe_page(&info, false, env, addr, 0,
6300 MMU_DATA_STORE, mmu_idx, retaddr);
6301 sve_probe_page(&info2, false, env, addr + in_page, 0,
6302 MMU_DATA_STORE, mmu_idx, retaddr);
6303 info.flags |= info2.flags;
6304 }
f6dbf62a 6305
88a660a4
RH
6306 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6307 cpu_check_watchpoint(env_cpu(env), addr, msize,
6308 info.attrs, BP_MEM_WRITE, retaddr);
6309 }
d28d12f0
RH
6310
6311 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
bd47b61c 6312 mte_check(env, mtedesc, addr, retaddr);
d28d12f0 6313 }
88a660a4
RH
6314 }
6315 i += 1;
6316 reg_off += esize;
6317 } while (reg_off & 63);
6318 } while (reg_off < reg_max);
6319
6320 /*
6321 * Now that we have recognized all exceptions except SyncExternal
6322 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6323 *
6324 * Note for the common case of an element in RAM, not crossing a page
6325 * boundary, we have stored the host address in host[]. This doubles
6326 * as a first-level check against the predicate, since only enabled
6327 * elements have non-null host addresses.
6328 */
6329 i = reg_off = 0;
6330 do {
6331 void *h = host[i];
6332 if (likely(h != NULL)) {
6333 host_fn(vd, reg_off, h);
6334 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
6335 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6336 tlb_fn(env, vd, reg_off, addr, retaddr);
78cf1b88 6337 }
88a660a4
RH
6338 i += 1;
6339 reg_off += esize;
6340 } while (reg_off < reg_max);
78cf1b88 6341}
f6dbf62a 6342
d28d12f0
RH
6343static inline QEMU_ALWAYS_INLINE
6344void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6345 target_ulong base, uint32_t desc, uintptr_t retaddr,
6346 int esize, int msize, zreg_off_fn *off_fn,
6347 sve_ldst1_host_fn *host_fn,
6348 sve_ldst1_tlb_fn *tlb_fn)
6349{
6350 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6351 /* Remove mtedesc from the normal sve descriptor. */
6352 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6353
6354 /*
6355 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6356 * offset base entirely over the address space hole to change the
6357 * pointer tag, or change the bit55 selector. So we could here
6358 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6359 */
6360 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6361 esize, msize, off_fn, host_fn, tlb_fn);
6362}
6363
6364#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
6365void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
88a660a4 6366 void *vm, target_ulong base, uint32_t desc) \
d28d12f0
RH
6367{ \
6368 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6369 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6370} \
6371void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6372 void *vm, target_ulong base, uint32_t desc) \
6373{ \
6374 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6375 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
78cf1b88 6376}
f6dbf62a 6377
d28d12f0
RH
6378#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
6379void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
88a660a4 6380 void *vm, target_ulong base, uint32_t desc) \
d28d12f0
RH
6381{ \
6382 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6383 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6384} \
6385void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6386 void *vm, target_ulong base, uint32_t desc) \
6387{ \
6388 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6389 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
88a660a4
RH
6390}
6391
6392DO_ST1_ZPZ_S(bs, zsu, MO_8)
6393DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
6394DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
6395DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
6396DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
6397
6398DO_ST1_ZPZ_S(bs, zss, MO_8)
6399DO_ST1_ZPZ_S(hs_le, zss, MO_16)
6400DO_ST1_ZPZ_S(hs_be, zss, MO_16)
6401DO_ST1_ZPZ_S(ss_le, zss, MO_32)
6402DO_ST1_ZPZ_S(ss_be, zss, MO_32)
6403
6404DO_ST1_ZPZ_D(bd, zsu, MO_8)
6405DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
6406DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
6407DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
6408DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
6409DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
6410DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
6411
6412DO_ST1_ZPZ_D(bd, zss, MO_8)
6413DO_ST1_ZPZ_D(hd_le, zss, MO_16)
6414DO_ST1_ZPZ_D(hd_be, zss, MO_16)
6415DO_ST1_ZPZ_D(sd_le, zss, MO_32)
6416DO_ST1_ZPZ_D(sd_be, zss, MO_32)
6417DO_ST1_ZPZ_D(dd_le, zss, MO_64)
6418DO_ST1_ZPZ_D(dd_be, zss, MO_64)
6419
6420DO_ST1_ZPZ_D(bd, zd, MO_8)
6421DO_ST1_ZPZ_D(hd_le, zd, MO_16)
6422DO_ST1_ZPZ_D(hd_be, zd, MO_16)
6423DO_ST1_ZPZ_D(sd_le, zd, MO_32)
6424DO_ST1_ZPZ_D(sd_be, zd, MO_32)
6425DO_ST1_ZPZ_D(dd_le, zd, MO_64)
6426DO_ST1_ZPZ_D(dd_be, zd, MO_64)
78cf1b88
RH
6427
6428#undef DO_ST1_ZPZ_S
6429#undef DO_ST1_ZPZ_D