]> git.proxmox.com Git - mirror_qemu.git/blame - target/arm/sve_helper.c
target/arm: Implement SVE2 RADDHNB, RADDHNT
[mirror_qemu.git] / target / arm / sve_helper.c
CommitLineData
9e18d7a6
RH
1/*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
50f57e09 9 * version 2.1 of the License, or (at your option) any later version.
9e18d7a6
RH
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "qemu/osdep.h"
21#include "cpu.h"
500d0484 22#include "internals.h"
9e18d7a6
RH
23#include "exec/exec-all.h"
24#include "exec/cpu_ldst.h"
25#include "exec/helper-proto.h"
26#include "tcg/tcg-gvec-desc.h"
a1f233f2 27#include "fpu/softfloat.h"
dcb32f1d 28#include "tcg/tcg.h"
45d9503d 29#include "vec_internal.h"
9e18d7a6
RH
30
31
f97cfd59
RH
32/* Note that vector data is stored in host-endian 64-bit chunks,
33 so addressing units smaller than that needs a host-endian fixup. */
34#ifdef HOST_WORDS_BIGENDIAN
35#define H1(x) ((x) ^ 7)
36#define H1_2(x) ((x) ^ 6)
37#define H1_4(x) ((x) ^ 4)
38#define H2(x) ((x) ^ 3)
39#define H4(x) ((x) ^ 1)
40#else
41#define H1(x) (x)
42#define H1_2(x) (x)
43#define H1_4(x) (x)
44#define H2(x) (x)
45#define H4(x) (x)
46#endif
47
9e18d7a6
RH
48/* Return a value for NZCV as per the ARM PredTest pseudofunction.
49 *
50 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
51 * and bit 0 set if C is set. Compare the definitions of these variables
52 * within CPUARMState.
53 */
54
55/* For no G bits set, NZCV = C. */
56#define PREDTEST_INIT 1
57
58/* This is an iterative function, called for each Pd and Pg word
59 * moving forward.
60 */
61static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
62{
63 if (likely(g)) {
64 /* Compute N from first D & G.
65 Use bit 2 to signal first G bit seen. */
66 if (!(flags & 4)) {
67 flags |= ((d & (g & -g)) != 0) << 31;
68 flags |= 4;
69 }
70
71 /* Accumulate Z from each D & G. */
72 flags |= ((d & g) != 0) << 1;
73
74 /* Compute C from last !(D & G). Replace previous. */
75 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
76 }
77 return flags;
78}
79
757f9cff
RH
80/* This is an iterative function, called for each Pd and Pg word
81 * moving backward.
82 */
83static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
84{
85 if (likely(g)) {
86 /* Compute C from first (i.e last) !(D & G).
87 Use bit 2 to signal first G bit seen. */
88 if (!(flags & 4)) {
89 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
90 flags |= (d & pow2floor(g)) == 0;
91 }
92
93 /* Accumulate Z from each D & G. */
94 flags |= ((d & g) != 0) << 1;
95
96 /* Compute N from last (i.e first) D & G. Replace previous. */
97 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
98 }
99 return flags;
100}
101
9e18d7a6
RH
102/* The same for a single word predicate. */
103uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
104{
105 return iter_predtest_fwd(d, g, PREDTEST_INIT);
106}
107
108/* The same for a multi-word predicate. */
109uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
110{
111 uint32_t flags = PREDTEST_INIT;
112 uint64_t *d = vd, *g = vg;
113 uintptr_t i = 0;
114
115 do {
116 flags = iter_predtest_fwd(d[i], g[i], flags);
117 } while (++i < words);
118
119 return flags;
120}
516e246a 121
ccd841c3
RH
122/* Expand active predicate bits to bytes, for byte elements.
123 * for (i = 0; i < 256; ++i) {
124 * unsigned long m = 0;
125 * for (j = 0; j < 8; j++) {
126 * if ((i >> j) & 1) {
127 * m |= 0xfful << (j << 3);
128 * }
129 * }
130 * printf("0x%016lx,\n", m);
131 * }
132 */
133static inline uint64_t expand_pred_b(uint8_t byte)
134{
135 static const uint64_t word[256] = {
136 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
137 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
138 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
139 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
140 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
141 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
142 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
143 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
144 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
145 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
146 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
147 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
148 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
149 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
150 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
151 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
152 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
153 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
154 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
155 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
156 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
157 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
158 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
159 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
160 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
161 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
162 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
163 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
164 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
165 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
166 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
167 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
168 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
169 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
170 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
171 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
172 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
173 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
174 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
175 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
176 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
177 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
178 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
179 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
180 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
181 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
182 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
183 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
184 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
185 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
186 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
187 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
188 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
189 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
190 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
191 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
192 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
193 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
194 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
195 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
196 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
197 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
198 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
199 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
200 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
201 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
202 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
203 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
204 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
205 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
206 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
207 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
208 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
209 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
210 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
211 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
212 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
213 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
214 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
215 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
216 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
217 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
218 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
219 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
220 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
221 0xffffffffffffffff,
222 };
223 return word[byte];
224}
225
226/* Similarly for half-word elements.
227 * for (i = 0; i < 256; ++i) {
228 * unsigned long m = 0;
229 * if (i & 0xaa) {
230 * continue;
231 * }
232 * for (j = 0; j < 8; j += 2) {
233 * if ((i >> j) & 1) {
234 * m |= 0xfffful << (j << 3);
235 * }
236 * }
237 * printf("[0x%x] = 0x%016lx,\n", i, m);
238 * }
239 */
240static inline uint64_t expand_pred_h(uint8_t byte)
241{
242 static const uint64_t word[] = {
243 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
244 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
245 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
246 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
247 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
248 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
249 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
250 [0x55] = 0xffffffffffffffff,
251 };
252 return word[byte & 0x55];
253}
254
255/* Similarly for single word elements. */
256static inline uint64_t expand_pred_s(uint8_t byte)
257{
258 static const uint64_t word[] = {
259 [0x01] = 0x00000000ffffffffull,
260 [0x10] = 0xffffffff00000000ull,
261 [0x11] = 0xffffffffffffffffull,
262 };
263 return word[byte & 0x11];
264}
265
dae8fb90
RH
266/* Swap 16-bit words within a 32-bit word. */
267static inline uint32_t hswap32(uint32_t h)
268{
269 return rol32(h, 16);
270}
271
272/* Swap 16-bit words within a 64-bit word. */
273static inline uint64_t hswap64(uint64_t h)
274{
275 uint64_t m = 0x0000ffff0000ffffull;
276 h = rol64(h, 32);
277 return ((h & m) << 16) | ((h >> 16) & m);
278}
279
280/* Swap 32-bit words within a 64-bit word. */
281static inline uint64_t wswap64(uint64_t h)
282{
283 return rol64(h, 32);
284}
285
516e246a
RH
286#define LOGICAL_PPPP(NAME, FUNC) \
287void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
288{ \
289 uintptr_t opr_sz = simd_oprsz(desc); \
290 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
291 uintptr_t i; \
292 for (i = 0; i < opr_sz / 8; ++i) { \
293 d[i] = FUNC(n[i], m[i], g[i]); \
294 } \
295}
296
297#define DO_AND(N, M, G) (((N) & (M)) & (G))
298#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
299#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
300#define DO_ORR(N, M, G) (((N) | (M)) & (G))
301#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
302#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
303#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
304#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
305
306LOGICAL_PPPP(sve_and_pppp, DO_AND)
307LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
308LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
309LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
310LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
311LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
312LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
313LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
314
315#undef DO_AND
316#undef DO_BIC
317#undef DO_EOR
318#undef DO_ORR
319#undef DO_ORN
320#undef DO_NOR
321#undef DO_NAND
322#undef DO_SEL
323#undef LOGICAL_PPPP
028e2a7b 324
f97cfd59
RH
325/* Fully general three-operand expander, controlled by a predicate.
326 * This is complicated by the host-endian storage of the register file.
327 */
328/* ??? I don't expect the compiler could ever vectorize this itself.
329 * With some tables we can convert bit masks to byte masks, and with
330 * extra care wrt byte/word ordering we could use gcc generic vectors
331 * and do 16 bytes at a time.
332 */
333#define DO_ZPZZ(NAME, TYPE, H, OP) \
334void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
335{ \
336 intptr_t i, opr_sz = simd_oprsz(desc); \
337 for (i = 0; i < opr_sz; ) { \
338 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
339 do { \
340 if (pg & 1) { \
341 TYPE nn = *(TYPE *)(vn + H(i)); \
342 TYPE mm = *(TYPE *)(vm + H(i)); \
343 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
344 } \
345 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
346 } while (i & 15); \
347 } \
348}
349
350/* Similarly, specialized for 64-bit operands. */
351#define DO_ZPZZ_D(NAME, TYPE, OP) \
352void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
353{ \
354 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
355 TYPE *d = vd, *n = vn, *m = vm; \
356 uint8_t *pg = vg; \
357 for (i = 0; i < opr_sz; i += 1) { \
358 if (pg[H1(i)] & 1) { \
359 TYPE nn = n[i], mm = m[i]; \
360 d[i] = OP(nn, mm); \
361 } \
362 } \
363}
364
365#define DO_AND(N, M) (N & M)
366#define DO_EOR(N, M) (N ^ M)
367#define DO_ORR(N, M) (N | M)
368#define DO_BIC(N, M) (N & ~M)
369#define DO_ADD(N, M) (N + M)
370#define DO_SUB(N, M) (N - M)
371#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
372#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
373#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
374#define DO_MUL(N, M) (N * M)
7e8fafbf
RH
375
376
377/*
378 * We must avoid the C undefined behaviour cases: division by
379 * zero and signed division of INT_MIN by -1. Both of these
380 * have architecturally defined required results for Arm.
381 * We special case all signed divisions by -1 to avoid having
382 * to deduce the minimum integer for the type involved.
383 */
384#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
385#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
f97cfd59
RH
386
387DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
388DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
389DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
390DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
391
392DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
393DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
394DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
395DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
396
397DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
398DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
399DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
400DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
401
402DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
403DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
404DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
405DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
406
407DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
408DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
409DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
410DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
411
412DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
413DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
414DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
415DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
416
417DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
418DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
419DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
420DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
421
422DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
423DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
424DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
425DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
426
427DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
428DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
429DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
430DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
431
432DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
433DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
434DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
435DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
436
437DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
438DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
439DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
440DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
441
442DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
443DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
444DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
445DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
446
447/* Because the computation type is at least twice as large as required,
448 these work for both signed and unsigned source types. */
449static inline uint8_t do_mulh_b(int32_t n, int32_t m)
450{
451 return (n * m) >> 8;
452}
453
454static inline uint16_t do_mulh_h(int32_t n, int32_t m)
455{
456 return (n * m) >> 16;
457}
458
459static inline uint32_t do_mulh_s(int64_t n, int64_t m)
460{
461 return (n * m) >> 32;
462}
463
464static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
465{
466 uint64_t lo, hi;
467 muls64(&lo, &hi, n, m);
468 return hi;
469}
470
471static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
472{
473 uint64_t lo, hi;
474 mulu64(&lo, &hi, n, m);
475 return hi;
476}
477
478DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
479DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
480DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
481DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
482
483DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
484DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
485DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
486DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
487
488DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
489DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
490DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
491DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
492
7e8fafbf
RH
493DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
494DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
f97cfd59 495
7e8fafbf
RH
496DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
497DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
f97cfd59 498
27721dbb
RH
499/* Note that all bits of the shift are significant
500 and not modulo the element size. */
501#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
502#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
503#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
504
505DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
506DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
507DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
508
509DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
510DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
511DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
512
513DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
514DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
515DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
516
517DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
518DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
519DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
520
d4b1e59d
RH
521static inline uint16_t do_sadalp_h(int16_t n, int16_t m)
522{
523 int8_t n1 = n, n2 = n >> 8;
524 return m + n1 + n2;
525}
526
527static inline uint32_t do_sadalp_s(int32_t n, int32_t m)
528{
529 int16_t n1 = n, n2 = n >> 16;
530 return m + n1 + n2;
531}
532
533static inline uint64_t do_sadalp_d(int64_t n, int64_t m)
534{
535 int32_t n1 = n, n2 = n >> 32;
536 return m + n1 + n2;
537}
538
539DO_ZPZZ(sve2_sadalp_zpzz_h, int16_t, H1_2, do_sadalp_h)
540DO_ZPZZ(sve2_sadalp_zpzz_s, int32_t, H1_4, do_sadalp_s)
541DO_ZPZZ_D(sve2_sadalp_zpzz_d, int64_t, do_sadalp_d)
542
543static inline uint16_t do_uadalp_h(uint16_t n, uint16_t m)
544{
545 uint8_t n1 = n, n2 = n >> 8;
546 return m + n1 + n2;
547}
548
549static inline uint32_t do_uadalp_s(uint32_t n, uint32_t m)
550{
551 uint16_t n1 = n, n2 = n >> 16;
552 return m + n1 + n2;
553}
554
555static inline uint64_t do_uadalp_d(uint64_t n, uint64_t m)
556{
557 uint32_t n1 = n, n2 = n >> 32;
558 return m + n1 + n2;
559}
560
561DO_ZPZZ(sve2_uadalp_zpzz_h, uint16_t, H1_2, do_uadalp_h)
562DO_ZPZZ(sve2_uadalp_zpzz_s, uint32_t, H1_4, do_uadalp_s)
563DO_ZPZZ_D(sve2_uadalp_zpzz_d, uint64_t, do_uadalp_d)
564
45d9503d
RH
565#define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
566#define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
567#define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
568#define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
569
570DO_ZPZZ(sve2_srshl_zpzz_b, int8_t, H1, do_srshl_b)
571DO_ZPZZ(sve2_srshl_zpzz_h, int16_t, H1_2, do_srshl_h)
572DO_ZPZZ(sve2_srshl_zpzz_s, int32_t, H1_4, do_srshl_s)
573DO_ZPZZ_D(sve2_srshl_zpzz_d, int64_t, do_srshl_d)
574
575#define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
576#define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
577#define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
578#define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
579
580DO_ZPZZ(sve2_urshl_zpzz_b, uint8_t, H1, do_urshl_b)
581DO_ZPZZ(sve2_urshl_zpzz_h, uint16_t, H1_2, do_urshl_h)
582DO_ZPZZ(sve2_urshl_zpzz_s, uint32_t, H1_4, do_urshl_s)
583DO_ZPZZ_D(sve2_urshl_zpzz_d, uint64_t, do_urshl_d)
584
585/*
586 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
587 * We pass in a pointer to a dummy saturation field to trigger
588 * the saturating arithmetic but discard the information about
589 * whether it has occurred.
590 */
591#define do_sqshl_b(n, m) \
592 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
593#define do_sqshl_h(n, m) \
594 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
595#define do_sqshl_s(n, m) \
596 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
597#define do_sqshl_d(n, m) \
598 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
599
600DO_ZPZZ(sve2_sqshl_zpzz_b, int8_t, H1_2, do_sqshl_b)
601DO_ZPZZ(sve2_sqshl_zpzz_h, int16_t, H1_2, do_sqshl_h)
602DO_ZPZZ(sve2_sqshl_zpzz_s, int32_t, H1_4, do_sqshl_s)
603DO_ZPZZ_D(sve2_sqshl_zpzz_d, int64_t, do_sqshl_d)
604
605#define do_uqshl_b(n, m) \
606 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
607#define do_uqshl_h(n, m) \
608 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
609#define do_uqshl_s(n, m) \
610 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
611#define do_uqshl_d(n, m) \
612 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
613
614DO_ZPZZ(sve2_uqshl_zpzz_b, uint8_t, H1_2, do_uqshl_b)
615DO_ZPZZ(sve2_uqshl_zpzz_h, uint16_t, H1_2, do_uqshl_h)
616DO_ZPZZ(sve2_uqshl_zpzz_s, uint32_t, H1_4, do_uqshl_s)
617DO_ZPZZ_D(sve2_uqshl_zpzz_d, uint64_t, do_uqshl_d)
618
619#define do_sqrshl_b(n, m) \
620 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
621#define do_sqrshl_h(n, m) \
622 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
623#define do_sqrshl_s(n, m) \
624 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
625#define do_sqrshl_d(n, m) \
626 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
627
628DO_ZPZZ(sve2_sqrshl_zpzz_b, int8_t, H1_2, do_sqrshl_b)
629DO_ZPZZ(sve2_sqrshl_zpzz_h, int16_t, H1_2, do_sqrshl_h)
630DO_ZPZZ(sve2_sqrshl_zpzz_s, int32_t, H1_4, do_sqrshl_s)
631DO_ZPZZ_D(sve2_sqrshl_zpzz_d, int64_t, do_sqrshl_d)
632
633#undef do_sqrshl_d
634
635#define do_uqrshl_b(n, m) \
636 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
637#define do_uqrshl_h(n, m) \
638 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
639#define do_uqrshl_s(n, m) \
640 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
641#define do_uqrshl_d(n, m) \
642 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
643
644DO_ZPZZ(sve2_uqrshl_zpzz_b, uint8_t, H1_2, do_uqrshl_b)
645DO_ZPZZ(sve2_uqrshl_zpzz_h, uint16_t, H1_2, do_uqrshl_h)
646DO_ZPZZ(sve2_uqrshl_zpzz_s, uint32_t, H1_4, do_uqrshl_s)
647DO_ZPZZ_D(sve2_uqrshl_zpzz_d, uint64_t, do_uqrshl_d)
648
649#undef do_uqrshl_d
650
a47dc220
RH
651#define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
652#define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
653
654DO_ZPZZ(sve2_shadd_zpzz_b, int8_t, H1, DO_HADD_BHS)
655DO_ZPZZ(sve2_shadd_zpzz_h, int16_t, H1_2, DO_HADD_BHS)
656DO_ZPZZ(sve2_shadd_zpzz_s, int32_t, H1_4, DO_HADD_BHS)
657DO_ZPZZ_D(sve2_shadd_zpzz_d, int64_t, DO_HADD_D)
658
659DO_ZPZZ(sve2_uhadd_zpzz_b, uint8_t, H1, DO_HADD_BHS)
660DO_ZPZZ(sve2_uhadd_zpzz_h, uint16_t, H1_2, DO_HADD_BHS)
661DO_ZPZZ(sve2_uhadd_zpzz_s, uint32_t, H1_4, DO_HADD_BHS)
662DO_ZPZZ_D(sve2_uhadd_zpzz_d, uint64_t, DO_HADD_D)
663
664#define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
665#define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
666
667DO_ZPZZ(sve2_srhadd_zpzz_b, int8_t, H1, DO_RHADD_BHS)
668DO_ZPZZ(sve2_srhadd_zpzz_h, int16_t, H1_2, DO_RHADD_BHS)
669DO_ZPZZ(sve2_srhadd_zpzz_s, int32_t, H1_4, DO_RHADD_BHS)
670DO_ZPZZ_D(sve2_srhadd_zpzz_d, int64_t, DO_RHADD_D)
671
672DO_ZPZZ(sve2_urhadd_zpzz_b, uint8_t, H1, DO_RHADD_BHS)
673DO_ZPZZ(sve2_urhadd_zpzz_h, uint16_t, H1_2, DO_RHADD_BHS)
674DO_ZPZZ(sve2_urhadd_zpzz_s, uint32_t, H1_4, DO_RHADD_BHS)
675DO_ZPZZ_D(sve2_urhadd_zpzz_d, uint64_t, DO_RHADD_D)
676
677#define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
678#define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
679
680DO_ZPZZ(sve2_shsub_zpzz_b, int8_t, H1, DO_HSUB_BHS)
681DO_ZPZZ(sve2_shsub_zpzz_h, int16_t, H1_2, DO_HSUB_BHS)
682DO_ZPZZ(sve2_shsub_zpzz_s, int32_t, H1_4, DO_HSUB_BHS)
683DO_ZPZZ_D(sve2_shsub_zpzz_d, int64_t, DO_HSUB_D)
684
685DO_ZPZZ(sve2_uhsub_zpzz_b, uint8_t, H1, DO_HSUB_BHS)
686DO_ZPZZ(sve2_uhsub_zpzz_h, uint16_t, H1_2, DO_HSUB_BHS)
687DO_ZPZZ(sve2_uhsub_zpzz_s, uint32_t, H1_4, DO_HSUB_BHS)
688DO_ZPZZ_D(sve2_uhsub_zpzz_d, uint64_t, DO_HSUB_D)
689
4f07fbeb
RH
690static inline int32_t do_sat_bhs(int64_t val, int64_t min, int64_t max)
691{
692 return val >= max ? max : val <= min ? min : val;
693}
694
695#define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
696#define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
697#define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
698
699static inline int64_t do_sqadd_d(int64_t n, int64_t m)
700{
701 int64_t r = n + m;
702 if (((r ^ n) & ~(n ^ m)) < 0) {
703 /* Signed overflow. */
704 return r < 0 ? INT64_MAX : INT64_MIN;
705 }
706 return r;
707}
708
709DO_ZPZZ(sve2_sqadd_zpzz_b, int8_t, H1, DO_SQADD_B)
710DO_ZPZZ(sve2_sqadd_zpzz_h, int16_t, H1_2, DO_SQADD_H)
711DO_ZPZZ(sve2_sqadd_zpzz_s, int32_t, H1_4, DO_SQADD_S)
712DO_ZPZZ_D(sve2_sqadd_zpzz_d, int64_t, do_sqadd_d)
713
714#define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
715#define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
716#define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
717
718static inline uint64_t do_uqadd_d(uint64_t n, uint64_t m)
719{
720 uint64_t r = n + m;
721 return r < n ? UINT64_MAX : r;
722}
723
724DO_ZPZZ(sve2_uqadd_zpzz_b, uint8_t, H1, DO_UQADD_B)
725DO_ZPZZ(sve2_uqadd_zpzz_h, uint16_t, H1_2, DO_UQADD_H)
726DO_ZPZZ(sve2_uqadd_zpzz_s, uint32_t, H1_4, DO_UQADD_S)
727DO_ZPZZ_D(sve2_uqadd_zpzz_d, uint64_t, do_uqadd_d)
728
729#define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
730#define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
731#define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
732
733static inline int64_t do_sqsub_d(int64_t n, int64_t m)
734{
735 int64_t r = n - m;
736 if (((r ^ n) & (n ^ m)) < 0) {
737 /* Signed overflow. */
738 return r < 0 ? INT64_MAX : INT64_MIN;
739 }
740 return r;
741}
742
743DO_ZPZZ(sve2_sqsub_zpzz_b, int8_t, H1, DO_SQSUB_B)
744DO_ZPZZ(sve2_sqsub_zpzz_h, int16_t, H1_2, DO_SQSUB_H)
745DO_ZPZZ(sve2_sqsub_zpzz_s, int32_t, H1_4, DO_SQSUB_S)
746DO_ZPZZ_D(sve2_sqsub_zpzz_d, int64_t, do_sqsub_d)
747
748#define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
749#define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
750#define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
751
752static inline uint64_t do_uqsub_d(uint64_t n, uint64_t m)
753{
754 return n > m ? n - m : 0;
755}
756
757DO_ZPZZ(sve2_uqsub_zpzz_b, uint8_t, H1, DO_UQSUB_B)
758DO_ZPZZ(sve2_uqsub_zpzz_h, uint16_t, H1_2, DO_UQSUB_H)
759DO_ZPZZ(sve2_uqsub_zpzz_s, uint32_t, H1_4, DO_UQSUB_S)
760DO_ZPZZ_D(sve2_uqsub_zpzz_d, uint64_t, do_uqsub_d)
761
762#define DO_SUQADD_B(n, m) \
763 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
764#define DO_SUQADD_H(n, m) \
765 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
766#define DO_SUQADD_S(n, m) \
767 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
768
769static inline int64_t do_suqadd_d(int64_t n, uint64_t m)
770{
771 uint64_t r = n + m;
772
773 if (n < 0) {
774 /* Note that m - abs(n) cannot underflow. */
775 if (r > INT64_MAX) {
776 /* Result is either very large positive or negative. */
777 if (m > -n) {
778 /* m > abs(n), so r is a very large positive. */
779 return INT64_MAX;
780 }
781 /* Result is negative. */
782 }
783 } else {
784 /* Both inputs are positive: check for overflow. */
785 if (r < m || r > INT64_MAX) {
786 return INT64_MAX;
787 }
788 }
789 return r;
790}
791
792DO_ZPZZ(sve2_suqadd_zpzz_b, uint8_t, H1, DO_SUQADD_B)
793DO_ZPZZ(sve2_suqadd_zpzz_h, uint16_t, H1_2, DO_SUQADD_H)
794DO_ZPZZ(sve2_suqadd_zpzz_s, uint32_t, H1_4, DO_SUQADD_S)
795DO_ZPZZ_D(sve2_suqadd_zpzz_d, uint64_t, do_suqadd_d)
796
797#define DO_USQADD_B(n, m) \
798 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
799#define DO_USQADD_H(n, m) \
800 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
801#define DO_USQADD_S(n, m) \
802 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
803
804static inline uint64_t do_usqadd_d(uint64_t n, int64_t m)
805{
806 uint64_t r = n + m;
807
808 if (m < 0) {
809 return n < -m ? 0 : r;
810 }
811 return r < n ? UINT64_MAX : r;
812}
813
814DO_ZPZZ(sve2_usqadd_zpzz_b, uint8_t, H1, DO_USQADD_B)
815DO_ZPZZ(sve2_usqadd_zpzz_h, uint16_t, H1_2, DO_USQADD_H)
816DO_ZPZZ(sve2_usqadd_zpzz_s, uint32_t, H1_4, DO_USQADD_S)
817DO_ZPZZ_D(sve2_usqadd_zpzz_d, uint64_t, do_usqadd_d)
818
f97cfd59
RH
819#undef DO_ZPZZ
820#undef DO_ZPZZ_D
047cec97 821
8597dc8b
RH
822/*
823 * Three operand expander, operating on element pairs.
824 * If the slot I is even, the elements from from VN {I, I+1}.
825 * If the slot I is odd, the elements from from VM {I-1, I}.
826 * Load all of the input elements in each pair before overwriting output.
827 */
828#define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
829void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
830{ \
831 intptr_t i, opr_sz = simd_oprsz(desc); \
832 for (i = 0; i < opr_sz; ) { \
833 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
834 do { \
835 TYPE n0 = *(TYPE *)(vn + H(i)); \
836 TYPE m0 = *(TYPE *)(vm + H(i)); \
837 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
838 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
839 if (pg & 1) { \
840 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
841 } \
842 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
843 if (pg & 1) { \
844 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
845 } \
846 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
847 } while (i & 15); \
848 } \
849}
850
851/* Similarly, specialized for 64-bit operands. */
852#define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
853void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
854{ \
855 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
856 TYPE *d = vd, *n = vn, *m = vm; \
857 uint8_t *pg = vg; \
858 for (i = 0; i < opr_sz; i += 2) { \
859 TYPE n0 = n[i], n1 = n[i + 1]; \
860 TYPE m0 = m[i], m1 = m[i + 1]; \
861 if (pg[H1(i)] & 1) { \
862 d[i] = OP(n0, n1); \
863 } \
864 if (pg[H1(i + 1)] & 1) { \
865 d[i + 1] = OP(m0, m1); \
866 } \
867 } \
868}
869
870DO_ZPZZ_PAIR(sve2_addp_zpzz_b, uint8_t, H1, DO_ADD)
871DO_ZPZZ_PAIR(sve2_addp_zpzz_h, uint16_t, H1_2, DO_ADD)
872DO_ZPZZ_PAIR(sve2_addp_zpzz_s, uint32_t, H1_4, DO_ADD)
873DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d, uint64_t, DO_ADD)
874
875DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b, uint8_t, H1, DO_MAX)
876DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h, uint16_t, H1_2, DO_MAX)
877DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s, uint32_t, H1_4, DO_MAX)
878DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d, uint64_t, DO_MAX)
879
880DO_ZPZZ_PAIR(sve2_uminp_zpzz_b, uint8_t, H1, DO_MIN)
881DO_ZPZZ_PAIR(sve2_uminp_zpzz_h, uint16_t, H1_2, DO_MIN)
882DO_ZPZZ_PAIR(sve2_uminp_zpzz_s, uint32_t, H1_4, DO_MIN)
883DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d, uint64_t, DO_MIN)
884
885DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b, int8_t, H1, DO_MAX)
886DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h, int16_t, H1_2, DO_MAX)
887DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s, int32_t, H1_4, DO_MAX)
888DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d, int64_t, DO_MAX)
889
890DO_ZPZZ_PAIR(sve2_sminp_zpzz_b, int8_t, H1, DO_MIN)
891DO_ZPZZ_PAIR(sve2_sminp_zpzz_h, int16_t, H1_2, DO_MIN)
892DO_ZPZZ_PAIR(sve2_sminp_zpzz_s, int32_t, H1_4, DO_MIN)
893DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d, int64_t, DO_MIN)
894
895#undef DO_ZPZZ_PAIR
896#undef DO_ZPZZ_PAIR_D
897
b87dbeeb
SL
898#define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
899void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
900 void *status, uint32_t desc) \
901{ \
902 intptr_t i, opr_sz = simd_oprsz(desc); \
903 for (i = 0; i < opr_sz; ) { \
904 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
905 do { \
906 TYPE n0 = *(TYPE *)(vn + H(i)); \
907 TYPE m0 = *(TYPE *)(vm + H(i)); \
908 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
909 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
910 if (pg & 1) { \
911 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
912 } \
913 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
914 if (pg & 1) { \
915 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
916 } \
917 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
918 } while (i & 15); \
919 } \
920}
921
922DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h, float16, H1_2, float16_add)
923DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s, float32, H1_4, float32_add)
924DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d, float64, , float64_add)
925
926DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h, float16, H1_2, float16_maxnum)
927DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s, float32, H1_4, float32_maxnum)
928DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d, float64, , float64_maxnum)
929
930DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h, float16, H1_2, float16_minnum)
931DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s, float32, H1_4, float32_minnum)
932DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d, float64, , float64_minnum)
933
934DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h, float16, H1_2, float16_max)
935DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s, float32, H1_4, float32_max)
936DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d, float64, , float64_max)
937
938DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h, float16, H1_2, float16_min)
939DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s, float32, H1_4, float32_min)
940DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d, float64, , float64_min)
941
942#undef DO_ZPZZ_PAIR_FP
943
fe7f8dfb
RH
944/* Three-operand expander, controlled by a predicate, in which the
945 * third operand is "wide". That is, for D = N op M, the same 64-bit
946 * value of M is used with all of the narrower values of N.
947 */
948#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
949void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
950{ \
951 intptr_t i, opr_sz = simd_oprsz(desc); \
952 for (i = 0; i < opr_sz; ) { \
953 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
954 TYPEW mm = *(TYPEW *)(vm + i); \
955 do { \
956 if (pg & 1) { \
957 TYPE nn = *(TYPE *)(vn + H(i)); \
958 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
959 } \
960 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
961 } while (i & 7); \
962 } \
963}
964
965DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
966DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
967DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
968
969DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
970DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
971DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
972
973DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
974DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
975DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
976
977#undef DO_ZPZW
978
afac6d04
RH
979/* Fully general two-operand expander, controlled by a predicate.
980 */
981#define DO_ZPZ(NAME, TYPE, H, OP) \
982void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
983{ \
984 intptr_t i, opr_sz = simd_oprsz(desc); \
985 for (i = 0; i < opr_sz; ) { \
986 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
987 do { \
988 if (pg & 1) { \
989 TYPE nn = *(TYPE *)(vn + H(i)); \
990 *(TYPE *)(vd + H(i)) = OP(nn); \
991 } \
992 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
993 } while (i & 15); \
994 } \
995}
996
997/* Similarly, specialized for 64-bit operands. */
998#define DO_ZPZ_D(NAME, TYPE, OP) \
999void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1000{ \
1001 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1002 TYPE *d = vd, *n = vn; \
1003 uint8_t *pg = vg; \
1004 for (i = 0; i < opr_sz; i += 1) { \
1005 if (pg[H1(i)] & 1) { \
1006 TYPE nn = n[i]; \
1007 d[i] = OP(nn); \
1008 } \
1009 } \
1010}
1011
1012#define DO_CLS_B(N) (clrsb32(N) - 24)
1013#define DO_CLS_H(N) (clrsb32(N) - 16)
1014
1015DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
1016DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
1017DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
1018DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
1019
1020#define DO_CLZ_B(N) (clz32(N) - 24)
1021#define DO_CLZ_H(N) (clz32(N) - 16)
1022
1023DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
1024DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
1025DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
1026DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
1027
1028DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
1029DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
1030DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
1031DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
1032
1033#define DO_CNOT(N) (N == 0)
1034
1035DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
1036DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
1037DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
1038DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
1039
1040#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
1041
1042DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
1043DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
1044DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
1045
1046#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
1047
1048DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
1049DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
1050DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
1051
1052#define DO_NOT(N) (~N)
1053
1054DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
1055DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
1056DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
1057DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
1058
1059#define DO_SXTB(N) ((int8_t)N)
1060#define DO_SXTH(N) ((int16_t)N)
1061#define DO_SXTS(N) ((int32_t)N)
1062#define DO_UXTB(N) ((uint8_t)N)
1063#define DO_UXTH(N) ((uint16_t)N)
1064#define DO_UXTS(N) ((uint32_t)N)
1065
1066DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
1067DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
1068DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
1069DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
1070DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
1071DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
1072
1073DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
1074DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
1075DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
1076DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
1077DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
1078DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
1079
1080#define DO_ABS(N) (N < 0 ? -N : N)
1081
1082DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
1083DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
1084DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
1085DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
1086
1087#define DO_NEG(N) (-N)
1088
1089DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
1090DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
1091DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
1092DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
1093
dae8fb90
RH
1094DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
1095DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
1096DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
1097
1098DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
1099DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
1100
1101DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
1102
1103DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
1104DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
1105DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
1106DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
1107
db366da8
RH
1108#define DO_SQABS(X) \
1109 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1110 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
1111
1112DO_ZPZ(sve2_sqabs_b, int8_t, H1, DO_SQABS)
1113DO_ZPZ(sve2_sqabs_h, int16_t, H1_2, DO_SQABS)
1114DO_ZPZ(sve2_sqabs_s, int32_t, H1_4, DO_SQABS)
1115DO_ZPZ_D(sve2_sqabs_d, int64_t, DO_SQABS)
1116
1117#define DO_SQNEG(X) \
1118 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1119 x_ == min_ ? -min_ - 1 : -x_; })
1120
1121DO_ZPZ(sve2_sqneg_b, uint8_t, H1, DO_SQNEG)
1122DO_ZPZ(sve2_sqneg_h, uint16_t, H1_2, DO_SQNEG)
1123DO_ZPZ(sve2_sqneg_s, uint32_t, H1_4, DO_SQNEG)
1124DO_ZPZ_D(sve2_sqneg_d, uint64_t, DO_SQNEG)
1125
1126DO_ZPZ(sve2_urecpe_s, uint32_t, H1_4, helper_recpe_u32)
1127DO_ZPZ(sve2_ursqrte_s, uint32_t, H1_4, helper_rsqrte_u32)
1128
d9d78dcc
RH
1129/* Three-operand expander, unpredicated, in which the third operand is "wide".
1130 */
1131#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1132void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1133{ \
1134 intptr_t i, opr_sz = simd_oprsz(desc); \
1135 for (i = 0; i < opr_sz; ) { \
1136 TYPEW mm = *(TYPEW *)(vm + i); \
1137 do { \
1138 TYPE nn = *(TYPE *)(vn + H(i)); \
1139 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1140 i += sizeof(TYPE); \
1141 } while (i & 7); \
1142 } \
1143}
1144
1145DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
1146DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
1147DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
1148
1149DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
1150DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
1151DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
1152
1153DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
1154DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
1155DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
1156
1157#undef DO_ZZW
1158
afac6d04
RH
1159#undef DO_CLS_B
1160#undef DO_CLS_H
1161#undef DO_CLZ_B
1162#undef DO_CLZ_H
1163#undef DO_CNOT
1164#undef DO_FABS
1165#undef DO_FNEG
1166#undef DO_ABS
1167#undef DO_NEG
1168#undef DO_ZPZ
1169#undef DO_ZPZ_D
1170
0ce1dda8
RH
1171/*
1172 * Three-operand expander, unpredicated, in which the two inputs are
1173 * selected from the top or bottom half of the wide column.
1174 */
1175#define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1176void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1177{ \
1178 intptr_t i, opr_sz = simd_oprsz(desc); \
1179 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1180 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1181 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1182 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1183 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1184 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1185 } \
1186}
1187
1188DO_ZZZ_TB(sve2_saddl_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1189DO_ZZZ_TB(sve2_saddl_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1190DO_ZZZ_TB(sve2_saddl_d, int64_t, int32_t, , H1_4, DO_ADD)
1191
1192DO_ZZZ_TB(sve2_ssubl_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1193DO_ZZZ_TB(sve2_ssubl_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1194DO_ZZZ_TB(sve2_ssubl_d, int64_t, int32_t, , H1_4, DO_SUB)
1195
1196DO_ZZZ_TB(sve2_sabdl_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1197DO_ZZZ_TB(sve2_sabdl_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1198DO_ZZZ_TB(sve2_sabdl_d, int64_t, int32_t, , H1_4, DO_ABD)
1199
1200DO_ZZZ_TB(sve2_uaddl_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1201DO_ZZZ_TB(sve2_uaddl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1202DO_ZZZ_TB(sve2_uaddl_d, uint64_t, uint32_t, , H1_4, DO_ADD)
1203
1204DO_ZZZ_TB(sve2_usubl_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1205DO_ZZZ_TB(sve2_usubl_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1206DO_ZZZ_TB(sve2_usubl_d, uint64_t, uint32_t, , H1_4, DO_SUB)
1207
1208DO_ZZZ_TB(sve2_uabdl_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1209DO_ZZZ_TB(sve2_uabdl_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1210DO_ZZZ_TB(sve2_uabdl_d, uint64_t, uint32_t, , H1_4, DO_ABD)
1211
69ccc099
RH
1212DO_ZZZ_TB(sve2_smull_zzz_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1213DO_ZZZ_TB(sve2_smull_zzz_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1214DO_ZZZ_TB(sve2_smull_zzz_d, int64_t, int32_t, , H1_4, DO_MUL)
1215
1216DO_ZZZ_TB(sve2_umull_zzz_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1217DO_ZZZ_TB(sve2_umull_zzz_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1218DO_ZZZ_TB(sve2_umull_zzz_d, uint64_t, uint32_t, , H1_4, DO_MUL)
1219
1220/* Note that the multiply cannot overflow, but the doubling can. */
1221static inline int16_t do_sqdmull_h(int16_t n, int16_t m)
1222{
1223 int16_t val = n * m;
1224 return DO_SQADD_H(val, val);
1225}
1226
1227static inline int32_t do_sqdmull_s(int32_t n, int32_t m)
1228{
1229 int32_t val = n * m;
1230 return DO_SQADD_S(val, val);
1231}
1232
1233static inline int64_t do_sqdmull_d(int64_t n, int64_t m)
1234{
1235 int64_t val = n * m;
1236 return do_sqadd_d(val, val);
1237}
1238
1239DO_ZZZ_TB(sve2_sqdmull_zzz_h, int16_t, int8_t, H1_2, H1, do_sqdmull_h)
1240DO_ZZZ_TB(sve2_sqdmull_zzz_s, int32_t, int16_t, H1_4, H1_2, do_sqdmull_s)
1241DO_ZZZ_TB(sve2_sqdmull_zzz_d, int64_t, int32_t, , H1_4, do_sqdmull_d)
1242
0ce1dda8
RH
1243#undef DO_ZZZ_TB
1244
81fccf09
RH
1245#define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1246void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1247{ \
1248 intptr_t i, opr_sz = simd_oprsz(desc); \
1249 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1250 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1251 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1252 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1253 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1254 } \
1255}
1256
1257DO_ZZZ_WTB(sve2_saddw_h, int16_t, int8_t, H1_2, H1, DO_ADD)
1258DO_ZZZ_WTB(sve2_saddw_s, int32_t, int16_t, H1_4, H1_2, DO_ADD)
1259DO_ZZZ_WTB(sve2_saddw_d, int64_t, int32_t, , H1_4, DO_ADD)
1260
1261DO_ZZZ_WTB(sve2_ssubw_h, int16_t, int8_t, H1_2, H1, DO_SUB)
1262DO_ZZZ_WTB(sve2_ssubw_s, int32_t, int16_t, H1_4, H1_2, DO_SUB)
1263DO_ZZZ_WTB(sve2_ssubw_d, int64_t, int32_t, , H1_4, DO_SUB)
1264
1265DO_ZZZ_WTB(sve2_uaddw_h, uint16_t, uint8_t, H1_2, H1, DO_ADD)
1266DO_ZZZ_WTB(sve2_uaddw_s, uint32_t, uint16_t, H1_4, H1_2, DO_ADD)
1267DO_ZZZ_WTB(sve2_uaddw_d, uint64_t, uint32_t, , H1_4, DO_ADD)
1268
1269DO_ZZZ_WTB(sve2_usubw_h, uint16_t, uint8_t, H1_2, H1, DO_SUB)
1270DO_ZZZ_WTB(sve2_usubw_s, uint32_t, uint16_t, H1_4, H1_2, DO_SUB)
1271DO_ZZZ_WTB(sve2_usubw_d, uint64_t, uint32_t, , H1_4, DO_SUB)
1272
1273#undef DO_ZZZ_WTB
1274
2df3ca55
RH
1275#define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1276void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1277{ \
1278 intptr_t i, opr_sz = simd_oprsz(desc); \
1279 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1280 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1281 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1282 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1283 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1284 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1285 } \
1286}
1287
1288DO_ZZZ_NTB(sve2_eoril_b, uint8_t, H1, DO_EOR)
1289DO_ZZZ_NTB(sve2_eoril_h, uint16_t, H1_2, DO_EOR)
1290DO_ZZZ_NTB(sve2_eoril_s, uint32_t, H1_4, DO_EOR)
1291DO_ZZZ_NTB(sve2_eoril_d, uint64_t, , DO_EOR)
1292
1293#undef DO_ZZZ_NTB
1294
38650638
RH
1295#define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1296void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1297{ \
1298 intptr_t i, opr_sz = simd_oprsz(desc); \
1299 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1300 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1301 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1302 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1303 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1304 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1305 } \
1306}
1307
1308DO_ZZZW_ACC(sve2_sabal_h, int16_t, int8_t, H1_2, H1, DO_ABD)
1309DO_ZZZW_ACC(sve2_sabal_s, int32_t, int16_t, H1_4, H1_2, DO_ABD)
1310DO_ZZZW_ACC(sve2_sabal_d, int64_t, int32_t, , H1_4, DO_ABD)
1311
1312DO_ZZZW_ACC(sve2_uabal_h, uint16_t, uint8_t, H1_2, H1, DO_ABD)
1313DO_ZZZW_ACC(sve2_uabal_s, uint32_t, uint16_t, H1_4, H1_2, DO_ABD)
1314DO_ZZZW_ACC(sve2_uabal_d, uint64_t, uint32_t, , H1_4, DO_ABD)
1315
45a32e80
RH
1316DO_ZZZW_ACC(sve2_smlal_zzzw_h, int16_t, int8_t, H1_2, H1, DO_MUL)
1317DO_ZZZW_ACC(sve2_smlal_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_MUL)
1318DO_ZZZW_ACC(sve2_smlal_zzzw_d, int64_t, int32_t, , H1_4, DO_MUL)
1319
1320DO_ZZZW_ACC(sve2_umlal_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_MUL)
1321DO_ZZZW_ACC(sve2_umlal_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_MUL)
1322DO_ZZZW_ACC(sve2_umlal_zzzw_d, uint64_t, uint32_t, , H1_4, DO_MUL)
1323
1324#define DO_NMUL(N, M) -(N * M)
1325
1326DO_ZZZW_ACC(sve2_smlsl_zzzw_h, int16_t, int8_t, H1_2, H1, DO_NMUL)
1327DO_ZZZW_ACC(sve2_smlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2, DO_NMUL)
1328DO_ZZZW_ACC(sve2_smlsl_zzzw_d, int64_t, int32_t, , H1_4, DO_NMUL)
1329
1330DO_ZZZW_ACC(sve2_umlsl_zzzw_h, uint16_t, uint8_t, H1_2, H1, DO_NMUL)
1331DO_ZZZW_ACC(sve2_umlsl_zzzw_s, uint32_t, uint16_t, H1_4, H1_2, DO_NMUL)
1332DO_ZZZW_ACC(sve2_umlsl_zzzw_d, uint64_t, uint32_t, , H1_4, DO_NMUL)
1333
38650638
RH
1334#undef DO_ZZZW_ACC
1335
5ff2838d
RH
1336#define DO_XTNB(NAME, TYPE, OP) \
1337void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1338{ \
1339 intptr_t i, opr_sz = simd_oprsz(desc); \
1340 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1341 TYPE nn = *(TYPE *)(vn + i); \
1342 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1343 *(TYPE *)(vd + i) = nn; \
1344 } \
1345}
1346
1347#define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1348void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1349{ \
1350 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1351 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1352 TYPE nn = *(TYPE *)(vn + i); \
1353 *(TYPEN *)(vd + i + odd) = OP(nn); \
1354 } \
1355}
1356
1357#define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1358#define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1359#define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1360
1361DO_XTNB(sve2_sqxtnb_h, int16_t, DO_SQXTN_H)
1362DO_XTNB(sve2_sqxtnb_s, int32_t, DO_SQXTN_S)
1363DO_XTNB(sve2_sqxtnb_d, int64_t, DO_SQXTN_D)
1364
1365DO_XTNT(sve2_sqxtnt_h, int16_t, int8_t, H1, DO_SQXTN_H)
1366DO_XTNT(sve2_sqxtnt_s, int32_t, int16_t, H1_2, DO_SQXTN_S)
1367DO_XTNT(sve2_sqxtnt_d, int64_t, int32_t, H1_4, DO_SQXTN_D)
1368
1369#define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1370#define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1371#define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1372
1373DO_XTNB(sve2_uqxtnb_h, uint16_t, DO_UQXTN_H)
1374DO_XTNB(sve2_uqxtnb_s, uint32_t, DO_UQXTN_S)
1375DO_XTNB(sve2_uqxtnb_d, uint64_t, DO_UQXTN_D)
1376
1377DO_XTNT(sve2_uqxtnt_h, uint16_t, uint8_t, H1, DO_UQXTN_H)
1378DO_XTNT(sve2_uqxtnt_s, uint32_t, uint16_t, H1_2, DO_UQXTN_S)
1379DO_XTNT(sve2_uqxtnt_d, uint64_t, uint32_t, H1_4, DO_UQXTN_D)
1380
1381DO_XTNB(sve2_sqxtunb_h, int16_t, DO_UQXTN_H)
1382DO_XTNB(sve2_sqxtunb_s, int32_t, DO_UQXTN_S)
1383DO_XTNB(sve2_sqxtunb_d, int64_t, DO_UQXTN_D)
1384
1385DO_XTNT(sve2_sqxtunt_h, int16_t, int8_t, H1, DO_UQXTN_H)
1386DO_XTNT(sve2_sqxtunt_s, int32_t, int16_t, H1_2, DO_UQXTN_S)
1387DO_XTNT(sve2_sqxtunt_d, int64_t, int32_t, H1_4, DO_UQXTN_D)
1388
1389#undef DO_XTNB
1390#undef DO_XTNT
1391
b8295dfb
RH
1392void HELPER(sve2_adcl_s)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1393{
1394 intptr_t i, opr_sz = simd_oprsz(desc);
1395 int sel = H4(extract32(desc, SIMD_DATA_SHIFT, 1));
1396 uint32_t inv = -extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1397 uint32_t *a = va, *n = vn;
1398 uint64_t *d = vd, *m = vm;
1399
1400 for (i = 0; i < opr_sz / 8; ++i) {
1401 uint32_t e1 = a[2 * i + H4(0)];
1402 uint32_t e2 = n[2 * i + sel] ^ inv;
1403 uint64_t c = extract64(m[i], 32, 1);
1404 /* Compute and store the entire 33-bit result at once. */
1405 d[i] = c + e1 + e2;
1406 }
1407}
1408
1409void HELPER(sve2_adcl_d)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
1410{
1411 intptr_t i, opr_sz = simd_oprsz(desc);
1412 int sel = extract32(desc, SIMD_DATA_SHIFT, 1);
1413 uint64_t inv = -(uint64_t)extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1414 uint64_t *d = vd, *a = va, *n = vn, *m = vm;
1415
1416 for (i = 0; i < opr_sz / 8; i += 2) {
1417 Int128 e1 = int128_make64(a[i]);
1418 Int128 e2 = int128_make64(n[i + sel] ^ inv);
1419 Int128 c = int128_make64(m[i + 1] & 1);
1420 Int128 r = int128_add(int128_add(e1, e2), c);
1421 d[i + 0] = int128_getlo(r);
1422 d[i + 1] = int128_gethi(r);
1423 }
1424}
1425
bfc9307e
RH
1426#define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1427void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1428{ \
1429 intptr_t i, opr_sz = simd_oprsz(desc); \
1430 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1431 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1432 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1433 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1434 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1435 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1436 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1437 } \
1438}
1439
1440DO_SQDMLAL(sve2_sqdmlal_zzzw_h, int16_t, int8_t, H1_2, H1,
1441 do_sqdmull_h, DO_SQADD_H)
1442DO_SQDMLAL(sve2_sqdmlal_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1443 do_sqdmull_s, DO_SQADD_S)
1444DO_SQDMLAL(sve2_sqdmlal_zzzw_d, int64_t, int32_t, , H1_4,
1445 do_sqdmull_d, do_sqadd_d)
1446
1447DO_SQDMLAL(sve2_sqdmlsl_zzzw_h, int16_t, int8_t, H1_2, H1,
1448 do_sqdmull_h, DO_SQSUB_H)
1449DO_SQDMLAL(sve2_sqdmlsl_zzzw_s, int32_t, int16_t, H1_4, H1_2,
1450 do_sqdmull_s, DO_SQSUB_S)
1451DO_SQDMLAL(sve2_sqdmlsl_zzzw_d, int64_t, int32_t, , H1_4,
1452 do_sqdmull_d, do_sqsub_d)
1453
1454#undef DO_SQDMLAL
1455
d782d3ca
RH
1456#define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1457void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1458{ \
1459 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1460 int rot = simd_data(desc); \
1461 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1462 bool sub_r = rot == 1 || rot == 2; \
1463 bool sub_i = rot >= 2; \
1464 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1465 for (i = 0; i < opr_sz; i += 2) { \
1466 TYPE elt1_a = n[H(i + sel_a)]; \
1467 TYPE elt2_a = m[H(i + sel_a)]; \
1468 TYPE elt2_b = m[H(i + sel_b)]; \
1469 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1470 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1471 } \
1472}
1473
1474#define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1475
1476DO_CMLA_FUNC(sve2_cmla_zzzz_b, uint8_t, H1, DO_CMLA)
1477DO_CMLA_FUNC(sve2_cmla_zzzz_h, uint16_t, H2, DO_CMLA)
1478DO_CMLA_FUNC(sve2_cmla_zzzz_s, uint32_t, H4, DO_CMLA)
1479DO_CMLA_FUNC(sve2_cmla_zzzz_d, uint64_t, , DO_CMLA)
1480
1481#define DO_SQRDMLAH_B(N, M, A, S) \
1482 do_sqrdmlah_b(N, M, A, S, true)
1483#define DO_SQRDMLAH_H(N, M, A, S) \
1484 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1485#define DO_SQRDMLAH_S(N, M, A, S) \
1486 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1487#define DO_SQRDMLAH_D(N, M, A, S) \
1488 do_sqrdmlah_d(N, M, A, S, true)
1489
1490DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b, int8_t, H1, DO_SQRDMLAH_B)
1491DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h, int16_t, H2, DO_SQRDMLAH_H)
1492DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s, int32_t, H4, DO_SQRDMLAH_S)
1493DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d, int64_t, , DO_SQRDMLAH_D)
1494
1495#undef DO_CMLA
1496#undef DO_CMLA_FUNC
1497#undef DO_SQRDMLAH_B
1498#undef DO_SQRDMLAH_H
1499#undef DO_SQRDMLAH_S
1500#undef DO_SQRDMLAH_D
1501
cb9c33b8
RH
1502#define DO_BITPERM(NAME, TYPE, OP) \
1503void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1504{ \
1505 intptr_t i, opr_sz = simd_oprsz(desc); \
1506 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1507 TYPE nn = *(TYPE *)(vn + i); \
1508 TYPE mm = *(TYPE *)(vm + i); \
1509 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1510 } \
1511}
1512
1513static uint64_t bitextract(uint64_t data, uint64_t mask, int n)
1514{
1515 uint64_t res = 0;
1516 int db, rb = 0;
1517
1518 for (db = 0; db < n; ++db) {
1519 if ((mask >> db) & 1) {
1520 res |= ((data >> db) & 1) << rb;
1521 ++rb;
1522 }
1523 }
1524 return res;
1525}
1526
1527DO_BITPERM(sve2_bext_b, uint8_t, bitextract)
1528DO_BITPERM(sve2_bext_h, uint16_t, bitextract)
1529DO_BITPERM(sve2_bext_s, uint32_t, bitextract)
1530DO_BITPERM(sve2_bext_d, uint64_t, bitextract)
1531
1532static uint64_t bitdeposit(uint64_t data, uint64_t mask, int n)
1533{
1534 uint64_t res = 0;
1535 int rb, db = 0;
1536
1537 for (rb = 0; rb < n; ++rb) {
1538 if ((mask >> rb) & 1) {
1539 res |= ((data >> db) & 1) << rb;
1540 ++db;
1541 }
1542 }
1543 return res;
1544}
1545
1546DO_BITPERM(sve2_bdep_b, uint8_t, bitdeposit)
1547DO_BITPERM(sve2_bdep_h, uint16_t, bitdeposit)
1548DO_BITPERM(sve2_bdep_s, uint32_t, bitdeposit)
1549DO_BITPERM(sve2_bdep_d, uint64_t, bitdeposit)
1550
1551static uint64_t bitgroup(uint64_t data, uint64_t mask, int n)
1552{
1553 uint64_t resm = 0, resu = 0;
1554 int db, rbm = 0, rbu = 0;
1555
1556 for (db = 0; db < n; ++db) {
1557 uint64_t val = (data >> db) & 1;
1558 if ((mask >> db) & 1) {
1559 resm |= val << rbm++;
1560 } else {
1561 resu |= val << rbu++;
1562 }
1563 }
1564
1565 return resm | (resu << rbm);
1566}
1567
1568DO_BITPERM(sve2_bgrp_b, uint8_t, bitgroup)
1569DO_BITPERM(sve2_bgrp_h, uint16_t, bitgroup)
1570DO_BITPERM(sve2_bgrp_s, uint32_t, bitgroup)
1571DO_BITPERM(sve2_bgrp_d, uint64_t, bitgroup)
1572
1573#undef DO_BITPERM
1574
ed4a6387
RH
1575#define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1576void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1577{ \
1578 intptr_t i, opr_sz = simd_oprsz(desc); \
1579 int sub_r = simd_data(desc); \
1580 if (sub_r) { \
1581 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1582 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1583 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1584 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1585 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1586 acc_r = ADD_OP(acc_r, el2_i); \
1587 acc_i = SUB_OP(acc_i, el2_r); \
1588 *(TYPE *)(vd + H(i)) = acc_r; \
1589 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1590 } \
1591 } else { \
1592 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1593 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1594 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1595 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1596 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1597 acc_r = SUB_OP(acc_r, el2_i); \
1598 acc_i = ADD_OP(acc_i, el2_r); \
1599 *(TYPE *)(vd + H(i)) = acc_r; \
1600 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1601 } \
1602 } \
1603}
1604
1605DO_CADD(sve2_cadd_b, int8_t, H1, DO_ADD, DO_SUB)
1606DO_CADD(sve2_cadd_h, int16_t, H1_2, DO_ADD, DO_SUB)
1607DO_CADD(sve2_cadd_s, int32_t, H1_4, DO_ADD, DO_SUB)
1608DO_CADD(sve2_cadd_d, int64_t, , DO_ADD, DO_SUB)
1609
1610DO_CADD(sve2_sqcadd_b, int8_t, H1, DO_SQADD_B, DO_SQSUB_B)
1611DO_CADD(sve2_sqcadd_h, int16_t, H1_2, DO_SQADD_H, DO_SQSUB_H)
1612DO_CADD(sve2_sqcadd_s, int32_t, H1_4, DO_SQADD_S, DO_SQSUB_S)
1613DO_CADD(sve2_sqcadd_d, int64_t, , do_sqadd_d, do_sqsub_d)
1614
1615#undef DO_CADD
1616
4269fef1
RH
1617#define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1618void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1619{ \
1620 intptr_t i, opr_sz = simd_oprsz(desc); \
1621 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1622 int shift = simd_data(desc) >> 1; \
1623 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1624 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1625 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1626 } \
1627}
1628
1629DO_ZZI_SHLL(sve2_sshll_h, int16_t, int8_t, H1_2, H1)
1630DO_ZZI_SHLL(sve2_sshll_s, int32_t, int16_t, H1_4, H1_2)
1631DO_ZZI_SHLL(sve2_sshll_d, int64_t, int32_t, , H1_4)
1632
1633DO_ZZI_SHLL(sve2_ushll_h, uint16_t, uint8_t, H1_2, H1)
1634DO_ZZI_SHLL(sve2_ushll_s, uint32_t, uint16_t, H1_4, H1_2)
1635DO_ZZI_SHLL(sve2_ushll_d, uint64_t, uint32_t, , H1_4)
1636
1637#undef DO_ZZI_SHLL
1638
047cec97
RH
1639/* Two-operand reduction expander, controlled by a predicate.
1640 * The difference between TYPERED and TYPERET has to do with
1641 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1642 * but TYPERET must be unsigned so that e.g. a 32-bit value
1643 * is not sign-extended to the ABI uint64_t return type.
1644 */
1645/* ??? If we were to vectorize this by hand the reduction ordering
1646 * would change. For integer operands, this is perfectly fine.
1647 */
1648#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1649uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1650{ \
1651 intptr_t i, opr_sz = simd_oprsz(desc); \
1652 TYPERED ret = INIT; \
1653 for (i = 0; i < opr_sz; ) { \
1654 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1655 do { \
1656 if (pg & 1) { \
1657 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1658 ret = OP(ret, nn); \
1659 } \
1660 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1661 } while (i & 15); \
1662 } \
1663 return (TYPERET)ret; \
1664}
1665
1666#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1667uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1668{ \
1669 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1670 TYPEE *n = vn; \
1671 uint8_t *pg = vg; \
1672 TYPER ret = INIT; \
1673 for (i = 0; i < opr_sz; i += 1) { \
1674 if (pg[H1(i)] & 1) { \
1675 TYPEE nn = n[i]; \
1676 ret = OP(ret, nn); \
1677 } \
1678 } \
1679 return ret; \
1680}
1681
1682DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
1683DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
1684DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
1685DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
1686
1687DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
1688DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
1689DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
1690DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
1691
1692DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
1693DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
1694DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
1695DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
1696
1697DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1698DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1699DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1700
1701DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
1702DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
1703DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
1704DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
1705
1706DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
1707DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
1708DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
1709DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
1710
1711DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
1712DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
1713DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
1714DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
1715
1716DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
1717DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
1718DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
1719DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
1720
1721DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
1722DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
1723DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
1724DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
1725
1726#undef DO_VPZ
1727#undef DO_VPZ_D
1728
6e6a157d
RH
1729/* Two vector operand, one scalar operand, unpredicated. */
1730#define DO_ZZI(NAME, TYPE, OP) \
1731void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1732{ \
1733 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1734 TYPE s = s64, *d = vd, *n = vn; \
1735 for (i = 0; i < opr_sz; ++i) { \
1736 d[i] = OP(n[i], s); \
1737 } \
1738}
1739
1740#define DO_SUBR(X, Y) (Y - X)
1741
1742DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
1743DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
1744DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
1745DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
1746
1747DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
1748DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
1749DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
1750DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
1751
1752DO_ZZI(sve_smini_b, int8_t, DO_MIN)
1753DO_ZZI(sve_smini_h, int16_t, DO_MIN)
1754DO_ZZI(sve_smini_s, int32_t, DO_MIN)
1755DO_ZZI(sve_smini_d, int64_t, DO_MIN)
1756
1757DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
1758DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
1759DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
1760DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
1761
1762DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
1763DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
1764DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
1765DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
1766
1767#undef DO_ZZI
1768
f97cfd59
RH
1769#undef DO_AND
1770#undef DO_ORR
1771#undef DO_EOR
1772#undef DO_BIC
1773#undef DO_ADD
1774#undef DO_SUB
1775#undef DO_MAX
1776#undef DO_MIN
1777#undef DO_ABD
1778#undef DO_MUL
1779#undef DO_DIV
27721dbb
RH
1780#undef DO_ASR
1781#undef DO_LSR
1782#undef DO_LSL
6e6a157d 1783#undef DO_SUBR
f97cfd59 1784
028e2a7b
RH
1785/* Similar to the ARM LastActiveElement pseudocode function, except the
1786 result is multiplied by the element size. This includes the not found
1787 indication; e.g. not found for esz=3 is -8. */
1788static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
1789{
1790 uint64_t mask = pred_esz_masks[esz];
1791 intptr_t i = words;
1792
1793 do {
1794 uint64_t this_g = g[--i] & mask;
1795 if (this_g) {
1796 return i * 64 + (63 - clz64(this_g));
1797 }
1798 } while (i > 0);
1799 return (intptr_t)-1 << esz;
1800}
1801
86300b5d 1802uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t pred_desc)
028e2a7b 1803{
86300b5d 1804 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
028e2a7b
RH
1805 uint32_t flags = PREDTEST_INIT;
1806 uint64_t *d = vd, *g = vg;
1807 intptr_t i = 0;
1808
1809 do {
1810 uint64_t this_d = d[i];
1811 uint64_t this_g = g[i];
1812
1813 if (this_g) {
1814 if (!(flags & 4)) {
1815 /* Set in D the first bit of G. */
1816 this_d |= this_g & -this_g;
1817 d[i] = this_d;
1818 }
1819 flags = iter_predtest_fwd(this_d, this_g, flags);
1820 }
1821 } while (++i < words);
1822
1823 return flags;
1824}
1825
1826uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
1827{
86300b5d
RH
1828 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
1829 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
028e2a7b
RH
1830 uint32_t flags = PREDTEST_INIT;
1831 uint64_t *d = vd, *g = vg, esz_mask;
1832 intptr_t i, next;
1833
1834 next = last_active_element(vd, words, esz) + (1 << esz);
1835 esz_mask = pred_esz_masks[esz];
1836
1837 /* Similar to the pseudocode for pnext, but scaled by ESZ
1838 so that we find the correct bit. */
1839 if (next < words * 64) {
1840 uint64_t mask = -1;
1841
1842 if (next & 63) {
1843 mask = ~((1ull << (next & 63)) - 1);
1844 next &= -64;
1845 }
1846 do {
1847 uint64_t this_g = g[next / 64] & esz_mask & mask;
1848 if (this_g != 0) {
1849 next = (next & -64) + ctz64(this_g);
1850 break;
1851 }
1852 next += 64;
1853 mask = -1;
1854 } while (next < words * 64);
1855 }
1856
1857 i = 0;
1858 do {
1859 uint64_t this_d = 0;
1860 if (i == next / 64) {
1861 this_d = 1ull << (next & 63);
1862 }
1863 d[i] = this_d;
1864 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
1865 } while (++i < words);
1866
1867 return flags;
1868}
ccd841c3 1869
60245996
RH
1870/*
1871 * Copy Zn into Zd, and store zero into inactive elements.
1872 * If inv, store zeros into the active elements.
ccd841c3 1873 */
68459864
RH
1874void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1875{
1876 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
60245996 1877 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
68459864
RH
1878 uint64_t *d = vd, *n = vn;
1879 uint8_t *pg = vg;
60245996 1880
68459864 1881 for (i = 0; i < opr_sz; i += 1) {
60245996 1882 d[i] = n[i] & (expand_pred_b(pg[H1(i)]) ^ inv);
68459864
RH
1883 }
1884}
1885
1886void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1887{
1888 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
60245996 1889 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
68459864
RH
1890 uint64_t *d = vd, *n = vn;
1891 uint8_t *pg = vg;
60245996 1892
68459864 1893 for (i = 0; i < opr_sz; i += 1) {
60245996 1894 d[i] = n[i] & (expand_pred_h(pg[H1(i)]) ^ inv);
68459864
RH
1895 }
1896}
1897
1898void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1899{
1900 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
60245996 1901 uint64_t inv = -(uint64_t)(simd_data(desc) & 1);
68459864
RH
1902 uint64_t *d = vd, *n = vn;
1903 uint8_t *pg = vg;
60245996 1904
68459864 1905 for (i = 0; i < opr_sz; i += 1) {
60245996 1906 d[i] = n[i] & (expand_pred_s(pg[H1(i)]) ^ inv);
68459864
RH
1907 }
1908}
1909
1910void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1911{
1912 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1913 uint64_t *d = vd, *n = vn;
1914 uint8_t *pg = vg;
60245996
RH
1915 uint8_t inv = simd_data(desc);
1916
68459864 1917 for (i = 0; i < opr_sz; i += 1) {
60245996 1918 d[i] = n[i] & -(uint64_t)((pg[H1(i)] ^ inv) & 1);
68459864
RH
1919 }
1920}
1921
ccd841c3
RH
1922/* Three-operand expander, immediate operand, controlled by a predicate.
1923 */
1924#define DO_ZPZI(NAME, TYPE, H, OP) \
1925void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1926{ \
1927 intptr_t i, opr_sz = simd_oprsz(desc); \
1928 TYPE imm = simd_data(desc); \
1929 for (i = 0; i < opr_sz; ) { \
1930 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1931 do { \
1932 if (pg & 1) { \
1933 TYPE nn = *(TYPE *)(vn + H(i)); \
1934 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1935 } \
1936 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1937 } while (i & 15); \
1938 } \
1939}
1940
1941/* Similarly, specialized for 64-bit operands. */
1942#define DO_ZPZI_D(NAME, TYPE, OP) \
1943void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1944{ \
1945 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1946 TYPE *d = vd, *n = vn; \
1947 TYPE imm = simd_data(desc); \
1948 uint8_t *pg = vg; \
1949 for (i = 0; i < opr_sz; i += 1) { \
1950 if (pg[H1(i)] & 1) { \
1951 TYPE nn = n[i]; \
1952 d[i] = OP(nn, imm); \
1953 } \
1954 } \
1955}
1956
1957#define DO_SHR(N, M) (N >> M)
1958#define DO_SHL(N, M) (N << M)
1959
1960/* Arithmetic shift right for division. This rounds negative numbers
1961 toward zero as per signed division. Therefore before shifting,
1962 when N is negative, add 2**M-1. */
1963#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1964
46d111b2
RH
1965static inline uint64_t do_urshr(uint64_t x, unsigned sh)
1966{
1967 if (likely(sh < 64)) {
1968 return (x >> sh) + ((x >> (sh - 1)) & 1);
1969 } else if (sh == 64) {
1970 return x >> 63;
1971 } else {
1972 return 0;
1973 }
1974}
1975
81fd3e6e
RH
1976static inline int64_t do_srshr(int64_t x, unsigned sh)
1977{
1978 if (likely(sh < 64)) {
1979 return (x >> sh) + ((x >> (sh - 1)) & 1);
1980 } else {
1981 /* Rounding the sign bit always produces 0. */
1982 return 0;
1983 }
1984}
1985
ccd841c3
RH
1986DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1987DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1988DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1989DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1990
1991DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1992DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1993DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1994DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1995
1996DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1997DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1998DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1999DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
2000
2001DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
2002DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
2003DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
2004DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
2005
ccd841c3
RH
2006#undef DO_ASRD
2007#undef DO_ZPZI
2008#undef DO_ZPZI_D
96a36e4a 2009
46d111b2
RH
2010#define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2011void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2012{ \
2013 intptr_t i, opr_sz = simd_oprsz(desc); \
2014 int shift = simd_data(desc); \
2015 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2016 TYPEW nn = *(TYPEW *)(vn + i); \
2017 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2018 } \
2019}
2020
2021#define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2022void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2023{ \
2024 intptr_t i, opr_sz = simd_oprsz(desc); \
2025 int shift = simd_data(desc); \
2026 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2027 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2028 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2029 } \
2030}
2031
2032DO_SHRNB(sve2_shrnb_h, uint16_t, uint8_t, DO_SHR)
2033DO_SHRNB(sve2_shrnb_s, uint32_t, uint16_t, DO_SHR)
2034DO_SHRNB(sve2_shrnb_d, uint64_t, uint32_t, DO_SHR)
2035
2036DO_SHRNT(sve2_shrnt_h, uint16_t, uint8_t, H1_2, H1, DO_SHR)
2037DO_SHRNT(sve2_shrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_SHR)
2038DO_SHRNT(sve2_shrnt_d, uint64_t, uint32_t, , H1_4, DO_SHR)
2039
2040DO_SHRNB(sve2_rshrnb_h, uint16_t, uint8_t, do_urshr)
2041DO_SHRNB(sve2_rshrnb_s, uint32_t, uint16_t, do_urshr)
2042DO_SHRNB(sve2_rshrnb_d, uint64_t, uint32_t, do_urshr)
2043
2044DO_SHRNT(sve2_rshrnt_h, uint16_t, uint8_t, H1_2, H1, do_urshr)
2045DO_SHRNT(sve2_rshrnt_s, uint32_t, uint16_t, H1_4, H1_2, do_urshr)
2046DO_SHRNT(sve2_rshrnt_d, uint64_t, uint32_t, , H1_4, do_urshr)
2047
81fd3e6e
RH
2048#define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2049#define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2050#define DO_SQSHRUN_D(x, sh) \
2051 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2052
2053DO_SHRNB(sve2_sqshrunb_h, int16_t, uint8_t, DO_SQSHRUN_H)
2054DO_SHRNB(sve2_sqshrunb_s, int32_t, uint16_t, DO_SQSHRUN_S)
2055DO_SHRNB(sve2_sqshrunb_d, int64_t, uint32_t, DO_SQSHRUN_D)
2056
2057DO_SHRNT(sve2_sqshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRUN_H)
2058DO_SHRNT(sve2_sqshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRUN_S)
2059DO_SHRNT(sve2_sqshrunt_d, int64_t, uint32_t, , H1_4, DO_SQSHRUN_D)
2060
2061#define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2062#define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2063#define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2064
2065DO_SHRNB(sve2_sqrshrunb_h, int16_t, uint8_t, DO_SQRSHRUN_H)
2066DO_SHRNB(sve2_sqrshrunb_s, int32_t, uint16_t, DO_SQRSHRUN_S)
2067DO_SHRNB(sve2_sqrshrunb_d, int64_t, uint32_t, DO_SQRSHRUN_D)
2068
2069DO_SHRNT(sve2_sqrshrunt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRUN_H)
2070DO_SHRNT(sve2_sqrshrunt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRUN_S)
2071DO_SHRNT(sve2_sqrshrunt_d, int64_t, uint32_t, , H1_4, DO_SQRSHRUN_D)
2072
743bb147
RH
2073#define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2074#define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2075#define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2076
2077DO_SHRNB(sve2_sqshrnb_h, int16_t, uint8_t, DO_SQSHRN_H)
2078DO_SHRNB(sve2_sqshrnb_s, int32_t, uint16_t, DO_SQSHRN_S)
2079DO_SHRNB(sve2_sqshrnb_d, int64_t, uint32_t, DO_SQSHRN_D)
2080
2081DO_SHRNT(sve2_sqshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQSHRN_H)
2082DO_SHRNT(sve2_sqshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQSHRN_S)
2083DO_SHRNT(sve2_sqshrnt_d, int64_t, uint32_t, , H1_4, DO_SQSHRN_D)
2084
2085#define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2086#define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2087#define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2088
2089DO_SHRNB(sve2_sqrshrnb_h, int16_t, uint8_t, DO_SQRSHRN_H)
2090DO_SHRNB(sve2_sqrshrnb_s, int32_t, uint16_t, DO_SQRSHRN_S)
2091DO_SHRNB(sve2_sqrshrnb_d, int64_t, uint32_t, DO_SQRSHRN_D)
2092
2093DO_SHRNT(sve2_sqrshrnt_h, int16_t, uint8_t, H1_2, H1, DO_SQRSHRN_H)
2094DO_SHRNT(sve2_sqrshrnt_s, int32_t, uint16_t, H1_4, H1_2, DO_SQRSHRN_S)
2095DO_SHRNT(sve2_sqrshrnt_d, int64_t, uint32_t, , H1_4, DO_SQRSHRN_D)
2096
c13418da
RH
2097#define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2098#define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2099#define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2100
2101DO_SHRNB(sve2_uqshrnb_h, uint16_t, uint8_t, DO_UQSHRN_H)
2102DO_SHRNB(sve2_uqshrnb_s, uint32_t, uint16_t, DO_UQSHRN_S)
2103DO_SHRNB(sve2_uqshrnb_d, uint64_t, uint32_t, DO_UQSHRN_D)
2104
2105DO_SHRNT(sve2_uqshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQSHRN_H)
2106DO_SHRNT(sve2_uqshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQSHRN_S)
2107DO_SHRNT(sve2_uqshrnt_d, uint64_t, uint32_t, , H1_4, DO_UQSHRN_D)
2108
2109#define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2110#define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2111#define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2112
2113DO_SHRNB(sve2_uqrshrnb_h, uint16_t, uint8_t, DO_UQRSHRN_H)
2114DO_SHRNB(sve2_uqrshrnb_s, uint32_t, uint16_t, DO_UQRSHRN_S)
2115DO_SHRNB(sve2_uqrshrnb_d, uint64_t, uint32_t, DO_UQRSHRN_D)
2116
2117DO_SHRNT(sve2_uqrshrnt_h, uint16_t, uint8_t, H1_2, H1, DO_UQRSHRN_H)
2118DO_SHRNT(sve2_uqrshrnt_s, uint32_t, uint16_t, H1_4, H1_2, DO_UQRSHRN_S)
2119DO_SHRNT(sve2_uqrshrnt_d, uint64_t, uint32_t, , H1_4, DO_UQRSHRN_D)
2120
46d111b2
RH
2121#undef DO_SHRNB
2122#undef DO_SHRNT
2123
40d5ea50
SL
2124#define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2125void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2126{ \
2127 intptr_t i, opr_sz = simd_oprsz(desc); \
2128 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2129 TYPEW nn = *(TYPEW *)(vn + i); \
2130 TYPEW mm = *(TYPEW *)(vm + i); \
2131 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2132 } \
2133}
2134
2135#define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2136void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2137{ \
2138 intptr_t i, opr_sz = simd_oprsz(desc); \
2139 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2140 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2141 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2142 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2143 } \
2144}
2145
2146#define DO_ADDHN(N, M, SH) ((N + M) >> SH)
0ea3ff02 2147#define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
40d5ea50
SL
2148
2149DO_BINOPNB(sve2_addhnb_h, uint16_t, uint8_t, 8, DO_ADDHN)
2150DO_BINOPNB(sve2_addhnb_s, uint32_t, uint16_t, 16, DO_ADDHN)
2151DO_BINOPNB(sve2_addhnb_d, uint64_t, uint32_t, 32, DO_ADDHN)
2152
2153DO_BINOPNT(sve2_addhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_ADDHN)
2154DO_BINOPNT(sve2_addhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_ADDHN)
2155DO_BINOPNT(sve2_addhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_ADDHN)
2156
0ea3ff02
SL
2157DO_BINOPNB(sve2_raddhnb_h, uint16_t, uint8_t, 8, DO_RADDHN)
2158DO_BINOPNB(sve2_raddhnb_s, uint32_t, uint16_t, 16, DO_RADDHN)
2159DO_BINOPNB(sve2_raddhnb_d, uint64_t, uint32_t, 32, DO_RADDHN)
2160
2161DO_BINOPNT(sve2_raddhnt_h, uint16_t, uint8_t, 8, H1_2, H1, DO_RADDHN)
2162DO_BINOPNT(sve2_raddhnt_s, uint32_t, uint16_t, 16, H1_4, H1_2, DO_RADDHN)
2163DO_BINOPNT(sve2_raddhnt_d, uint64_t, uint32_t, 32, , H1_4, DO_RADDHN)
2164
2165#undef DO_RADDHN
40d5ea50
SL
2166#undef DO_ADDHN
2167
2168#undef DO_BINOPNB
2169
96a36e4a
RH
2170/* Fully general four-operand expander, controlled by a predicate.
2171 */
2172#define DO_ZPZZZ(NAME, TYPE, H, OP) \
2173void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2174 void *vg, uint32_t desc) \
2175{ \
2176 intptr_t i, opr_sz = simd_oprsz(desc); \
2177 for (i = 0; i < opr_sz; ) { \
2178 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2179 do { \
2180 if (pg & 1) { \
2181 TYPE nn = *(TYPE *)(vn + H(i)); \
2182 TYPE mm = *(TYPE *)(vm + H(i)); \
2183 TYPE aa = *(TYPE *)(va + H(i)); \
2184 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2185 } \
2186 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2187 } while (i & 15); \
2188 } \
2189}
2190
2191/* Similarly, specialized for 64-bit operands. */
2192#define DO_ZPZZZ_D(NAME, TYPE, OP) \
2193void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2194 void *vg, uint32_t desc) \
2195{ \
2196 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2197 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2198 uint8_t *pg = vg; \
2199 for (i = 0; i < opr_sz; i += 1) { \
2200 if (pg[H1(i)] & 1) { \
2201 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2202 d[i] = OP(aa, nn, mm); \
2203 } \
2204 } \
2205}
2206
2207#define DO_MLA(A, N, M) (A + N * M)
2208#define DO_MLS(A, N, M) (A - N * M)
2209
2210DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
2211DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
2212
2213DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
2214DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
2215
2216DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
2217DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
2218
2219DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
2220DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
2221
2222#undef DO_MLA
2223#undef DO_MLS
2224#undef DO_ZPZZZ
2225#undef DO_ZPZZZ_D
9a56c9c3
RH
2226
2227void HELPER(sve_index_b)(void *vd, uint32_t start,
2228 uint32_t incr, uint32_t desc)
2229{
2230 intptr_t i, opr_sz = simd_oprsz(desc);
2231 uint8_t *d = vd;
2232 for (i = 0; i < opr_sz; i += 1) {
2233 d[H1(i)] = start + i * incr;
2234 }
2235}
2236
2237void HELPER(sve_index_h)(void *vd, uint32_t start,
2238 uint32_t incr, uint32_t desc)
2239{
2240 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2241 uint16_t *d = vd;
2242 for (i = 0; i < opr_sz; i += 1) {
2243 d[H2(i)] = start + i * incr;
2244 }
2245}
2246
2247void HELPER(sve_index_s)(void *vd, uint32_t start,
2248 uint32_t incr, uint32_t desc)
2249{
2250 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2251 uint32_t *d = vd;
2252 for (i = 0; i < opr_sz; i += 1) {
2253 d[H4(i)] = start + i * incr;
2254 }
2255}
2256
2257void HELPER(sve_index_d)(void *vd, uint64_t start,
2258 uint64_t incr, uint32_t desc)
2259{
2260 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2261 uint64_t *d = vd;
2262 for (i = 0; i < opr_sz; i += 1) {
2263 d[i] = start + i * incr;
2264 }
2265}
4b242d9c
RH
2266
2267void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
2268{
2269 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2270 uint32_t sh = simd_data(desc);
2271 uint32_t *d = vd, *n = vn, *m = vm;
2272 for (i = 0; i < opr_sz; i += 1) {
2273 d[i] = n[i] + (m[i] << sh);
2274 }
2275}
2276
2277void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
2278{
2279 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2280 uint64_t sh = simd_data(desc);
2281 uint64_t *d = vd, *n = vn, *m = vm;
2282 for (i = 0; i < opr_sz; i += 1) {
2283 d[i] = n[i] + (m[i] << sh);
2284 }
2285}
2286
2287void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
2288{
2289 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2290 uint64_t sh = simd_data(desc);
2291 uint64_t *d = vd, *n = vn, *m = vm;
2292 for (i = 0; i < opr_sz; i += 1) {
2293 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
2294 }
2295}
2296
2297void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
2298{
2299 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2300 uint64_t sh = simd_data(desc);
2301 uint64_t *d = vd, *n = vn, *m = vm;
2302 for (i = 0; i < opr_sz; i += 1) {
2303 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
2304 }
2305}
0762cd42
RH
2306
2307void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
2308{
2309 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2310 static const uint16_t coeff[] = {
2311 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2312 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2313 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2314 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2315 };
2316 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2317 uint16_t *d = vd, *n = vn;
2318
2319 for (i = 0; i < opr_sz; i++) {
2320 uint16_t nn = n[i];
2321 intptr_t idx = extract32(nn, 0, 5);
2322 uint16_t exp = extract32(nn, 5, 5);
2323 d[i] = coeff[idx] | (exp << 10);
2324 }
2325}
2326
2327void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
2328{
2329 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2330 static const uint32_t coeff[] = {
2331 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2332 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2333 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2334 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2335 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2336 0x1ef532, 0x20b051, 0x227043, 0x243516,
2337 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2338 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2339 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2340 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2341 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2342 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2343 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2344 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2345 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2346 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2347 };
2348 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2349 uint32_t *d = vd, *n = vn;
2350
2351 for (i = 0; i < opr_sz; i++) {
2352 uint32_t nn = n[i];
2353 intptr_t idx = extract32(nn, 0, 6);
2354 uint32_t exp = extract32(nn, 6, 8);
2355 d[i] = coeff[idx] | (exp << 23);
2356 }
2357}
2358
2359void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
2360{
2361 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2362 static const uint64_t coeff[] = {
2363 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
2364 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
2365 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
2366 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
2367 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
2368 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
2369 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
2370 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
2371 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
2372 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
2373 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
2374 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
2375 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
2376 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
2377 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
2378 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
2379 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
2380 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
2381 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
2382 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
2383 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
2384 0xFA7C1819E90D8ull,
2385 };
2386 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2387 uint64_t *d = vd, *n = vn;
2388
2389 for (i = 0; i < opr_sz; i++) {
2390 uint64_t nn = n[i];
2391 intptr_t idx = extract32(nn, 0, 6);
2392 uint64_t exp = extract32(nn, 6, 11);
2393 d[i] = coeff[idx] | (exp << 52);
2394 }
2395}
a1f233f2
RH
2396
2397void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
2398{
2399 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
2400 uint16_t *d = vd, *n = vn, *m = vm;
2401 for (i = 0; i < opr_sz; i += 1) {
2402 uint16_t nn = n[i];
2403 uint16_t mm = m[i];
2404 if (mm & 1) {
2405 nn = float16_one;
2406 }
2407 d[i] = nn ^ (mm & 2) << 14;
2408 }
2409}
2410
2411void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
2412{
2413 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
2414 uint32_t *d = vd, *n = vn, *m = vm;
2415 for (i = 0; i < opr_sz; i += 1) {
2416 uint32_t nn = n[i];
2417 uint32_t mm = m[i];
2418 if (mm & 1) {
2419 nn = float32_one;
2420 }
2421 d[i] = nn ^ (mm & 2) << 30;
2422 }
2423}
2424
2425void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
2426{
2427 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2428 uint64_t *d = vd, *n = vn, *m = vm;
2429 for (i = 0; i < opr_sz; i += 1) {
2430 uint64_t nn = n[i];
2431 uint64_t mm = m[i];
2432 if (mm & 1) {
2433 nn = float64_one;
2434 }
2435 d[i] = nn ^ (mm & 2) << 62;
2436 }
2437}
24e82e68
RH
2438
2439/*
2440 * Signed saturating addition with scalar operand.
2441 */
2442
2443void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2444{
2445 intptr_t i, oprsz = simd_oprsz(desc);
2446
2447 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
4f07fbeb 2448 *(int8_t *)(d + i) = DO_SQADD_B(b, *(int8_t *)(a + i));
24e82e68
RH
2449 }
2450}
2451
2452void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2453{
2454 intptr_t i, oprsz = simd_oprsz(desc);
2455
2456 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
4f07fbeb 2457 *(int16_t *)(d + i) = DO_SQADD_H(b, *(int16_t *)(a + i));
24e82e68
RH
2458 }
2459}
2460
2461void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2462{
2463 intptr_t i, oprsz = simd_oprsz(desc);
2464
2465 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
4f07fbeb 2466 *(int32_t *)(d + i) = DO_SQADD_S(b, *(int32_t *)(a + i));
24e82e68
RH
2467 }
2468}
2469
2470void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
2471{
2472 intptr_t i, oprsz = simd_oprsz(desc);
2473
2474 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
4f07fbeb 2475 *(int64_t *)(d + i) = do_sqadd_d(b, *(int64_t *)(a + i));
24e82e68
RH
2476 }
2477}
2478
2479/*
2480 * Unsigned saturating addition with scalar operand.
2481 */
2482
2483void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
2484{
2485 intptr_t i, oprsz = simd_oprsz(desc);
2486
2487 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
4f07fbeb 2488 *(uint8_t *)(d + i) = DO_UQADD_B(b, *(uint8_t *)(a + i));
24e82e68
RH
2489 }
2490}
2491
2492void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
2493{
2494 intptr_t i, oprsz = simd_oprsz(desc);
2495
2496 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
4f07fbeb 2497 *(uint16_t *)(d + i) = DO_UQADD_H(b, *(uint16_t *)(a + i));
24e82e68
RH
2498 }
2499}
2500
2501void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
2502{
2503 intptr_t i, oprsz = simd_oprsz(desc);
2504
2505 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
4f07fbeb 2506 *(uint32_t *)(d + i) = DO_UQADD_S(b, *(uint32_t *)(a + i));
24e82e68
RH
2507 }
2508}
2509
2510void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2511{
2512 intptr_t i, oprsz = simd_oprsz(desc);
2513
2514 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
4f07fbeb 2515 *(uint64_t *)(d + i) = do_uqadd_d(b, *(uint64_t *)(a + i));
24e82e68
RH
2516 }
2517}
2518
2519void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
2520{
2521 intptr_t i, oprsz = simd_oprsz(desc);
2522
2523 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
4f07fbeb 2524 *(uint64_t *)(d + i) = do_uqsub_d(*(uint64_t *)(a + i), b);
24e82e68
RH
2525 }
2526}
f25a2361
RH
2527
2528/* Two operand predicated copy immediate with merge. All valid immediates
2529 * can fit within 17 signed bits in the simd_data field.
2530 */
2531void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
2532 uint64_t mm, uint32_t desc)
2533{
2534 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2535 uint64_t *d = vd, *n = vn;
2536 uint8_t *pg = vg;
2537
2538 mm = dup_const(MO_8, mm);
2539 for (i = 0; i < opr_sz; i += 1) {
2540 uint64_t nn = n[i];
2541 uint64_t pp = expand_pred_b(pg[H1(i)]);
2542 d[i] = (mm & pp) | (nn & ~pp);
2543 }
2544}
2545
2546void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
2547 uint64_t mm, uint32_t desc)
2548{
2549 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2550 uint64_t *d = vd, *n = vn;
2551 uint8_t *pg = vg;
2552
2553 mm = dup_const(MO_16, mm);
2554 for (i = 0; i < opr_sz; i += 1) {
2555 uint64_t nn = n[i];
2556 uint64_t pp = expand_pred_h(pg[H1(i)]);
2557 d[i] = (mm & pp) | (nn & ~pp);
2558 }
2559}
2560
2561void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
2562 uint64_t mm, uint32_t desc)
2563{
2564 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2565 uint64_t *d = vd, *n = vn;
2566 uint8_t *pg = vg;
2567
2568 mm = dup_const(MO_32, mm);
2569 for (i = 0; i < opr_sz; i += 1) {
2570 uint64_t nn = n[i];
2571 uint64_t pp = expand_pred_s(pg[H1(i)]);
2572 d[i] = (mm & pp) | (nn & ~pp);
2573 }
2574}
2575
2576void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
2577 uint64_t mm, uint32_t desc)
2578{
2579 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2580 uint64_t *d = vd, *n = vn;
2581 uint8_t *pg = vg;
2582
2583 for (i = 0; i < opr_sz; i += 1) {
2584 uint64_t nn = n[i];
2585 d[i] = (pg[H1(i)] & 1 ? mm : nn);
2586 }
2587}
2588
2589void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
2590{
2591 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2592 uint64_t *d = vd;
2593 uint8_t *pg = vg;
2594
2595 val = dup_const(MO_8, val);
2596 for (i = 0; i < opr_sz; i += 1) {
2597 d[i] = val & expand_pred_b(pg[H1(i)]);
2598 }
2599}
2600
2601void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
2602{
2603 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2604 uint64_t *d = vd;
2605 uint8_t *pg = vg;
2606
2607 val = dup_const(MO_16, val);
2608 for (i = 0; i < opr_sz; i += 1) {
2609 d[i] = val & expand_pred_h(pg[H1(i)]);
2610 }
2611}
2612
2613void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
2614{
2615 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2616 uint64_t *d = vd;
2617 uint8_t *pg = vg;
2618
2619 val = dup_const(MO_32, val);
2620 for (i = 0; i < opr_sz; i += 1) {
2621 d[i] = val & expand_pred_s(pg[H1(i)]);
2622 }
2623}
2624
2625void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
2626{
2627 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2628 uint64_t *d = vd;
2629 uint8_t *pg = vg;
2630
2631 for (i = 0; i < opr_sz; i += 1) {
2632 d[i] = (pg[H1(i)] & 1 ? val : 0);
2633 }
2634}
b94f8f60 2635
b4cd95d2 2636/* Big-endian hosts need to frob the byte indices. If the copy
b94f8f60
RH
2637 * happens to be 8-byte aligned, then no frobbing necessary.
2638 */
2639static void swap_memmove(void *vd, void *vs, size_t n)
2640{
2641 uintptr_t d = (uintptr_t)vd;
2642 uintptr_t s = (uintptr_t)vs;
2643 uintptr_t o = (d | s | n) & 7;
2644 size_t i;
2645
2646#ifndef HOST_WORDS_BIGENDIAN
2647 o = 0;
2648#endif
2649 switch (o) {
2650 case 0:
2651 memmove(vd, vs, n);
2652 break;
2653
2654 case 4:
2655 if (d < s || d >= s + n) {
2656 for (i = 0; i < n; i += 4) {
2657 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2658 }
2659 } else {
2660 for (i = n; i > 0; ) {
2661 i -= 4;
2662 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
2663 }
2664 }
2665 break;
2666
2667 case 2:
2668 case 6:
2669 if (d < s || d >= s + n) {
2670 for (i = 0; i < n; i += 2) {
2671 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2672 }
2673 } else {
2674 for (i = n; i > 0; ) {
2675 i -= 2;
2676 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
2677 }
2678 }
2679 break;
2680
2681 default:
2682 if (d < s || d >= s + n) {
2683 for (i = 0; i < n; i++) {
2684 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2685 }
2686 } else {
2687 for (i = n; i > 0; ) {
2688 i -= 1;
2689 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
2690 }
2691 }
2692 break;
2693 }
2694}
2695
9123aeb6
RH
2696/* Similarly for memset of 0. */
2697static void swap_memzero(void *vd, size_t n)
2698{
2699 uintptr_t d = (uintptr_t)vd;
2700 uintptr_t o = (d | n) & 7;
2701 size_t i;
2702
2703 /* Usually, the first bit of a predicate is set, so N is 0. */
2704 if (likely(n == 0)) {
2705 return;
2706 }
2707
2708#ifndef HOST_WORDS_BIGENDIAN
2709 o = 0;
2710#endif
2711 switch (o) {
2712 case 0:
2713 memset(vd, 0, n);
2714 break;
2715
2716 case 4:
2717 for (i = 0; i < n; i += 4) {
2718 *(uint32_t *)H1_4(d + i) = 0;
2719 }
2720 break;
2721
2722 case 2:
2723 case 6:
2724 for (i = 0; i < n; i += 2) {
2725 *(uint16_t *)H1_2(d + i) = 0;
2726 }
2727 break;
2728
2729 default:
2730 for (i = 0; i < n; i++) {
2731 *(uint8_t *)H1(d + i) = 0;
2732 }
2733 break;
2734 }
2735}
2736
b94f8f60
RH
2737void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
2738{
2739 intptr_t opr_sz = simd_oprsz(desc);
2740 size_t n_ofs = simd_data(desc);
2741 size_t n_siz = opr_sz - n_ofs;
2742
2743 if (vd != vm) {
2744 swap_memmove(vd, vn + n_ofs, n_siz);
2745 swap_memmove(vd + n_siz, vm, n_ofs);
2746 } else if (vd != vn) {
2747 swap_memmove(vd + n_siz, vd, n_ofs);
2748 swap_memmove(vd, vn + n_ofs, n_siz);
2749 } else {
2750 /* vd == vn == vm. Need temp space. */
2751 ARMVectorReg tmp;
2752 swap_memmove(&tmp, vm, n_ofs);
2753 swap_memmove(vd, vd + n_ofs, n_siz);
2754 memcpy(vd + n_siz, &tmp, n_ofs);
2755 }
2756}
30562ab7
RH
2757
2758#define DO_INSR(NAME, TYPE, H) \
2759void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2760{ \
2761 intptr_t opr_sz = simd_oprsz(desc); \
2762 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2763 *(TYPE *)(vd + H(0)) = val; \
2764}
2765
2766DO_INSR(sve_insr_b, uint8_t, H1)
2767DO_INSR(sve_insr_h, uint16_t, H1_2)
2768DO_INSR(sve_insr_s, uint32_t, H1_4)
2769DO_INSR(sve_insr_d, uint64_t, )
2770
2771#undef DO_INSR
2772
2773void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
2774{
2775 intptr_t i, j, opr_sz = simd_oprsz(desc);
2776 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2777 uint64_t f = *(uint64_t *)(vn + i);
2778 uint64_t b = *(uint64_t *)(vn + j);
2779 *(uint64_t *)(vd + i) = bswap64(b);
2780 *(uint64_t *)(vd + j) = bswap64(f);
2781 }
2782}
2783
30562ab7
RH
2784void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
2785{
2786 intptr_t i, j, opr_sz = simd_oprsz(desc);
2787 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2788 uint64_t f = *(uint64_t *)(vn + i);
2789 uint64_t b = *(uint64_t *)(vn + j);
2790 *(uint64_t *)(vd + i) = hswap64(b);
2791 *(uint64_t *)(vd + j) = hswap64(f);
2792 }
2793}
2794
2795void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
2796{
2797 intptr_t i, j, opr_sz = simd_oprsz(desc);
2798 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2799 uint64_t f = *(uint64_t *)(vn + i);
2800 uint64_t b = *(uint64_t *)(vn + j);
2801 *(uint64_t *)(vd + i) = rol64(b, 32);
2802 *(uint64_t *)(vd + j) = rol64(f, 32);
2803 }
2804}
2805
2806void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
2807{
2808 intptr_t i, j, opr_sz = simd_oprsz(desc);
2809 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
2810 uint64_t f = *(uint64_t *)(vn + i);
2811 uint64_t b = *(uint64_t *)(vn + j);
2812 *(uint64_t *)(vd + i) = b;
2813 *(uint64_t *)(vd + j) = f;
2814 }
2815}
2816
2817#define DO_TBL(NAME, TYPE, H) \
2818void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2819{ \
2820 intptr_t i, opr_sz = simd_oprsz(desc); \
2821 uintptr_t elem = opr_sz / sizeof(TYPE); \
2822 TYPE *d = vd, *n = vn, *m = vm; \
2823 ARMVectorReg tmp; \
2824 if (unlikely(vd == vn)) { \
2825 n = memcpy(&tmp, vn, opr_sz); \
2826 } \
2827 for (i = 0; i < elem; i++) { \
2828 TYPE j = m[H(i)]; \
2829 d[H(i)] = j < elem ? n[H(j)] : 0; \
2830 } \
2831}
2832
2833DO_TBL(sve_tbl_b, uint8_t, H1)
2834DO_TBL(sve_tbl_h, uint16_t, H2)
2835DO_TBL(sve_tbl_s, uint32_t, H4)
2836DO_TBL(sve_tbl_d, uint64_t, )
2837
2838#undef TBL
2839
2840#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
2841void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2842{ \
2843 intptr_t i, opr_sz = simd_oprsz(desc); \
2844 TYPED *d = vd; \
2845 TYPES *n = vn; \
2846 ARMVectorReg tmp; \
2847 if (unlikely(vn - vd < opr_sz)) { \
2848 n = memcpy(&tmp, n, opr_sz / 2); \
2849 } \
2850 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
2851 d[HD(i)] = n[HS(i)]; \
2852 } \
2853}
2854
2855DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
2856DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
2857DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
2858
2859DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
2860DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
2861DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
2862
2863#undef DO_UNPK
d731d8cb
RH
2864
2865/* Mask of bits included in the even numbered predicates of width esz.
2866 * We also use this for expand_bits/compress_bits, and so extend the
2867 * same pattern out to 16-bit units.
2868 */
2869static const uint64_t even_bit_esz_masks[5] = {
2870 0x5555555555555555ull,
2871 0x3333333333333333ull,
2872 0x0f0f0f0f0f0f0f0full,
2873 0x00ff00ff00ff00ffull,
2874 0x0000ffff0000ffffull,
2875};
2876
2877/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
2878 * For N==0, this corresponds to the operation that in qemu/bitops.h
2879 * we call half_shuffle64; this algorithm is from Hacker's Delight,
2880 * section 7-2 Shuffling Bits.
2881 */
2882static uint64_t expand_bits(uint64_t x, int n)
2883{
2884 int i;
2885
2886 x &= 0xffffffffu;
2887 for (i = 4; i >= n; i--) {
2888 int sh = 1 << i;
2889 x = ((x << sh) | x) & even_bit_esz_masks[i];
2890 }
2891 return x;
2892}
2893
2894/* Compress units of 2**(N+1) bits to units of 2**N bits.
2895 * For N==0, this corresponds to the operation that in qemu/bitops.h
2896 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
2897 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
2898 */
2899static uint64_t compress_bits(uint64_t x, int n)
2900{
2901 int i;
2902
2903 for (i = n; i <= 4; i++) {
2904 int sh = 1 << i;
2905 x &= even_bit_esz_masks[i];
2906 x = (x >> sh) | x;
2907 }
2908 return x & 0xffffffffu;
2909}
2910
2911void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2912{
f9b0fcce
RH
2913 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2914 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2915 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
8e7fefed 2916 int esize = 1 << esz;
d731d8cb
RH
2917 uint64_t *d = vd;
2918 intptr_t i;
2919
2920 if (oprsz <= 8) {
2921 uint64_t nn = *(uint64_t *)vn;
2922 uint64_t mm = *(uint64_t *)vm;
2923 int half = 4 * oprsz;
2924
2925 nn = extract64(nn, high * half, half);
2926 mm = extract64(mm, high * half, half);
2927 nn = expand_bits(nn, esz);
2928 mm = expand_bits(mm, esz);
8e7fefed 2929 d[0] = nn | (mm << esize);
d731d8cb 2930 } else {
8e7fefed 2931 ARMPredicateReg tmp;
d731d8cb
RH
2932
2933 /* We produce output faster than we consume input.
2934 Therefore we must be mindful of possible overlap. */
8e7fefed
RH
2935 if (vd == vn) {
2936 vn = memcpy(&tmp, vn, oprsz);
2937 if (vd == vm) {
2938 vm = vn;
2939 }
2940 } else if (vd == vm) {
2941 vm = memcpy(&tmp, vm, oprsz);
d731d8cb
RH
2942 }
2943 if (high) {
2944 high = oprsz >> 1;
2945 }
2946
8e7fefed 2947 if ((oprsz & 7) == 0) {
d731d8cb
RH
2948 uint32_t *n = vn, *m = vm;
2949 high >>= 2;
2950
8e7fefed 2951 for (i = 0; i < oprsz / 8; i++) {
d731d8cb
RH
2952 uint64_t nn = n[H4(high + i)];
2953 uint64_t mm = m[H4(high + i)];
2954
2955 nn = expand_bits(nn, esz);
2956 mm = expand_bits(mm, esz);
8e7fefed 2957 d[i] = nn | (mm << esize);
d731d8cb
RH
2958 }
2959 } else {
2960 uint8_t *n = vn, *m = vm;
2961 uint16_t *d16 = vd;
2962
2963 for (i = 0; i < oprsz / 2; i++) {
2964 uint16_t nn = n[H1(high + i)];
2965 uint16_t mm = m[H1(high + i)];
2966
2967 nn = expand_bits(nn, esz);
2968 mm = expand_bits(mm, esz);
8e7fefed 2969 d16[H2(i)] = nn | (mm << esize);
d731d8cb
RH
2970 }
2971 }
2972 }
2973}
2974
2975void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2976{
f9b0fcce
RH
2977 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
2978 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
2979 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA) << esz;
d731d8cb
RH
2980 uint64_t *d = vd, *n = vn, *m = vm;
2981 uint64_t l, h;
2982 intptr_t i;
2983
2984 if (oprsz <= 8) {
2985 l = compress_bits(n[0] >> odd, esz);
2986 h = compress_bits(m[0] >> odd, esz);
226e6c04 2987 d[0] = l | (h << (4 * oprsz));
d731d8cb
RH
2988 } else {
2989 ARMPredicateReg tmp_m;
2990 intptr_t oprsz_16 = oprsz / 16;
2991
2992 if ((vm - vd) < (uintptr_t)oprsz) {
2993 m = memcpy(&tmp_m, vm, oprsz);
2994 }
2995
2996 for (i = 0; i < oprsz_16; i++) {
2997 l = n[2 * i + 0];
2998 h = n[2 * i + 1];
2999 l = compress_bits(l >> odd, esz);
3000 h = compress_bits(h >> odd, esz);
226e6c04 3001 d[i] = l | (h << 32);
d731d8cb
RH
3002 }
3003
226e6c04
RH
3004 /*
3005 * For VL which is not a multiple of 512, the results from M do not
3006 * align nicely with the uint64_t for D. Put the aligned results
3007 * from M into TMP_M and then copy it into place afterward.
3008 */
d731d8cb 3009 if (oprsz & 15) {
226e6c04
RH
3010 int final_shift = (oprsz & 15) * 2;
3011
3012 l = n[2 * i + 0];
3013 h = n[2 * i + 1];
3014 l = compress_bits(l >> odd, esz);
3015 h = compress_bits(h >> odd, esz);
3016 d[i] = l | (h << final_shift);
d731d8cb
RH
3017
3018 for (i = 0; i < oprsz_16; i++) {
3019 l = m[2 * i + 0];
3020 h = m[2 * i + 1];
3021 l = compress_bits(l >> odd, esz);
3022 h = compress_bits(h >> odd, esz);
226e6c04 3023 tmp_m.p[i] = l | (h << 32);
d731d8cb 3024 }
226e6c04
RH
3025 l = m[2 * i + 0];
3026 h = m[2 * i + 1];
3027 l = compress_bits(l >> odd, esz);
3028 h = compress_bits(h >> odd, esz);
3029 tmp_m.p[i] = l | (h << final_shift);
d731d8cb
RH
3030
3031 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
3032 } else {
3033 for (i = 0; i < oprsz_16; i++) {
3034 l = m[2 * i + 0];
3035 h = m[2 * i + 1];
3036 l = compress_bits(l >> odd, esz);
3037 h = compress_bits(h >> odd, esz);
226e6c04 3038 d[oprsz_16 + i] = l | (h << 32);
d731d8cb
RH
3039 }
3040 }
3041 }
3042}
3043
3044void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
3045{
f9b0fcce
RH
3046 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3047 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3048 int odd = FIELD_EX32(pred_desc, PREDDESC, DATA);
d731d8cb
RH
3049 uint64_t *d = vd, *n = vn, *m = vm;
3050 uint64_t mask;
3051 int shr, shl;
3052 intptr_t i;
3053
3054 shl = 1 << esz;
3055 shr = 0;
3056 mask = even_bit_esz_masks[esz];
3057 if (odd) {
3058 mask <<= shl;
3059 shr = shl;
3060 shl = 0;
3061 }
3062
3063 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
3064 uint64_t nn = (n[i] & mask) >> shr;
3065 uint64_t mm = (m[i] & mask) << shl;
3066 d[i] = nn + mm;
3067 }
3068}
3069
3070/* Reverse units of 2**N bits. */
3071static uint64_t reverse_bits_64(uint64_t x, int n)
3072{
3073 int i, sh;
3074
3075 x = bswap64(x);
3076 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3077 uint64_t mask = even_bit_esz_masks[i];
3078 x = ((x & mask) << sh) | ((x >> sh) & mask);
3079 }
3080 return x;
3081}
3082
3083static uint8_t reverse_bits_8(uint8_t x, int n)
3084{
3085 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
3086 int i, sh;
3087
3088 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
3089 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
3090 }
3091 return x;
3092}
3093
3094void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
3095{
70acaafe
RH
3096 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3097 int esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
d731d8cb
RH
3098 intptr_t i, oprsz_2 = oprsz / 2;
3099
3100 if (oprsz <= 8) {
3101 uint64_t l = *(uint64_t *)vn;
3102 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
3103 *(uint64_t *)vd = l;
3104 } else if ((oprsz & 15) == 0) {
3105 for (i = 0; i < oprsz_2; i += 8) {
3106 intptr_t ih = oprsz - 8 - i;
3107 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
3108 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
3109 *(uint64_t *)(vd + i) = h;
3110 *(uint64_t *)(vd + ih) = l;
3111 }
3112 } else {
3113 for (i = 0; i < oprsz_2; i += 1) {
3114 intptr_t il = H1(i);
3115 intptr_t ih = H1(oprsz - 1 - i);
3116 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
3117 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
3118 *(uint8_t *)(vd + il) = h;
3119 *(uint8_t *)(vd + ih) = l;
3120 }
3121 }
3122}
3123
3124void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
3125{
70acaafe
RH
3126 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3127 intptr_t high = FIELD_EX32(pred_desc, PREDDESC, DATA);
d731d8cb
RH
3128 uint64_t *d = vd;
3129 intptr_t i;
3130
3131 if (oprsz <= 8) {
3132 uint64_t nn = *(uint64_t *)vn;
3133 int half = 4 * oprsz;
3134
3135 nn = extract64(nn, high * half, half);
3136 nn = expand_bits(nn, 0);
3137 d[0] = nn;
3138 } else {
3139 ARMPredicateReg tmp_n;
3140
3141 /* We produce output faster than we consume input.
3142 Therefore we must be mindful of possible overlap. */
3143 if ((vn - vd) < (uintptr_t)oprsz) {
3144 vn = memcpy(&tmp_n, vn, oprsz);
3145 }
3146 if (high) {
3147 high = oprsz >> 1;
3148 }
3149
fd911a21 3150 if ((oprsz & 7) == 0) {
d731d8cb
RH
3151 uint32_t *n = vn;
3152 high >>= 2;
3153
fd911a21 3154 for (i = 0; i < oprsz / 8; i++) {
d731d8cb
RH
3155 uint64_t nn = n[H4(high + i)];
3156 d[i] = expand_bits(nn, 0);
3157 }
3158 } else {
3159 uint16_t *d16 = vd;
3160 uint8_t *n = vn;
3161
3162 for (i = 0; i < oprsz / 2; i++) {
3163 uint16_t nn = n[H1(high + i)];
3164 d16[H2(i)] = expand_bits(nn, 0);
3165 }
3166 }
3167 }
3168}
234b48e9
RH
3169
3170#define DO_ZIP(NAME, TYPE, H) \
3171void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3172{ \
3173 intptr_t oprsz = simd_oprsz(desc); \
3174 intptr_t i, oprsz_2 = oprsz / 2; \
3175 ARMVectorReg tmp_n, tmp_m; \
3176 /* We produce output faster than we consume input. \
3177 Therefore we must be mindful of possible overlap. */ \
3178 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3179 vn = memcpy(&tmp_n, vn, oprsz_2); \
3180 } \
3181 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3182 vm = memcpy(&tmp_m, vm, oprsz_2); \
3183 } \
3184 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3185 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
3186 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
3187 } \
3188}
3189
3190DO_ZIP(sve_zip_b, uint8_t, H1)
3191DO_ZIP(sve_zip_h, uint16_t, H1_2)
3192DO_ZIP(sve_zip_s, uint32_t, H1_4)
3193DO_ZIP(sve_zip_d, uint64_t, )
3194
3195#define DO_UZP(NAME, TYPE, H) \
3196void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3197{ \
3198 intptr_t oprsz = simd_oprsz(desc); \
3199 intptr_t oprsz_2 = oprsz / 2; \
3200 intptr_t odd_ofs = simd_data(desc); \
3201 intptr_t i; \
3202 ARMVectorReg tmp_m; \
3203 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3204 vm = memcpy(&tmp_m, vm, oprsz); \
3205 } \
3206 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3207 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
3208 } \
3209 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3210 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
3211 } \
3212}
3213
3214DO_UZP(sve_uzp_b, uint8_t, H1)
3215DO_UZP(sve_uzp_h, uint16_t, H1_2)
3216DO_UZP(sve_uzp_s, uint32_t, H1_4)
3217DO_UZP(sve_uzp_d, uint64_t, )
3218
3219#define DO_TRN(NAME, TYPE, H) \
3220void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3221{ \
3222 intptr_t oprsz = simd_oprsz(desc); \
3223 intptr_t odd_ofs = simd_data(desc); \
3224 intptr_t i; \
3225 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3226 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3227 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3228 *(TYPE *)(vd + H(i + 0)) = ae; \
3229 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3230 } \
3231}
3232
3233DO_TRN(sve_trn_b, uint8_t, H1)
3234DO_TRN(sve_trn_h, uint16_t, H1_2)
3235DO_TRN(sve_trn_s, uint32_t, H1_4)
3236DO_TRN(sve_trn_d, uint64_t, )
3237
3238#undef DO_ZIP
3239#undef DO_UZP
3240#undef DO_TRN
3ca879ae
RH
3241
3242void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
3243{
3244 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
3245 uint32_t *d = vd, *n = vn;
3246 uint8_t *pg = vg;
3247
3248 for (i = j = 0; i < opr_sz; i++) {
3249 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
3250 d[H4(j)] = n[H4(i)];
3251 j++;
3252 }
3253 }
3254 for (; j < opr_sz; j++) {
3255 d[H4(j)] = 0;
3256 }
3257}
3258
3259void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
3260{
3261 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
3262 uint64_t *d = vd, *n = vn;
3263 uint8_t *pg = vg;
3264
3265 for (i = j = 0; i < opr_sz; i++) {
3266 if (pg[H1(i)] & 1) {
3267 d[j] = n[i];
3268 j++;
3269 }
3270 }
3271 for (; j < opr_sz; j++) {
3272 d[j] = 0;
3273 }
3274}
ef23cb72
RH
3275
3276/* Similar to the ARM LastActiveElement pseudocode function, except the
3277 * result is multiplied by the element size. This includes the not found
3278 * indication; e.g. not found for esz=3 is -8.
3279 */
3280int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
3281{
2acbfbe4
RH
3282 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3283 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
ef23cb72 3284
2acbfbe4 3285 return last_active_element(vg, words, esz);
ef23cb72 3286}
b48ff240
RH
3287
3288void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
3289{
3290 intptr_t opr_sz = simd_oprsz(desc) / 8;
3291 int esz = simd_data(desc);
3292 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
3293 intptr_t i, first_i, last_i;
3294 ARMVectorReg tmp;
3295
3296 first_i = last_i = 0;
3297 first_g = last_g = 0;
3298
3299 /* Find the extent of the active elements within VG. */
3300 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
3301 pg = *(uint64_t *)(vg + i) & mask;
3302 if (pg) {
3303 if (last_g == 0) {
3304 last_g = pg;
3305 last_i = i;
3306 }
3307 first_g = pg;
3308 first_i = i;
3309 }
3310 }
3311
3312 len = 0;
3313 if (first_g != 0) {
3314 first_i = first_i * 8 + ctz64(first_g);
3315 last_i = last_i * 8 + 63 - clz64(last_g);
3316 len = last_i - first_i + (1 << esz);
3317 if (vd == vm) {
3318 vm = memcpy(&tmp, vm, opr_sz * 8);
3319 }
3320 swap_memmove(vd, vn + first_i, len);
3321 }
3322 swap_memmove(vd + len, vm, opr_sz * 8 - len);
3323}
d3fe4a29
RH
3324
3325void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
3326 void *vg, uint32_t desc)
3327{
3328 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3329 uint64_t *d = vd, *n = vn, *m = vm;
3330 uint8_t *pg = vg;
3331
3332 for (i = 0; i < opr_sz; i += 1) {
3333 uint64_t nn = n[i], mm = m[i];
3334 uint64_t pp = expand_pred_b(pg[H1(i)]);
3335 d[i] = (nn & pp) | (mm & ~pp);
3336 }
3337}
3338
3339void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
3340 void *vg, uint32_t desc)
3341{
3342 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3343 uint64_t *d = vd, *n = vn, *m = vm;
3344 uint8_t *pg = vg;
3345
3346 for (i = 0; i < opr_sz; i += 1) {
3347 uint64_t nn = n[i], mm = m[i];
3348 uint64_t pp = expand_pred_h(pg[H1(i)]);
3349 d[i] = (nn & pp) | (mm & ~pp);
3350 }
3351}
3352
3353void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
3354 void *vg, uint32_t desc)
3355{
3356 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3357 uint64_t *d = vd, *n = vn, *m = vm;
3358 uint8_t *pg = vg;
3359
3360 for (i = 0; i < opr_sz; i += 1) {
3361 uint64_t nn = n[i], mm = m[i];
3362 uint64_t pp = expand_pred_s(pg[H1(i)]);
3363 d[i] = (nn & pp) | (mm & ~pp);
3364 }
3365}
3366
3367void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
3368 void *vg, uint32_t desc)
3369{
3370 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
3371 uint64_t *d = vd, *n = vn, *m = vm;
3372 uint8_t *pg = vg;
3373
3374 for (i = 0; i < opr_sz; i += 1) {
3375 uint64_t nn = n[i], mm = m[i];
3376 d[i] = (pg[H1(i)] & 1 ? nn : mm);
3377 }
3378}
757f9cff
RH
3379
3380/* Two operand comparison controlled by a predicate.
3381 * ??? It is very tempting to want to be able to expand this inline
3382 * with x86 instructions, e.g.
3383 *
3384 * vcmpeqw zm, zn, %ymm0
3385 * vpmovmskb %ymm0, %eax
3386 * and $0x5555, %eax
3387 * and pg, %eax
3388 *
3389 * or even aarch64, e.g.
3390 *
3391 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3392 * cmeq v0.8h, zn, zm
3393 * and v0.8h, v0.8h, mask
3394 * addv h0, v0.8h
3395 * and v0.8b, pg
3396 *
3397 * However, coming up with an abstraction that allows vector inputs and
3398 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3399 * scalar outputs, is tricky.
3400 */
3401#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3402uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3403{ \
3404 intptr_t opr_sz = simd_oprsz(desc); \
3405 uint32_t flags = PREDTEST_INIT; \
3406 intptr_t i = opr_sz; \
3407 do { \
3408 uint64_t out = 0, pg; \
3409 do { \
3410 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3411 TYPE nn = *(TYPE *)(vn + H(i)); \
3412 TYPE mm = *(TYPE *)(vm + H(i)); \
3413 out |= nn OP mm; \
3414 } while (i & 63); \
3415 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3416 out &= pg; \
3417 *(uint64_t *)(vd + (i >> 3)) = out; \
3418 flags = iter_predtest_bwd(out, pg, flags); \
3419 } while (i > 0); \
3420 return flags; \
3421}
3422
3423#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3424 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3425#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3426 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3427#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3428 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3429#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3430 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
3431
3432DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
3433DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
3434DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
3435DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
3436
3437DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
3438DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
3439DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
3440DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
3441
3442DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
3443DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
3444DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
3445DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
3446
3447DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
3448DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
3449DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
3450DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
3451
3452DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
3453DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
3454DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
3455DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
3456
3457DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
3458DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
3459DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
3460DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
3461
3462#undef DO_CMP_PPZZ_B
3463#undef DO_CMP_PPZZ_H
3464#undef DO_CMP_PPZZ_S
3465#undef DO_CMP_PPZZ_D
3466#undef DO_CMP_PPZZ
3467
3468/* Similar, but the second source is "wide". */
3469#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3470uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3471{ \
3472 intptr_t opr_sz = simd_oprsz(desc); \
3473 uint32_t flags = PREDTEST_INIT; \
3474 intptr_t i = opr_sz; \
3475 do { \
3476 uint64_t out = 0, pg; \
3477 do { \
3478 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3479 do { \
3480 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3481 TYPE nn = *(TYPE *)(vn + H(i)); \
3482 out |= nn OP mm; \
3483 } while (i & 7); \
3484 } while (i & 63); \
3485 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3486 out &= pg; \
3487 *(uint64_t *)(vd + (i >> 3)) = out; \
3488 flags = iter_predtest_bwd(out, pg, flags); \
3489 } while (i > 0); \
3490 return flags; \
3491}
3492
3493#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3494 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3495#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3496 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3497#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3498 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3499
df4e0010
RH
3500DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
3501DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
3502DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
757f9cff 3503
df4e0010
RH
3504DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
3505DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
3506DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
757f9cff
RH
3507
3508DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
3509DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
3510DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
3511
3512DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
3513DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
3514DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
3515
3516DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
3517DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
3518DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
3519
3520DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
3521DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
3522DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
3523
3524DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
3525DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
3526DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
3527
3528DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
3529DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
3530DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
3531
3532DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
3533DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
3534DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
3535
3536DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
3537DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
3538DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
3539
3540#undef DO_CMP_PPZW_B
3541#undef DO_CMP_PPZW_H
3542#undef DO_CMP_PPZW_S
3543#undef DO_CMP_PPZW
38cadeba
RH
3544
3545/* Similar, but the second source is immediate. */
3546#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3547uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3548{ \
3549 intptr_t opr_sz = simd_oprsz(desc); \
3550 uint32_t flags = PREDTEST_INIT; \
3551 TYPE mm = simd_data(desc); \
3552 intptr_t i = opr_sz; \
3553 do { \
3554 uint64_t out = 0, pg; \
3555 do { \
3556 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3557 TYPE nn = *(TYPE *)(vn + H(i)); \
3558 out |= nn OP mm; \
3559 } while (i & 63); \
3560 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3561 out &= pg; \
3562 *(uint64_t *)(vd + (i >> 3)) = out; \
3563 flags = iter_predtest_bwd(out, pg, flags); \
3564 } while (i > 0); \
3565 return flags; \
3566}
3567
3568#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3569 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3570#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3571 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3572#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3573 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3574#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3575 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
3576
3577DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
3578DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
3579DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
3580DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
3581
3582DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
3583DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
3584DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
3585DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
3586
3587DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
3588DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
3589DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
3590DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
3591
3592DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
3593DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
3594DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
3595DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
3596
3597DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
3598DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
3599DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
3600DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
3601
3602DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
3603DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
3604DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
3605DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
3606
3607DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
3608DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
3609DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
3610DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
3611
3612DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
3613DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
3614DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
3615DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
3616
3617DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
3618DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
3619DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
3620DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
3621
3622DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
3623DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
3624DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
3625DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
3626
3627#undef DO_CMP_PPZI_B
3628#undef DO_CMP_PPZI_H
3629#undef DO_CMP_PPZI_S
3630#undef DO_CMP_PPZI_D
3631#undef DO_CMP_PPZI
35da316f
RH
3632
3633/* Similar to the ARM LastActive pseudocode function. */
3634static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
3635{
3636 intptr_t i;
3637
3638 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
3639 uint64_t pg = *(uint64_t *)(vg + i);
3640 if (pg) {
3641 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
3642 }
3643 }
3644 return 0;
3645}
3646
3647/* Compute a mask into RETB that is true for all G, up to and including
3648 * (if after) or excluding (if !after) the first G & N.
3649 * Return true if BRK found.
3650 */
3651static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
3652 bool brk, bool after)
3653{
3654 uint64_t b;
3655
3656 if (brk) {
3657 b = 0;
3658 } else if ((g & n) == 0) {
3659 /* For all G, no N are set; break not found. */
3660 b = g;
3661 } else {
3662 /* Break somewhere in N. Locate it. */
3663 b = g & n; /* guard true, pred true */
3664 b = b & -b; /* first such */
3665 if (after) {
3666 b = b | (b - 1); /* break after same */
3667 } else {
3668 b = b - 1; /* break before same */
3669 }
3670 brk = true;
3671 }
3672
3673 *retb = b;
3674 return brk;
3675}
3676
3677/* Compute a zeroing BRK. */
3678static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
3679 intptr_t oprsz, bool after)
3680{
3681 bool brk = false;
3682 intptr_t i;
3683
3684 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3685 uint64_t this_b, this_g = g[i];
3686
3687 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3688 d[i] = this_b & this_g;
3689 }
3690}
3691
3692/* Likewise, but also compute flags. */
3693static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
3694 intptr_t oprsz, bool after)
3695{
3696 uint32_t flags = PREDTEST_INIT;
3697 bool brk = false;
3698 intptr_t i;
3699
3700 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3701 uint64_t this_b, this_d, this_g = g[i];
3702
3703 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3704 d[i] = this_d = this_b & this_g;
3705 flags = iter_predtest_fwd(this_d, this_g, flags);
3706 }
3707 return flags;
3708}
3709
3710/* Compute a merging BRK. */
3711static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
3712 intptr_t oprsz, bool after)
3713{
3714 bool brk = false;
3715 intptr_t i;
3716
3717 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
3718 uint64_t this_b, this_g = g[i];
3719
3720 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3721 d[i] = (this_b & this_g) | (d[i] & ~this_g);
3722 }
3723}
3724
3725/* Likewise, but also compute flags. */
3726static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
3727 intptr_t oprsz, bool after)
3728{
3729 uint32_t flags = PREDTEST_INIT;
3730 bool brk = false;
3731 intptr_t i;
3732
3733 for (i = 0; i < oprsz / 8; ++i) {
3734 uint64_t this_b, this_d = d[i], this_g = g[i];
3735
3736 brk = compute_brk(&this_b, n[i], this_g, brk, after);
3737 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
3738 flags = iter_predtest_fwd(this_d, this_g, flags);
3739 }
3740 return flags;
3741}
3742
3743static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
3744{
3745 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3746 * The compiler should turn this into 4 64-bit integer stores.
3747 */
3748 memset(d, 0, sizeof(ARMPredicateReg));
3749 return PREDTEST_INIT;
3750}
3751
3752void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
3753 uint32_t pred_desc)
3754{
04c774a2 3755 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3756 if (last_active_pred(vn, vg, oprsz)) {
3757 compute_brk_z(vd, vm, vg, oprsz, true);
3758 } else {
3759 do_zero(vd, oprsz);
3760 }
3761}
3762
3763uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
3764 uint32_t pred_desc)
3765{
04c774a2 3766 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3767 if (last_active_pred(vn, vg, oprsz)) {
3768 return compute_brks_z(vd, vm, vg, oprsz, true);
3769 } else {
3770 return do_zero(vd, oprsz);
3771 }
3772}
3773
3774void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
3775 uint32_t pred_desc)
3776{
04c774a2 3777 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3778 if (last_active_pred(vn, vg, oprsz)) {
3779 compute_brk_z(vd, vm, vg, oprsz, false);
3780 } else {
3781 do_zero(vd, oprsz);
3782 }
3783}
3784
3785uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
3786 uint32_t pred_desc)
3787{
04c774a2 3788 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3789 if (last_active_pred(vn, vg, oprsz)) {
3790 return compute_brks_z(vd, vm, vg, oprsz, false);
3791 } else {
3792 return do_zero(vd, oprsz);
3793 }
3794}
3795
3796void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3797{
04c774a2 3798 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3799 compute_brk_z(vd, vn, vg, oprsz, true);
3800}
3801
3802uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3803{
04c774a2 3804 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3805 return compute_brks_z(vd, vn, vg, oprsz, true);
3806}
3807
3808void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3809{
04c774a2 3810 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3811 compute_brk_z(vd, vn, vg, oprsz, false);
3812}
3813
3814uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3815{
04c774a2 3816 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3817 return compute_brks_z(vd, vn, vg, oprsz, false);
3818}
3819
3820void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3821{
04c774a2 3822 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3823 compute_brk_m(vd, vn, vg, oprsz, true);
3824}
3825
3826uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3827{
04c774a2 3828 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3829 return compute_brks_m(vd, vn, vg, oprsz, true);
3830}
3831
3832void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3833{
04c774a2 3834 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3835 compute_brk_m(vd, vn, vg, oprsz, false);
3836}
3837
3838uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3839{
04c774a2 3840 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3841 return compute_brks_m(vd, vn, vg, oprsz, false);
3842}
3843
3844void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3845{
04c774a2 3846 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3847 if (!last_active_pred(vn, vg, oprsz)) {
3848 do_zero(vd, oprsz);
3849 }
3850}
3851
3852/* As if PredTest(Ones(PL), D, esz). */
3853static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
3854 uint64_t esz_mask)
3855{
3856 uint32_t flags = PREDTEST_INIT;
3857 intptr_t i;
3858
3859 for (i = 0; i < oprsz / 8; i++) {
3860 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
3861 }
3862 if (oprsz & 7) {
3863 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
3864 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
3865 }
3866 return flags;
3867}
3868
3869uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
3870{
04c774a2 3871 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
35da316f
RH
3872 if (last_active_pred(vn, vg, oprsz)) {
3873 return predtest_ones(vd, oprsz, -1);
3874 } else {
3875 return do_zero(vd, oprsz);
3876 }
3877}
9ee3a611
RH
3878
3879uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
3880{
f556a201
RH
3881 intptr_t words = DIV_ROUND_UP(FIELD_EX32(pred_desc, PREDDESC, OPRSZ), 8);
3882 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
9ee3a611
RH
3883 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
3884 intptr_t i;
3885
f556a201 3886 for (i = 0; i < words; ++i) {
9ee3a611
RH
3887 uint64_t t = n[i] & g[i] & mask;
3888 sum += ctpop64(t);
3889 }
3890 return sum;
3891}
caf1cefc 3892
34688dbc 3893uint32_t HELPER(sve_whilel)(void *vd, uint32_t count, uint32_t pred_desc)
caf1cefc 3894{
e610906c
RH
3895 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3896 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
caf1cefc
RH
3897 uint64_t esz_mask = pred_esz_masks[esz];
3898 ARMPredicateReg *d = vd;
3899 uint32_t flags;
3900 intptr_t i;
3901
3902 /* Begin with a zero predicate register. */
3903 flags = do_zero(d, oprsz);
3904 if (count == 0) {
3905 return flags;
3906 }
3907
caf1cefc
RH
3908 /* Set all of the requested bits. */
3909 for (i = 0; i < count / 64; ++i) {
3910 d->p[i] = esz_mask;
3911 }
3912 if (count & 63) {
3913 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
3914 }
3915
3916 return predtest_ones(d, oprsz, esz_mask);
3917}
c4e7c493 3918
34688dbc
RH
3919uint32_t HELPER(sve_whileg)(void *vd, uint32_t count, uint32_t pred_desc)
3920{
3921 intptr_t oprsz = FIELD_EX32(pred_desc, PREDDESC, OPRSZ);
3922 intptr_t esz = FIELD_EX32(pred_desc, PREDDESC, ESZ);
3923 uint64_t esz_mask = pred_esz_masks[esz];
3924 ARMPredicateReg *d = vd;
3925 intptr_t i, invcount, oprbits;
3926 uint64_t bits;
3927
3928 if (count == 0) {
3929 return do_zero(d, oprsz);
3930 }
3931
3932 oprbits = oprsz * 8;
3933 tcg_debug_assert(count <= oprbits);
3934
3935 bits = esz_mask;
3936 if (oprbits & 63) {
3937 bits &= MAKE_64BIT_MASK(0, oprbits & 63);
3938 }
3939
3940 invcount = oprbits - count;
3941 for (i = (oprsz - 1) / 8; i > invcount / 64; --i) {
3942 d->p[i] = bits;
3943 bits = esz_mask;
3944 }
3945
3946 d->p[i] = bits & MAKE_64BIT_MASK(invcount & 63, 64);
3947
3948 while (--i >= 0) {
3949 d->p[i] = 0;
3950 }
3951
3952 return predtest_ones(d, oprsz, esz_mask);
3953}
3954
23fbe79f
RH
3955/* Recursive reduction on a function;
3956 * C.f. the ARM ARM function ReducePredicated.
3957 *
3958 * While it would be possible to write this without the DATA temporary,
3959 * it is much simpler to process the predicate register this way.
3960 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
3961 * little to gain with a more complex non-recursive form.
3962 */
3963#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
3964static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
3965{ \
3966 if (n == 1) { \
3967 return *data; \
3968 } else { \
3969 uintptr_t half = n / 2; \
3970 TYPE lo = NAME##_reduce(data, status, half); \
3971 TYPE hi = NAME##_reduce(data + half, status, half); \
3972 return TYPE##_##FUNC(lo, hi, status); \
3973 } \
3974} \
3975uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
3976{ \
c648c9b7 3977 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
23fbe79f
RH
3978 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
3979 for (i = 0; i < oprsz; ) { \
3980 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3981 do { \
3982 TYPE nn = *(TYPE *)(vn + H(i)); \
3983 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
3984 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
3985 } while (i & 15); \
3986 } \
3987 for (; i < maxsz; i += sizeof(TYPE)) { \
3988 *(TYPE *)((void *)data + i) = IDENT; \
3989 } \
3990 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
3991}
3992
3993DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
3994DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
3995DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
3996
3997/* Identity is floatN_default_nan, without the function call. */
3998DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
3999DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
4000DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
4001
4002DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
4003DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
4004DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
4005
4006DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
4007DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
4008DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
4009
4010DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
4011DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
4012DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
4013
4014#undef DO_REDUCE
4015
7f9ddf64
RH
4016uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
4017 void *status, uint32_t desc)
4018{
4019 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4020 float16 result = nn;
4021
4022 do {
4023 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4024 do {
4025 if (pg & 1) {
4026 float16 mm = *(float16 *)(vm + H1_2(i));
4027 result = float16_add(result, mm, status);
4028 }
4029 i += sizeof(float16), pg >>= sizeof(float16);
4030 } while (i & 15);
4031 } while (i < opr_sz);
4032
4033 return result;
4034}
4035
4036uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
4037 void *status, uint32_t desc)
4038{
4039 intptr_t i = 0, opr_sz = simd_oprsz(desc);
4040 float32 result = nn;
4041
4042 do {
4043 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4044 do {
4045 if (pg & 1) {
4046 float32 mm = *(float32 *)(vm + H1_2(i));
4047 result = float32_add(result, mm, status);
4048 }
4049 i += sizeof(float32), pg >>= sizeof(float32);
4050 } while (i & 15);
4051 } while (i < opr_sz);
4052
4053 return result;
4054}
4055
4056uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
4057 void *status, uint32_t desc)
4058{
4059 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
4060 uint64_t *m = vm;
4061 uint8_t *pg = vg;
4062
4063 for (i = 0; i < opr_sz; i++) {
4064 if (pg[H1(i)] & 1) {
4065 nn = float64_add(nn, m[i], status);
4066 }
4067 }
4068
4069 return nn;
4070}
4071
ec3b87c2
RH
4072/* Fully general three-operand expander, controlled by a predicate,
4073 * With the extra float_status parameter.
4074 */
4075#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4076void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4077 void *status, uint32_t desc) \
4078{ \
4079 intptr_t i = simd_oprsz(desc); \
4080 uint64_t *g = vg; \
4081 do { \
4082 uint64_t pg = g[(i - 1) >> 6]; \
4083 do { \
4084 i -= sizeof(TYPE); \
4085 if (likely((pg >> (i & 63)) & 1)) { \
4086 TYPE nn = *(TYPE *)(vn + H(i)); \
4087 TYPE mm = *(TYPE *)(vm + H(i)); \
4088 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4089 } \
4090 } while (i & 63); \
4091 } while (i != 0); \
4092}
4093
4094DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
4095DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
4096DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
4097
4098DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
4099DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
4100DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
4101
4102DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
4103DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
4104DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
4105
4106DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
4107DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
4108DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
4109
4110DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
4111DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
4112DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
4113
4114DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
4115DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
4116DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
4117
4118DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
4119DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
4120DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
4121
4122DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
4123DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
4124DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
4125
4126static inline float16 abd_h(float16 a, float16 b, float_status *s)
4127{
4128 return float16_abs(float16_sub(a, b, s));
4129}
4130
4131static inline float32 abd_s(float32 a, float32 b, float_status *s)
4132{
4133 return float32_abs(float32_sub(a, b, s));
4134}
4135
4136static inline float64 abd_d(float64 a, float64 b, float_status *s)
4137{
4138 return float64_abs(float64_sub(a, b, s));
4139}
4140
4141DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
4142DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
4143DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
4144
4145static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
4146{
4147 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
4148 return float64_scalbn(a, b_int, s);
4149}
4150
4151DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
4152DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
4153DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
4154
4155DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
4156DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
4157DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
4158
4159#undef DO_ZPZZ_FP
4160
cc48affe
RH
4161/* Three-operand expander, with one scalar operand, controlled by
4162 * a predicate, with the extra float_status parameter.
4163 */
4164#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4165void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4166 void *status, uint32_t desc) \
4167{ \
4168 intptr_t i = simd_oprsz(desc); \
4169 uint64_t *g = vg; \
4170 TYPE mm = scalar; \
4171 do { \
4172 uint64_t pg = g[(i - 1) >> 6]; \
4173 do { \
4174 i -= sizeof(TYPE); \
4175 if (likely((pg >> (i & 63)) & 1)) { \
4176 TYPE nn = *(TYPE *)(vn + H(i)); \
4177 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4178 } \
4179 } while (i & 63); \
4180 } while (i != 0); \
4181}
4182
4183DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
4184DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
4185DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
4186
4187DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
4188DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
4189DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
4190
4191DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
4192DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
4193DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
4194
4195static inline float16 subr_h(float16 a, float16 b, float_status *s)
4196{
4197 return float16_sub(b, a, s);
4198}
4199
4200static inline float32 subr_s(float32 a, float32 b, float_status *s)
4201{
4202 return float32_sub(b, a, s);
4203}
4204
4205static inline float64 subr_d(float64 a, float64 b, float_status *s)
4206{
4207 return float64_sub(b, a, s);
4208}
4209
4210DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
4211DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
4212DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
4213
4214DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
4215DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
4216DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
4217
4218DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
4219DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
4220DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
4221
4222DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
4223DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
4224DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
4225
4226DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
4227DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
4228DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
4229
8092c6a3
RH
4230/* Fully general two-operand expander, controlled by a predicate,
4231 * With the extra float_status parameter.
4232 */
4233#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4234void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4235{ \
4236 intptr_t i = simd_oprsz(desc); \
4237 uint64_t *g = vg; \
4238 do { \
4239 uint64_t pg = g[(i - 1) >> 6]; \
4240 do { \
4241 i -= sizeof(TYPE); \
4242 if (likely((pg >> (i & 63)) & 1)) { \
4243 TYPE nn = *(TYPE *)(vn + H(i)); \
4244 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4245 } \
4246 } while (i & 63); \
4247 } while (i != 0); \
4248}
4249
46d33d1e
RH
4250/* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4251 * FZ16. When converting from fp16, this affects flushing input denormals;
4252 * when converting to fp16, this affects flushing output denormals.
4253 */
4254static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
4255{
c120391c 4256 bool save = get_flush_inputs_to_zero(fpst);
46d33d1e
RH
4257 float32 ret;
4258
4259 set_flush_inputs_to_zero(false, fpst);
4260 ret = float16_to_float32(f, true, fpst);
4261 set_flush_inputs_to_zero(save, fpst);
4262 return ret;
4263}
4264
4265static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
4266{
c120391c 4267 bool save = get_flush_inputs_to_zero(fpst);
46d33d1e
RH
4268 float64 ret;
4269
4270 set_flush_inputs_to_zero(false, fpst);
4271 ret = float16_to_float64(f, true, fpst);
4272 set_flush_inputs_to_zero(save, fpst);
4273 return ret;
4274}
4275
4276static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
4277{
c120391c 4278 bool save = get_flush_to_zero(fpst);
46d33d1e
RH
4279 float16 ret;
4280
4281 set_flush_to_zero(false, fpst);
4282 ret = float32_to_float16(f, true, fpst);
4283 set_flush_to_zero(save, fpst);
4284 return ret;
4285}
4286
4287static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
4288{
c120391c 4289 bool save = get_flush_to_zero(fpst);
46d33d1e
RH
4290 float16 ret;
4291
4292 set_flush_to_zero(false, fpst);
4293 ret = float64_to_float16(f, true, fpst);
4294 set_flush_to_zero(save, fpst);
4295 return ret;
4296}
4297
df4de1af
RH
4298static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
4299{
4300 if (float16_is_any_nan(f)) {
4301 float_raise(float_flag_invalid, s);
4302 return 0;
4303 }
4304 return float16_to_int16_round_to_zero(f, s);
4305}
4306
4307static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
4308{
4309 if (float16_is_any_nan(f)) {
4310 float_raise(float_flag_invalid, s);
4311 return 0;
4312 }
4313 return float16_to_int64_round_to_zero(f, s);
4314}
4315
4316static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
4317{
4318 if (float32_is_any_nan(f)) {
4319 float_raise(float_flag_invalid, s);
4320 return 0;
4321 }
4322 return float32_to_int64_round_to_zero(f, s);
4323}
4324
4325static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
4326{
4327 if (float64_is_any_nan(f)) {
4328 float_raise(float_flag_invalid, s);
4329 return 0;
4330 }
4331 return float64_to_int64_round_to_zero(f, s);
4332}
4333
4334static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
4335{
4336 if (float16_is_any_nan(f)) {
4337 float_raise(float_flag_invalid, s);
4338 return 0;
4339 }
4340 return float16_to_uint16_round_to_zero(f, s);
4341}
4342
4343static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
4344{
4345 if (float16_is_any_nan(f)) {
4346 float_raise(float_flag_invalid, s);
4347 return 0;
4348 }
4349 return float16_to_uint64_round_to_zero(f, s);
4350}
4351
4352static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
4353{
4354 if (float32_is_any_nan(f)) {
4355 float_raise(float_flag_invalid, s);
4356 return 0;
4357 }
4358 return float32_to_uint64_round_to_zero(f, s);
4359}
4360
4361static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
4362{
4363 if (float64_is_any_nan(f)) {
4364 float_raise(float_flag_invalid, s);
4365 return 0;
4366 }
4367 return float64_to_uint64_round_to_zero(f, s);
4368}
4369
46d33d1e
RH
4370DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
4371DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
4372DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
4373DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
4374DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
4375DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
4376
df4de1af
RH
4377DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
4378DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
4379DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
4380DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
4381DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
4382DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
4383DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
4384
4385DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
4386DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
4387DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
4388DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
4389DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
4390DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
4391DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
4392
cda3c753
RH
4393DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
4394DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
4395DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
4396
4397DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
4398DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
4399DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
4400
ec5b375b
RH
4401DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
4402DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
4403DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
4404
4405DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
4406DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
4407DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
4408
8092c6a3
RH
4409DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
4410DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
4411DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
4412DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
4413DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
4414DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
4415DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
4416
4417DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
4418DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
4419DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
4420DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
4421DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
4422DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
4423DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
4424
4425#undef DO_ZPZ_FP
4426
08975da9
RH
4427static void do_fmla_zpzzz_h(void *vd, void *vn, void *vm, void *va, void *vg,
4428 float_status *status, uint32_t desc,
6ceabaad
RH
4429 uint16_t neg1, uint16_t neg3)
4430{
4431 intptr_t i = simd_oprsz(desc);
6ceabaad
RH
4432 uint64_t *g = vg;
4433
4434 do {
4435 uint64_t pg = g[(i - 1) >> 6];
4436 do {
4437 i -= 2;
4438 if (likely((pg >> (i & 63)) & 1)) {
4439 float16 e1, e2, e3, r;
4440
4441 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
4442 e2 = *(uint16_t *)(vm + H1_2(i));
4443 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
08975da9 4444 r = float16_muladd(e1, e2, e3, 0, status);
6ceabaad
RH
4445 *(uint16_t *)(vd + H1_2(i)) = r;
4446 }
4447 } while (i & 63);
4448 } while (i != 0);
4449}
4450
08975da9
RH
4451void HELPER(sve_fmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4452 void *vg, void *status, uint32_t desc)
6ceabaad 4453{
08975da9 4454 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0);
6ceabaad
RH
4455}
4456
08975da9
RH
4457void HELPER(sve_fmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4458 void *vg, void *status, uint32_t desc)
6ceabaad 4459{
08975da9 4460 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0);
6ceabaad
RH
4461}
4462
08975da9
RH
4463void HELPER(sve_fnmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4464 void *vg, void *status, uint32_t desc)
6ceabaad 4465{
08975da9 4466 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0x8000, 0x8000);
6ceabaad
RH
4467}
4468
08975da9
RH
4469void HELPER(sve_fnmls_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4470 void *vg, void *status, uint32_t desc)
6ceabaad 4471{
08975da9 4472 do_fmla_zpzzz_h(vd, vn, vm, va, vg, status, desc, 0, 0x8000);
6ceabaad
RH
4473}
4474
08975da9
RH
4475static void do_fmla_zpzzz_s(void *vd, void *vn, void *vm, void *va, void *vg,
4476 float_status *status, uint32_t desc,
6ceabaad
RH
4477 uint32_t neg1, uint32_t neg3)
4478{
4479 intptr_t i = simd_oprsz(desc);
6ceabaad
RH
4480 uint64_t *g = vg;
4481
4482 do {
4483 uint64_t pg = g[(i - 1) >> 6];
4484 do {
4485 i -= 4;
4486 if (likely((pg >> (i & 63)) & 1)) {
4487 float32 e1, e2, e3, r;
4488
4489 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
4490 e2 = *(uint32_t *)(vm + H1_4(i));
4491 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
08975da9 4492 r = float32_muladd(e1, e2, e3, 0, status);
6ceabaad
RH
4493 *(uint32_t *)(vd + H1_4(i)) = r;
4494 }
4495 } while (i & 63);
4496 } while (i != 0);
4497}
4498
08975da9
RH
4499void HELPER(sve_fmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4500 void *vg, void *status, uint32_t desc)
6ceabaad 4501{
08975da9 4502 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0);
6ceabaad
RH
4503}
4504
08975da9
RH
4505void HELPER(sve_fmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4506 void *vg, void *status, uint32_t desc)
6ceabaad 4507{
08975da9 4508 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0);
6ceabaad
RH
4509}
4510
08975da9
RH
4511void HELPER(sve_fnmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4512 void *vg, void *status, uint32_t desc)
6ceabaad 4513{
08975da9 4514 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0x80000000, 0x80000000);
6ceabaad
RH
4515}
4516
08975da9
RH
4517void HELPER(sve_fnmls_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4518 void *vg, void *status, uint32_t desc)
6ceabaad 4519{
08975da9 4520 do_fmla_zpzzz_s(vd, vn, vm, va, vg, status, desc, 0, 0x80000000);
6ceabaad
RH
4521}
4522
08975da9
RH
4523static void do_fmla_zpzzz_d(void *vd, void *vn, void *vm, void *va, void *vg,
4524 float_status *status, uint32_t desc,
6ceabaad
RH
4525 uint64_t neg1, uint64_t neg3)
4526{
4527 intptr_t i = simd_oprsz(desc);
6ceabaad
RH
4528 uint64_t *g = vg;
4529
4530 do {
4531 uint64_t pg = g[(i - 1) >> 6];
4532 do {
4533 i -= 8;
4534 if (likely((pg >> (i & 63)) & 1)) {
4535 float64 e1, e2, e3, r;
4536
4537 e1 = *(uint64_t *)(vn + i) ^ neg1;
4538 e2 = *(uint64_t *)(vm + i);
4539 e3 = *(uint64_t *)(va + i) ^ neg3;
08975da9 4540 r = float64_muladd(e1, e2, e3, 0, status);
6ceabaad
RH
4541 *(uint64_t *)(vd + i) = r;
4542 }
4543 } while (i & 63);
4544 } while (i != 0);
4545}
4546
08975da9
RH
4547void HELPER(sve_fmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4548 void *vg, void *status, uint32_t desc)
6ceabaad 4549{
08975da9 4550 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, 0);
6ceabaad
RH
4551}
4552
08975da9
RH
4553void HELPER(sve_fmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4554 void *vg, void *status, uint32_t desc)
6ceabaad 4555{
08975da9 4556 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, 0);
6ceabaad
RH
4557}
4558
08975da9
RH
4559void HELPER(sve_fnmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4560 void *vg, void *status, uint32_t desc)
6ceabaad 4561{
08975da9 4562 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, INT64_MIN, INT64_MIN);
6ceabaad
RH
4563}
4564
08975da9
RH
4565void HELPER(sve_fnmls_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4566 void *vg, void *status, uint32_t desc)
6ceabaad 4567{
08975da9 4568 do_fmla_zpzzz_d(vd, vn, vm, va, vg, status, desc, 0, INT64_MIN);
6ceabaad
RH
4569}
4570
abfdefd5
RH
4571/* Two operand floating-point comparison controlled by a predicate.
4572 * Unlike the integer version, we are not allowed to optimistically
4573 * compare operands, since the comparison may have side effects wrt
4574 * the FPSR.
4575 */
4576#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4577void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4578 void *status, uint32_t desc) \
4579{ \
4580 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4581 uint64_t *d = vd, *g = vg; \
4582 do { \
4583 uint64_t out = 0, pg = g[j]; \
4584 do { \
4585 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4586 if (likely((pg >> (i & 63)) & 1)) { \
4587 TYPE nn = *(TYPE *)(vn + H(i)); \
4588 TYPE mm = *(TYPE *)(vm + H(i)); \
4589 out |= OP(TYPE, nn, mm, status); \
4590 } \
4591 } while (i & 63); \
4592 d[j--] = out; \
4593 } while (i > 0); \
4594}
4595
4596#define DO_FPCMP_PPZZ_H(NAME, OP) \
4597 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4598#define DO_FPCMP_PPZZ_S(NAME, OP) \
4599 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4600#define DO_FPCMP_PPZZ_D(NAME, OP) \
4601 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
4602
4603#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4604 DO_FPCMP_PPZZ_H(NAME, OP) \
4605 DO_FPCMP_PPZZ_S(NAME, OP) \
4606 DO_FPCMP_PPZZ_D(NAME, OP)
4607
4608#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4609#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4d2e2a03
RH
4610#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4611#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
abfdefd5
RH
4612#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4613#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4614#define DO_FCMUO(TYPE, X, Y, ST) \
4615 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4616#define DO_FACGE(TYPE, X, Y, ST) \
4617 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4618#define DO_FACGT(TYPE, X, Y, ST) \
4619 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4620
4621DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
4622DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
4623DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
4624DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
4625DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
4626DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
4627DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
4628
4629#undef DO_FPCMP_PPZZ_ALL
4630#undef DO_FPCMP_PPZZ_D
4631#undef DO_FPCMP_PPZZ_S
4632#undef DO_FPCMP_PPZZ_H
4633#undef DO_FPCMP_PPZZ
4634
4d2e2a03
RH
4635/* One operand floating-point comparison against zero, controlled
4636 * by a predicate.
4637 */
4638#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4639void HELPER(NAME)(void *vd, void *vn, void *vg, \
4640 void *status, uint32_t desc) \
4641{ \
4642 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4643 uint64_t *d = vd, *g = vg; \
4644 do { \
4645 uint64_t out = 0, pg = g[j]; \
4646 do { \
4647 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4648 if ((pg >> (i & 63)) & 1) { \
4649 TYPE nn = *(TYPE *)(vn + H(i)); \
4650 out |= OP(TYPE, nn, 0, status); \
4651 } \
4652 } while (i & 63); \
4653 d[j--] = out; \
4654 } while (i > 0); \
4655}
4656
4657#define DO_FPCMP_PPZ0_H(NAME, OP) \
4658 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4659#define DO_FPCMP_PPZ0_S(NAME, OP) \
4660 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4661#define DO_FPCMP_PPZ0_D(NAME, OP) \
4662 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
4663
4664#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4665 DO_FPCMP_PPZ0_H(NAME, OP) \
4666 DO_FPCMP_PPZ0_S(NAME, OP) \
4667 DO_FPCMP_PPZ0_D(NAME, OP)
4668
4669DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
4670DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
4671DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
4672DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
4673DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
4674DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
4675
67fcd9ad
RH
4676/* FP Trig Multiply-Add. */
4677
4678void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4679{
4680 static const float16 coeff[16] = {
4681 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4682 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4683 };
4684 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
4685 intptr_t x = simd_data(desc);
4686 float16 *d = vd, *n = vn, *m = vm;
4687 for (i = 0; i < opr_sz; i++) {
4688 float16 mm = m[i];
4689 intptr_t xx = x;
4690 if (float16_is_neg(mm)) {
4691 mm = float16_abs(mm);
4692 xx += 8;
4693 }
4694 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
4695 }
4696}
4697
4698void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4699{
4700 static const float32 coeff[16] = {
4701 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
4702 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
4703 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
4704 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
4705 };
4706 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
4707 intptr_t x = simd_data(desc);
4708 float32 *d = vd, *n = vn, *m = vm;
4709 for (i = 0; i < opr_sz; i++) {
4710 float32 mm = m[i];
4711 intptr_t xx = x;
4712 if (float32_is_neg(mm)) {
4713 mm = float32_abs(mm);
4714 xx += 8;
4715 }
4716 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
4717 }
4718}
4719
4720void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
4721{
4722 static const float64 coeff[16] = {
4723 0x3ff0000000000000ull, 0xbfc5555555555543ull,
4724 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
4725 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
4726 0x3de5d8408868552full, 0x0000000000000000ull,
4727 0x3ff0000000000000ull, 0xbfe0000000000000ull,
4728 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
4729 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
4730 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
4731 };
4732 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
4733 intptr_t x = simd_data(desc);
4734 float64 *d = vd, *n = vn, *m = vm;
4735 for (i = 0; i < opr_sz; i++) {
4736 float64 mm = m[i];
4737 intptr_t xx = x;
4738 if (float64_is_neg(mm)) {
4739 mm = float64_abs(mm);
4740 xx += 8;
4741 }
4742 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
4743 }
4744}
4745
76a9d9cd
RH
4746/*
4747 * FP Complex Add
4748 */
4749
4750void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
4751 void *vs, uint32_t desc)
4752{
4753 intptr_t j, i = simd_oprsz(desc);
4754 uint64_t *g = vg;
4755 float16 neg_imag = float16_set_sign(0, simd_data(desc));
4756 float16 neg_real = float16_chs(neg_imag);
4757
4758 do {
4759 uint64_t pg = g[(i - 1) >> 6];
4760 do {
4761 float16 e0, e1, e2, e3;
4762
4763 /* I holds the real index; J holds the imag index. */
4764 j = i - sizeof(float16);
4765 i -= 2 * sizeof(float16);
4766
4767 e0 = *(float16 *)(vn + H1_2(i));
4768 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
4769 e2 = *(float16 *)(vn + H1_2(j));
4770 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
4771
4772 if (likely((pg >> (i & 63)) & 1)) {
4773 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
4774 }
4775 if (likely((pg >> (j & 63)) & 1)) {
4776 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
4777 }
4778 } while (i & 63);
4779 } while (i != 0);
4780}
4781
4782void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
4783 void *vs, uint32_t desc)
4784{
4785 intptr_t j, i = simd_oprsz(desc);
4786 uint64_t *g = vg;
4787 float32 neg_imag = float32_set_sign(0, simd_data(desc));
4788 float32 neg_real = float32_chs(neg_imag);
4789
4790 do {
4791 uint64_t pg = g[(i - 1) >> 6];
4792 do {
4793 float32 e0, e1, e2, e3;
4794
4795 /* I holds the real index; J holds the imag index. */
4796 j = i - sizeof(float32);
4797 i -= 2 * sizeof(float32);
4798
4799 e0 = *(float32 *)(vn + H1_2(i));
4800 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
4801 e2 = *(float32 *)(vn + H1_2(j));
4802 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
4803
4804 if (likely((pg >> (i & 63)) & 1)) {
4805 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
4806 }
4807 if (likely((pg >> (j & 63)) & 1)) {
4808 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
4809 }
4810 } while (i & 63);
4811 } while (i != 0);
4812}
4813
4814void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
4815 void *vs, uint32_t desc)
4816{
4817 intptr_t j, i = simd_oprsz(desc);
4818 uint64_t *g = vg;
4819 float64 neg_imag = float64_set_sign(0, simd_data(desc));
4820 float64 neg_real = float64_chs(neg_imag);
4821
4822 do {
4823 uint64_t pg = g[(i - 1) >> 6];
4824 do {
4825 float64 e0, e1, e2, e3;
4826
4827 /* I holds the real index; J holds the imag index. */
4828 j = i - sizeof(float64);
4829 i -= 2 * sizeof(float64);
4830
4831 e0 = *(float64 *)(vn + H1_2(i));
4832 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
4833 e2 = *(float64 *)(vn + H1_2(j));
4834 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
4835
4836 if (likely((pg >> (i & 63)) & 1)) {
4837 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
4838 }
4839 if (likely((pg >> (j & 63)) & 1)) {
4840 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
4841 }
4842 } while (i & 63);
4843 } while (i != 0);
4844}
4845
05f48bab
RH
4846/*
4847 * FP Complex Multiply
4848 */
4849
08975da9
RH
4850void HELPER(sve_fcmla_zpzzz_h)(void *vd, void *vn, void *vm, void *va,
4851 void *vg, void *status, uint32_t desc)
05f48bab
RH
4852{
4853 intptr_t j, i = simd_oprsz(desc);
08975da9 4854 unsigned rot = simd_data(desc);
05f48bab
RH
4855 bool flip = rot & 1;
4856 float16 neg_imag, neg_real;
05f48bab
RH
4857 uint64_t *g = vg;
4858
4859 neg_imag = float16_set_sign(0, (rot & 2) != 0);
4860 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
4861
4862 do {
4863 uint64_t pg = g[(i - 1) >> 6];
4864 do {
4865 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
4866
4867 /* I holds the real index; J holds the imag index. */
4868 j = i - sizeof(float16);
4869 i -= 2 * sizeof(float16);
4870
4871 nr = *(float16 *)(vn + H1_2(i));
4872 ni = *(float16 *)(vn + H1_2(j));
4873 mr = *(float16 *)(vm + H1_2(i));
4874 mi = *(float16 *)(vm + H1_2(j));
4875
4876 e2 = (flip ? ni : nr);
4877 e1 = (flip ? mi : mr) ^ neg_real;
4878 e4 = e2;
4879 e3 = (flip ? mr : mi) ^ neg_imag;
4880
4881 if (likely((pg >> (i & 63)) & 1)) {
4882 d = *(float16 *)(va + H1_2(i));
08975da9 4883 d = float16_muladd(e2, e1, d, 0, status);
05f48bab
RH
4884 *(float16 *)(vd + H1_2(i)) = d;
4885 }
4886 if (likely((pg >> (j & 63)) & 1)) {
4887 d = *(float16 *)(va + H1_2(j));
08975da9 4888 d = float16_muladd(e4, e3, d, 0, status);
05f48bab
RH
4889 *(float16 *)(vd + H1_2(j)) = d;
4890 }
4891 } while (i & 63);
4892 } while (i != 0);
4893}
4894
08975da9
RH
4895void HELPER(sve_fcmla_zpzzz_s)(void *vd, void *vn, void *vm, void *va,
4896 void *vg, void *status, uint32_t desc)
05f48bab
RH
4897{
4898 intptr_t j, i = simd_oprsz(desc);
08975da9 4899 unsigned rot = simd_data(desc);
05f48bab
RH
4900 bool flip = rot & 1;
4901 float32 neg_imag, neg_real;
05f48bab
RH
4902 uint64_t *g = vg;
4903
4904 neg_imag = float32_set_sign(0, (rot & 2) != 0);
4905 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
4906
4907 do {
4908 uint64_t pg = g[(i - 1) >> 6];
4909 do {
4910 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
4911
4912 /* I holds the real index; J holds the imag index. */
4913 j = i - sizeof(float32);
4914 i -= 2 * sizeof(float32);
4915
4916 nr = *(float32 *)(vn + H1_2(i));
4917 ni = *(float32 *)(vn + H1_2(j));
4918 mr = *(float32 *)(vm + H1_2(i));
4919 mi = *(float32 *)(vm + H1_2(j));
4920
4921 e2 = (flip ? ni : nr);
4922 e1 = (flip ? mi : mr) ^ neg_real;
4923 e4 = e2;
4924 e3 = (flip ? mr : mi) ^ neg_imag;
4925
4926 if (likely((pg >> (i & 63)) & 1)) {
4927 d = *(float32 *)(va + H1_2(i));
08975da9 4928 d = float32_muladd(e2, e1, d, 0, status);
05f48bab
RH
4929 *(float32 *)(vd + H1_2(i)) = d;
4930 }
4931 if (likely((pg >> (j & 63)) & 1)) {
4932 d = *(float32 *)(va + H1_2(j));
08975da9 4933 d = float32_muladd(e4, e3, d, 0, status);
05f48bab
RH
4934 *(float32 *)(vd + H1_2(j)) = d;
4935 }
4936 } while (i & 63);
4937 } while (i != 0);
4938}
4939
08975da9
RH
4940void HELPER(sve_fcmla_zpzzz_d)(void *vd, void *vn, void *vm, void *va,
4941 void *vg, void *status, uint32_t desc)
05f48bab
RH
4942{
4943 intptr_t j, i = simd_oprsz(desc);
08975da9 4944 unsigned rot = simd_data(desc);
05f48bab
RH
4945 bool flip = rot & 1;
4946 float64 neg_imag, neg_real;
05f48bab
RH
4947 uint64_t *g = vg;
4948
4949 neg_imag = float64_set_sign(0, (rot & 2) != 0);
4950 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
4951
4952 do {
4953 uint64_t pg = g[(i - 1) >> 6];
4954 do {
4955 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
4956
4957 /* I holds the real index; J holds the imag index. */
4958 j = i - sizeof(float64);
4959 i -= 2 * sizeof(float64);
4960
4961 nr = *(float64 *)(vn + H1_2(i));
4962 ni = *(float64 *)(vn + H1_2(j));
4963 mr = *(float64 *)(vm + H1_2(i));
4964 mi = *(float64 *)(vm + H1_2(j));
4965
4966 e2 = (flip ? ni : nr);
4967 e1 = (flip ? mi : mr) ^ neg_real;
4968 e4 = e2;
4969 e3 = (flip ? mr : mi) ^ neg_imag;
4970
4971 if (likely((pg >> (i & 63)) & 1)) {
4972 d = *(float64 *)(va + H1_2(i));
08975da9 4973 d = float64_muladd(e2, e1, d, 0, status);
05f48bab
RH
4974 *(float64 *)(vd + H1_2(i)) = d;
4975 }
4976 if (likely((pg >> (j & 63)) & 1)) {
4977 d = *(float64 *)(va + H1_2(j));
08975da9 4978 d = float64_muladd(e4, e3, d, 0, status);
05f48bab
RH
4979 *(float64 *)(vd + H1_2(j)) = d;
4980 }
4981 } while (i & 63);
4982 } while (i != 0);
4983}
4984
c4e7c493
RH
4985/*
4986 * Load contiguous data, protected by a governing predicate.
4987 */
9123aeb6
RH
4988
4989/*
cf4a49b7
RH
4990 * Load one element into @vd + @reg_off from @host.
4991 * The controlling predicate is known to be true.
9123aeb6 4992 */
cf4a49b7 4993typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
9123aeb6
RH
4994
4995/*
4996 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
4997 * The controlling predicate is known to be true.
4998 */
6799ce7b
RH
4999typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
5000 target_ulong vaddr, uintptr_t retaddr);
9123aeb6
RH
5001
5002/*
5003 * Generate the above primitives.
5004 */
5005
5006#define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
cf4a49b7
RH
5007static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5008{ \
5009 TYPEM val = HOST(host); \
5010 *(TYPEE *)(vd + H(reg_off)) = val; \
9123aeb6
RH
5011}
5012
0fa476c1
RH
5013#define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5014static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5015{ HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
5016
6799ce7b 5017#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
9123aeb6 5018static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
6799ce7b 5019 target_ulong addr, uintptr_t ra) \
9123aeb6 5020{ \
c4af8ba1
RH
5021 *(TYPEE *)(vd + H(reg_off)) = \
5022 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
9123aeb6 5023}
6799ce7b
RH
5024
5025#define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
9123aeb6 5026static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
6799ce7b 5027 target_ulong addr, uintptr_t ra) \
9123aeb6 5028{ \
c4af8ba1
RH
5029 TLB(env, useronly_clean_ptr(addr), \
5030 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
9123aeb6 5031}
9123aeb6
RH
5032
5033#define DO_LD_PRIM_1(NAME, H, TE, TM) \
5034 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
6799ce7b 5035 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
9123aeb6
RH
5036
5037DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
5038DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
5039DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
5040DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
5041DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
5042DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
5043DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
5044
6799ce7b 5045#define DO_ST_PRIM_1(NAME, H, TE, TM) \
0fa476c1 5046 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
6799ce7b
RH
5047 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
5048
5049DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
5050DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
5051DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
5052DO_ST_PRIM_1(bd, , uint64_t, uint8_t)
9123aeb6 5053
6799ce7b
RH
5054#define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
5055 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
5056 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
5057 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
5058 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
9123aeb6 5059
6799ce7b 5060#define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
0fa476c1
RH
5061 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
5062 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
6799ce7b
RH
5063 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
5064 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
9123aeb6 5065
6799ce7b
RH
5066DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
5067DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
5068DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
5069DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw)
5070DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw)
9123aeb6 5071
6799ce7b
RH
5072DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
5073DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
5074DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw)
9123aeb6 5075
6799ce7b
RH
5076DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
5077DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl)
5078DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl)
9123aeb6 5079
6799ce7b
RH
5080DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
5081DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl)
5082
5083DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq)
5084DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq)
9123aeb6
RH
5085
5086#undef DO_LD_TLB
6799ce7b 5087#undef DO_ST_TLB
9123aeb6
RH
5088#undef DO_LD_HOST
5089#undef DO_LD_PRIM_1
6799ce7b 5090#undef DO_ST_PRIM_1
9123aeb6 5091#undef DO_LD_PRIM_2
6799ce7b 5092#undef DO_ST_PRIM_2
9123aeb6
RH
5093
5094/*
5095 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5096 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
5097 * element >= @reg_off, or @reg_max if there were no active elements at all.
5098 */
5099static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
5100 intptr_t reg_max, int esz)
5101{
5102 uint64_t pg_mask = pred_esz_masks[esz];
5103 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
5104
5105 /* In normal usage, the first element is active. */
5106 if (likely(pg & 1)) {
5107 return reg_off;
5108 }
5109
5110 if (pg == 0) {
5111 reg_off &= -64;
5112 do {
5113 reg_off += 64;
5114 if (unlikely(reg_off >= reg_max)) {
5115 /* The entire predicate was false. */
5116 return reg_max;
5117 }
5118 pg = vg[reg_off >> 6] & pg_mask;
5119 } while (pg == 0);
5120 }
5121 reg_off += ctz64(pg);
5122
5123 /* We should never see an out of range predicate bit set. */
5124 tcg_debug_assert(reg_off < reg_max);
5125 return reg_off;
5126}
5127
b4cd95d2
RH
5128/*
5129 * Resolve the guest virtual address to info->host and info->flags.
5130 * If @nofault, return false if the page is invalid, otherwise
5131 * exit via page fault exception.
5132 */
5133
5134typedef struct {
5135 void *host;
5136 int flags;
5137 MemTxAttrs attrs;
5138} SVEHostPage;
5139
5140static bool sve_probe_page(SVEHostPage *info, bool nofault,
5141 CPUARMState *env, target_ulong addr,
5142 int mem_off, MMUAccessType access_type,
5143 int mmu_idx, uintptr_t retaddr)
5144{
5145 int flags;
5146
5147 addr += mem_off;
c4af8ba1
RH
5148
5149 /*
5150 * User-only currently always issues with TBI. See the comment
5151 * above useronly_clean_ptr. Usually we clean this top byte away
5152 * during translation, but we can't do that for e.g. vector + imm
5153 * addressing modes.
5154 *
5155 * We currently always enable TBI for user-only, and do not provide
5156 * a way to turn it off. So clean the pointer unconditionally here,
5157 * rather than look it up here, or pass it down from above.
5158 */
5159 addr = useronly_clean_ptr(addr);
5160
b4cd95d2
RH
5161 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
5162 &info->host, retaddr);
5163 info->flags = flags;
5164
5165 if (flags & TLB_INVALID_MASK) {
5166 g_assert(nofault);
5167 return false;
5168 }
5169
5170 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5171 info->host -= mem_off;
5172
5173#ifdef CONFIG_USER_ONLY
5174 memset(&info->attrs, 0, sizeof(info->attrs));
5175#else
5176 /*
5177 * Find the iotlbentry for addr and return the transaction attributes.
5178 * This *must* be present in the TLB because we just found the mapping.
5179 */
5180 {
5181 uintptr_t index = tlb_index(env, mmu_idx, addr);
5182
5183# ifdef CONFIG_DEBUG_TCG
5184 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
5185 target_ulong comparator = (access_type == MMU_DATA_LOAD
5186 ? entry->addr_read
5187 : tlb_addr_write(entry));
5188 g_assert(tlb_hit(comparator, addr));
5189# endif
5190
5191 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
5192 info->attrs = iotlbentry->attrs;
5193 }
5194#endif
5195
5196 return true;
5197}
5198
5199
5200/*
5201 * Analyse contiguous data, protected by a governing predicate.
5202 */
5203
5204typedef enum {
5205 FAULT_NO,
5206 FAULT_FIRST,
5207 FAULT_ALL,
5208} SVEContFault;
5209
5210typedef struct {
5211 /*
5212 * First and last element wholly contained within the two pages.
5213 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
5214 * reg_off_last[0] may be < 0 if the first element crosses pages.
5215 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
5216 * are set >= 0 only if there are complete elements on a second page.
5217 *
5218 * The reg_off_* offsets are relative to the internal vector register.
5219 * The mem_off_first offset is relative to the memory address; the
5220 * two offsets are different when a load operation extends, a store
5221 * operation truncates, or for multi-register operations.
5222 */
5223 int16_t mem_off_first[2];
5224 int16_t reg_off_first[2];
5225 int16_t reg_off_last[2];
5226
5227 /*
5228 * One element that is misaligned and spans both pages,
5229 * or -1 if there is no such active element.
5230 */
5231 int16_t mem_off_split;
5232 int16_t reg_off_split;
5233
5234 /*
5235 * The byte offset at which the entire operation crosses a page boundary.
5236 * Set >= 0 if and only if the entire operation spans two pages.
5237 */
5238 int16_t page_split;
5239
5240 /* TLB data for the two pages. */
5241 SVEHostPage page[2];
5242} SVEContLdSt;
5243
5244/*
5245 * Find first active element on each page, and a loose bound for the
5246 * final element on each page. Identify any single element that spans
5247 * the page boundary. Return true if there are any active elements.
5248 */
b854fd06
RH
5249static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
5250 uint64_t *vg, intptr_t reg_max,
5251 int esz, int msize)
b4cd95d2
RH
5252{
5253 const int esize = 1 << esz;
5254 const uint64_t pg_mask = pred_esz_masks[esz];
5255 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
5256 intptr_t mem_off_last, mem_off_split;
5257 intptr_t page_split, elt_split;
5258 intptr_t i;
5259
5260 /* Set all of the element indices to -1, and the TLB data to 0. */
5261 memset(info, -1, offsetof(SVEContLdSt, page));
5262 memset(info->page, 0, sizeof(info->page));
5263
5264 /* Gross scan over the entire predicate to find bounds. */
5265 i = 0;
5266 do {
5267 uint64_t pg = vg[i] & pg_mask;
5268 if (pg) {
5269 reg_off_last = i * 64 + 63 - clz64(pg);
5270 if (reg_off_first < 0) {
5271 reg_off_first = i * 64 + ctz64(pg);
5272 }
5273 }
5274 } while (++i * 64 < reg_max);
5275
5276 if (unlikely(reg_off_first < 0)) {
5277 /* No active elements, no pages touched. */
5278 return false;
5279 }
5280 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
5281
5282 info->reg_off_first[0] = reg_off_first;
5283 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
5284 mem_off_last = (reg_off_last >> esz) * msize;
5285
5286 page_split = -(addr | TARGET_PAGE_MASK);
5287 if (likely(mem_off_last + msize <= page_split)) {
5288 /* The entire operation fits within a single page. */
5289 info->reg_off_last[0] = reg_off_last;
5290 return true;
5291 }
5292
5293 info->page_split = page_split;
5294 elt_split = page_split / msize;
5295 reg_off_split = elt_split << esz;
5296 mem_off_split = elt_split * msize;
5297
5298 /*
5299 * This is the last full element on the first page, but it is not
5300 * necessarily active. If there is no full element, i.e. the first
5301 * active element is the one that's split, this value remains -1.
5302 * It is useful as iteration bounds.
5303 */
5304 if (elt_split != 0) {
5305 info->reg_off_last[0] = reg_off_split - esize;
5306 }
5307
5308 /* Determine if an unaligned element spans the pages. */
5309 if (page_split % msize != 0) {
5310 /* It is helpful to know if the split element is active. */
5311 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
5312 info->reg_off_split = reg_off_split;
5313 info->mem_off_split = mem_off_split;
5314
5315 if (reg_off_split == reg_off_last) {
5316 /* The page crossing element is last. */
5317 return true;
5318 }
5319 }
5320 reg_off_split += esize;
5321 mem_off_split += msize;
5322 }
5323
5324 /*
5325 * We do want the first active element on the second page, because
5326 * this may affect the address reported in an exception.
5327 */
5328 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
5329 tcg_debug_assert(reg_off_split <= reg_off_last);
5330 info->reg_off_first[1] = reg_off_split;
5331 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
5332 info->reg_off_last[1] = reg_off_last;
5333 return true;
5334}
5335
5336/*
5337 * Resolve the guest virtual addresses to info->page[].
5338 * Control the generation of page faults with @fault. Return false if
5339 * there is no work to do, which can only happen with @fault == FAULT_NO.
5340 */
b854fd06
RH
5341static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
5342 CPUARMState *env, target_ulong addr,
5343 MMUAccessType access_type, uintptr_t retaddr)
b4cd95d2
RH
5344{
5345 int mmu_idx = cpu_mmu_index(env, false);
5346 int mem_off = info->mem_off_first[0];
5347 bool nofault = fault == FAULT_NO;
5348 bool have_work = true;
5349
5350 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
5351 access_type, mmu_idx, retaddr)) {
5352 /* No work to be done. */
5353 return false;
5354 }
5355
5356 if (likely(info->page_split < 0)) {
5357 /* The entire operation was on the one page. */
5358 return true;
5359 }
5360
5361 /*
5362 * If the second page is invalid, then we want the fault address to be
5363 * the first byte on that page which is accessed.
5364 */
5365 if (info->mem_off_split >= 0) {
5366 /*
5367 * There is an element split across the pages. The fault address
5368 * should be the first byte of the second page.
5369 */
5370 mem_off = info->page_split;
5371 /*
5372 * If the split element is also the first active element
5373 * of the vector, then: For first-fault we should continue
5374 * to generate faults for the second page. For no-fault,
5375 * we have work only if the second page is valid.
5376 */
5377 if (info->mem_off_first[0] < info->mem_off_split) {
5378 nofault = FAULT_FIRST;
5379 have_work = false;
5380 }
5381 } else {
5382 /*
5383 * There is no element split across the pages. The fault address
5384 * should be the first active element on the second page.
5385 */
5386 mem_off = info->mem_off_first[1];
5387 /*
5388 * There must have been one active element on the first page,
5389 * so we're out of first-fault territory.
5390 */
5391 nofault = fault != FAULT_ALL;
5392 }
5393
5394 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
5395 access_type, mmu_idx, retaddr);
5396 return have_work;
5397}
5398
4bcc3f0f
RH
5399static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
5400 uint64_t *vg, target_ulong addr,
5401 int esize, int msize, int wp_access,
5402 uintptr_t retaddr)
5403{
5404#ifndef CONFIG_USER_ONLY
5405 intptr_t mem_off, reg_off, reg_last;
5406 int flags0 = info->page[0].flags;
5407 int flags1 = info->page[1].flags;
5408
5409 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
5410 return;
5411 }
5412
5413 /* Indicate that watchpoints are handled. */
5414 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
5415 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
5416
5417 if (flags0 & TLB_WATCHPOINT) {
5418 mem_off = info->mem_off_first[0];
5419 reg_off = info->reg_off_first[0];
5420 reg_last = info->reg_off_last[0];
5421
5422 while (reg_off <= reg_last) {
5423 uint64_t pg = vg[reg_off >> 6];
5424 do {
5425 if ((pg >> (reg_off & 63)) & 1) {
5426 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5427 msize, info->page[0].attrs,
5428 wp_access, retaddr);
5429 }
5430 reg_off += esize;
5431 mem_off += msize;
5432 } while (reg_off <= reg_last && (reg_off & 63));
5433 }
5434 }
5435
5436 mem_off = info->mem_off_split;
5437 if (mem_off >= 0) {
5438 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
5439 info->page[0].attrs, wp_access, retaddr);
5440 }
5441
5442 mem_off = info->mem_off_first[1];
5443 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
5444 reg_off = info->reg_off_first[1];
5445 reg_last = info->reg_off_last[1];
5446
5447 do {
5448 uint64_t pg = vg[reg_off >> 6];
5449 do {
5450 if ((pg >> (reg_off & 63)) & 1) {
5451 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
5452 msize, info->page[1].attrs,
5453 wp_access, retaddr);
5454 }
5455 reg_off += esize;
5456 mem_off += msize;
5457 } while (reg_off & 63);
5458 } while (reg_off <= reg_last);
5459 }
5460#endif
5461}
5462
4c3310c7
RH
5463static void sve_cont_ldst_mte_check(SVEContLdSt *info, CPUARMState *env,
5464 uint64_t *vg, target_ulong addr, int esize,
5465 int msize, uint32_t mtedesc, uintptr_t ra)
206adacf
RH
5466{
5467 intptr_t mem_off, reg_off, reg_last;
5468
5469 /* Process the page only if MemAttr == Tagged. */
5470 if (arm_tlb_mte_tagged(&info->page[0].attrs)) {
5471 mem_off = info->mem_off_first[0];
5472 reg_off = info->reg_off_first[0];
5473 reg_last = info->reg_off_split;
5474 if (reg_last < 0) {
5475 reg_last = info->reg_off_last[0];
5476 }
5477
5478 do {
5479 uint64_t pg = vg[reg_off >> 6];
5480 do {
5481 if ((pg >> (reg_off & 63)) & 1) {
4c3310c7 5482 mte_check(env, mtedesc, addr, ra);
206adacf
RH
5483 }
5484 reg_off += esize;
5485 mem_off += msize;
5486 } while (reg_off <= reg_last && (reg_off & 63));
5487 } while (reg_off <= reg_last);
5488 }
5489
5490 mem_off = info->mem_off_first[1];
5491 if (mem_off >= 0 && arm_tlb_mte_tagged(&info->page[1].attrs)) {
5492 reg_off = info->reg_off_first[1];
5493 reg_last = info->reg_off_last[1];
5494
5495 do {
5496 uint64_t pg = vg[reg_off >> 6];
5497 do {
5498 if ((pg >> (reg_off & 63)) & 1) {
4c3310c7 5499 mte_check(env, mtedesc, addr, ra);
206adacf
RH
5500 }
5501 reg_off += esize;
5502 mem_off += msize;
5503 } while (reg_off & 63);
5504 } while (reg_off <= reg_last);
5505 }
5506}
5507
9123aeb6 5508/*
5c9b8458 5509 * Common helper for all contiguous 1,2,3,4-register predicated stores.
9123aeb6 5510 */
b854fd06 5511static inline QEMU_ALWAYS_INLINE
5c9b8458 5512void sve_ldN_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
b854fd06 5513 uint32_t desc, const uintptr_t retaddr,
206adacf 5514 const int esz, const int msz, const int N, uint32_t mtedesc,
b854fd06 5515 sve_ldst1_host_fn *host_fn,
4c3310c7 5516 sve_ldst1_tlb_fn *tlb_fn)
b854fd06 5517{
ba080b86 5518 const unsigned rd = simd_data(desc);
9123aeb6 5519 const intptr_t reg_max = simd_oprsz(desc);
b854fd06
RH
5520 intptr_t reg_off, reg_last, mem_off;
5521 SVEContLdSt info;
9123aeb6 5522 void *host;
5c9b8458 5523 int flags, i;
9123aeb6 5524
b854fd06 5525 /* Find the active elements. */
5c9b8458 5526 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
9123aeb6 5527 /* The entire predicate was false; no load occurs. */
5c9b8458
RH
5528 for (i = 0; i < N; ++i) {
5529 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5530 }
9123aeb6
RH
5531 return;
5532 }
9123aeb6 5533
b854fd06
RH
5534 /* Probe the page(s). Exit with exception for any invalid page. */
5535 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
9123aeb6 5536
4bcc3f0f 5537 /* Handle watchpoints for all active elements. */
5c9b8458 5538 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
4bcc3f0f
RH
5539 BP_MEM_READ, retaddr);
5540
206adacf
RH
5541 /*
5542 * Handle mte checks for all active elements.
5543 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5544 */
4c3310c7
RH
5545 if (mtedesc) {
5546 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
5547 mtedesc, retaddr);
206adacf 5548 }
4bcc3f0f 5549
b854fd06
RH
5550 flags = info.page[0].flags | info.page[1].flags;
5551 if (unlikely(flags != 0)) {
9123aeb6 5552#ifdef CONFIG_USER_ONLY
b854fd06 5553 g_assert_not_reached();
9123aeb6 5554#else
b854fd06 5555 /*
4bcc3f0f 5556 * At least one page includes MMIO.
b854fd06
RH
5557 * Any bus operation can fail with cpu_transaction_failed,
5558 * which for ARM will raise SyncExternal. Perform the load
5559 * into scratch memory to preserve register state until the end.
5560 */
5c9b8458 5561 ARMVectorReg scratch[4] = { };
b854fd06 5562
b854fd06
RH
5563 mem_off = info.mem_off_first[0];
5564 reg_off = info.reg_off_first[0];
5565 reg_last = info.reg_off_last[1];
5566 if (reg_last < 0) {
5567 reg_last = info.reg_off_split;
5568 if (reg_last < 0) {
5569 reg_last = info.reg_off_last[0];
9123aeb6
RH
5570 }
5571 }
5572
b854fd06
RH
5573 do {
5574 uint64_t pg = vg[reg_off >> 6];
5575 do {
5576 if ((pg >> (reg_off & 63)) & 1) {
5c9b8458
RH
5577 for (i = 0; i < N; ++i) {
5578 tlb_fn(env, &scratch[i], reg_off,
5579 addr + mem_off + (i << msz), retaddr);
5580 }
b854fd06
RH
5581 }
5582 reg_off += 1 << esz;
5c9b8458 5583 mem_off += N << msz;
b854fd06
RH
5584 } while (reg_off & 63);
5585 } while (reg_off <= reg_last);
5586
5c9b8458
RH
5587 for (i = 0; i < N; ++i) {
5588 memcpy(&env->vfp.zregs[(rd + i) & 31], &scratch[i], reg_max);
5589 }
b854fd06 5590 return;
9123aeb6 5591#endif
b854fd06
RH
5592 }
5593
5594 /* The entire operation is in RAM, on valid pages. */
5595
5c9b8458
RH
5596 for (i = 0; i < N; ++i) {
5597 memset(&env->vfp.zregs[(rd + i) & 31], 0, reg_max);
5598 }
5599
b854fd06
RH
5600 mem_off = info.mem_off_first[0];
5601 reg_off = info.reg_off_first[0];
5602 reg_last = info.reg_off_last[0];
5603 host = info.page[0].host;
5604
5605 while (reg_off <= reg_last) {
5606 uint64_t pg = vg[reg_off >> 6];
5607 do {
5608 if ((pg >> (reg_off & 63)) & 1) {
5c9b8458
RH
5609 for (i = 0; i < N; ++i) {
5610 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5611 host + mem_off + (i << msz));
5612 }
b854fd06
RH
5613 }
5614 reg_off += 1 << esz;
5c9b8458 5615 mem_off += N << msz;
b854fd06
RH
5616 } while (reg_off <= reg_last && (reg_off & 63));
5617 }
9123aeb6 5618
b854fd06
RH
5619 /*
5620 * Use the slow path to manage the cross-page misalignment.
5621 * But we know this is RAM and cannot trap.
5622 */
5623 mem_off = info.mem_off_split;
5624 if (unlikely(mem_off >= 0)) {
5c9b8458
RH
5625 reg_off = info.reg_off_split;
5626 for (i = 0; i < N; ++i) {
5627 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
5628 addr + mem_off + (i << msz), retaddr);
5629 }
b854fd06
RH
5630 }
5631
5632 mem_off = info.mem_off_first[1];
5633 if (unlikely(mem_off >= 0)) {
5634 reg_off = info.reg_off_first[1];
5635 reg_last = info.reg_off_last[1];
5636 host = info.page[1].host;
5637
5638 do {
5639 uint64_t pg = vg[reg_off >> 6];
5640 do {
5641 if ((pg >> (reg_off & 63)) & 1) {
5c9b8458
RH
5642 for (i = 0; i < N; ++i) {
5643 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
5644 host + mem_off + (i << msz));
5645 }
b854fd06
RH
5646 }
5647 reg_off += 1 << esz;
5c9b8458 5648 mem_off += N << msz;
b854fd06
RH
5649 } while (reg_off & 63);
5650 } while (reg_off <= reg_last);
5651 }
c4e7c493
RH
5652}
5653
206adacf
RH
5654static inline QEMU_ALWAYS_INLINE
5655void sve_ldN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
5656 uint32_t desc, const uintptr_t ra,
5657 const int esz, const int msz, const int N,
5658 sve_ldst1_host_fn *host_fn,
5659 sve_ldst1_tlb_fn *tlb_fn)
5660{
5661 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5662 int bit55 = extract64(addr, 55, 1);
5663
5664 /* Remove mtedesc from the normal sve descriptor. */
5665 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
5666
5667 /* Perform gross MTE suppression early. */
5668 if (!tbi_check(desc, bit55) ||
5669 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
5670 mtedesc = 0;
5671 }
5672
4c3310c7 5673 sve_ldN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
9123aeb6
RH
5674}
5675
206adacf
RH
5676#define DO_LD1_1(NAME, ESZ) \
5677void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5678 target_ulong addr, uint32_t desc) \
5679{ \
5680 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
4c3310c7 5681 sve_##NAME##_host, sve_##NAME##_tlb); \
206adacf
RH
5682} \
5683void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5684 target_ulong addr, uint32_t desc) \
5685{ \
5686 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5687 sve_##NAME##_host, sve_##NAME##_tlb); \
5688}
5689
5690#define DO_LD1_2(NAME, ESZ, MSZ) \
5691void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5692 target_ulong addr, uint32_t desc) \
5693{ \
5694 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4c3310c7 5695 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
206adacf
RH
5696} \
5697void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5698 target_ulong addr, uint32_t desc) \
5699{ \
5700 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
4c3310c7 5701 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
206adacf
RH
5702} \
5703void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
4c3310c7 5704 target_ulong addr, uint32_t desc) \
206adacf
RH
5705{ \
5706 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5707 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5708} \
5709void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
4c3310c7 5710 target_ulong addr, uint32_t desc) \
206adacf
RH
5711{ \
5712 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5713 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
9123aeb6
RH
5714}
5715
5c9b8458
RH
5716DO_LD1_1(ld1bb, MO_8)
5717DO_LD1_1(ld1bhu, MO_16)
5718DO_LD1_1(ld1bhs, MO_16)
5719DO_LD1_1(ld1bsu, MO_32)
5720DO_LD1_1(ld1bss, MO_32)
5721DO_LD1_1(ld1bdu, MO_64)
5722DO_LD1_1(ld1bds, MO_64)
9123aeb6 5723
5c9b8458
RH
5724DO_LD1_2(ld1hh, MO_16, MO_16)
5725DO_LD1_2(ld1hsu, MO_32, MO_16)
5726DO_LD1_2(ld1hss, MO_32, MO_16)
5727DO_LD1_2(ld1hdu, MO_64, MO_16)
5728DO_LD1_2(ld1hds, MO_64, MO_16)
9123aeb6 5729
5c9b8458
RH
5730DO_LD1_2(ld1ss, MO_32, MO_32)
5731DO_LD1_2(ld1sdu, MO_64, MO_32)
5732DO_LD1_2(ld1sds, MO_64, MO_32)
9123aeb6 5733
5c9b8458 5734DO_LD1_2(ld1dd, MO_64, MO_64)
9123aeb6
RH
5735
5736#undef DO_LD1_1
5737#undef DO_LD1_2
5738
206adacf
RH
5739#define DO_LDN_1(N) \
5740void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5741 target_ulong addr, uint32_t desc) \
5742{ \
5743 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
4c3310c7 5744 sve_ld1bb_host, sve_ld1bb_tlb); \
206adacf
RH
5745} \
5746void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
5747 target_ulong addr, uint32_t desc) \
5748{ \
5749 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
5750 sve_ld1bb_host, sve_ld1bb_tlb); \
f27d4dc2
RH
5751}
5752
206adacf
RH
5753#define DO_LDN_2(N, SUFF, ESZ) \
5754void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
5755 target_ulong addr, uint32_t desc) \
5756{ \
5757 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4c3310c7 5758 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
206adacf
RH
5759} \
5760void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
5761 target_ulong addr, uint32_t desc) \
5762{ \
5763 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
4c3310c7 5764 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
206adacf
RH
5765} \
5766void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
5767 target_ulong addr, uint32_t desc) \
5768{ \
5769 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5770 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5771} \
5772void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
5773 target_ulong addr, uint32_t desc) \
5774{ \
5775 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5776 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
c4e7c493
RH
5777}
5778
f27d4dc2
RH
5779DO_LDN_1(2)
5780DO_LDN_1(3)
5781DO_LDN_1(4)
c4e7c493 5782
5c9b8458
RH
5783DO_LDN_2(2, hh, MO_16)
5784DO_LDN_2(3, hh, MO_16)
5785DO_LDN_2(4, hh, MO_16)
c4e7c493 5786
5c9b8458
RH
5787DO_LDN_2(2, ss, MO_32)
5788DO_LDN_2(3, ss, MO_32)
5789DO_LDN_2(4, ss, MO_32)
c4e7c493 5790
5c9b8458
RH
5791DO_LDN_2(2, dd, MO_64)
5792DO_LDN_2(3, dd, MO_64)
5793DO_LDN_2(4, dd, MO_64)
c4e7c493 5794
f27d4dc2
RH
5795#undef DO_LDN_1
5796#undef DO_LDN_2
e2654d75
RH
5797
5798/*
5799 * Load contiguous data, first-fault and no-fault.
9123aeb6
RH
5800 *
5801 * For user-only, one could argue that we should hold the mmap_lock during
5802 * the operation so that there is no race between page_check_range and the
5803 * load operation. However, unmapping pages out from under a running thread
5804 * is extraordinarily unlikely. This theoretical race condition also affects
5805 * linux-user/ in its get_user/put_user macros.
5806 *
5807 * TODO: Construct some helpers, written in assembly, that interact with
5808 * handle_cpu_signal to produce memory ops which can properly report errors
5809 * without racing.
e2654d75
RH
5810 */
5811
e2654d75
RH
5812/* Fault on byte I. All bits in FFR from I are cleared. The vector
5813 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5814 * option, which leaves subsequent data unchanged.
5815 */
5816static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
5817{
5818 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
5819
5820 if (i & 63) {
5821 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
5822 i = ROUND_UP(i, 64);
5823 }
5824 for (; i < oprsz; i += 64) {
5825 ffr[i / 64] = 0;
5826 }
5827}
5828
9123aeb6 5829/*
c647673c 5830 * Common helper for all contiguous no-fault and first-fault loads.
9123aeb6 5831 */
c647673c
RH
5832static inline QEMU_ALWAYS_INLINE
5833void sve_ldnfff1_r(CPUARMState *env, void *vg, const target_ulong addr,
aa13f7c3 5834 uint32_t desc, const uintptr_t retaddr, uint32_t mtedesc,
c647673c
RH
5835 const int esz, const int msz, const SVEContFault fault,
5836 sve_ldst1_host_fn *host_fn,
5837 sve_ldst1_tlb_fn *tlb_fn)
5838{
ba080b86 5839 const unsigned rd = simd_data(desc);
500d0484 5840 void *vd = &env->vfp.zregs[rd];
9123aeb6 5841 const intptr_t reg_max = simd_oprsz(desc);
c647673c
RH
5842 intptr_t reg_off, mem_off, reg_last;
5843 SVEContLdSt info;
5844 int flags;
9123aeb6
RH
5845 void *host;
5846
c647673c
RH
5847 /* Find the active elements. */
5848 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
9123aeb6
RH
5849 /* The entire predicate was false; no load occurs. */
5850 memset(vd, 0, reg_max);
5851 return;
5852 }
c647673c 5853 reg_off = info.reg_off_first[0];
9123aeb6 5854
c647673c
RH
5855 /* Probe the page(s). */
5856 if (!sve_cont_ldst_pages(&info, fault, env, addr, MMU_DATA_LOAD, retaddr)) {
5857 /* Fault on first element. */
5858 tcg_debug_assert(fault == FAULT_NO);
5859 memset(vd, 0, reg_max);
5860 goto do_fault;
5861 }
5862
5863 mem_off = info.mem_off_first[0];
5864 flags = info.page[0].flags;
5865
aa13f7c3
RH
5866 /*
5867 * Disable MTE checking if the Tagged bit is not set. Since TBI must
5868 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
5869 */
5870 if (arm_tlb_mte_tagged(&info.page[0].attrs)) {
5871 mtedesc = 0;
5872 }
5873
c647673c 5874 if (fault == FAULT_FIRST) {
aa13f7c3
RH
5875 /* Trapping mte check for the first-fault element. */
5876 if (mtedesc) {
bd47b61c 5877 mte_check(env, mtedesc, addr + mem_off, retaddr);
aa13f7c3
RH
5878 }
5879
c647673c
RH
5880 /*
5881 * Special handling of the first active element,
5882 * if it crosses a page boundary or is MMIO.
5883 */
5884 bool is_split = mem_off == info.mem_off_split;
c647673c
RH
5885 if (unlikely(flags != 0) || unlikely(is_split)) {
5886 /*
5887 * Use the slow path for cross-page handling.
5888 * Might trap for MMIO or watchpoints.
5889 */
5890 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
5891
5892 /* After any fault, zero the other elements. */
9123aeb6 5893 swap_memzero(vd, reg_off);
c647673c
RH
5894 reg_off += 1 << esz;
5895 mem_off += 1 << msz;
5896 swap_memzero(vd + reg_off, reg_max - reg_off);
5897
5898 if (is_split) {
5899 goto second_page;
5900 }
5901 } else {
5902 memset(vd, 0, reg_max);
5903 }
5904 } else {
5905 memset(vd, 0, reg_max);
5906 if (unlikely(mem_off == info.mem_off_split)) {
5907 /* The first active element crosses a page boundary. */
5908 flags |= info.page[1].flags;
5909 if (unlikely(flags & TLB_MMIO)) {
5910 /* Some page is MMIO, see below. */
5911 goto do_fault;
5912 }
5913 if (unlikely(flags & TLB_WATCHPOINT) &&
5914 (cpu_watchpoint_address_matches
5915 (env_cpu(env), addr + mem_off, 1 << msz)
5916 & BP_MEM_READ)) {
5917 /* Watchpoint hit, see below. */
5918 goto do_fault;
5919 }
d304d280 5920 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
aa13f7c3
RH
5921 goto do_fault;
5922 }
c647673c
RH
5923 /*
5924 * Use the slow path for cross-page handling.
5925 * This is RAM, without a watchpoint, and will not trap.
5926 */
5927 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
5928 goto second_page;
9123aeb6
RH
5929 }
5930 }
5931
9123aeb6 5932 /*
c647673c
RH
5933 * From this point on, all memory operations are MemSingleNF.
5934 *
5935 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
5936 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
5937 *
5938 * Unfortuately we do not have access to the memory attributes from the
5939 * PTE to tell Device memory from Normal memory. So we make a mostly
5940 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
5941 * This gives the right answer for the common cases of "Normal memory,
5942 * backed by host RAM" and "Device memory, backed by MMIO".
5943 * The architecture allows us to suppress an NF load and return
5944 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
5945 * case of "Normal memory, backed by MMIO" is permitted. The case we
5946 * get wrong is "Device memory, backed by host RAM", for which we
5947 * should return (UNKNOWN, FAULT) for but do not.
5948 *
5949 * Similarly, CPU_BP breakpoints would raise exceptions, and so
5950 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
5951 * architectural breakpoints the same.
9123aeb6 5952 */
c647673c
RH
5953 if (unlikely(flags & TLB_MMIO)) {
5954 goto do_fault;
9123aeb6 5955 }
9123aeb6 5956
c647673c
RH
5957 reg_last = info.reg_off_last[0];
5958 host = info.page[0].host;
9123aeb6 5959
c647673c
RH
5960 do {
5961 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 3));
cf4a49b7 5962 do {
c647673c
RH
5963 if ((pg >> (reg_off & 63)) & 1) {
5964 if (unlikely(flags & TLB_WATCHPOINT) &&
5965 (cpu_watchpoint_address_matches
5966 (env_cpu(env), addr + mem_off, 1 << msz)
5967 & BP_MEM_READ)) {
5968 goto do_fault;
5969 }
d304d280 5970 if (mtedesc && !mte_probe(env, mtedesc, addr + mem_off)) {
aa13f7c3
RH
5971 goto do_fault;
5972 }
c647673c
RH
5973 host_fn(vd, reg_off, host + mem_off);
5974 }
cf4a49b7 5975 reg_off += 1 << esz;
c647673c
RH
5976 mem_off += 1 << msz;
5977 } while (reg_off <= reg_last && (reg_off & 63));
5978 } while (reg_off <= reg_last);
9123aeb6 5979
c647673c
RH
5980 /*
5981 * MemSingleNF is allowed to fail for any reason. We have special
5982 * code above to handle the first element crossing a page boundary.
5983 * As an implementation choice, decline to handle a cross-page element
5984 * in any other position.
5985 */
5986 reg_off = info.reg_off_split;
5987 if (reg_off >= 0) {
5988 goto do_fault;
5989 }
9123aeb6 5990
c647673c
RH
5991 second_page:
5992 reg_off = info.reg_off_first[1];
5993 if (likely(reg_off < 0)) {
5994 /* No active elements on the second page. All done. */
9123aeb6
RH
5995 return;
5996 }
9123aeb6 5997
9123aeb6 5998 /*
c647673c
RH
5999 * MemSingleNF is allowed to fail for any reason. As an implementation
6000 * choice, decline to handle elements on the second page. This should
6001 * be low frequency as the guest walks through memory -- the next
6002 * iteration of the guest's loop should be aligned on the page boundary,
6003 * and then all following iterations will stay aligned.
9123aeb6 6004 */
9123aeb6 6005
c647673c 6006 do_fault:
9123aeb6
RH
6007 record_fault(env, reg_off, reg_max);
6008}
6009
aa13f7c3
RH
6010static inline QEMU_ALWAYS_INLINE
6011void sve_ldnfff1_r_mte(CPUARMState *env, void *vg, target_ulong addr,
6012 uint32_t desc, const uintptr_t retaddr,
6013 const int esz, const int msz, const SVEContFault fault,
6014 sve_ldst1_host_fn *host_fn,
6015 sve_ldst1_tlb_fn *tlb_fn)
6016{
6017 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6018 int bit55 = extract64(addr, 55, 1);
6019
6020 /* Remove mtedesc from the normal sve descriptor. */
6021 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6022
6023 /* Perform gross MTE suppression early. */
6024 if (!tbi_check(desc, bit55) ||
6025 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6026 mtedesc = 0;
6027 }
6028
6029 sve_ldnfff1_r(env, vg, addr, desc, retaddr, mtedesc,
6030 esz, msz, fault, host_fn, tlb_fn);
6031}
6032
6033#define DO_LDFF1_LDNF1_1(PART, ESZ) \
9123aeb6
RH
6034void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6035 target_ulong addr, uint32_t desc) \
e2654d75 6036{ \
aa13f7c3 6037 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
c647673c 6038 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
e2654d75 6039} \
9123aeb6
RH
6040void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6041 target_ulong addr, uint32_t desc) \
e2654d75 6042{ \
aa13f7c3
RH
6043 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6044 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6045} \
6046void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6047 target_ulong addr, uint32_t desc) \
6048{ \
6049 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6050 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6051} \
6052void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6053 target_ulong addr, uint32_t desc) \
6054{ \
6055 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
c647673c 6056 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
e2654d75
RH
6057}
6058
aa13f7c3 6059#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
7d0a57a2
RH
6060void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6061 target_ulong addr, uint32_t desc) \
e2654d75 6062{ \
aa13f7c3 6063 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
c647673c 6064 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
9123aeb6 6065} \
7d0a57a2
RH
6066void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6067 target_ulong addr, uint32_t desc) \
9123aeb6 6068{ \
aa13f7c3 6069 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
c647673c 6070 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
7d0a57a2
RH
6071} \
6072void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6073 target_ulong addr, uint32_t desc) \
6074{ \
aa13f7c3 6075 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
c647673c 6076 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
7d0a57a2
RH
6077} \
6078void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6079 target_ulong addr, uint32_t desc) \
6080{ \
aa13f7c3 6081 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
c647673c 6082 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
aa13f7c3
RH
6083} \
6084void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6085 target_ulong addr, uint32_t desc) \
6086{ \
6087 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6088 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6089} \
6090void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6091 target_ulong addr, uint32_t desc) \
6092{ \
6093 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6094 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6095} \
6096void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6097 target_ulong addr, uint32_t desc) \
6098{ \
6099 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6100 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6101} \
6102void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6103 target_ulong addr, uint32_t desc) \
6104{ \
6105 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6106 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
e2654d75
RH
6107}
6108
c647673c
RH
6109DO_LDFF1_LDNF1_1(bb, MO_8)
6110DO_LDFF1_LDNF1_1(bhu, MO_16)
6111DO_LDFF1_LDNF1_1(bhs, MO_16)
6112DO_LDFF1_LDNF1_1(bsu, MO_32)
6113DO_LDFF1_LDNF1_1(bss, MO_32)
6114DO_LDFF1_LDNF1_1(bdu, MO_64)
6115DO_LDFF1_LDNF1_1(bds, MO_64)
e2654d75 6116
c647673c
RH
6117DO_LDFF1_LDNF1_2(hh, MO_16, MO_16)
6118DO_LDFF1_LDNF1_2(hsu, MO_32, MO_16)
6119DO_LDFF1_LDNF1_2(hss, MO_32, MO_16)
6120DO_LDFF1_LDNF1_2(hdu, MO_64, MO_16)
6121DO_LDFF1_LDNF1_2(hds, MO_64, MO_16)
e2654d75 6122
c647673c
RH
6123DO_LDFF1_LDNF1_2(ss, MO_32, MO_32)
6124DO_LDFF1_LDNF1_2(sdu, MO_64, MO_32)
6125DO_LDFF1_LDNF1_2(sds, MO_64, MO_32)
e2654d75 6126
c647673c 6127DO_LDFF1_LDNF1_2(dd, MO_64, MO_64)
e2654d75 6128
9123aeb6
RH
6129#undef DO_LDFF1_LDNF1_1
6130#undef DO_LDFF1_LDNF1_2
1a039c7e 6131
9fd46c83 6132/*
0fa476c1 6133 * Common helper for all contiguous 1,2,3,4-register predicated stores.
9fd46c83 6134 */
0fa476c1
RH
6135
6136static inline QEMU_ALWAYS_INLINE
71b9f394
RH
6137void sve_stN_r(CPUARMState *env, uint64_t *vg, target_ulong addr,
6138 uint32_t desc, const uintptr_t retaddr,
6139 const int esz, const int msz, const int N, uint32_t mtedesc,
0fa476c1 6140 sve_ldst1_host_fn *host_fn,
4c3310c7 6141 sve_ldst1_tlb_fn *tlb_fn)
9fd46c83 6142{
ba080b86 6143 const unsigned rd = simd_data(desc);
0fa476c1
RH
6144 const intptr_t reg_max = simd_oprsz(desc);
6145 intptr_t reg_off, reg_last, mem_off;
6146 SVEContLdSt info;
6147 void *host;
6148 int i, flags;
1a039c7e 6149
0fa476c1
RH
6150 /* Find the active elements. */
6151 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, N << msz)) {
6152 /* The entire predicate was false; no store occurs. */
6153 return;
9fd46c83 6154 }
1a039c7e 6155
0fa476c1
RH
6156 /* Probe the page(s). Exit with exception for any invalid page. */
6157 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_STORE, retaddr);
1a039c7e 6158
0fa476c1
RH
6159 /* Handle watchpoints for all active elements. */
6160 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, N << msz,
6161 BP_MEM_WRITE, retaddr);
6162
71b9f394
RH
6163 /*
6164 * Handle mte checks for all active elements.
6165 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6166 */
4c3310c7
RH
6167 if (mtedesc) {
6168 sve_cont_ldst_mte_check(&info, env, vg, addr, 1 << esz, N << msz,
6169 mtedesc, retaddr);
71b9f394 6170 }
0fa476c1
RH
6171
6172 flags = info.page[0].flags | info.page[1].flags;
6173 if (unlikely(flags != 0)) {
6174#ifdef CONFIG_USER_ONLY
6175 g_assert_not_reached();
6176#else
6177 /*
6178 * At least one page includes MMIO.
6179 * Any bus operation can fail with cpu_transaction_failed,
6180 * which for ARM will raise SyncExternal. We cannot avoid
6181 * this fault and will leave with the store incomplete.
6182 */
6183 mem_off = info.mem_off_first[0];
6184 reg_off = info.reg_off_first[0];
6185 reg_last = info.reg_off_last[1];
6186 if (reg_last < 0) {
6187 reg_last = info.reg_off_split;
6188 if (reg_last < 0) {
6189 reg_last = info.reg_off_last[0];
9fd46c83 6190 }
0fa476c1
RH
6191 }
6192
6193 do {
6194 uint64_t pg = vg[reg_off >> 6];
6195 do {
6196 if ((pg >> (reg_off & 63)) & 1) {
6197 for (i = 0; i < N; ++i) {
6198 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6199 addr + mem_off + (i << msz), retaddr);
6200 }
6201 }
6202 reg_off += 1 << esz;
6203 mem_off += N << msz;
6204 } while (reg_off & 63);
6205 } while (reg_off <= reg_last);
6206 return;
6207#endif
1a039c7e 6208 }
1a039c7e 6209
0fa476c1
RH
6210 mem_off = info.mem_off_first[0];
6211 reg_off = info.reg_off_first[0];
6212 reg_last = info.reg_off_last[0];
6213 host = info.page[0].host;
1a039c7e 6214
0fa476c1
RH
6215 while (reg_off <= reg_last) {
6216 uint64_t pg = vg[reg_off >> 6];
9fd46c83 6217 do {
0fa476c1
RH
6218 if ((pg >> (reg_off & 63)) & 1) {
6219 for (i = 0; i < N; ++i) {
6220 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6221 host + mem_off + (i << msz));
6222 }
9fd46c83 6223 }
0fa476c1
RH
6224 reg_off += 1 << esz;
6225 mem_off += N << msz;
6226 } while (reg_off <= reg_last && (reg_off & 63));
1a039c7e 6227 }
1a039c7e 6228
0fa476c1
RH
6229 /*
6230 * Use the slow path to manage the cross-page misalignment.
6231 * But we know this is RAM and cannot trap.
6232 */
6233 mem_off = info.mem_off_split;
6234 if (unlikely(mem_off >= 0)) {
6235 reg_off = info.reg_off_split;
6236 for (i = 0; i < N; ++i) {
6237 tlb_fn(env, &env->vfp.zregs[(rd + i) & 31], reg_off,
6238 addr + mem_off + (i << msz), retaddr);
6239 }
6240 }
6241
6242 mem_off = info.mem_off_first[1];
6243 if (unlikely(mem_off >= 0)) {
6244 reg_off = info.reg_off_first[1];
6245 reg_last = info.reg_off_last[1];
6246 host = info.page[1].host;
1a039c7e 6247
9fd46c83 6248 do {
0fa476c1
RH
6249 uint64_t pg = vg[reg_off >> 6];
6250 do {
6251 if ((pg >> (reg_off & 63)) & 1) {
6252 for (i = 0; i < N; ++i) {
6253 host_fn(&env->vfp.zregs[(rd + i) & 31], reg_off,
6254 host + mem_off + (i << msz));
6255 }
6256 }
6257 reg_off += 1 << esz;
6258 mem_off += N << msz;
6259 } while (reg_off & 63);
6260 } while (reg_off <= reg_last);
1a039c7e 6261 }
9fd46c83
RH
6262}
6263
71b9f394
RH
6264static inline QEMU_ALWAYS_INLINE
6265void sve_stN_r_mte(CPUARMState *env, uint64_t *vg, target_ulong addr,
6266 uint32_t desc, const uintptr_t ra,
6267 const int esz, const int msz, const int N,
6268 sve_ldst1_host_fn *host_fn,
6269 sve_ldst1_tlb_fn *tlb_fn)
6270{
6271 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6272 int bit55 = extract64(addr, 55, 1);
6273
6274 /* Remove mtedesc from the normal sve descriptor. */
6275 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6276
6277 /* Perform gross MTE suppression early. */
6278 if (!tbi_check(desc, bit55) ||
6279 tcma_check(desc, bit55, allocation_tag_from_addr(addr))) {
6280 mtedesc = 0;
6281 }
6282
4c3310c7 6283 sve_stN_r(env, vg, addr, desc, ra, esz, msz, N, mtedesc, host_fn, tlb_fn);
1a039c7e 6284}
f6dbf62a 6285
71b9f394
RH
6286#define DO_STN_1(N, NAME, ESZ) \
6287void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6288 target_ulong addr, uint32_t desc) \
6289{ \
6290 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
4c3310c7 6291 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
71b9f394
RH
6292} \
6293void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6294 target_ulong addr, uint32_t desc) \
6295{ \
6296 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6297 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6298}
6299
6300#define DO_STN_2(N, NAME, ESZ, MSZ) \
6301void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6302 target_ulong addr, uint32_t desc) \
6303{ \
6304 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
4c3310c7 6305 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
71b9f394
RH
6306} \
6307void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6308 target_ulong addr, uint32_t desc) \
6309{ \
6310 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
4c3310c7 6311 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
71b9f394
RH
6312} \
6313void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6314 target_ulong addr, uint32_t desc) \
6315{ \
6316 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6317 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6318} \
6319void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6320 target_ulong addr, uint32_t desc) \
6321{ \
6322 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6323 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
0fa476c1
RH
6324}
6325
6326DO_STN_1(1, bb, MO_8)
6327DO_STN_1(1, bh, MO_16)
6328DO_STN_1(1, bs, MO_32)
6329DO_STN_1(1, bd, MO_64)
6330DO_STN_1(2, bb, MO_8)
6331DO_STN_1(3, bb, MO_8)
6332DO_STN_1(4, bb, MO_8)
6333
6334DO_STN_2(1, hh, MO_16, MO_16)
6335DO_STN_2(1, hs, MO_32, MO_16)
6336DO_STN_2(1, hd, MO_64, MO_16)
6337DO_STN_2(2, hh, MO_16, MO_16)
6338DO_STN_2(3, hh, MO_16, MO_16)
6339DO_STN_2(4, hh, MO_16, MO_16)
6340
6341DO_STN_2(1, ss, MO_32, MO_32)
6342DO_STN_2(1, sd, MO_64, MO_32)
6343DO_STN_2(2, ss, MO_32, MO_32)
6344DO_STN_2(3, ss, MO_32, MO_32)
6345DO_STN_2(4, ss, MO_32, MO_32)
6346
6347DO_STN_2(1, dd, MO_64, MO_64)
6348DO_STN_2(2, dd, MO_64, MO_64)
6349DO_STN_2(3, dd, MO_64, MO_64)
6350DO_STN_2(4, dd, MO_64, MO_64)
9fd46c83
RH
6351
6352#undef DO_STN_1
6353#undef DO_STN_2
6354
d4f75f25
RH
6355/*
6356 * Loads with a vector index.
6357 */
673e9fa6 6358
d4f75f25
RH
6359/*
6360 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6361 */
6362typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
6363
6364static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
6365{
6366 return *(uint32_t *)(reg + H1_4(reg_ofs));
673e9fa6
RH
6367}
6368
d4f75f25
RH
6369static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
6370{
6371 return *(int32_t *)(reg + H1_4(reg_ofs));
6372}
6373
6374static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
6375{
6376 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
6377}
6378
6379static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
6380{
6381 return (int32_t)*(uint64_t *)(reg + reg_ofs);
6382}
6383
6384static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
6385{
6386 return *(uint64_t *)(reg + reg_ofs);
673e9fa6
RH
6387}
6388
10a85e2c
RH
6389static inline QEMU_ALWAYS_INLINE
6390void sve_ld1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6391 target_ulong base, uint32_t desc, uintptr_t retaddr,
d28d12f0
RH
6392 uint32_t mtedesc, int esize, int msize,
6393 zreg_off_fn *off_fn,
10a85e2c
RH
6394 sve_ldst1_host_fn *host_fn,
6395 sve_ldst1_tlb_fn *tlb_fn)
d4f75f25 6396{
10a85e2c
RH
6397 const int mmu_idx = cpu_mmu_index(env, false);
6398 const intptr_t reg_max = simd_oprsz(desc);
ba080b86 6399 const int scale = simd_data(desc);
10a85e2c
RH
6400 ARMVectorReg scratch;
6401 intptr_t reg_off;
6402 SVEHostPage info, info2;
d4f75f25 6403
10a85e2c
RH
6404 memset(&scratch, 0, reg_max);
6405 reg_off = 0;
6406 do {
6407 uint64_t pg = vg[reg_off >> 6];
d4f75f25
RH
6408 do {
6409 if (likely(pg & 1)) {
10a85e2c
RH
6410 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6411 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
6412
6413 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_LOAD,
6414 mmu_idx, retaddr);
6415
6416 if (likely(in_page >= msize)) {
6417 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6418 cpu_check_watchpoint(env_cpu(env), addr, msize,
6419 info.attrs, BP_MEM_READ, retaddr);
6420 }
d28d12f0 6421 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
bd47b61c 6422 mte_check(env, mtedesc, addr, retaddr);
d28d12f0 6423 }
10a85e2c
RH
6424 host_fn(&scratch, reg_off, info.host);
6425 } else {
6426 /* Element crosses the page boundary. */
6427 sve_probe_page(&info2, false, env, addr + in_page, 0,
6428 MMU_DATA_LOAD, mmu_idx, retaddr);
6429 if (unlikely((info.flags | info2.flags) & TLB_WATCHPOINT)) {
6430 cpu_check_watchpoint(env_cpu(env), addr,
6431 msize, info.attrs,
6432 BP_MEM_READ, retaddr);
6433 }
d28d12f0 6434 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
bd47b61c 6435 mte_check(env, mtedesc, addr, retaddr);
d28d12f0 6436 }
10a85e2c
RH
6437 tlb_fn(env, &scratch, reg_off, addr, retaddr);
6438 }
d4f75f25 6439 }
10a85e2c
RH
6440 reg_off += esize;
6441 pg >>= esize;
6442 } while (reg_off & 63);
6443 } while (reg_off < reg_max);
d4f75f25
RH
6444
6445 /* Wait until all exceptions have been raised to write back. */
10a85e2c 6446 memcpy(vd, &scratch, reg_max);
d4f75f25
RH
6447}
6448
d28d12f0
RH
6449static inline QEMU_ALWAYS_INLINE
6450void sve_ld1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6451 target_ulong base, uint32_t desc, uintptr_t retaddr,
6452 int esize, int msize, zreg_off_fn *off_fn,
6453 sve_ldst1_host_fn *host_fn,
6454 sve_ldst1_tlb_fn *tlb_fn)
6455{
6456 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6457 /* Remove mtedesc from the normal sve descriptor. */
6458 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6459
6460 /*
6461 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6462 * offset base entirely over the address space hole to change the
6463 * pointer tag, or change the bit55 selector. So we could here
6464 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6465 */
6466 sve_ld1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6467 esize, msize, off_fn, host_fn, tlb_fn);
6468}
6469
10a85e2c
RH
6470#define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6471void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6472 void *vm, target_ulong base, uint32_t desc) \
6473{ \
d28d12f0 6474 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
10a85e2c 6475 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
d28d12f0
RH
6476} \
6477void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6478 void *vm, target_ulong base, uint32_t desc) \
6479{ \
6480 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6481 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
10a85e2c 6482}
d4f75f25 6483
10a85e2c
RH
6484#define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6485void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6486 void *vm, target_ulong base, uint32_t desc) \
6487{ \
d28d12f0 6488 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
10a85e2c 6489 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
d28d12f0
RH
6490} \
6491void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6492 void *vm, target_ulong base, uint32_t desc) \
6493{ \
6494 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6495 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
10a85e2c
RH
6496}
6497
6498DO_LD1_ZPZ_S(bsu, zsu, MO_8)
6499DO_LD1_ZPZ_S(bsu, zss, MO_8)
6500DO_LD1_ZPZ_D(bdu, zsu, MO_8)
6501DO_LD1_ZPZ_D(bdu, zss, MO_8)
6502DO_LD1_ZPZ_D(bdu, zd, MO_8)
6503
6504DO_LD1_ZPZ_S(bss, zsu, MO_8)
6505DO_LD1_ZPZ_S(bss, zss, MO_8)
6506DO_LD1_ZPZ_D(bds, zsu, MO_8)
6507DO_LD1_ZPZ_D(bds, zss, MO_8)
6508DO_LD1_ZPZ_D(bds, zd, MO_8)
6509
6510DO_LD1_ZPZ_S(hsu_le, zsu, MO_16)
6511DO_LD1_ZPZ_S(hsu_le, zss, MO_16)
6512DO_LD1_ZPZ_D(hdu_le, zsu, MO_16)
6513DO_LD1_ZPZ_D(hdu_le, zss, MO_16)
6514DO_LD1_ZPZ_D(hdu_le, zd, MO_16)
6515
6516DO_LD1_ZPZ_S(hsu_be, zsu, MO_16)
6517DO_LD1_ZPZ_S(hsu_be, zss, MO_16)
6518DO_LD1_ZPZ_D(hdu_be, zsu, MO_16)
6519DO_LD1_ZPZ_D(hdu_be, zss, MO_16)
6520DO_LD1_ZPZ_D(hdu_be, zd, MO_16)
6521
6522DO_LD1_ZPZ_S(hss_le, zsu, MO_16)
6523DO_LD1_ZPZ_S(hss_le, zss, MO_16)
6524DO_LD1_ZPZ_D(hds_le, zsu, MO_16)
6525DO_LD1_ZPZ_D(hds_le, zss, MO_16)
6526DO_LD1_ZPZ_D(hds_le, zd, MO_16)
6527
6528DO_LD1_ZPZ_S(hss_be, zsu, MO_16)
6529DO_LD1_ZPZ_S(hss_be, zss, MO_16)
6530DO_LD1_ZPZ_D(hds_be, zsu, MO_16)
6531DO_LD1_ZPZ_D(hds_be, zss, MO_16)
6532DO_LD1_ZPZ_D(hds_be, zd, MO_16)
6533
6534DO_LD1_ZPZ_S(ss_le, zsu, MO_32)
6535DO_LD1_ZPZ_S(ss_le, zss, MO_32)
6536DO_LD1_ZPZ_D(sdu_le, zsu, MO_32)
6537DO_LD1_ZPZ_D(sdu_le, zss, MO_32)
6538DO_LD1_ZPZ_D(sdu_le, zd, MO_32)
6539
6540DO_LD1_ZPZ_S(ss_be, zsu, MO_32)
6541DO_LD1_ZPZ_S(ss_be, zss, MO_32)
6542DO_LD1_ZPZ_D(sdu_be, zsu, MO_32)
6543DO_LD1_ZPZ_D(sdu_be, zss, MO_32)
6544DO_LD1_ZPZ_D(sdu_be, zd, MO_32)
6545
6546DO_LD1_ZPZ_D(sds_le, zsu, MO_32)
6547DO_LD1_ZPZ_D(sds_le, zss, MO_32)
6548DO_LD1_ZPZ_D(sds_le, zd, MO_32)
6549
6550DO_LD1_ZPZ_D(sds_be, zsu, MO_32)
6551DO_LD1_ZPZ_D(sds_be, zss, MO_32)
6552DO_LD1_ZPZ_D(sds_be, zd, MO_32)
6553
6554DO_LD1_ZPZ_D(dd_le, zsu, MO_64)
6555DO_LD1_ZPZ_D(dd_le, zss, MO_64)
6556DO_LD1_ZPZ_D(dd_le, zd, MO_64)
6557
6558DO_LD1_ZPZ_D(dd_be, zsu, MO_64)
6559DO_LD1_ZPZ_D(dd_be, zss, MO_64)
6560DO_LD1_ZPZ_D(dd_be, zd, MO_64)
d4f75f25
RH
6561
6562#undef DO_LD1_ZPZ_S
6563#undef DO_LD1_ZPZ_D
673e9fa6 6564
ed67eb7f
RH
6565/* First fault loads with a vector index. */
6566
116347ce 6567/*
50de9b78 6568 * Common helpers for all gather first-faulting loads.
116347ce 6569 */
50de9b78
RH
6570
6571static inline QEMU_ALWAYS_INLINE
6572void sve_ldff1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6573 target_ulong base, uint32_t desc, uintptr_t retaddr,
d28d12f0
RH
6574 uint32_t mtedesc, const int esz, const int msz,
6575 zreg_off_fn *off_fn,
50de9b78
RH
6576 sve_ldst1_host_fn *host_fn,
6577 sve_ldst1_tlb_fn *tlb_fn)
116347ce 6578{
50de9b78 6579 const int mmu_idx = cpu_mmu_index(env, false);
ba080b86
RH
6580 const intptr_t reg_max = simd_oprsz(desc);
6581 const int scale = simd_data(desc);
50de9b78
RH
6582 const int esize = 1 << esz;
6583 const int msize = 1 << msz;
50de9b78
RH
6584 intptr_t reg_off;
6585 SVEHostPage info;
6586 target_ulong addr, in_page;
116347ce
RH
6587
6588 /* Skip to the first true predicate. */
50de9b78
RH
6589 reg_off = find_next_active(vg, 0, reg_max, esz);
6590 if (unlikely(reg_off >= reg_max)) {
6591 /* The entire predicate was false; no load occurs. */
6592 memset(vd, 0, reg_max);
6593 return;
116347ce
RH
6594 }
6595
50de9b78
RH
6596 /*
6597 * Probe the first element, allowing faults.
6598 */
6599 addr = base + (off_fn(vm, reg_off) << scale);
d28d12f0 6600 if (mtedesc) {
bd47b61c 6601 mte_check(env, mtedesc, addr, retaddr);
d28d12f0 6602 }
50de9b78 6603 tlb_fn(env, vd, reg_off, addr, retaddr);
ed67eb7f 6604
50de9b78
RH
6605 /* After any fault, zero the other elements. */
6606 swap_memzero(vd, reg_off);
6607 reg_off += esize;
6608 swap_memzero(vd + reg_off, reg_max - reg_off);
116347ce 6609
50de9b78
RH
6610 /*
6611 * Probe the remaining elements, not allowing faults.
6612 */
6613 while (reg_off < reg_max) {
6614 uint64_t pg = vg[reg_off >> 6];
6615 do {
6616 if (likely((pg >> (reg_off & 63)) & 1)) {
6617 addr = base + (off_fn(vm, reg_off) << scale);
6618 in_page = -(addr | TARGET_PAGE_MASK);
116347ce 6619
50de9b78
RH
6620 if (unlikely(in_page < msize)) {
6621 /* Stop if the element crosses a page boundary. */
6622 goto fault;
6623 }
ed67eb7f 6624
50de9b78
RH
6625 sve_probe_page(&info, true, env, addr, 0, MMU_DATA_LOAD,
6626 mmu_idx, retaddr);
6627 if (unlikely(info.flags & (TLB_INVALID_MASK | TLB_MMIO))) {
6628 goto fault;
6629 }
6630 if (unlikely(info.flags & TLB_WATCHPOINT) &&
6631 (cpu_watchpoint_address_matches
6632 (env_cpu(env), addr, msize) & BP_MEM_READ)) {
6633 goto fault;
6634 }
d28d12f0
RH
6635 if (mtedesc &&
6636 arm_tlb_mte_tagged(&info.attrs) &&
d304d280 6637 !mte_probe(env, mtedesc, addr)) {
d28d12f0
RH
6638 goto fault;
6639 }
116347ce 6640
50de9b78 6641 host_fn(vd, reg_off, info.host);
116347ce 6642 }
50de9b78
RH
6643 reg_off += esize;
6644 } while (reg_off & 63);
116347ce 6645 }
50de9b78 6646 return;
116347ce 6647
50de9b78
RH
6648 fault:
6649 record_fault(env, reg_off, reg_max);
116347ce
RH
6650}
6651
d28d12f0
RH
6652static inline QEMU_ALWAYS_INLINE
6653void sve_ldff1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6654 target_ulong base, uint32_t desc, uintptr_t retaddr,
6655 const int esz, const int msz,
6656 zreg_off_fn *off_fn,
6657 sve_ldst1_host_fn *host_fn,
6658 sve_ldst1_tlb_fn *tlb_fn)
6659{
6660 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6661 /* Remove mtedesc from the normal sve descriptor. */
6662 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6663
6664 /*
6665 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6666 * offset base entirely over the address space hole to change the
6667 * pointer tag, or change the bit55 selector. So we could here
6668 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6669 */
6670 sve_ldff1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6671 esz, msz, off_fn, host_fn, tlb_fn);
50de9b78
RH
6672}
6673
d28d12f0
RH
6674#define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6675void HELPER(sve_ldff##MEM##_##OFS) \
6676 (CPUARMState *env, void *vd, void *vg, \
6677 void *vm, target_ulong base, uint32_t desc) \
6678{ \
6679 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6680 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6681} \
6682void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6683 (CPUARMState *env, void *vd, void *vg, \
6684 void *vm, target_ulong base, uint32_t desc) \
6685{ \
6686 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6687 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6688}
6689
6690#define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6691void HELPER(sve_ldff##MEM##_##OFS) \
6692 (CPUARMState *env, void *vd, void *vg, \
6693 void *vm, target_ulong base, uint32_t desc) \
6694{ \
6695 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6696 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6697} \
6698void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6699 (CPUARMState *env, void *vd, void *vg, \
6700 void *vm, target_ulong base, uint32_t desc) \
6701{ \
6702 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6703 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
50de9b78
RH
6704}
6705
6706DO_LDFF1_ZPZ_S(bsu, zsu, MO_8)
6707DO_LDFF1_ZPZ_S(bsu, zss, MO_8)
6708DO_LDFF1_ZPZ_D(bdu, zsu, MO_8)
6709DO_LDFF1_ZPZ_D(bdu, zss, MO_8)
6710DO_LDFF1_ZPZ_D(bdu, zd, MO_8)
6711
6712DO_LDFF1_ZPZ_S(bss, zsu, MO_8)
6713DO_LDFF1_ZPZ_S(bss, zss, MO_8)
6714DO_LDFF1_ZPZ_D(bds, zsu, MO_8)
6715DO_LDFF1_ZPZ_D(bds, zss, MO_8)
6716DO_LDFF1_ZPZ_D(bds, zd, MO_8)
6717
6718DO_LDFF1_ZPZ_S(hsu_le, zsu, MO_16)
6719DO_LDFF1_ZPZ_S(hsu_le, zss, MO_16)
6720DO_LDFF1_ZPZ_D(hdu_le, zsu, MO_16)
6721DO_LDFF1_ZPZ_D(hdu_le, zss, MO_16)
6722DO_LDFF1_ZPZ_D(hdu_le, zd, MO_16)
6723
6724DO_LDFF1_ZPZ_S(hsu_be, zsu, MO_16)
6725DO_LDFF1_ZPZ_S(hsu_be, zss, MO_16)
6726DO_LDFF1_ZPZ_D(hdu_be, zsu, MO_16)
6727DO_LDFF1_ZPZ_D(hdu_be, zss, MO_16)
6728DO_LDFF1_ZPZ_D(hdu_be, zd, MO_16)
6729
6730DO_LDFF1_ZPZ_S(hss_le, zsu, MO_16)
6731DO_LDFF1_ZPZ_S(hss_le, zss, MO_16)
6732DO_LDFF1_ZPZ_D(hds_le, zsu, MO_16)
6733DO_LDFF1_ZPZ_D(hds_le, zss, MO_16)
6734DO_LDFF1_ZPZ_D(hds_le, zd, MO_16)
6735
6736DO_LDFF1_ZPZ_S(hss_be, zsu, MO_16)
6737DO_LDFF1_ZPZ_S(hss_be, zss, MO_16)
6738DO_LDFF1_ZPZ_D(hds_be, zsu, MO_16)
6739DO_LDFF1_ZPZ_D(hds_be, zss, MO_16)
6740DO_LDFF1_ZPZ_D(hds_be, zd, MO_16)
6741
6742DO_LDFF1_ZPZ_S(ss_le, zsu, MO_32)
6743DO_LDFF1_ZPZ_S(ss_le, zss, MO_32)
6744DO_LDFF1_ZPZ_D(sdu_le, zsu, MO_32)
6745DO_LDFF1_ZPZ_D(sdu_le, zss, MO_32)
6746DO_LDFF1_ZPZ_D(sdu_le, zd, MO_32)
6747
6748DO_LDFF1_ZPZ_S(ss_be, zsu, MO_32)
6749DO_LDFF1_ZPZ_S(ss_be, zss, MO_32)
6750DO_LDFF1_ZPZ_D(sdu_be, zsu, MO_32)
6751DO_LDFF1_ZPZ_D(sdu_be, zss, MO_32)
6752DO_LDFF1_ZPZ_D(sdu_be, zd, MO_32)
6753
6754DO_LDFF1_ZPZ_D(sds_le, zsu, MO_32)
6755DO_LDFF1_ZPZ_D(sds_le, zss, MO_32)
6756DO_LDFF1_ZPZ_D(sds_le, zd, MO_32)
6757
6758DO_LDFF1_ZPZ_D(sds_be, zsu, MO_32)
6759DO_LDFF1_ZPZ_D(sds_be, zss, MO_32)
6760DO_LDFF1_ZPZ_D(sds_be, zd, MO_32)
6761
6762DO_LDFF1_ZPZ_D(dd_le, zsu, MO_64)
6763DO_LDFF1_ZPZ_D(dd_le, zss, MO_64)
6764DO_LDFF1_ZPZ_D(dd_le, zd, MO_64)
6765
6766DO_LDFF1_ZPZ_D(dd_be, zsu, MO_64)
6767DO_LDFF1_ZPZ_D(dd_be, zss, MO_64)
6768DO_LDFF1_ZPZ_D(dd_be, zd, MO_64)
ed67eb7f 6769
f6dbf62a
RH
6770/* Stores with a vector index. */
6771
88a660a4
RH
6772static inline QEMU_ALWAYS_INLINE
6773void sve_st1_z(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6774 target_ulong base, uint32_t desc, uintptr_t retaddr,
d28d12f0
RH
6775 uint32_t mtedesc, int esize, int msize,
6776 zreg_off_fn *off_fn,
88a660a4
RH
6777 sve_ldst1_host_fn *host_fn,
6778 sve_ldst1_tlb_fn *tlb_fn)
78cf1b88 6779{
88a660a4
RH
6780 const int mmu_idx = cpu_mmu_index(env, false);
6781 const intptr_t reg_max = simd_oprsz(desc);
ba080b86 6782 const int scale = simd_data(desc);
88a660a4
RH
6783 void *host[ARM_MAX_VQ * 4];
6784 intptr_t reg_off, i;
6785 SVEHostPage info, info2;
f6dbf62a 6786
88a660a4
RH
6787 /*
6788 * Probe all of the elements for host addresses and flags.
6789 */
6790 i = reg_off = 0;
6791 do {
6792 uint64_t pg = vg[reg_off >> 6];
78cf1b88 6793 do {
88a660a4
RH
6794 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6795 target_ulong in_page = -(addr | TARGET_PAGE_MASK);
f6dbf62a 6796
88a660a4
RH
6797 host[i] = NULL;
6798 if (likely((pg >> (reg_off & 63)) & 1)) {
6799 if (likely(in_page >= msize)) {
6800 sve_probe_page(&info, false, env, addr, 0, MMU_DATA_STORE,
6801 mmu_idx, retaddr);
6802 host[i] = info.host;
6803 } else {
6804 /*
6805 * Element crosses the page boundary.
6806 * Probe both pages, but do not record the host address,
6807 * so that we use the slow path.
6808 */
6809 sve_probe_page(&info, false, env, addr, 0,
6810 MMU_DATA_STORE, mmu_idx, retaddr);
6811 sve_probe_page(&info2, false, env, addr + in_page, 0,
6812 MMU_DATA_STORE, mmu_idx, retaddr);
6813 info.flags |= info2.flags;
6814 }
f6dbf62a 6815
88a660a4
RH
6816 if (unlikely(info.flags & TLB_WATCHPOINT)) {
6817 cpu_check_watchpoint(env_cpu(env), addr, msize,
6818 info.attrs, BP_MEM_WRITE, retaddr);
6819 }
d28d12f0
RH
6820
6821 if (mtedesc && arm_tlb_mte_tagged(&info.attrs)) {
bd47b61c 6822 mte_check(env, mtedesc, addr, retaddr);
d28d12f0 6823 }
88a660a4
RH
6824 }
6825 i += 1;
6826 reg_off += esize;
6827 } while (reg_off & 63);
6828 } while (reg_off < reg_max);
6829
6830 /*
6831 * Now that we have recognized all exceptions except SyncExternal
6832 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6833 *
6834 * Note for the common case of an element in RAM, not crossing a page
6835 * boundary, we have stored the host address in host[]. This doubles
6836 * as a first-level check against the predicate, since only enabled
6837 * elements have non-null host addresses.
6838 */
6839 i = reg_off = 0;
6840 do {
6841 void *h = host[i];
6842 if (likely(h != NULL)) {
6843 host_fn(vd, reg_off, h);
6844 } else if ((vg[reg_off >> 6] >> (reg_off & 63)) & 1) {
6845 target_ulong addr = base + (off_fn(vm, reg_off) << scale);
6846 tlb_fn(env, vd, reg_off, addr, retaddr);
78cf1b88 6847 }
88a660a4
RH
6848 i += 1;
6849 reg_off += esize;
6850 } while (reg_off < reg_max);
78cf1b88 6851}
f6dbf62a 6852
d28d12f0
RH
6853static inline QEMU_ALWAYS_INLINE
6854void sve_st1_z_mte(CPUARMState *env, void *vd, uint64_t *vg, void *vm,
6855 target_ulong base, uint32_t desc, uintptr_t retaddr,
6856 int esize, int msize, zreg_off_fn *off_fn,
6857 sve_ldst1_host_fn *host_fn,
6858 sve_ldst1_tlb_fn *tlb_fn)
6859{
6860 uint32_t mtedesc = desc >> (SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6861 /* Remove mtedesc from the normal sve descriptor. */
6862 desc = extract32(desc, 0, SIMD_DATA_SHIFT + SVE_MTEDESC_SHIFT);
6863
6864 /*
6865 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6866 * offset base entirely over the address space hole to change the
6867 * pointer tag, or change the bit55 selector. So we could here
6868 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6869 */
6870 sve_st1_z(env, vd, vg, vm, base, desc, retaddr, mtedesc,
6871 esize, msize, off_fn, host_fn, tlb_fn);
6872}
6873
6874#define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
6875void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
88a660a4 6876 void *vm, target_ulong base, uint32_t desc) \
d28d12f0
RH
6877{ \
6878 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6879 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6880} \
6881void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6882 void *vm, target_ulong base, uint32_t desc) \
6883{ \
6884 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6885 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
78cf1b88 6886}
f6dbf62a 6887
d28d12f0
RH
6888#define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
6889void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
88a660a4 6890 void *vm, target_ulong base, uint32_t desc) \
d28d12f0
RH
6891{ \
6892 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6893 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6894} \
6895void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6896 void *vm, target_ulong base, uint32_t desc) \
6897{ \
6898 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6899 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
88a660a4
RH
6900}
6901
6902DO_ST1_ZPZ_S(bs, zsu, MO_8)
6903DO_ST1_ZPZ_S(hs_le, zsu, MO_16)
6904DO_ST1_ZPZ_S(hs_be, zsu, MO_16)
6905DO_ST1_ZPZ_S(ss_le, zsu, MO_32)
6906DO_ST1_ZPZ_S(ss_be, zsu, MO_32)
6907
6908DO_ST1_ZPZ_S(bs, zss, MO_8)
6909DO_ST1_ZPZ_S(hs_le, zss, MO_16)
6910DO_ST1_ZPZ_S(hs_be, zss, MO_16)
6911DO_ST1_ZPZ_S(ss_le, zss, MO_32)
6912DO_ST1_ZPZ_S(ss_be, zss, MO_32)
6913
6914DO_ST1_ZPZ_D(bd, zsu, MO_8)
6915DO_ST1_ZPZ_D(hd_le, zsu, MO_16)
6916DO_ST1_ZPZ_D(hd_be, zsu, MO_16)
6917DO_ST1_ZPZ_D(sd_le, zsu, MO_32)
6918DO_ST1_ZPZ_D(sd_be, zsu, MO_32)
6919DO_ST1_ZPZ_D(dd_le, zsu, MO_64)
6920DO_ST1_ZPZ_D(dd_be, zsu, MO_64)
6921
6922DO_ST1_ZPZ_D(bd, zss, MO_8)
6923DO_ST1_ZPZ_D(hd_le, zss, MO_16)
6924DO_ST1_ZPZ_D(hd_be, zss, MO_16)
6925DO_ST1_ZPZ_D(sd_le, zss, MO_32)
6926DO_ST1_ZPZ_D(sd_be, zss, MO_32)
6927DO_ST1_ZPZ_D(dd_le, zss, MO_64)
6928DO_ST1_ZPZ_D(dd_be, zss, MO_64)
6929
6930DO_ST1_ZPZ_D(bd, zd, MO_8)
6931DO_ST1_ZPZ_D(hd_le, zd, MO_16)
6932DO_ST1_ZPZ_D(hd_be, zd, MO_16)
6933DO_ST1_ZPZ_D(sd_le, zd, MO_32)
6934DO_ST1_ZPZ_D(sd_be, zd, MO_32)
6935DO_ST1_ZPZ_D(dd_le, zd, MO_64)
6936DO_ST1_ZPZ_D(dd_be, zd, MO_64)
78cf1b88
RH
6937
6938#undef DO_ST1_ZPZ_S
6939#undef DO_ST1_ZPZ_D
911cdc6d
RH
6940
6941void HELPER(sve2_eor3)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
6942{
6943 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
6944 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
6945
6946 for (i = 0; i < opr_sz; ++i) {
6947 d[i] = n[i] ^ m[i] ^ k[i];
6948 }
6949}
6950
6951void HELPER(sve2_bcax)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
6952{
6953 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
6954 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
6955
6956 for (i = 0; i < opr_sz; ++i) {
6957 d[i] = n[i] ^ (m[i] & ~k[i]);
6958 }
6959}
6960
6961void HELPER(sve2_bsl1n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
6962{
6963 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
6964 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
6965
6966 for (i = 0; i < opr_sz; ++i) {
6967 d[i] = (~n[i] & k[i]) | (m[i] & ~k[i]);
6968 }
6969}
6970
6971void HELPER(sve2_bsl2n)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
6972{
6973 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
6974 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
6975
6976 for (i = 0; i < opr_sz; ++i) {
6977 d[i] = (n[i] & k[i]) | (~m[i] & ~k[i]);
6978 }
6979}
6980
6981void HELPER(sve2_nbsl)(void *vd, void *vn, void *vm, void *vk, uint32_t desc)
6982{
6983 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
6984 uint64_t *d = vd, *n = vn, *m = vm, *k = vk;
6985
6986 for (i = 0; i < opr_sz; ++i) {
6987 d[i] = ~((n[i] & k[i]) | (m[i] & ~k[i]));
6988 }
6989}
e0ae6ec3
SL
6990
6991/*
6992 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
6993 * See hasless(v,1) from
6994 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
6995 */
6996static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz)
6997{
6998 int bits = 8 << esz;
6999 uint64_t ones = dup_const(esz, 1);
7000 uint64_t signs = ones << (bits - 1);
7001 uint64_t cmp0, cmp1;
7002
7003 cmp1 = dup_const(esz, n);
7004 cmp0 = cmp1 ^ m0;
7005 cmp1 = cmp1 ^ m1;
7006 cmp0 = (cmp0 - ones) & ~cmp0;
7007 cmp1 = (cmp1 - ones) & ~cmp1;
7008 return (cmp0 | cmp1) & signs;
7009}
7010
7011static inline uint32_t do_match(void *vd, void *vn, void *vm, void *vg,
7012 uint32_t desc, int esz, bool nmatch)
7013{
7014 uint16_t esz_mask = pred_esz_masks[esz];
7015 intptr_t opr_sz = simd_oprsz(desc);
7016 uint32_t flags = PREDTEST_INIT;
7017 intptr_t i, j, k;
7018
7019 for (i = 0; i < opr_sz; i += 16) {
7020 uint64_t m0 = *(uint64_t *)(vm + i);
7021 uint64_t m1 = *(uint64_t *)(vm + i + 8);
7022 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)) & esz_mask;
7023 uint16_t out = 0;
7024
7025 for (j = 0; j < 16; j += 8) {
7026 uint64_t n = *(uint64_t *)(vn + i + j);
7027
7028 for (k = 0; k < 8; k += 1 << esz) {
7029 if (pg & (1 << (j + k))) {
7030 bool o = do_match2(n >> (k * 8), m0, m1, esz);
7031 out |= (o ^ nmatch) << (j + k);
7032 }
7033 }
7034 }
7035 *(uint16_t *)(vd + H1_2(i >> 3)) = out;
7036 flags = iter_predtest_fwd(out, pg, flags);
7037 }
7038 return flags;
7039}
7040
7041#define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7042uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7043{ \
7044 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7045}
7046
7047DO_PPZZ_MATCH(sve2_match_ppzz_b, MO_8, false)
7048DO_PPZZ_MATCH(sve2_match_ppzz_h, MO_16, false)
7049
7050DO_PPZZ_MATCH(sve2_nmatch_ppzz_b, MO_8, true)
7051DO_PPZZ_MATCH(sve2_nmatch_ppzz_h, MO_16, true)
7052
7053#undef DO_PPZZ_MATCH