]> git.proxmox.com Git - mirror_qemu.git/blame - target/arm/sve_helper.c
target/arm: Handle watchpoints in sve_ld1_r
[mirror_qemu.git] / target / arm / sve_helper.c
CommitLineData
9e18d7a6
RH
1/*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "qemu/osdep.h"
21#include "cpu.h"
500d0484 22#include "internals.h"
9e18d7a6
RH
23#include "exec/exec-all.h"
24#include "exec/cpu_ldst.h"
25#include "exec/helper-proto.h"
26#include "tcg/tcg-gvec-desc.h"
a1f233f2 27#include "fpu/softfloat.h"
dcb32f1d 28#include "tcg/tcg.h"
9e18d7a6
RH
29
30
f97cfd59
RH
31/* Note that vector data is stored in host-endian 64-bit chunks,
32 so addressing units smaller than that needs a host-endian fixup. */
33#ifdef HOST_WORDS_BIGENDIAN
34#define H1(x) ((x) ^ 7)
35#define H1_2(x) ((x) ^ 6)
36#define H1_4(x) ((x) ^ 4)
37#define H2(x) ((x) ^ 3)
38#define H4(x) ((x) ^ 1)
39#else
40#define H1(x) (x)
41#define H1_2(x) (x)
42#define H1_4(x) (x)
43#define H2(x) (x)
44#define H4(x) (x)
45#endif
46
9e18d7a6
RH
47/* Return a value for NZCV as per the ARM PredTest pseudofunction.
48 *
49 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
50 * and bit 0 set if C is set. Compare the definitions of these variables
51 * within CPUARMState.
52 */
53
54/* For no G bits set, NZCV = C. */
55#define PREDTEST_INIT 1
56
57/* This is an iterative function, called for each Pd and Pg word
58 * moving forward.
59 */
60static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
61{
62 if (likely(g)) {
63 /* Compute N from first D & G.
64 Use bit 2 to signal first G bit seen. */
65 if (!(flags & 4)) {
66 flags |= ((d & (g & -g)) != 0) << 31;
67 flags |= 4;
68 }
69
70 /* Accumulate Z from each D & G. */
71 flags |= ((d & g) != 0) << 1;
72
73 /* Compute C from last !(D & G). Replace previous. */
74 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
75 }
76 return flags;
77}
78
757f9cff
RH
79/* This is an iterative function, called for each Pd and Pg word
80 * moving backward.
81 */
82static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
83{
84 if (likely(g)) {
85 /* Compute C from first (i.e last) !(D & G).
86 Use bit 2 to signal first G bit seen. */
87 if (!(flags & 4)) {
88 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
89 flags |= (d & pow2floor(g)) == 0;
90 }
91
92 /* Accumulate Z from each D & G. */
93 flags |= ((d & g) != 0) << 1;
94
95 /* Compute N from last (i.e first) D & G. Replace previous. */
96 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
97 }
98 return flags;
99}
100
9e18d7a6
RH
101/* The same for a single word predicate. */
102uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
103{
104 return iter_predtest_fwd(d, g, PREDTEST_INIT);
105}
106
107/* The same for a multi-word predicate. */
108uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
109{
110 uint32_t flags = PREDTEST_INIT;
111 uint64_t *d = vd, *g = vg;
112 uintptr_t i = 0;
113
114 do {
115 flags = iter_predtest_fwd(d[i], g[i], flags);
116 } while (++i < words);
117
118 return flags;
119}
516e246a 120
ccd841c3
RH
121/* Expand active predicate bits to bytes, for byte elements.
122 * for (i = 0; i < 256; ++i) {
123 * unsigned long m = 0;
124 * for (j = 0; j < 8; j++) {
125 * if ((i >> j) & 1) {
126 * m |= 0xfful << (j << 3);
127 * }
128 * }
129 * printf("0x%016lx,\n", m);
130 * }
131 */
132static inline uint64_t expand_pred_b(uint8_t byte)
133{
134 static const uint64_t word[256] = {
135 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
136 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
137 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
138 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
139 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
140 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
141 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
142 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
143 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
144 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
145 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
146 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
147 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
148 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
149 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
150 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
151 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
152 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
153 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
154 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
155 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
156 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
157 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
158 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
159 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
160 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
161 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
162 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
163 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
164 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
165 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
166 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
167 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
168 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
169 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
170 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
171 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
172 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
173 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
174 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
175 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
176 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
177 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
178 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
179 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
180 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
181 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
182 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
183 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
184 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
185 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
186 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
187 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
188 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
189 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
190 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
191 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
192 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
193 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
194 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
195 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
196 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
197 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
198 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
199 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
200 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
201 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
202 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
203 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
204 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
205 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
206 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
207 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
208 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
209 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
210 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
211 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
212 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
213 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
214 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
215 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
216 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
217 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
218 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
219 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
220 0xffffffffffffffff,
221 };
222 return word[byte];
223}
224
225/* Similarly for half-word elements.
226 * for (i = 0; i < 256; ++i) {
227 * unsigned long m = 0;
228 * if (i & 0xaa) {
229 * continue;
230 * }
231 * for (j = 0; j < 8; j += 2) {
232 * if ((i >> j) & 1) {
233 * m |= 0xfffful << (j << 3);
234 * }
235 * }
236 * printf("[0x%x] = 0x%016lx,\n", i, m);
237 * }
238 */
239static inline uint64_t expand_pred_h(uint8_t byte)
240{
241 static const uint64_t word[] = {
242 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
243 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
244 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
245 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
246 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
247 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
248 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
249 [0x55] = 0xffffffffffffffff,
250 };
251 return word[byte & 0x55];
252}
253
254/* Similarly for single word elements. */
255static inline uint64_t expand_pred_s(uint8_t byte)
256{
257 static const uint64_t word[] = {
258 [0x01] = 0x00000000ffffffffull,
259 [0x10] = 0xffffffff00000000ull,
260 [0x11] = 0xffffffffffffffffull,
261 };
262 return word[byte & 0x11];
263}
264
dae8fb90
RH
265/* Swap 16-bit words within a 32-bit word. */
266static inline uint32_t hswap32(uint32_t h)
267{
268 return rol32(h, 16);
269}
270
271/* Swap 16-bit words within a 64-bit word. */
272static inline uint64_t hswap64(uint64_t h)
273{
274 uint64_t m = 0x0000ffff0000ffffull;
275 h = rol64(h, 32);
276 return ((h & m) << 16) | ((h >> 16) & m);
277}
278
279/* Swap 32-bit words within a 64-bit word. */
280static inline uint64_t wswap64(uint64_t h)
281{
282 return rol64(h, 32);
283}
284
516e246a
RH
285#define LOGICAL_PPPP(NAME, FUNC) \
286void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
287{ \
288 uintptr_t opr_sz = simd_oprsz(desc); \
289 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
290 uintptr_t i; \
291 for (i = 0; i < opr_sz / 8; ++i) { \
292 d[i] = FUNC(n[i], m[i], g[i]); \
293 } \
294}
295
296#define DO_AND(N, M, G) (((N) & (M)) & (G))
297#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
298#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
299#define DO_ORR(N, M, G) (((N) | (M)) & (G))
300#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
301#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
302#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
303#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
304
305LOGICAL_PPPP(sve_and_pppp, DO_AND)
306LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
307LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
308LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
309LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
310LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
311LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
312LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
313
314#undef DO_AND
315#undef DO_BIC
316#undef DO_EOR
317#undef DO_ORR
318#undef DO_ORN
319#undef DO_NOR
320#undef DO_NAND
321#undef DO_SEL
322#undef LOGICAL_PPPP
028e2a7b 323
f97cfd59
RH
324/* Fully general three-operand expander, controlled by a predicate.
325 * This is complicated by the host-endian storage of the register file.
326 */
327/* ??? I don't expect the compiler could ever vectorize this itself.
328 * With some tables we can convert bit masks to byte masks, and with
329 * extra care wrt byte/word ordering we could use gcc generic vectors
330 * and do 16 bytes at a time.
331 */
332#define DO_ZPZZ(NAME, TYPE, H, OP) \
333void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
334{ \
335 intptr_t i, opr_sz = simd_oprsz(desc); \
336 for (i = 0; i < opr_sz; ) { \
337 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
338 do { \
339 if (pg & 1) { \
340 TYPE nn = *(TYPE *)(vn + H(i)); \
341 TYPE mm = *(TYPE *)(vm + H(i)); \
342 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
343 } \
344 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
345 } while (i & 15); \
346 } \
347}
348
349/* Similarly, specialized for 64-bit operands. */
350#define DO_ZPZZ_D(NAME, TYPE, OP) \
351void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
352{ \
353 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
354 TYPE *d = vd, *n = vn, *m = vm; \
355 uint8_t *pg = vg; \
356 for (i = 0; i < opr_sz; i += 1) { \
357 if (pg[H1(i)] & 1) { \
358 TYPE nn = n[i], mm = m[i]; \
359 d[i] = OP(nn, mm); \
360 } \
361 } \
362}
363
364#define DO_AND(N, M) (N & M)
365#define DO_EOR(N, M) (N ^ M)
366#define DO_ORR(N, M) (N | M)
367#define DO_BIC(N, M) (N & ~M)
368#define DO_ADD(N, M) (N + M)
369#define DO_SUB(N, M) (N - M)
370#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
371#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
372#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
373#define DO_MUL(N, M) (N * M)
7e8fafbf
RH
374
375
376/*
377 * We must avoid the C undefined behaviour cases: division by
378 * zero and signed division of INT_MIN by -1. Both of these
379 * have architecturally defined required results for Arm.
380 * We special case all signed divisions by -1 to avoid having
381 * to deduce the minimum integer for the type involved.
382 */
383#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
384#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
f97cfd59
RH
385
386DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
387DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
388DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
389DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
390
391DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
392DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
393DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
394DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
395
396DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
397DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
398DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
399DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
400
401DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
402DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
403DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
404DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
405
406DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
407DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
408DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
409DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
410
411DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
412DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
413DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
414DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
415
416DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
417DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
418DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
419DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
420
421DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
422DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
423DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
424DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
425
426DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
427DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
428DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
429DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
430
431DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
432DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
433DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
434DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
435
436DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
437DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
438DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
439DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
440
441DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
442DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
443DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
444DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
445
446/* Because the computation type is at least twice as large as required,
447 these work for both signed and unsigned source types. */
448static inline uint8_t do_mulh_b(int32_t n, int32_t m)
449{
450 return (n * m) >> 8;
451}
452
453static inline uint16_t do_mulh_h(int32_t n, int32_t m)
454{
455 return (n * m) >> 16;
456}
457
458static inline uint32_t do_mulh_s(int64_t n, int64_t m)
459{
460 return (n * m) >> 32;
461}
462
463static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
464{
465 uint64_t lo, hi;
466 muls64(&lo, &hi, n, m);
467 return hi;
468}
469
470static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
471{
472 uint64_t lo, hi;
473 mulu64(&lo, &hi, n, m);
474 return hi;
475}
476
477DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
478DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
479DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
480DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
481
482DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
483DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
484DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
485DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
486
487DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
488DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
489DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
490DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
491
7e8fafbf
RH
492DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
493DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
f97cfd59 494
7e8fafbf
RH
495DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
496DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
f97cfd59 497
27721dbb
RH
498/* Note that all bits of the shift are significant
499 and not modulo the element size. */
500#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
501#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
502#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
503
504DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
505DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
506DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
507
508DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
509DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
510DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
511
512DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
513DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
514DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
515
516DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
517DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
518DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
519
f97cfd59
RH
520#undef DO_ZPZZ
521#undef DO_ZPZZ_D
047cec97 522
fe7f8dfb
RH
523/* Three-operand expander, controlled by a predicate, in which the
524 * third operand is "wide". That is, for D = N op M, the same 64-bit
525 * value of M is used with all of the narrower values of N.
526 */
527#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
528void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
529{ \
530 intptr_t i, opr_sz = simd_oprsz(desc); \
531 for (i = 0; i < opr_sz; ) { \
532 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
533 TYPEW mm = *(TYPEW *)(vm + i); \
534 do { \
535 if (pg & 1) { \
536 TYPE nn = *(TYPE *)(vn + H(i)); \
537 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
538 } \
539 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
540 } while (i & 7); \
541 } \
542}
543
544DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
545DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
546DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
547
548DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
549DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
550DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
551
552DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
553DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
554DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
555
556#undef DO_ZPZW
557
afac6d04
RH
558/* Fully general two-operand expander, controlled by a predicate.
559 */
560#define DO_ZPZ(NAME, TYPE, H, OP) \
561void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
562{ \
563 intptr_t i, opr_sz = simd_oprsz(desc); \
564 for (i = 0; i < opr_sz; ) { \
565 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
566 do { \
567 if (pg & 1) { \
568 TYPE nn = *(TYPE *)(vn + H(i)); \
569 *(TYPE *)(vd + H(i)) = OP(nn); \
570 } \
571 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
572 } while (i & 15); \
573 } \
574}
575
576/* Similarly, specialized for 64-bit operands. */
577#define DO_ZPZ_D(NAME, TYPE, OP) \
578void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
579{ \
580 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
581 TYPE *d = vd, *n = vn; \
582 uint8_t *pg = vg; \
583 for (i = 0; i < opr_sz; i += 1) { \
584 if (pg[H1(i)] & 1) { \
585 TYPE nn = n[i]; \
586 d[i] = OP(nn); \
587 } \
588 } \
589}
590
591#define DO_CLS_B(N) (clrsb32(N) - 24)
592#define DO_CLS_H(N) (clrsb32(N) - 16)
593
594DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
595DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
596DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
597DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
598
599#define DO_CLZ_B(N) (clz32(N) - 24)
600#define DO_CLZ_H(N) (clz32(N) - 16)
601
602DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
603DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
604DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
605DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
606
607DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
608DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
609DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
610DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
611
612#define DO_CNOT(N) (N == 0)
613
614DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
615DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
616DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
617DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
618
619#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
620
621DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
622DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
623DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
624
625#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
626
627DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
628DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
629DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
630
631#define DO_NOT(N) (~N)
632
633DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
634DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
635DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
636DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
637
638#define DO_SXTB(N) ((int8_t)N)
639#define DO_SXTH(N) ((int16_t)N)
640#define DO_SXTS(N) ((int32_t)N)
641#define DO_UXTB(N) ((uint8_t)N)
642#define DO_UXTH(N) ((uint16_t)N)
643#define DO_UXTS(N) ((uint32_t)N)
644
645DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
646DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
647DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
648DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
649DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
650DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
651
652DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
653DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
654DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
655DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
656DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
657DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
658
659#define DO_ABS(N) (N < 0 ? -N : N)
660
661DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
662DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
663DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
664DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
665
666#define DO_NEG(N) (-N)
667
668DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
669DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
670DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
671DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
672
dae8fb90
RH
673DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
674DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
675DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
676
677DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
678DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
679
680DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
681
682DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
683DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
684DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
685DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
686
d9d78dcc
RH
687/* Three-operand expander, unpredicated, in which the third operand is "wide".
688 */
689#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
690void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
691{ \
692 intptr_t i, opr_sz = simd_oprsz(desc); \
693 for (i = 0; i < opr_sz; ) { \
694 TYPEW mm = *(TYPEW *)(vm + i); \
695 do { \
696 TYPE nn = *(TYPE *)(vn + H(i)); \
697 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
698 i += sizeof(TYPE); \
699 } while (i & 7); \
700 } \
701}
702
703DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
704DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
705DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
706
707DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
708DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
709DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
710
711DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
712DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
713DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
714
715#undef DO_ZZW
716
afac6d04
RH
717#undef DO_CLS_B
718#undef DO_CLS_H
719#undef DO_CLZ_B
720#undef DO_CLZ_H
721#undef DO_CNOT
722#undef DO_FABS
723#undef DO_FNEG
724#undef DO_ABS
725#undef DO_NEG
726#undef DO_ZPZ
727#undef DO_ZPZ_D
728
047cec97
RH
729/* Two-operand reduction expander, controlled by a predicate.
730 * The difference between TYPERED and TYPERET has to do with
731 * sign-extension. E.g. for SMAX, TYPERED must be signed,
732 * but TYPERET must be unsigned so that e.g. a 32-bit value
733 * is not sign-extended to the ABI uint64_t return type.
734 */
735/* ??? If we were to vectorize this by hand the reduction ordering
736 * would change. For integer operands, this is perfectly fine.
737 */
738#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
739uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
740{ \
741 intptr_t i, opr_sz = simd_oprsz(desc); \
742 TYPERED ret = INIT; \
743 for (i = 0; i < opr_sz; ) { \
744 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
745 do { \
746 if (pg & 1) { \
747 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
748 ret = OP(ret, nn); \
749 } \
750 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
751 } while (i & 15); \
752 } \
753 return (TYPERET)ret; \
754}
755
756#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
757uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
758{ \
759 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
760 TYPEE *n = vn; \
761 uint8_t *pg = vg; \
762 TYPER ret = INIT; \
763 for (i = 0; i < opr_sz; i += 1) { \
764 if (pg[H1(i)] & 1) { \
765 TYPEE nn = n[i]; \
766 ret = OP(ret, nn); \
767 } \
768 } \
769 return ret; \
770}
771
772DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
773DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
774DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
775DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
776
777DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
778DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
779DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
780DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
781
782DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
783DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
784DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
785DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
786
787DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
788DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
789DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
790
791DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
792DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
793DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
794DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
795
796DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
797DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
798DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
799DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
800
801DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
802DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
803DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
804DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
805
806DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
807DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
808DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
809DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
810
811DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
812DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
813DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
814DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
815
816#undef DO_VPZ
817#undef DO_VPZ_D
818
6e6a157d
RH
819/* Two vector operand, one scalar operand, unpredicated. */
820#define DO_ZZI(NAME, TYPE, OP) \
821void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
822{ \
823 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
824 TYPE s = s64, *d = vd, *n = vn; \
825 for (i = 0; i < opr_sz; ++i) { \
826 d[i] = OP(n[i], s); \
827 } \
828}
829
830#define DO_SUBR(X, Y) (Y - X)
831
832DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
833DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
834DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
835DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
836
837DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
838DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
839DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
840DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
841
842DO_ZZI(sve_smini_b, int8_t, DO_MIN)
843DO_ZZI(sve_smini_h, int16_t, DO_MIN)
844DO_ZZI(sve_smini_s, int32_t, DO_MIN)
845DO_ZZI(sve_smini_d, int64_t, DO_MIN)
846
847DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
848DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
849DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
850DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
851
852DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
853DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
854DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
855DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
856
857#undef DO_ZZI
858
f97cfd59
RH
859#undef DO_AND
860#undef DO_ORR
861#undef DO_EOR
862#undef DO_BIC
863#undef DO_ADD
864#undef DO_SUB
865#undef DO_MAX
866#undef DO_MIN
867#undef DO_ABD
868#undef DO_MUL
869#undef DO_DIV
27721dbb
RH
870#undef DO_ASR
871#undef DO_LSR
872#undef DO_LSL
6e6a157d 873#undef DO_SUBR
f97cfd59 874
028e2a7b
RH
875/* Similar to the ARM LastActiveElement pseudocode function, except the
876 result is multiplied by the element size. This includes the not found
877 indication; e.g. not found for esz=3 is -8. */
878static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
879{
880 uint64_t mask = pred_esz_masks[esz];
881 intptr_t i = words;
882
883 do {
884 uint64_t this_g = g[--i] & mask;
885 if (this_g) {
886 return i * 64 + (63 - clz64(this_g));
887 }
888 } while (i > 0);
889 return (intptr_t)-1 << esz;
890}
891
892uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
893{
894 uint32_t flags = PREDTEST_INIT;
895 uint64_t *d = vd, *g = vg;
896 intptr_t i = 0;
897
898 do {
899 uint64_t this_d = d[i];
900 uint64_t this_g = g[i];
901
902 if (this_g) {
903 if (!(flags & 4)) {
904 /* Set in D the first bit of G. */
905 this_d |= this_g & -this_g;
906 d[i] = this_d;
907 }
908 flags = iter_predtest_fwd(this_d, this_g, flags);
909 }
910 } while (++i < words);
911
912 return flags;
913}
914
915uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
916{
917 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
918 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
919 uint32_t flags = PREDTEST_INIT;
920 uint64_t *d = vd, *g = vg, esz_mask;
921 intptr_t i, next;
922
923 next = last_active_element(vd, words, esz) + (1 << esz);
924 esz_mask = pred_esz_masks[esz];
925
926 /* Similar to the pseudocode for pnext, but scaled by ESZ
927 so that we find the correct bit. */
928 if (next < words * 64) {
929 uint64_t mask = -1;
930
931 if (next & 63) {
932 mask = ~((1ull << (next & 63)) - 1);
933 next &= -64;
934 }
935 do {
936 uint64_t this_g = g[next / 64] & esz_mask & mask;
937 if (this_g != 0) {
938 next = (next & -64) + ctz64(this_g);
939 break;
940 }
941 next += 64;
942 mask = -1;
943 } while (next < words * 64);
944 }
945
946 i = 0;
947 do {
948 uint64_t this_d = 0;
949 if (i == next / 64) {
950 this_d = 1ull << (next & 63);
951 }
952 d[i] = this_d;
953 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
954 } while (++i < words);
955
956 return flags;
957}
ccd841c3
RH
958
959/* Store zero into every active element of Zd. We will use this for two
960 * and three-operand predicated instructions for which logic dictates a
961 * zero result. In particular, logical shift by element size, which is
962 * otherwise undefined on the host.
963 *
964 * For element sizes smaller than uint64_t, we use tables to expand
965 * the N bits of the controlling predicate to a byte mask, and clear
966 * those bytes.
967 */
968void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
969{
970 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
971 uint64_t *d = vd;
972 uint8_t *pg = vg;
973 for (i = 0; i < opr_sz; i += 1) {
974 d[i] &= ~expand_pred_b(pg[H1(i)]);
975 }
976}
977
978void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
979{
980 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
981 uint64_t *d = vd;
982 uint8_t *pg = vg;
983 for (i = 0; i < opr_sz; i += 1) {
984 d[i] &= ~expand_pred_h(pg[H1(i)]);
985 }
986}
987
988void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
989{
990 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
991 uint64_t *d = vd;
992 uint8_t *pg = vg;
993 for (i = 0; i < opr_sz; i += 1) {
994 d[i] &= ~expand_pred_s(pg[H1(i)]);
995 }
996}
997
998void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
999{
1000 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1001 uint64_t *d = vd;
1002 uint8_t *pg = vg;
1003 for (i = 0; i < opr_sz; i += 1) {
1004 if (pg[H1(i)] & 1) {
1005 d[i] = 0;
1006 }
1007 }
1008}
1009
68459864
RH
1010/* Copy Zn into Zd, and store zero into inactive elements. */
1011void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1012{
1013 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1014 uint64_t *d = vd, *n = vn;
1015 uint8_t *pg = vg;
1016 for (i = 0; i < opr_sz; i += 1) {
1017 d[i] = n[i] & expand_pred_b(pg[H1(i)]);
1018 }
1019}
1020
1021void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1022{
1023 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1024 uint64_t *d = vd, *n = vn;
1025 uint8_t *pg = vg;
1026 for (i = 0; i < opr_sz; i += 1) {
1027 d[i] = n[i] & expand_pred_h(pg[H1(i)]);
1028 }
1029}
1030
1031void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1032{
1033 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1034 uint64_t *d = vd, *n = vn;
1035 uint8_t *pg = vg;
1036 for (i = 0; i < opr_sz; i += 1) {
1037 d[i] = n[i] & expand_pred_s(pg[H1(i)]);
1038 }
1039}
1040
1041void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1042{
1043 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1044 uint64_t *d = vd, *n = vn;
1045 uint8_t *pg = vg;
1046 for (i = 0; i < opr_sz; i += 1) {
054e7adf 1047 d[i] = n[i] & -(uint64_t)(pg[H1(i)] & 1);
68459864
RH
1048 }
1049}
1050
ccd841c3
RH
1051/* Three-operand expander, immediate operand, controlled by a predicate.
1052 */
1053#define DO_ZPZI(NAME, TYPE, H, OP) \
1054void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1055{ \
1056 intptr_t i, opr_sz = simd_oprsz(desc); \
1057 TYPE imm = simd_data(desc); \
1058 for (i = 0; i < opr_sz; ) { \
1059 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1060 do { \
1061 if (pg & 1) { \
1062 TYPE nn = *(TYPE *)(vn + H(i)); \
1063 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1064 } \
1065 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1066 } while (i & 15); \
1067 } \
1068}
1069
1070/* Similarly, specialized for 64-bit operands. */
1071#define DO_ZPZI_D(NAME, TYPE, OP) \
1072void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1073{ \
1074 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1075 TYPE *d = vd, *n = vn; \
1076 TYPE imm = simd_data(desc); \
1077 uint8_t *pg = vg; \
1078 for (i = 0; i < opr_sz; i += 1) { \
1079 if (pg[H1(i)] & 1) { \
1080 TYPE nn = n[i]; \
1081 d[i] = OP(nn, imm); \
1082 } \
1083 } \
1084}
1085
1086#define DO_SHR(N, M) (N >> M)
1087#define DO_SHL(N, M) (N << M)
1088
1089/* Arithmetic shift right for division. This rounds negative numbers
1090 toward zero as per signed division. Therefore before shifting,
1091 when N is negative, add 2**M-1. */
1092#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1093
1094DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1095DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1096DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1097DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1098
1099DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1100DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1101DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1102DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1103
1104DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1105DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1106DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1107DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1108
1109DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1110DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1111DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1112DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1113
1114#undef DO_SHR
1115#undef DO_SHL
1116#undef DO_ASRD
1117#undef DO_ZPZI
1118#undef DO_ZPZI_D
96a36e4a
RH
1119
1120/* Fully general four-operand expander, controlled by a predicate.
1121 */
1122#define DO_ZPZZZ(NAME, TYPE, H, OP) \
1123void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1124 void *vg, uint32_t desc) \
1125{ \
1126 intptr_t i, opr_sz = simd_oprsz(desc); \
1127 for (i = 0; i < opr_sz; ) { \
1128 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1129 do { \
1130 if (pg & 1) { \
1131 TYPE nn = *(TYPE *)(vn + H(i)); \
1132 TYPE mm = *(TYPE *)(vm + H(i)); \
1133 TYPE aa = *(TYPE *)(va + H(i)); \
1134 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1135 } \
1136 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1137 } while (i & 15); \
1138 } \
1139}
1140
1141/* Similarly, specialized for 64-bit operands. */
1142#define DO_ZPZZZ_D(NAME, TYPE, OP) \
1143void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1144 void *vg, uint32_t desc) \
1145{ \
1146 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1147 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1148 uint8_t *pg = vg; \
1149 for (i = 0; i < opr_sz; i += 1) { \
1150 if (pg[H1(i)] & 1) { \
1151 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1152 d[i] = OP(aa, nn, mm); \
1153 } \
1154 } \
1155}
1156
1157#define DO_MLA(A, N, M) (A + N * M)
1158#define DO_MLS(A, N, M) (A - N * M)
1159
1160DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1161DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1162
1163DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1164DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1165
1166DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1167DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1168
1169DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1170DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1171
1172#undef DO_MLA
1173#undef DO_MLS
1174#undef DO_ZPZZZ
1175#undef DO_ZPZZZ_D
9a56c9c3
RH
1176
1177void HELPER(sve_index_b)(void *vd, uint32_t start,
1178 uint32_t incr, uint32_t desc)
1179{
1180 intptr_t i, opr_sz = simd_oprsz(desc);
1181 uint8_t *d = vd;
1182 for (i = 0; i < opr_sz; i += 1) {
1183 d[H1(i)] = start + i * incr;
1184 }
1185}
1186
1187void HELPER(sve_index_h)(void *vd, uint32_t start,
1188 uint32_t incr, uint32_t desc)
1189{
1190 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1191 uint16_t *d = vd;
1192 for (i = 0; i < opr_sz; i += 1) {
1193 d[H2(i)] = start + i * incr;
1194 }
1195}
1196
1197void HELPER(sve_index_s)(void *vd, uint32_t start,
1198 uint32_t incr, uint32_t desc)
1199{
1200 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1201 uint32_t *d = vd;
1202 for (i = 0; i < opr_sz; i += 1) {
1203 d[H4(i)] = start + i * incr;
1204 }
1205}
1206
1207void HELPER(sve_index_d)(void *vd, uint64_t start,
1208 uint64_t incr, uint32_t desc)
1209{
1210 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1211 uint64_t *d = vd;
1212 for (i = 0; i < opr_sz; i += 1) {
1213 d[i] = start + i * incr;
1214 }
1215}
4b242d9c
RH
1216
1217void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1218{
1219 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1220 uint32_t sh = simd_data(desc);
1221 uint32_t *d = vd, *n = vn, *m = vm;
1222 for (i = 0; i < opr_sz; i += 1) {
1223 d[i] = n[i] + (m[i] << sh);
1224 }
1225}
1226
1227void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1228{
1229 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1230 uint64_t sh = simd_data(desc);
1231 uint64_t *d = vd, *n = vn, *m = vm;
1232 for (i = 0; i < opr_sz; i += 1) {
1233 d[i] = n[i] + (m[i] << sh);
1234 }
1235}
1236
1237void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1238{
1239 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1240 uint64_t sh = simd_data(desc);
1241 uint64_t *d = vd, *n = vn, *m = vm;
1242 for (i = 0; i < opr_sz; i += 1) {
1243 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1244 }
1245}
1246
1247void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1248{
1249 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1250 uint64_t sh = simd_data(desc);
1251 uint64_t *d = vd, *n = vn, *m = vm;
1252 for (i = 0; i < opr_sz; i += 1) {
1253 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1254 }
1255}
0762cd42
RH
1256
1257void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1258{
1259 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1260 static const uint16_t coeff[] = {
1261 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1262 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1263 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1264 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1265 };
1266 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1267 uint16_t *d = vd, *n = vn;
1268
1269 for (i = 0; i < opr_sz; i++) {
1270 uint16_t nn = n[i];
1271 intptr_t idx = extract32(nn, 0, 5);
1272 uint16_t exp = extract32(nn, 5, 5);
1273 d[i] = coeff[idx] | (exp << 10);
1274 }
1275}
1276
1277void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1278{
1279 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1280 static const uint32_t coeff[] = {
1281 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1282 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1283 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1284 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1285 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1286 0x1ef532, 0x20b051, 0x227043, 0x243516,
1287 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1288 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1289 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1290 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1291 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1292 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1293 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1294 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1295 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1296 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1297 };
1298 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1299 uint32_t *d = vd, *n = vn;
1300
1301 for (i = 0; i < opr_sz; i++) {
1302 uint32_t nn = n[i];
1303 intptr_t idx = extract32(nn, 0, 6);
1304 uint32_t exp = extract32(nn, 6, 8);
1305 d[i] = coeff[idx] | (exp << 23);
1306 }
1307}
1308
1309void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1310{
1311 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1312 static const uint64_t coeff[] = {
1313 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1314 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1315 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1316 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1317 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1318 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1319 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1320 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1321 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1322 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1323 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1324 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1325 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1326 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1327 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1328 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1329 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1330 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1331 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1332 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1333 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1334 0xFA7C1819E90D8ull,
1335 };
1336 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1337 uint64_t *d = vd, *n = vn;
1338
1339 for (i = 0; i < opr_sz; i++) {
1340 uint64_t nn = n[i];
1341 intptr_t idx = extract32(nn, 0, 6);
1342 uint64_t exp = extract32(nn, 6, 11);
1343 d[i] = coeff[idx] | (exp << 52);
1344 }
1345}
a1f233f2
RH
1346
1347void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1348{
1349 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1350 uint16_t *d = vd, *n = vn, *m = vm;
1351 for (i = 0; i < opr_sz; i += 1) {
1352 uint16_t nn = n[i];
1353 uint16_t mm = m[i];
1354 if (mm & 1) {
1355 nn = float16_one;
1356 }
1357 d[i] = nn ^ (mm & 2) << 14;
1358 }
1359}
1360
1361void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1362{
1363 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1364 uint32_t *d = vd, *n = vn, *m = vm;
1365 for (i = 0; i < opr_sz; i += 1) {
1366 uint32_t nn = n[i];
1367 uint32_t mm = m[i];
1368 if (mm & 1) {
1369 nn = float32_one;
1370 }
1371 d[i] = nn ^ (mm & 2) << 30;
1372 }
1373}
1374
1375void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1376{
1377 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1378 uint64_t *d = vd, *n = vn, *m = vm;
1379 for (i = 0; i < opr_sz; i += 1) {
1380 uint64_t nn = n[i];
1381 uint64_t mm = m[i];
1382 if (mm & 1) {
1383 nn = float64_one;
1384 }
1385 d[i] = nn ^ (mm & 2) << 62;
1386 }
1387}
24e82e68
RH
1388
1389/*
1390 * Signed saturating addition with scalar operand.
1391 */
1392
1393void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1394{
1395 intptr_t i, oprsz = simd_oprsz(desc);
1396
1397 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1398 int r = *(int8_t *)(a + i) + b;
1399 if (r > INT8_MAX) {
1400 r = INT8_MAX;
1401 } else if (r < INT8_MIN) {
1402 r = INT8_MIN;
1403 }
1404 *(int8_t *)(d + i) = r;
1405 }
1406}
1407
1408void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1409{
1410 intptr_t i, oprsz = simd_oprsz(desc);
1411
1412 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1413 int r = *(int16_t *)(a + i) + b;
1414 if (r > INT16_MAX) {
1415 r = INT16_MAX;
1416 } else if (r < INT16_MIN) {
1417 r = INT16_MIN;
1418 }
1419 *(int16_t *)(d + i) = r;
1420 }
1421}
1422
1423void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1424{
1425 intptr_t i, oprsz = simd_oprsz(desc);
1426
1427 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1428 int64_t r = *(int32_t *)(a + i) + b;
1429 if (r > INT32_MAX) {
1430 r = INT32_MAX;
1431 } else if (r < INT32_MIN) {
1432 r = INT32_MIN;
1433 }
1434 *(int32_t *)(d + i) = r;
1435 }
1436}
1437
1438void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1439{
1440 intptr_t i, oprsz = simd_oprsz(desc);
1441
1442 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1443 int64_t ai = *(int64_t *)(a + i);
1444 int64_t r = ai + b;
1445 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1446 /* Signed overflow. */
1447 r = (r < 0 ? INT64_MAX : INT64_MIN);
1448 }
1449 *(int64_t *)(d + i) = r;
1450 }
1451}
1452
1453/*
1454 * Unsigned saturating addition with scalar operand.
1455 */
1456
1457void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1458{
1459 intptr_t i, oprsz = simd_oprsz(desc);
1460
1461 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1462 int r = *(uint8_t *)(a + i) + b;
1463 if (r > UINT8_MAX) {
1464 r = UINT8_MAX;
1465 } else if (r < 0) {
1466 r = 0;
1467 }
1468 *(uint8_t *)(d + i) = r;
1469 }
1470}
1471
1472void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1473{
1474 intptr_t i, oprsz = simd_oprsz(desc);
1475
1476 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1477 int r = *(uint16_t *)(a + i) + b;
1478 if (r > UINT16_MAX) {
1479 r = UINT16_MAX;
1480 } else if (r < 0) {
1481 r = 0;
1482 }
1483 *(uint16_t *)(d + i) = r;
1484 }
1485}
1486
1487void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1488{
1489 intptr_t i, oprsz = simd_oprsz(desc);
1490
1491 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1492 int64_t r = *(uint32_t *)(a + i) + b;
1493 if (r > UINT32_MAX) {
1494 r = UINT32_MAX;
1495 } else if (r < 0) {
1496 r = 0;
1497 }
1498 *(uint32_t *)(d + i) = r;
1499 }
1500}
1501
1502void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1503{
1504 intptr_t i, oprsz = simd_oprsz(desc);
1505
1506 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1507 uint64_t r = *(uint64_t *)(a + i) + b;
1508 if (r < b) {
1509 r = UINT64_MAX;
1510 }
1511 *(uint64_t *)(d + i) = r;
1512 }
1513}
1514
1515void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1516{
1517 intptr_t i, oprsz = simd_oprsz(desc);
1518
1519 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1520 uint64_t ai = *(uint64_t *)(a + i);
1521 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1522 }
1523}
f25a2361
RH
1524
1525/* Two operand predicated copy immediate with merge. All valid immediates
1526 * can fit within 17 signed bits in the simd_data field.
1527 */
1528void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1529 uint64_t mm, uint32_t desc)
1530{
1531 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1532 uint64_t *d = vd, *n = vn;
1533 uint8_t *pg = vg;
1534
1535 mm = dup_const(MO_8, mm);
1536 for (i = 0; i < opr_sz; i += 1) {
1537 uint64_t nn = n[i];
1538 uint64_t pp = expand_pred_b(pg[H1(i)]);
1539 d[i] = (mm & pp) | (nn & ~pp);
1540 }
1541}
1542
1543void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1544 uint64_t mm, uint32_t desc)
1545{
1546 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1547 uint64_t *d = vd, *n = vn;
1548 uint8_t *pg = vg;
1549
1550 mm = dup_const(MO_16, mm);
1551 for (i = 0; i < opr_sz; i += 1) {
1552 uint64_t nn = n[i];
1553 uint64_t pp = expand_pred_h(pg[H1(i)]);
1554 d[i] = (mm & pp) | (nn & ~pp);
1555 }
1556}
1557
1558void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1559 uint64_t mm, uint32_t desc)
1560{
1561 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1562 uint64_t *d = vd, *n = vn;
1563 uint8_t *pg = vg;
1564
1565 mm = dup_const(MO_32, mm);
1566 for (i = 0; i < opr_sz; i += 1) {
1567 uint64_t nn = n[i];
1568 uint64_t pp = expand_pred_s(pg[H1(i)]);
1569 d[i] = (mm & pp) | (nn & ~pp);
1570 }
1571}
1572
1573void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1574 uint64_t mm, uint32_t desc)
1575{
1576 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1577 uint64_t *d = vd, *n = vn;
1578 uint8_t *pg = vg;
1579
1580 for (i = 0; i < opr_sz; i += 1) {
1581 uint64_t nn = n[i];
1582 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1583 }
1584}
1585
1586void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1587{
1588 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1589 uint64_t *d = vd;
1590 uint8_t *pg = vg;
1591
1592 val = dup_const(MO_8, val);
1593 for (i = 0; i < opr_sz; i += 1) {
1594 d[i] = val & expand_pred_b(pg[H1(i)]);
1595 }
1596}
1597
1598void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1599{
1600 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1601 uint64_t *d = vd;
1602 uint8_t *pg = vg;
1603
1604 val = dup_const(MO_16, val);
1605 for (i = 0; i < opr_sz; i += 1) {
1606 d[i] = val & expand_pred_h(pg[H1(i)]);
1607 }
1608}
1609
1610void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1611{
1612 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1613 uint64_t *d = vd;
1614 uint8_t *pg = vg;
1615
1616 val = dup_const(MO_32, val);
1617 for (i = 0; i < opr_sz; i += 1) {
1618 d[i] = val & expand_pred_s(pg[H1(i)]);
1619 }
1620}
1621
1622void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1623{
1624 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1625 uint64_t *d = vd;
1626 uint8_t *pg = vg;
1627
1628 for (i = 0; i < opr_sz; i += 1) {
1629 d[i] = (pg[H1(i)] & 1 ? val : 0);
1630 }
1631}
b94f8f60 1632
b4cd95d2 1633/* Big-endian hosts need to frob the byte indices. If the copy
b94f8f60
RH
1634 * happens to be 8-byte aligned, then no frobbing necessary.
1635 */
1636static void swap_memmove(void *vd, void *vs, size_t n)
1637{
1638 uintptr_t d = (uintptr_t)vd;
1639 uintptr_t s = (uintptr_t)vs;
1640 uintptr_t o = (d | s | n) & 7;
1641 size_t i;
1642
1643#ifndef HOST_WORDS_BIGENDIAN
1644 o = 0;
1645#endif
1646 switch (o) {
1647 case 0:
1648 memmove(vd, vs, n);
1649 break;
1650
1651 case 4:
1652 if (d < s || d >= s + n) {
1653 for (i = 0; i < n; i += 4) {
1654 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1655 }
1656 } else {
1657 for (i = n; i > 0; ) {
1658 i -= 4;
1659 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1660 }
1661 }
1662 break;
1663
1664 case 2:
1665 case 6:
1666 if (d < s || d >= s + n) {
1667 for (i = 0; i < n; i += 2) {
1668 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1669 }
1670 } else {
1671 for (i = n; i > 0; ) {
1672 i -= 2;
1673 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1674 }
1675 }
1676 break;
1677
1678 default:
1679 if (d < s || d >= s + n) {
1680 for (i = 0; i < n; i++) {
1681 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1682 }
1683 } else {
1684 for (i = n; i > 0; ) {
1685 i -= 1;
1686 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1687 }
1688 }
1689 break;
1690 }
1691}
1692
9123aeb6
RH
1693/* Similarly for memset of 0. */
1694static void swap_memzero(void *vd, size_t n)
1695{
1696 uintptr_t d = (uintptr_t)vd;
1697 uintptr_t o = (d | n) & 7;
1698 size_t i;
1699
1700 /* Usually, the first bit of a predicate is set, so N is 0. */
1701 if (likely(n == 0)) {
1702 return;
1703 }
1704
1705#ifndef HOST_WORDS_BIGENDIAN
1706 o = 0;
1707#endif
1708 switch (o) {
1709 case 0:
1710 memset(vd, 0, n);
1711 break;
1712
1713 case 4:
1714 for (i = 0; i < n; i += 4) {
1715 *(uint32_t *)H1_4(d + i) = 0;
1716 }
1717 break;
1718
1719 case 2:
1720 case 6:
1721 for (i = 0; i < n; i += 2) {
1722 *(uint16_t *)H1_2(d + i) = 0;
1723 }
1724 break;
1725
1726 default:
1727 for (i = 0; i < n; i++) {
1728 *(uint8_t *)H1(d + i) = 0;
1729 }
1730 break;
1731 }
1732}
1733
b94f8f60
RH
1734void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1735{
1736 intptr_t opr_sz = simd_oprsz(desc);
1737 size_t n_ofs = simd_data(desc);
1738 size_t n_siz = opr_sz - n_ofs;
1739
1740 if (vd != vm) {
1741 swap_memmove(vd, vn + n_ofs, n_siz);
1742 swap_memmove(vd + n_siz, vm, n_ofs);
1743 } else if (vd != vn) {
1744 swap_memmove(vd + n_siz, vd, n_ofs);
1745 swap_memmove(vd, vn + n_ofs, n_siz);
1746 } else {
1747 /* vd == vn == vm. Need temp space. */
1748 ARMVectorReg tmp;
1749 swap_memmove(&tmp, vm, n_ofs);
1750 swap_memmove(vd, vd + n_ofs, n_siz);
1751 memcpy(vd + n_siz, &tmp, n_ofs);
1752 }
1753}
30562ab7
RH
1754
1755#define DO_INSR(NAME, TYPE, H) \
1756void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1757{ \
1758 intptr_t opr_sz = simd_oprsz(desc); \
1759 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1760 *(TYPE *)(vd + H(0)) = val; \
1761}
1762
1763DO_INSR(sve_insr_b, uint8_t, H1)
1764DO_INSR(sve_insr_h, uint16_t, H1_2)
1765DO_INSR(sve_insr_s, uint32_t, H1_4)
1766DO_INSR(sve_insr_d, uint64_t, )
1767
1768#undef DO_INSR
1769
1770void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1771{
1772 intptr_t i, j, opr_sz = simd_oprsz(desc);
1773 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1774 uint64_t f = *(uint64_t *)(vn + i);
1775 uint64_t b = *(uint64_t *)(vn + j);
1776 *(uint64_t *)(vd + i) = bswap64(b);
1777 *(uint64_t *)(vd + j) = bswap64(f);
1778 }
1779}
1780
30562ab7
RH
1781void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1782{
1783 intptr_t i, j, opr_sz = simd_oprsz(desc);
1784 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1785 uint64_t f = *(uint64_t *)(vn + i);
1786 uint64_t b = *(uint64_t *)(vn + j);
1787 *(uint64_t *)(vd + i) = hswap64(b);
1788 *(uint64_t *)(vd + j) = hswap64(f);
1789 }
1790}
1791
1792void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1793{
1794 intptr_t i, j, opr_sz = simd_oprsz(desc);
1795 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1796 uint64_t f = *(uint64_t *)(vn + i);
1797 uint64_t b = *(uint64_t *)(vn + j);
1798 *(uint64_t *)(vd + i) = rol64(b, 32);
1799 *(uint64_t *)(vd + j) = rol64(f, 32);
1800 }
1801}
1802
1803void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1804{
1805 intptr_t i, j, opr_sz = simd_oprsz(desc);
1806 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1807 uint64_t f = *(uint64_t *)(vn + i);
1808 uint64_t b = *(uint64_t *)(vn + j);
1809 *(uint64_t *)(vd + i) = b;
1810 *(uint64_t *)(vd + j) = f;
1811 }
1812}
1813
1814#define DO_TBL(NAME, TYPE, H) \
1815void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1816{ \
1817 intptr_t i, opr_sz = simd_oprsz(desc); \
1818 uintptr_t elem = opr_sz / sizeof(TYPE); \
1819 TYPE *d = vd, *n = vn, *m = vm; \
1820 ARMVectorReg tmp; \
1821 if (unlikely(vd == vn)) { \
1822 n = memcpy(&tmp, vn, opr_sz); \
1823 } \
1824 for (i = 0; i < elem; i++) { \
1825 TYPE j = m[H(i)]; \
1826 d[H(i)] = j < elem ? n[H(j)] : 0; \
1827 } \
1828}
1829
1830DO_TBL(sve_tbl_b, uint8_t, H1)
1831DO_TBL(sve_tbl_h, uint16_t, H2)
1832DO_TBL(sve_tbl_s, uint32_t, H4)
1833DO_TBL(sve_tbl_d, uint64_t, )
1834
1835#undef TBL
1836
1837#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1838void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1839{ \
1840 intptr_t i, opr_sz = simd_oprsz(desc); \
1841 TYPED *d = vd; \
1842 TYPES *n = vn; \
1843 ARMVectorReg tmp; \
1844 if (unlikely(vn - vd < opr_sz)) { \
1845 n = memcpy(&tmp, n, opr_sz / 2); \
1846 } \
1847 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1848 d[HD(i)] = n[HS(i)]; \
1849 } \
1850}
1851
1852DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1853DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1854DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1855
1856DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1857DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1858DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1859
1860#undef DO_UNPK
d731d8cb
RH
1861
1862/* Mask of bits included in the even numbered predicates of width esz.
1863 * We also use this for expand_bits/compress_bits, and so extend the
1864 * same pattern out to 16-bit units.
1865 */
1866static const uint64_t even_bit_esz_masks[5] = {
1867 0x5555555555555555ull,
1868 0x3333333333333333ull,
1869 0x0f0f0f0f0f0f0f0full,
1870 0x00ff00ff00ff00ffull,
1871 0x0000ffff0000ffffull,
1872};
1873
1874/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1875 * For N==0, this corresponds to the operation that in qemu/bitops.h
1876 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1877 * section 7-2 Shuffling Bits.
1878 */
1879static uint64_t expand_bits(uint64_t x, int n)
1880{
1881 int i;
1882
1883 x &= 0xffffffffu;
1884 for (i = 4; i >= n; i--) {
1885 int sh = 1 << i;
1886 x = ((x << sh) | x) & even_bit_esz_masks[i];
1887 }
1888 return x;
1889}
1890
1891/* Compress units of 2**(N+1) bits to units of 2**N bits.
1892 * For N==0, this corresponds to the operation that in qemu/bitops.h
1893 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1894 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1895 */
1896static uint64_t compress_bits(uint64_t x, int n)
1897{
1898 int i;
1899
1900 for (i = n; i <= 4; i++) {
1901 int sh = 1 << i;
1902 x &= even_bit_esz_masks[i];
1903 x = (x >> sh) | x;
1904 }
1905 return x & 0xffffffffu;
1906}
1907
1908void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1909{
1910 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1911 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1912 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1913 uint64_t *d = vd;
1914 intptr_t i;
1915
1916 if (oprsz <= 8) {
1917 uint64_t nn = *(uint64_t *)vn;
1918 uint64_t mm = *(uint64_t *)vm;
1919 int half = 4 * oprsz;
1920
1921 nn = extract64(nn, high * half, half);
1922 mm = extract64(mm, high * half, half);
1923 nn = expand_bits(nn, esz);
1924 mm = expand_bits(mm, esz);
1925 d[0] = nn + (mm << (1 << esz));
1926 } else {
1927 ARMPredicateReg tmp_n, tmp_m;
1928
1929 /* We produce output faster than we consume input.
1930 Therefore we must be mindful of possible overlap. */
1931 if ((vn - vd) < (uintptr_t)oprsz) {
1932 vn = memcpy(&tmp_n, vn, oprsz);
1933 }
1934 if ((vm - vd) < (uintptr_t)oprsz) {
1935 vm = memcpy(&tmp_m, vm, oprsz);
1936 }
1937 if (high) {
1938 high = oprsz >> 1;
1939 }
1940
1941 if ((high & 3) == 0) {
1942 uint32_t *n = vn, *m = vm;
1943 high >>= 2;
1944
1945 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1946 uint64_t nn = n[H4(high + i)];
1947 uint64_t mm = m[H4(high + i)];
1948
1949 nn = expand_bits(nn, esz);
1950 mm = expand_bits(mm, esz);
1951 d[i] = nn + (mm << (1 << esz));
1952 }
1953 } else {
1954 uint8_t *n = vn, *m = vm;
1955 uint16_t *d16 = vd;
1956
1957 for (i = 0; i < oprsz / 2; i++) {
1958 uint16_t nn = n[H1(high + i)];
1959 uint16_t mm = m[H1(high + i)];
1960
1961 nn = expand_bits(nn, esz);
1962 mm = expand_bits(mm, esz);
1963 d16[H2(i)] = nn + (mm << (1 << esz));
1964 }
1965 }
1966 }
1967}
1968
1969void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1970{
1971 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1972 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1973 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1974 uint64_t *d = vd, *n = vn, *m = vm;
1975 uint64_t l, h;
1976 intptr_t i;
1977
1978 if (oprsz <= 8) {
1979 l = compress_bits(n[0] >> odd, esz);
1980 h = compress_bits(m[0] >> odd, esz);
1981 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1982 } else {
1983 ARMPredicateReg tmp_m;
1984 intptr_t oprsz_16 = oprsz / 16;
1985
1986 if ((vm - vd) < (uintptr_t)oprsz) {
1987 m = memcpy(&tmp_m, vm, oprsz);
1988 }
1989
1990 for (i = 0; i < oprsz_16; i++) {
1991 l = n[2 * i + 0];
1992 h = n[2 * i + 1];
1993 l = compress_bits(l >> odd, esz);
1994 h = compress_bits(h >> odd, esz);
1995 d[i] = l + (h << 32);
1996 }
1997
1998 /* For VL which is not a power of 2, the results from M do not
1999 align nicely with the uint64_t for D. Put the aligned results
2000 from M into TMP_M and then copy it into place afterward. */
2001 if (oprsz & 15) {
2002 d[i] = compress_bits(n[2 * i] >> odd, esz);
2003
2004 for (i = 0; i < oprsz_16; i++) {
2005 l = m[2 * i + 0];
2006 h = m[2 * i + 1];
2007 l = compress_bits(l >> odd, esz);
2008 h = compress_bits(h >> odd, esz);
2009 tmp_m.p[i] = l + (h << 32);
2010 }
2011 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
2012
2013 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
2014 } else {
2015 for (i = 0; i < oprsz_16; i++) {
2016 l = m[2 * i + 0];
2017 h = m[2 * i + 1];
2018 l = compress_bits(l >> odd, esz);
2019 h = compress_bits(h >> odd, esz);
2020 d[oprsz_16 + i] = l + (h << 32);
2021 }
2022 }
2023 }
2024}
2025
2026void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2027{
2028 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2029 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2030 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2031 uint64_t *d = vd, *n = vn, *m = vm;
2032 uint64_t mask;
2033 int shr, shl;
2034 intptr_t i;
2035
2036 shl = 1 << esz;
2037 shr = 0;
2038 mask = even_bit_esz_masks[esz];
2039 if (odd) {
2040 mask <<= shl;
2041 shr = shl;
2042 shl = 0;
2043 }
2044
2045 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2046 uint64_t nn = (n[i] & mask) >> shr;
2047 uint64_t mm = (m[i] & mask) << shl;
2048 d[i] = nn + mm;
2049 }
2050}
2051
2052/* Reverse units of 2**N bits. */
2053static uint64_t reverse_bits_64(uint64_t x, int n)
2054{
2055 int i, sh;
2056
2057 x = bswap64(x);
2058 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2059 uint64_t mask = even_bit_esz_masks[i];
2060 x = ((x & mask) << sh) | ((x >> sh) & mask);
2061 }
2062 return x;
2063}
2064
2065static uint8_t reverse_bits_8(uint8_t x, int n)
2066{
2067 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2068 int i, sh;
2069
2070 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2071 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2072 }
2073 return x;
2074}
2075
2076void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2077{
2078 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2079 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2080 intptr_t i, oprsz_2 = oprsz / 2;
2081
2082 if (oprsz <= 8) {
2083 uint64_t l = *(uint64_t *)vn;
2084 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2085 *(uint64_t *)vd = l;
2086 } else if ((oprsz & 15) == 0) {
2087 for (i = 0; i < oprsz_2; i += 8) {
2088 intptr_t ih = oprsz - 8 - i;
2089 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2090 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2091 *(uint64_t *)(vd + i) = h;
2092 *(uint64_t *)(vd + ih) = l;
2093 }
2094 } else {
2095 for (i = 0; i < oprsz_2; i += 1) {
2096 intptr_t il = H1(i);
2097 intptr_t ih = H1(oprsz - 1 - i);
2098 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2099 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2100 *(uint8_t *)(vd + il) = h;
2101 *(uint8_t *)(vd + ih) = l;
2102 }
2103 }
2104}
2105
2106void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2107{
2108 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2109 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2110 uint64_t *d = vd;
2111 intptr_t i;
2112
2113 if (oprsz <= 8) {
2114 uint64_t nn = *(uint64_t *)vn;
2115 int half = 4 * oprsz;
2116
2117 nn = extract64(nn, high * half, half);
2118 nn = expand_bits(nn, 0);
2119 d[0] = nn;
2120 } else {
2121 ARMPredicateReg tmp_n;
2122
2123 /* We produce output faster than we consume input.
2124 Therefore we must be mindful of possible overlap. */
2125 if ((vn - vd) < (uintptr_t)oprsz) {
2126 vn = memcpy(&tmp_n, vn, oprsz);
2127 }
2128 if (high) {
2129 high = oprsz >> 1;
2130 }
2131
2132 if ((high & 3) == 0) {
2133 uint32_t *n = vn;
2134 high >>= 2;
2135
2136 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2137 uint64_t nn = n[H4(high + i)];
2138 d[i] = expand_bits(nn, 0);
2139 }
2140 } else {
2141 uint16_t *d16 = vd;
2142 uint8_t *n = vn;
2143
2144 for (i = 0; i < oprsz / 2; i++) {
2145 uint16_t nn = n[H1(high + i)];
2146 d16[H2(i)] = expand_bits(nn, 0);
2147 }
2148 }
2149 }
2150}
234b48e9
RH
2151
2152#define DO_ZIP(NAME, TYPE, H) \
2153void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2154{ \
2155 intptr_t oprsz = simd_oprsz(desc); \
2156 intptr_t i, oprsz_2 = oprsz / 2; \
2157 ARMVectorReg tmp_n, tmp_m; \
2158 /* We produce output faster than we consume input. \
2159 Therefore we must be mindful of possible overlap. */ \
2160 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2161 vn = memcpy(&tmp_n, vn, oprsz_2); \
2162 } \
2163 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2164 vm = memcpy(&tmp_m, vm, oprsz_2); \
2165 } \
2166 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2167 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2168 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2169 } \
2170}
2171
2172DO_ZIP(sve_zip_b, uint8_t, H1)
2173DO_ZIP(sve_zip_h, uint16_t, H1_2)
2174DO_ZIP(sve_zip_s, uint32_t, H1_4)
2175DO_ZIP(sve_zip_d, uint64_t, )
2176
2177#define DO_UZP(NAME, TYPE, H) \
2178void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2179{ \
2180 intptr_t oprsz = simd_oprsz(desc); \
2181 intptr_t oprsz_2 = oprsz / 2; \
2182 intptr_t odd_ofs = simd_data(desc); \
2183 intptr_t i; \
2184 ARMVectorReg tmp_m; \
2185 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2186 vm = memcpy(&tmp_m, vm, oprsz); \
2187 } \
2188 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2189 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2190 } \
2191 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2192 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2193 } \
2194}
2195
2196DO_UZP(sve_uzp_b, uint8_t, H1)
2197DO_UZP(sve_uzp_h, uint16_t, H1_2)
2198DO_UZP(sve_uzp_s, uint32_t, H1_4)
2199DO_UZP(sve_uzp_d, uint64_t, )
2200
2201#define DO_TRN(NAME, TYPE, H) \
2202void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2203{ \
2204 intptr_t oprsz = simd_oprsz(desc); \
2205 intptr_t odd_ofs = simd_data(desc); \
2206 intptr_t i; \
2207 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2208 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2209 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2210 *(TYPE *)(vd + H(i + 0)) = ae; \
2211 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2212 } \
2213}
2214
2215DO_TRN(sve_trn_b, uint8_t, H1)
2216DO_TRN(sve_trn_h, uint16_t, H1_2)
2217DO_TRN(sve_trn_s, uint32_t, H1_4)
2218DO_TRN(sve_trn_d, uint64_t, )
2219
2220#undef DO_ZIP
2221#undef DO_UZP
2222#undef DO_TRN
3ca879ae
RH
2223
2224void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2225{
2226 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2227 uint32_t *d = vd, *n = vn;
2228 uint8_t *pg = vg;
2229
2230 for (i = j = 0; i < opr_sz; i++) {
2231 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2232 d[H4(j)] = n[H4(i)];
2233 j++;
2234 }
2235 }
2236 for (; j < opr_sz; j++) {
2237 d[H4(j)] = 0;
2238 }
2239}
2240
2241void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2242{
2243 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2244 uint64_t *d = vd, *n = vn;
2245 uint8_t *pg = vg;
2246
2247 for (i = j = 0; i < opr_sz; i++) {
2248 if (pg[H1(i)] & 1) {
2249 d[j] = n[i];
2250 j++;
2251 }
2252 }
2253 for (; j < opr_sz; j++) {
2254 d[j] = 0;
2255 }
2256}
ef23cb72
RH
2257
2258/* Similar to the ARM LastActiveElement pseudocode function, except the
2259 * result is multiplied by the element size. This includes the not found
2260 * indication; e.g. not found for esz=3 is -8.
2261 */
2262int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2263{
2264 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2265 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2266
2267 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2268}
b48ff240
RH
2269
2270void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2271{
2272 intptr_t opr_sz = simd_oprsz(desc) / 8;
2273 int esz = simd_data(desc);
2274 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2275 intptr_t i, first_i, last_i;
2276 ARMVectorReg tmp;
2277
2278 first_i = last_i = 0;
2279 first_g = last_g = 0;
2280
2281 /* Find the extent of the active elements within VG. */
2282 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2283 pg = *(uint64_t *)(vg + i) & mask;
2284 if (pg) {
2285 if (last_g == 0) {
2286 last_g = pg;
2287 last_i = i;
2288 }
2289 first_g = pg;
2290 first_i = i;
2291 }
2292 }
2293
2294 len = 0;
2295 if (first_g != 0) {
2296 first_i = first_i * 8 + ctz64(first_g);
2297 last_i = last_i * 8 + 63 - clz64(last_g);
2298 len = last_i - first_i + (1 << esz);
2299 if (vd == vm) {
2300 vm = memcpy(&tmp, vm, opr_sz * 8);
2301 }
2302 swap_memmove(vd, vn + first_i, len);
2303 }
2304 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2305}
d3fe4a29
RH
2306
2307void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2308 void *vg, uint32_t desc)
2309{
2310 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2311 uint64_t *d = vd, *n = vn, *m = vm;
2312 uint8_t *pg = vg;
2313
2314 for (i = 0; i < opr_sz; i += 1) {
2315 uint64_t nn = n[i], mm = m[i];
2316 uint64_t pp = expand_pred_b(pg[H1(i)]);
2317 d[i] = (nn & pp) | (mm & ~pp);
2318 }
2319}
2320
2321void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2322 void *vg, uint32_t desc)
2323{
2324 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2325 uint64_t *d = vd, *n = vn, *m = vm;
2326 uint8_t *pg = vg;
2327
2328 for (i = 0; i < opr_sz; i += 1) {
2329 uint64_t nn = n[i], mm = m[i];
2330 uint64_t pp = expand_pred_h(pg[H1(i)]);
2331 d[i] = (nn & pp) | (mm & ~pp);
2332 }
2333}
2334
2335void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2336 void *vg, uint32_t desc)
2337{
2338 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2339 uint64_t *d = vd, *n = vn, *m = vm;
2340 uint8_t *pg = vg;
2341
2342 for (i = 0; i < opr_sz; i += 1) {
2343 uint64_t nn = n[i], mm = m[i];
2344 uint64_t pp = expand_pred_s(pg[H1(i)]);
2345 d[i] = (nn & pp) | (mm & ~pp);
2346 }
2347}
2348
2349void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2350 void *vg, uint32_t desc)
2351{
2352 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2353 uint64_t *d = vd, *n = vn, *m = vm;
2354 uint8_t *pg = vg;
2355
2356 for (i = 0; i < opr_sz; i += 1) {
2357 uint64_t nn = n[i], mm = m[i];
2358 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2359 }
2360}
757f9cff
RH
2361
2362/* Two operand comparison controlled by a predicate.
2363 * ??? It is very tempting to want to be able to expand this inline
2364 * with x86 instructions, e.g.
2365 *
2366 * vcmpeqw zm, zn, %ymm0
2367 * vpmovmskb %ymm0, %eax
2368 * and $0x5555, %eax
2369 * and pg, %eax
2370 *
2371 * or even aarch64, e.g.
2372 *
2373 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2374 * cmeq v0.8h, zn, zm
2375 * and v0.8h, v0.8h, mask
2376 * addv h0, v0.8h
2377 * and v0.8b, pg
2378 *
2379 * However, coming up with an abstraction that allows vector inputs and
2380 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2381 * scalar outputs, is tricky.
2382 */
2383#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2384uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2385{ \
2386 intptr_t opr_sz = simd_oprsz(desc); \
2387 uint32_t flags = PREDTEST_INIT; \
2388 intptr_t i = opr_sz; \
2389 do { \
2390 uint64_t out = 0, pg; \
2391 do { \
2392 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2393 TYPE nn = *(TYPE *)(vn + H(i)); \
2394 TYPE mm = *(TYPE *)(vm + H(i)); \
2395 out |= nn OP mm; \
2396 } while (i & 63); \
2397 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2398 out &= pg; \
2399 *(uint64_t *)(vd + (i >> 3)) = out; \
2400 flags = iter_predtest_bwd(out, pg, flags); \
2401 } while (i > 0); \
2402 return flags; \
2403}
2404
2405#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2406 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2407#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2408 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2409#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2410 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2411#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2412 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2413
2414DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2415DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2416DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2417DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2418
2419DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2420DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2421DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2422DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2423
2424DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2425DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2426DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2427DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2428
2429DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2430DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2431DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2432DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2433
2434DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2435DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2436DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2437DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2438
2439DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2440DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2441DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2442DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2443
2444#undef DO_CMP_PPZZ_B
2445#undef DO_CMP_PPZZ_H
2446#undef DO_CMP_PPZZ_S
2447#undef DO_CMP_PPZZ_D
2448#undef DO_CMP_PPZZ
2449
2450/* Similar, but the second source is "wide". */
2451#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2452uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2453{ \
2454 intptr_t opr_sz = simd_oprsz(desc); \
2455 uint32_t flags = PREDTEST_INIT; \
2456 intptr_t i = opr_sz; \
2457 do { \
2458 uint64_t out = 0, pg; \
2459 do { \
2460 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2461 do { \
2462 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2463 TYPE nn = *(TYPE *)(vn + H(i)); \
2464 out |= nn OP mm; \
2465 } while (i & 7); \
2466 } while (i & 63); \
2467 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2468 out &= pg; \
2469 *(uint64_t *)(vd + (i >> 3)) = out; \
2470 flags = iter_predtest_bwd(out, pg, flags); \
2471 } while (i > 0); \
2472 return flags; \
2473}
2474
2475#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2476 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2477#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2478 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2479#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2480 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2481
df4e0010
RH
2482DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
2483DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2484DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
757f9cff 2485
df4e0010
RH
2486DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
2487DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2488DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
757f9cff
RH
2489
2490DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2491DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2492DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2493
2494DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2495DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2496DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2497
2498DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2499DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2500DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2501
2502DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2503DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2504DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2505
2506DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2507DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2508DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2509
2510DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2511DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2512DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2513
2514DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2515DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2516DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2517
2518DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2519DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2520DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2521
2522#undef DO_CMP_PPZW_B
2523#undef DO_CMP_PPZW_H
2524#undef DO_CMP_PPZW_S
2525#undef DO_CMP_PPZW
38cadeba
RH
2526
2527/* Similar, but the second source is immediate. */
2528#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2529uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2530{ \
2531 intptr_t opr_sz = simd_oprsz(desc); \
2532 uint32_t flags = PREDTEST_INIT; \
2533 TYPE mm = simd_data(desc); \
2534 intptr_t i = opr_sz; \
2535 do { \
2536 uint64_t out = 0, pg; \
2537 do { \
2538 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2539 TYPE nn = *(TYPE *)(vn + H(i)); \
2540 out |= nn OP mm; \
2541 } while (i & 63); \
2542 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2543 out &= pg; \
2544 *(uint64_t *)(vd + (i >> 3)) = out; \
2545 flags = iter_predtest_bwd(out, pg, flags); \
2546 } while (i > 0); \
2547 return flags; \
2548}
2549
2550#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2551 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2552#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2553 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2554#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2555 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2556#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2557 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2558
2559DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2560DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2561DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2562DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2563
2564DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2565DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2566DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2567DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2568
2569DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2570DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2571DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2572DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2573
2574DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2575DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2576DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2577DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2578
2579DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2580DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2581DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2582DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2583
2584DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2585DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2586DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2587DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2588
2589DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2590DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2591DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2592DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2593
2594DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2595DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2596DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2597DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2598
2599DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2600DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2601DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2602DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2603
2604DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2605DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2606DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2607DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2608
2609#undef DO_CMP_PPZI_B
2610#undef DO_CMP_PPZI_H
2611#undef DO_CMP_PPZI_S
2612#undef DO_CMP_PPZI_D
2613#undef DO_CMP_PPZI
35da316f
RH
2614
2615/* Similar to the ARM LastActive pseudocode function. */
2616static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2617{
2618 intptr_t i;
2619
2620 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2621 uint64_t pg = *(uint64_t *)(vg + i);
2622 if (pg) {
2623 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2624 }
2625 }
2626 return 0;
2627}
2628
2629/* Compute a mask into RETB that is true for all G, up to and including
2630 * (if after) or excluding (if !after) the first G & N.
2631 * Return true if BRK found.
2632 */
2633static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2634 bool brk, bool after)
2635{
2636 uint64_t b;
2637
2638 if (brk) {
2639 b = 0;
2640 } else if ((g & n) == 0) {
2641 /* For all G, no N are set; break not found. */
2642 b = g;
2643 } else {
2644 /* Break somewhere in N. Locate it. */
2645 b = g & n; /* guard true, pred true */
2646 b = b & -b; /* first such */
2647 if (after) {
2648 b = b | (b - 1); /* break after same */
2649 } else {
2650 b = b - 1; /* break before same */
2651 }
2652 brk = true;
2653 }
2654
2655 *retb = b;
2656 return brk;
2657}
2658
2659/* Compute a zeroing BRK. */
2660static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2661 intptr_t oprsz, bool after)
2662{
2663 bool brk = false;
2664 intptr_t i;
2665
2666 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2667 uint64_t this_b, this_g = g[i];
2668
2669 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2670 d[i] = this_b & this_g;
2671 }
2672}
2673
2674/* Likewise, but also compute flags. */
2675static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2676 intptr_t oprsz, bool after)
2677{
2678 uint32_t flags = PREDTEST_INIT;
2679 bool brk = false;
2680 intptr_t i;
2681
2682 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2683 uint64_t this_b, this_d, this_g = g[i];
2684
2685 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2686 d[i] = this_d = this_b & this_g;
2687 flags = iter_predtest_fwd(this_d, this_g, flags);
2688 }
2689 return flags;
2690}
2691
2692/* Compute a merging BRK. */
2693static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2694 intptr_t oprsz, bool after)
2695{
2696 bool brk = false;
2697 intptr_t i;
2698
2699 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2700 uint64_t this_b, this_g = g[i];
2701
2702 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2703 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2704 }
2705}
2706
2707/* Likewise, but also compute flags. */
2708static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2709 intptr_t oprsz, bool after)
2710{
2711 uint32_t flags = PREDTEST_INIT;
2712 bool brk = false;
2713 intptr_t i;
2714
2715 for (i = 0; i < oprsz / 8; ++i) {
2716 uint64_t this_b, this_d = d[i], this_g = g[i];
2717
2718 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2719 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2720 flags = iter_predtest_fwd(this_d, this_g, flags);
2721 }
2722 return flags;
2723}
2724
2725static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2726{
2727 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2728 * The compiler should turn this into 4 64-bit integer stores.
2729 */
2730 memset(d, 0, sizeof(ARMPredicateReg));
2731 return PREDTEST_INIT;
2732}
2733
2734void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2735 uint32_t pred_desc)
2736{
2737 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2738 if (last_active_pred(vn, vg, oprsz)) {
2739 compute_brk_z(vd, vm, vg, oprsz, true);
2740 } else {
2741 do_zero(vd, oprsz);
2742 }
2743}
2744
2745uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2746 uint32_t pred_desc)
2747{
2748 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2749 if (last_active_pred(vn, vg, oprsz)) {
2750 return compute_brks_z(vd, vm, vg, oprsz, true);
2751 } else {
2752 return do_zero(vd, oprsz);
2753 }
2754}
2755
2756void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2757 uint32_t pred_desc)
2758{
2759 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2760 if (last_active_pred(vn, vg, oprsz)) {
2761 compute_brk_z(vd, vm, vg, oprsz, false);
2762 } else {
2763 do_zero(vd, oprsz);
2764 }
2765}
2766
2767uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2768 uint32_t pred_desc)
2769{
2770 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2771 if (last_active_pred(vn, vg, oprsz)) {
2772 return compute_brks_z(vd, vm, vg, oprsz, false);
2773 } else {
2774 return do_zero(vd, oprsz);
2775 }
2776}
2777
2778void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2779{
2780 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2781 compute_brk_z(vd, vn, vg, oprsz, true);
2782}
2783
2784uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2785{
2786 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2787 return compute_brks_z(vd, vn, vg, oprsz, true);
2788}
2789
2790void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2791{
2792 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2793 compute_brk_z(vd, vn, vg, oprsz, false);
2794}
2795
2796uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2797{
2798 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2799 return compute_brks_z(vd, vn, vg, oprsz, false);
2800}
2801
2802void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2803{
2804 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2805 compute_brk_m(vd, vn, vg, oprsz, true);
2806}
2807
2808uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2809{
2810 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2811 return compute_brks_m(vd, vn, vg, oprsz, true);
2812}
2813
2814void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2815{
2816 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2817 compute_brk_m(vd, vn, vg, oprsz, false);
2818}
2819
2820uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2821{
2822 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2823 return compute_brks_m(vd, vn, vg, oprsz, false);
2824}
2825
2826void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2827{
2828 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2829
2830 if (!last_active_pred(vn, vg, oprsz)) {
2831 do_zero(vd, oprsz);
2832 }
2833}
2834
2835/* As if PredTest(Ones(PL), D, esz). */
2836static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2837 uint64_t esz_mask)
2838{
2839 uint32_t flags = PREDTEST_INIT;
2840 intptr_t i;
2841
2842 for (i = 0; i < oprsz / 8; i++) {
2843 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2844 }
2845 if (oprsz & 7) {
2846 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2847 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2848 }
2849 return flags;
2850}
2851
2852uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2853{
2854 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2855
2856 if (last_active_pred(vn, vg, oprsz)) {
2857 return predtest_ones(vd, oprsz, -1);
2858 } else {
2859 return do_zero(vd, oprsz);
2860 }
2861}
9ee3a611
RH
2862
2863uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2864{
2865 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2866 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2867 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2868 intptr_t i;
2869
2870 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2871 uint64_t t = n[i] & g[i] & mask;
2872 sum += ctpop64(t);
2873 }
2874 return sum;
2875}
caf1cefc
RH
2876
2877uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2878{
2879 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2880 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2881 uint64_t esz_mask = pred_esz_masks[esz];
2882 ARMPredicateReg *d = vd;
2883 uint32_t flags;
2884 intptr_t i;
2885
2886 /* Begin with a zero predicate register. */
2887 flags = do_zero(d, oprsz);
2888 if (count == 0) {
2889 return flags;
2890 }
2891
caf1cefc
RH
2892 /* Set all of the requested bits. */
2893 for (i = 0; i < count / 64; ++i) {
2894 d->p[i] = esz_mask;
2895 }
2896 if (count & 63) {
2897 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2898 }
2899
2900 return predtest_ones(d, oprsz, esz_mask);
2901}
c4e7c493 2902
23fbe79f
RH
2903/* Recursive reduction on a function;
2904 * C.f. the ARM ARM function ReducePredicated.
2905 *
2906 * While it would be possible to write this without the DATA temporary,
2907 * it is much simpler to process the predicate register this way.
2908 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2909 * little to gain with a more complex non-recursive form.
2910 */
2911#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2912static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2913{ \
2914 if (n == 1) { \
2915 return *data; \
2916 } else { \
2917 uintptr_t half = n / 2; \
2918 TYPE lo = NAME##_reduce(data, status, half); \
2919 TYPE hi = NAME##_reduce(data + half, status, half); \
2920 return TYPE##_##FUNC(lo, hi, status); \
2921 } \
2922} \
2923uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2924{ \
2925 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2926 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2927 for (i = 0; i < oprsz; ) { \
2928 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2929 do { \
2930 TYPE nn = *(TYPE *)(vn + H(i)); \
2931 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2932 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2933 } while (i & 15); \
2934 } \
2935 for (; i < maxsz; i += sizeof(TYPE)) { \
2936 *(TYPE *)((void *)data + i) = IDENT; \
2937 } \
2938 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2939}
2940
2941DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2942DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2943DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2944
2945/* Identity is floatN_default_nan, without the function call. */
2946DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2947DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2948DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2949
2950DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2951DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2952DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2953
2954DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2955DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2956DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2957
2958DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2959DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2960DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2961
2962#undef DO_REDUCE
2963
7f9ddf64
RH
2964uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2965 void *status, uint32_t desc)
2966{
2967 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2968 float16 result = nn;
2969
2970 do {
2971 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2972 do {
2973 if (pg & 1) {
2974 float16 mm = *(float16 *)(vm + H1_2(i));
2975 result = float16_add(result, mm, status);
2976 }
2977 i += sizeof(float16), pg >>= sizeof(float16);
2978 } while (i & 15);
2979 } while (i < opr_sz);
2980
2981 return result;
2982}
2983
2984uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2985 void *status, uint32_t desc)
2986{
2987 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2988 float32 result = nn;
2989
2990 do {
2991 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2992 do {
2993 if (pg & 1) {
2994 float32 mm = *(float32 *)(vm + H1_2(i));
2995 result = float32_add(result, mm, status);
2996 }
2997 i += sizeof(float32), pg >>= sizeof(float32);
2998 } while (i & 15);
2999 } while (i < opr_sz);
3000
3001 return result;
3002}
3003
3004uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
3005 void *status, uint32_t desc)
3006{
3007 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
3008 uint64_t *m = vm;
3009 uint8_t *pg = vg;
3010
3011 for (i = 0; i < opr_sz; i++) {
3012 if (pg[H1(i)] & 1) {
3013 nn = float64_add(nn, m[i], status);
3014 }
3015 }
3016
3017 return nn;
3018}
3019
ec3b87c2
RH
3020/* Fully general three-operand expander, controlled by a predicate,
3021 * With the extra float_status parameter.
3022 */
3023#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
3024void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3025 void *status, uint32_t desc) \
3026{ \
3027 intptr_t i = simd_oprsz(desc); \
3028 uint64_t *g = vg; \
3029 do { \
3030 uint64_t pg = g[(i - 1) >> 6]; \
3031 do { \
3032 i -= sizeof(TYPE); \
3033 if (likely((pg >> (i & 63)) & 1)) { \
3034 TYPE nn = *(TYPE *)(vn + H(i)); \
3035 TYPE mm = *(TYPE *)(vm + H(i)); \
3036 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3037 } \
3038 } while (i & 63); \
3039 } while (i != 0); \
3040}
3041
3042DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3043DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3044DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3045
3046DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3047DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3048DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3049
3050DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3051DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3052DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3053
3054DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3055DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3056DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3057
3058DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3059DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3060DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3061
3062DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3063DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3064DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3065
3066DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3067DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3068DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3069
3070DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3071DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3072DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3073
3074static inline float16 abd_h(float16 a, float16 b, float_status *s)
3075{
3076 return float16_abs(float16_sub(a, b, s));
3077}
3078
3079static inline float32 abd_s(float32 a, float32 b, float_status *s)
3080{
3081 return float32_abs(float32_sub(a, b, s));
3082}
3083
3084static inline float64 abd_d(float64 a, float64 b, float_status *s)
3085{
3086 return float64_abs(float64_sub(a, b, s));
3087}
3088
3089DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3090DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3091DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3092
3093static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3094{
3095 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3096 return float64_scalbn(a, b_int, s);
3097}
3098
3099DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3100DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3101DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3102
3103DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3104DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3105DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3106
3107#undef DO_ZPZZ_FP
3108
cc48affe
RH
3109/* Three-operand expander, with one scalar operand, controlled by
3110 * a predicate, with the extra float_status parameter.
3111 */
3112#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3113void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3114 void *status, uint32_t desc) \
3115{ \
3116 intptr_t i = simd_oprsz(desc); \
3117 uint64_t *g = vg; \
3118 TYPE mm = scalar; \
3119 do { \
3120 uint64_t pg = g[(i - 1) >> 6]; \
3121 do { \
3122 i -= sizeof(TYPE); \
3123 if (likely((pg >> (i & 63)) & 1)) { \
3124 TYPE nn = *(TYPE *)(vn + H(i)); \
3125 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3126 } \
3127 } while (i & 63); \
3128 } while (i != 0); \
3129}
3130
3131DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3132DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3133DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3134
3135DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3136DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3137DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3138
3139DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3140DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3141DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3142
3143static inline float16 subr_h(float16 a, float16 b, float_status *s)
3144{
3145 return float16_sub(b, a, s);
3146}
3147
3148static inline float32 subr_s(float32 a, float32 b, float_status *s)
3149{
3150 return float32_sub(b, a, s);
3151}
3152
3153static inline float64 subr_d(float64 a, float64 b, float_status *s)
3154{
3155 return float64_sub(b, a, s);
3156}
3157
3158DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3159DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3160DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3161
3162DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3163DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3164DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3165
3166DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3167DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3168DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3169
3170DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3171DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3172DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3173
3174DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3175DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3176DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3177
8092c6a3
RH
3178/* Fully general two-operand expander, controlled by a predicate,
3179 * With the extra float_status parameter.
3180 */
3181#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3182void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3183{ \
3184 intptr_t i = simd_oprsz(desc); \
3185 uint64_t *g = vg; \
3186 do { \
3187 uint64_t pg = g[(i - 1) >> 6]; \
3188 do { \
3189 i -= sizeof(TYPE); \
3190 if (likely((pg >> (i & 63)) & 1)) { \
3191 TYPE nn = *(TYPE *)(vn + H(i)); \
3192 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3193 } \
3194 } while (i & 63); \
3195 } while (i != 0); \
3196}
3197
46d33d1e
RH
3198/* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3199 * FZ16. When converting from fp16, this affects flushing input denormals;
3200 * when converting to fp16, this affects flushing output denormals.
3201 */
3202static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3203{
3204 flag save = get_flush_inputs_to_zero(fpst);
3205 float32 ret;
3206
3207 set_flush_inputs_to_zero(false, fpst);
3208 ret = float16_to_float32(f, true, fpst);
3209 set_flush_inputs_to_zero(save, fpst);
3210 return ret;
3211}
3212
3213static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3214{
3215 flag save = get_flush_inputs_to_zero(fpst);
3216 float64 ret;
3217
3218 set_flush_inputs_to_zero(false, fpst);
3219 ret = float16_to_float64(f, true, fpst);
3220 set_flush_inputs_to_zero(save, fpst);
3221 return ret;
3222}
3223
3224static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3225{
3226 flag save = get_flush_to_zero(fpst);
3227 float16 ret;
3228
3229 set_flush_to_zero(false, fpst);
3230 ret = float32_to_float16(f, true, fpst);
3231 set_flush_to_zero(save, fpst);
3232 return ret;
3233}
3234
3235static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3236{
3237 flag save = get_flush_to_zero(fpst);
3238 float16 ret;
3239
3240 set_flush_to_zero(false, fpst);
3241 ret = float64_to_float16(f, true, fpst);
3242 set_flush_to_zero(save, fpst);
3243 return ret;
3244}
3245
df4de1af
RH
3246static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3247{
3248 if (float16_is_any_nan(f)) {
3249 float_raise(float_flag_invalid, s);
3250 return 0;
3251 }
3252 return float16_to_int16_round_to_zero(f, s);
3253}
3254
3255static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3256{
3257 if (float16_is_any_nan(f)) {
3258 float_raise(float_flag_invalid, s);
3259 return 0;
3260 }
3261 return float16_to_int64_round_to_zero(f, s);
3262}
3263
3264static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3265{
3266 if (float32_is_any_nan(f)) {
3267 float_raise(float_flag_invalid, s);
3268 return 0;
3269 }
3270 return float32_to_int64_round_to_zero(f, s);
3271}
3272
3273static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3274{
3275 if (float64_is_any_nan(f)) {
3276 float_raise(float_flag_invalid, s);
3277 return 0;
3278 }
3279 return float64_to_int64_round_to_zero(f, s);
3280}
3281
3282static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3283{
3284 if (float16_is_any_nan(f)) {
3285 float_raise(float_flag_invalid, s);
3286 return 0;
3287 }
3288 return float16_to_uint16_round_to_zero(f, s);
3289}
3290
3291static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3292{
3293 if (float16_is_any_nan(f)) {
3294 float_raise(float_flag_invalid, s);
3295 return 0;
3296 }
3297 return float16_to_uint64_round_to_zero(f, s);
3298}
3299
3300static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3301{
3302 if (float32_is_any_nan(f)) {
3303 float_raise(float_flag_invalid, s);
3304 return 0;
3305 }
3306 return float32_to_uint64_round_to_zero(f, s);
3307}
3308
3309static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3310{
3311 if (float64_is_any_nan(f)) {
3312 float_raise(float_flag_invalid, s);
3313 return 0;
3314 }
3315 return float64_to_uint64_round_to_zero(f, s);
3316}
3317
46d33d1e
RH
3318DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3319DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3320DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3321DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3322DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3323DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3324
df4de1af
RH
3325DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3326DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3327DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3328DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3329DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3330DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3331DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3332
3333DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3334DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3335DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3336DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3337DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3338DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3339DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3340
cda3c753
RH
3341DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3342DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3343DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3344
3345DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3346DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3347DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3348
ec5b375b
RH
3349DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3350DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3351DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3352
3353DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3354DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3355DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3356
8092c6a3
RH
3357DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3358DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3359DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3360DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3361DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3362DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3363DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3364
3365DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3366DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3367DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3368DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3369DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3370DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3371DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3372
3373#undef DO_ZPZ_FP
3374
6ceabaad
RH
3375/* 4-operand predicated multiply-add. This requires 7 operands to pass
3376 * "properly", so we need to encode some of the registers into DESC.
3377 */
3378QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
3379
3380static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
3381 uint16_t neg1, uint16_t neg3)
3382{
3383 intptr_t i = simd_oprsz(desc);
3384 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3385 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3386 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3387 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3388 void *vd = &env->vfp.zregs[rd];
3389 void *vn = &env->vfp.zregs[rn];
3390 void *vm = &env->vfp.zregs[rm];
3391 void *va = &env->vfp.zregs[ra];
3392 uint64_t *g = vg;
3393
3394 do {
3395 uint64_t pg = g[(i - 1) >> 6];
3396 do {
3397 i -= 2;
3398 if (likely((pg >> (i & 63)) & 1)) {
3399 float16 e1, e2, e3, r;
3400
3401 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3402 e2 = *(uint16_t *)(vm + H1_2(i));
3403 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
52a339b1 3404 r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status_f16);
6ceabaad
RH
3405 *(uint16_t *)(vd + H1_2(i)) = r;
3406 }
3407 } while (i & 63);
3408 } while (i != 0);
3409}
3410
3411void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3412{
3413 do_fmla_zpzzz_h(env, vg, desc, 0, 0);
3414}
3415
3416void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3417{
3418 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
3419}
3420
3421void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3422{
3423 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
3424}
3425
3426void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3427{
3428 do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
3429}
3430
3431static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
3432 uint32_t neg1, uint32_t neg3)
3433{
3434 intptr_t i = simd_oprsz(desc);
3435 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3436 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3437 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3438 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3439 void *vd = &env->vfp.zregs[rd];
3440 void *vn = &env->vfp.zregs[rn];
3441 void *vm = &env->vfp.zregs[rm];
3442 void *va = &env->vfp.zregs[ra];
3443 uint64_t *g = vg;
3444
3445 do {
3446 uint64_t pg = g[(i - 1) >> 6];
3447 do {
3448 i -= 4;
3449 if (likely((pg >> (i & 63)) & 1)) {
3450 float32 e1, e2, e3, r;
3451
3452 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3453 e2 = *(uint32_t *)(vm + H1_4(i));
3454 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3455 r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3456 *(uint32_t *)(vd + H1_4(i)) = r;
3457 }
3458 } while (i & 63);
3459 } while (i != 0);
3460}
3461
3462void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3463{
3464 do_fmla_zpzzz_s(env, vg, desc, 0, 0);
3465}
3466
3467void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3468{
3469 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
3470}
3471
3472void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3473{
3474 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
3475}
3476
3477void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3478{
3479 do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
3480}
3481
3482static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
3483 uint64_t neg1, uint64_t neg3)
3484{
3485 intptr_t i = simd_oprsz(desc);
3486 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3487 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3488 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3489 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3490 void *vd = &env->vfp.zregs[rd];
3491 void *vn = &env->vfp.zregs[rn];
3492 void *vm = &env->vfp.zregs[rm];
3493 void *va = &env->vfp.zregs[ra];
3494 uint64_t *g = vg;
3495
3496 do {
3497 uint64_t pg = g[(i - 1) >> 6];
3498 do {
3499 i -= 8;
3500 if (likely((pg >> (i & 63)) & 1)) {
3501 float64 e1, e2, e3, r;
3502
3503 e1 = *(uint64_t *)(vn + i) ^ neg1;
3504 e2 = *(uint64_t *)(vm + i);
3505 e3 = *(uint64_t *)(va + i) ^ neg3;
3506 r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3507 *(uint64_t *)(vd + i) = r;
3508 }
3509 } while (i & 63);
3510 } while (i != 0);
3511}
3512
3513void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3514{
3515 do_fmla_zpzzz_d(env, vg, desc, 0, 0);
3516}
3517
3518void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3519{
3520 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
3521}
3522
3523void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3524{
3525 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
3526}
3527
3528void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3529{
3530 do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
3531}
3532
abfdefd5
RH
3533/* Two operand floating-point comparison controlled by a predicate.
3534 * Unlike the integer version, we are not allowed to optimistically
3535 * compare operands, since the comparison may have side effects wrt
3536 * the FPSR.
3537 */
3538#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3539void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3540 void *status, uint32_t desc) \
3541{ \
3542 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3543 uint64_t *d = vd, *g = vg; \
3544 do { \
3545 uint64_t out = 0, pg = g[j]; \
3546 do { \
3547 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3548 if (likely((pg >> (i & 63)) & 1)) { \
3549 TYPE nn = *(TYPE *)(vn + H(i)); \
3550 TYPE mm = *(TYPE *)(vm + H(i)); \
3551 out |= OP(TYPE, nn, mm, status); \
3552 } \
3553 } while (i & 63); \
3554 d[j--] = out; \
3555 } while (i > 0); \
3556}
3557
3558#define DO_FPCMP_PPZZ_H(NAME, OP) \
3559 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3560#define DO_FPCMP_PPZZ_S(NAME, OP) \
3561 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3562#define DO_FPCMP_PPZZ_D(NAME, OP) \
3563 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3564
3565#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3566 DO_FPCMP_PPZZ_H(NAME, OP) \
3567 DO_FPCMP_PPZZ_S(NAME, OP) \
3568 DO_FPCMP_PPZZ_D(NAME, OP)
3569
3570#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3571#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4d2e2a03
RH
3572#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3573#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
abfdefd5
RH
3574#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3575#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3576#define DO_FCMUO(TYPE, X, Y, ST) \
3577 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3578#define DO_FACGE(TYPE, X, Y, ST) \
3579 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3580#define DO_FACGT(TYPE, X, Y, ST) \
3581 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3582
3583DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3584DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3585DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3586DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3587DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3588DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3589DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3590
3591#undef DO_FPCMP_PPZZ_ALL
3592#undef DO_FPCMP_PPZZ_D
3593#undef DO_FPCMP_PPZZ_S
3594#undef DO_FPCMP_PPZZ_H
3595#undef DO_FPCMP_PPZZ
3596
4d2e2a03
RH
3597/* One operand floating-point comparison against zero, controlled
3598 * by a predicate.
3599 */
3600#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3601void HELPER(NAME)(void *vd, void *vn, void *vg, \
3602 void *status, uint32_t desc) \
3603{ \
3604 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3605 uint64_t *d = vd, *g = vg; \
3606 do { \
3607 uint64_t out = 0, pg = g[j]; \
3608 do { \
3609 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3610 if ((pg >> (i & 63)) & 1) { \
3611 TYPE nn = *(TYPE *)(vn + H(i)); \
3612 out |= OP(TYPE, nn, 0, status); \
3613 } \
3614 } while (i & 63); \
3615 d[j--] = out; \
3616 } while (i > 0); \
3617}
3618
3619#define DO_FPCMP_PPZ0_H(NAME, OP) \
3620 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3621#define DO_FPCMP_PPZ0_S(NAME, OP) \
3622 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3623#define DO_FPCMP_PPZ0_D(NAME, OP) \
3624 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3625
3626#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3627 DO_FPCMP_PPZ0_H(NAME, OP) \
3628 DO_FPCMP_PPZ0_S(NAME, OP) \
3629 DO_FPCMP_PPZ0_D(NAME, OP)
3630
3631DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3632DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3633DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3634DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3635DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3636DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3637
67fcd9ad
RH
3638/* FP Trig Multiply-Add. */
3639
3640void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3641{
3642 static const float16 coeff[16] = {
3643 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3644 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3645 };
3646 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3647 intptr_t x = simd_data(desc);
3648 float16 *d = vd, *n = vn, *m = vm;
3649 for (i = 0; i < opr_sz; i++) {
3650 float16 mm = m[i];
3651 intptr_t xx = x;
3652 if (float16_is_neg(mm)) {
3653 mm = float16_abs(mm);
3654 xx += 8;
3655 }
3656 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3657 }
3658}
3659
3660void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3661{
3662 static const float32 coeff[16] = {
3663 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3664 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3665 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3666 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3667 };
3668 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3669 intptr_t x = simd_data(desc);
3670 float32 *d = vd, *n = vn, *m = vm;
3671 for (i = 0; i < opr_sz; i++) {
3672 float32 mm = m[i];
3673 intptr_t xx = x;
3674 if (float32_is_neg(mm)) {
3675 mm = float32_abs(mm);
3676 xx += 8;
3677 }
3678 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3679 }
3680}
3681
3682void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3683{
3684 static const float64 coeff[16] = {
3685 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3686 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3687 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3688 0x3de5d8408868552full, 0x0000000000000000ull,
3689 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3690 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3691 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3692 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3693 };
3694 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3695 intptr_t x = simd_data(desc);
3696 float64 *d = vd, *n = vn, *m = vm;
3697 for (i = 0; i < opr_sz; i++) {
3698 float64 mm = m[i];
3699 intptr_t xx = x;
3700 if (float64_is_neg(mm)) {
3701 mm = float64_abs(mm);
3702 xx += 8;
3703 }
3704 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3705 }
3706}
3707
76a9d9cd
RH
3708/*
3709 * FP Complex Add
3710 */
3711
3712void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3713 void *vs, uint32_t desc)
3714{
3715 intptr_t j, i = simd_oprsz(desc);
3716 uint64_t *g = vg;
3717 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3718 float16 neg_real = float16_chs(neg_imag);
3719
3720 do {
3721 uint64_t pg = g[(i - 1) >> 6];
3722 do {
3723 float16 e0, e1, e2, e3;
3724
3725 /* I holds the real index; J holds the imag index. */
3726 j = i - sizeof(float16);
3727 i -= 2 * sizeof(float16);
3728
3729 e0 = *(float16 *)(vn + H1_2(i));
3730 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3731 e2 = *(float16 *)(vn + H1_2(j));
3732 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3733
3734 if (likely((pg >> (i & 63)) & 1)) {
3735 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3736 }
3737 if (likely((pg >> (j & 63)) & 1)) {
3738 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3739 }
3740 } while (i & 63);
3741 } while (i != 0);
3742}
3743
3744void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3745 void *vs, uint32_t desc)
3746{
3747 intptr_t j, i = simd_oprsz(desc);
3748 uint64_t *g = vg;
3749 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3750 float32 neg_real = float32_chs(neg_imag);
3751
3752 do {
3753 uint64_t pg = g[(i - 1) >> 6];
3754 do {
3755 float32 e0, e1, e2, e3;
3756
3757 /* I holds the real index; J holds the imag index. */
3758 j = i - sizeof(float32);
3759 i -= 2 * sizeof(float32);
3760
3761 e0 = *(float32 *)(vn + H1_2(i));
3762 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3763 e2 = *(float32 *)(vn + H1_2(j));
3764 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3765
3766 if (likely((pg >> (i & 63)) & 1)) {
3767 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3768 }
3769 if (likely((pg >> (j & 63)) & 1)) {
3770 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3771 }
3772 } while (i & 63);
3773 } while (i != 0);
3774}
3775
3776void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3777 void *vs, uint32_t desc)
3778{
3779 intptr_t j, i = simd_oprsz(desc);
3780 uint64_t *g = vg;
3781 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3782 float64 neg_real = float64_chs(neg_imag);
3783
3784 do {
3785 uint64_t pg = g[(i - 1) >> 6];
3786 do {
3787 float64 e0, e1, e2, e3;
3788
3789 /* I holds the real index; J holds the imag index. */
3790 j = i - sizeof(float64);
3791 i -= 2 * sizeof(float64);
3792
3793 e0 = *(float64 *)(vn + H1_2(i));
3794 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3795 e2 = *(float64 *)(vn + H1_2(j));
3796 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3797
3798 if (likely((pg >> (i & 63)) & 1)) {
3799 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3800 }
3801 if (likely((pg >> (j & 63)) & 1)) {
3802 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3803 }
3804 } while (i & 63);
3805 } while (i != 0);
3806}
3807
05f48bab
RH
3808/*
3809 * FP Complex Multiply
3810 */
3811
3812QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 22 > 32);
3813
3814void HELPER(sve_fcmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3815{
3816 intptr_t j, i = simd_oprsz(desc);
3817 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3818 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3819 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3820 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3821 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3822 bool flip = rot & 1;
3823 float16 neg_imag, neg_real;
3824 void *vd = &env->vfp.zregs[rd];
3825 void *vn = &env->vfp.zregs[rn];
3826 void *vm = &env->vfp.zregs[rm];
3827 void *va = &env->vfp.zregs[ra];
3828 uint64_t *g = vg;
3829
3830 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3831 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3832
3833 do {
3834 uint64_t pg = g[(i - 1) >> 6];
3835 do {
3836 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3837
3838 /* I holds the real index; J holds the imag index. */
3839 j = i - sizeof(float16);
3840 i -= 2 * sizeof(float16);
3841
3842 nr = *(float16 *)(vn + H1_2(i));
3843 ni = *(float16 *)(vn + H1_2(j));
3844 mr = *(float16 *)(vm + H1_2(i));
3845 mi = *(float16 *)(vm + H1_2(j));
3846
3847 e2 = (flip ? ni : nr);
3848 e1 = (flip ? mi : mr) ^ neg_real;
3849 e4 = e2;
3850 e3 = (flip ? mr : mi) ^ neg_imag;
3851
3852 if (likely((pg >> (i & 63)) & 1)) {
3853 d = *(float16 *)(va + H1_2(i));
3854 d = float16_muladd(e2, e1, d, 0, &env->vfp.fp_status_f16);
3855 *(float16 *)(vd + H1_2(i)) = d;
3856 }
3857 if (likely((pg >> (j & 63)) & 1)) {
3858 d = *(float16 *)(va + H1_2(j));
3859 d = float16_muladd(e4, e3, d, 0, &env->vfp.fp_status_f16);
3860 *(float16 *)(vd + H1_2(j)) = d;
3861 }
3862 } while (i & 63);
3863 } while (i != 0);
3864}
3865
3866void HELPER(sve_fcmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3867{
3868 intptr_t j, i = simd_oprsz(desc);
3869 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3870 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3871 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3872 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3873 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3874 bool flip = rot & 1;
3875 float32 neg_imag, neg_real;
3876 void *vd = &env->vfp.zregs[rd];
3877 void *vn = &env->vfp.zregs[rn];
3878 void *vm = &env->vfp.zregs[rm];
3879 void *va = &env->vfp.zregs[ra];
3880 uint64_t *g = vg;
3881
3882 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3883 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3884
3885 do {
3886 uint64_t pg = g[(i - 1) >> 6];
3887 do {
3888 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3889
3890 /* I holds the real index; J holds the imag index. */
3891 j = i - sizeof(float32);
3892 i -= 2 * sizeof(float32);
3893
3894 nr = *(float32 *)(vn + H1_2(i));
3895 ni = *(float32 *)(vn + H1_2(j));
3896 mr = *(float32 *)(vm + H1_2(i));
3897 mi = *(float32 *)(vm + H1_2(j));
3898
3899 e2 = (flip ? ni : nr);
3900 e1 = (flip ? mi : mr) ^ neg_real;
3901 e4 = e2;
3902 e3 = (flip ? mr : mi) ^ neg_imag;
3903
3904 if (likely((pg >> (i & 63)) & 1)) {
3905 d = *(float32 *)(va + H1_2(i));
3906 d = float32_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3907 *(float32 *)(vd + H1_2(i)) = d;
3908 }
3909 if (likely((pg >> (j & 63)) & 1)) {
3910 d = *(float32 *)(va + H1_2(j));
3911 d = float32_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3912 *(float32 *)(vd + H1_2(j)) = d;
3913 }
3914 } while (i & 63);
3915 } while (i != 0);
3916}
3917
3918void HELPER(sve_fcmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3919{
3920 intptr_t j, i = simd_oprsz(desc);
3921 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3922 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3923 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3924 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3925 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3926 bool flip = rot & 1;
3927 float64 neg_imag, neg_real;
3928 void *vd = &env->vfp.zregs[rd];
3929 void *vn = &env->vfp.zregs[rn];
3930 void *vm = &env->vfp.zregs[rm];
3931 void *va = &env->vfp.zregs[ra];
3932 uint64_t *g = vg;
3933
3934 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3935 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3936
3937 do {
3938 uint64_t pg = g[(i - 1) >> 6];
3939 do {
3940 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3941
3942 /* I holds the real index; J holds the imag index. */
3943 j = i - sizeof(float64);
3944 i -= 2 * sizeof(float64);
3945
3946 nr = *(float64 *)(vn + H1_2(i));
3947 ni = *(float64 *)(vn + H1_2(j));
3948 mr = *(float64 *)(vm + H1_2(i));
3949 mi = *(float64 *)(vm + H1_2(j));
3950
3951 e2 = (flip ? ni : nr);
3952 e1 = (flip ? mi : mr) ^ neg_real;
3953 e4 = e2;
3954 e3 = (flip ? mr : mi) ^ neg_imag;
3955
3956 if (likely((pg >> (i & 63)) & 1)) {
3957 d = *(float64 *)(va + H1_2(i));
3958 d = float64_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3959 *(float64 *)(vd + H1_2(i)) = d;
3960 }
3961 if (likely((pg >> (j & 63)) & 1)) {
3962 d = *(float64 *)(va + H1_2(j));
3963 d = float64_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3964 *(float64 *)(vd + H1_2(j)) = d;
3965 }
3966 } while (i & 63);
3967 } while (i != 0);
3968}
3969
c4e7c493
RH
3970/*
3971 * Load contiguous data, protected by a governing predicate.
3972 */
9123aeb6
RH
3973
3974/*
cf4a49b7
RH
3975 * Load one element into @vd + @reg_off from @host.
3976 * The controlling predicate is known to be true.
9123aeb6 3977 */
cf4a49b7 3978typedef void sve_ldst1_host_fn(void *vd, intptr_t reg_off, void *host);
9123aeb6
RH
3979
3980/*
3981 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
3982 * The controlling predicate is known to be true.
3983 */
6799ce7b
RH
3984typedef void sve_ldst1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
3985 target_ulong vaddr, uintptr_t retaddr);
9123aeb6
RH
3986
3987/*
3988 * Generate the above primitives.
3989 */
3990
3991#define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
cf4a49b7
RH
3992static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
3993{ \
3994 TYPEM val = HOST(host); \
3995 *(TYPEE *)(vd + H(reg_off)) = val; \
9123aeb6
RH
3996}
3997
6799ce7b 3998#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
9123aeb6 3999static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
6799ce7b 4000 target_ulong addr, uintptr_t ra) \
9123aeb6 4001{ \
6799ce7b 4002 *(TYPEE *)(vd + H(reg_off)) = (TYPEM)TLB(env, addr, ra); \
9123aeb6 4003}
6799ce7b
RH
4004
4005#define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
9123aeb6 4006static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
6799ce7b 4007 target_ulong addr, uintptr_t ra) \
9123aeb6 4008{ \
6799ce7b 4009 TLB(env, addr, (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
9123aeb6 4010}
9123aeb6
RH
4011
4012#define DO_LD_PRIM_1(NAME, H, TE, TM) \
4013 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
6799ce7b 4014 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
9123aeb6
RH
4015
4016DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
4017DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
4018DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
4019DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
4020DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
4021DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
4022DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
4023
6799ce7b
RH
4024#define DO_ST_PRIM_1(NAME, H, TE, TM) \
4025 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
4026
4027DO_ST_PRIM_1(bb, H1, uint8_t, uint8_t)
4028DO_ST_PRIM_1(bh, H1_2, uint16_t, uint8_t)
4029DO_ST_PRIM_1(bs, H1_4, uint32_t, uint8_t)
4030DO_ST_PRIM_1(bd, , uint64_t, uint8_t)
9123aeb6 4031
6799ce7b
RH
4032#define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
4033 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
4034 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
4035 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
4036 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
9123aeb6 4037
6799ce7b
RH
4038#define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
4039 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
4040 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
9123aeb6 4041
6799ce7b
RH
4042DO_LD_PRIM_2(hh, H1_2, uint16_t, uint16_t, lduw)
4043DO_LD_PRIM_2(hsu, H1_4, uint32_t, uint16_t, lduw)
4044DO_LD_PRIM_2(hss, H1_4, uint32_t, int16_t, lduw)
4045DO_LD_PRIM_2(hdu, , uint64_t, uint16_t, lduw)
4046DO_LD_PRIM_2(hds, , uint64_t, int16_t, lduw)
9123aeb6 4047
6799ce7b
RH
4048DO_ST_PRIM_2(hh, H1_2, uint16_t, uint16_t, stw)
4049DO_ST_PRIM_2(hs, H1_4, uint32_t, uint16_t, stw)
4050DO_ST_PRIM_2(hd, , uint64_t, uint16_t, stw)
9123aeb6 4051
6799ce7b
RH
4052DO_LD_PRIM_2(ss, H1_4, uint32_t, uint32_t, ldl)
4053DO_LD_PRIM_2(sdu, , uint64_t, uint32_t, ldl)
4054DO_LD_PRIM_2(sds, , uint64_t, int32_t, ldl)
9123aeb6 4055
6799ce7b
RH
4056DO_ST_PRIM_2(ss, H1_4, uint32_t, uint32_t, stl)
4057DO_ST_PRIM_2(sd, , uint64_t, uint32_t, stl)
4058
4059DO_LD_PRIM_2(dd, , uint64_t, uint64_t, ldq)
4060DO_ST_PRIM_2(dd, , uint64_t, uint64_t, stq)
9123aeb6
RH
4061
4062#undef DO_LD_TLB
6799ce7b 4063#undef DO_ST_TLB
9123aeb6
RH
4064#undef DO_LD_HOST
4065#undef DO_LD_PRIM_1
6799ce7b 4066#undef DO_ST_PRIM_1
9123aeb6 4067#undef DO_LD_PRIM_2
6799ce7b 4068#undef DO_ST_PRIM_2
9123aeb6
RH
4069
4070/*
4071 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4072 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4073 * element >= @reg_off, or @reg_max if there were no active elements at all.
4074 */
4075static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4076 intptr_t reg_max, int esz)
4077{
4078 uint64_t pg_mask = pred_esz_masks[esz];
4079 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4080
4081 /* In normal usage, the first element is active. */
4082 if (likely(pg & 1)) {
4083 return reg_off;
4084 }
4085
4086 if (pg == 0) {
4087 reg_off &= -64;
4088 do {
4089 reg_off += 64;
4090 if (unlikely(reg_off >= reg_max)) {
4091 /* The entire predicate was false. */
4092 return reg_max;
4093 }
4094 pg = vg[reg_off >> 6] & pg_mask;
4095 } while (pg == 0);
4096 }
4097 reg_off += ctz64(pg);
4098
4099 /* We should never see an out of range predicate bit set. */
4100 tcg_debug_assert(reg_off < reg_max);
4101 return reg_off;
4102}
4103
4104/*
4105 * Return the maximum offset <= @mem_max which is still within the page
4106 * referenced by @base + @mem_off.
4107 */
4108static intptr_t max_for_page(target_ulong base, intptr_t mem_off,
4109 intptr_t mem_max)
4110{
4111 target_ulong addr = base + mem_off;
4112 intptr_t split = -(intptr_t)(addr | TARGET_PAGE_MASK);
4113 return MIN(split, mem_max - mem_off) + mem_off;
4114}
4115
b4cd95d2
RH
4116/*
4117 * Resolve the guest virtual address to info->host and info->flags.
4118 * If @nofault, return false if the page is invalid, otherwise
4119 * exit via page fault exception.
4120 */
4121
4122typedef struct {
4123 void *host;
4124 int flags;
4125 MemTxAttrs attrs;
4126} SVEHostPage;
4127
4128static bool sve_probe_page(SVEHostPage *info, bool nofault,
4129 CPUARMState *env, target_ulong addr,
4130 int mem_off, MMUAccessType access_type,
4131 int mmu_idx, uintptr_t retaddr)
4132{
4133 int flags;
4134
4135 addr += mem_off;
4136 flags = probe_access_flags(env, addr, access_type, mmu_idx, nofault,
4137 &info->host, retaddr);
4138 info->flags = flags;
4139
4140 if (flags & TLB_INVALID_MASK) {
4141 g_assert(nofault);
4142 return false;
4143 }
4144
4145 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
4146 info->host -= mem_off;
4147
4148#ifdef CONFIG_USER_ONLY
4149 memset(&info->attrs, 0, sizeof(info->attrs));
4150#else
4151 /*
4152 * Find the iotlbentry for addr and return the transaction attributes.
4153 * This *must* be present in the TLB because we just found the mapping.
4154 */
4155 {
4156 uintptr_t index = tlb_index(env, mmu_idx, addr);
4157
4158# ifdef CONFIG_DEBUG_TCG
4159 CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
4160 target_ulong comparator = (access_type == MMU_DATA_LOAD
4161 ? entry->addr_read
4162 : tlb_addr_write(entry));
4163 g_assert(tlb_hit(comparator, addr));
4164# endif
4165
4166 CPUIOTLBEntry *iotlbentry = &env_tlb(env)->d[mmu_idx].iotlb[index];
4167 info->attrs = iotlbentry->attrs;
4168 }
4169#endif
4170
4171 return true;
4172}
4173
4174
4175/*
4176 * Analyse contiguous data, protected by a governing predicate.
4177 */
4178
4179typedef enum {
4180 FAULT_NO,
4181 FAULT_FIRST,
4182 FAULT_ALL,
4183} SVEContFault;
4184
4185typedef struct {
4186 /*
4187 * First and last element wholly contained within the two pages.
4188 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
4189 * reg_off_last[0] may be < 0 if the first element crosses pages.
4190 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
4191 * are set >= 0 only if there are complete elements on a second page.
4192 *
4193 * The reg_off_* offsets are relative to the internal vector register.
4194 * The mem_off_first offset is relative to the memory address; the
4195 * two offsets are different when a load operation extends, a store
4196 * operation truncates, or for multi-register operations.
4197 */
4198 int16_t mem_off_first[2];
4199 int16_t reg_off_first[2];
4200 int16_t reg_off_last[2];
4201
4202 /*
4203 * One element that is misaligned and spans both pages,
4204 * or -1 if there is no such active element.
4205 */
4206 int16_t mem_off_split;
4207 int16_t reg_off_split;
4208
4209 /*
4210 * The byte offset at which the entire operation crosses a page boundary.
4211 * Set >= 0 if and only if the entire operation spans two pages.
4212 */
4213 int16_t page_split;
4214
4215 /* TLB data for the two pages. */
4216 SVEHostPage page[2];
4217} SVEContLdSt;
4218
4219/*
4220 * Find first active element on each page, and a loose bound for the
4221 * final element on each page. Identify any single element that spans
4222 * the page boundary. Return true if there are any active elements.
4223 */
b854fd06
RH
4224static bool sve_cont_ldst_elements(SVEContLdSt *info, target_ulong addr,
4225 uint64_t *vg, intptr_t reg_max,
4226 int esz, int msize)
b4cd95d2
RH
4227{
4228 const int esize = 1 << esz;
4229 const uint64_t pg_mask = pred_esz_masks[esz];
4230 intptr_t reg_off_first = -1, reg_off_last = -1, reg_off_split;
4231 intptr_t mem_off_last, mem_off_split;
4232 intptr_t page_split, elt_split;
4233 intptr_t i;
4234
4235 /* Set all of the element indices to -1, and the TLB data to 0. */
4236 memset(info, -1, offsetof(SVEContLdSt, page));
4237 memset(info->page, 0, sizeof(info->page));
4238
4239 /* Gross scan over the entire predicate to find bounds. */
4240 i = 0;
4241 do {
4242 uint64_t pg = vg[i] & pg_mask;
4243 if (pg) {
4244 reg_off_last = i * 64 + 63 - clz64(pg);
4245 if (reg_off_first < 0) {
4246 reg_off_first = i * 64 + ctz64(pg);
4247 }
4248 }
4249 } while (++i * 64 < reg_max);
4250
4251 if (unlikely(reg_off_first < 0)) {
4252 /* No active elements, no pages touched. */
4253 return false;
4254 }
4255 tcg_debug_assert(reg_off_last >= 0 && reg_off_last < reg_max);
4256
4257 info->reg_off_first[0] = reg_off_first;
4258 info->mem_off_first[0] = (reg_off_first >> esz) * msize;
4259 mem_off_last = (reg_off_last >> esz) * msize;
4260
4261 page_split = -(addr | TARGET_PAGE_MASK);
4262 if (likely(mem_off_last + msize <= page_split)) {
4263 /* The entire operation fits within a single page. */
4264 info->reg_off_last[0] = reg_off_last;
4265 return true;
4266 }
4267
4268 info->page_split = page_split;
4269 elt_split = page_split / msize;
4270 reg_off_split = elt_split << esz;
4271 mem_off_split = elt_split * msize;
4272
4273 /*
4274 * This is the last full element on the first page, but it is not
4275 * necessarily active. If there is no full element, i.e. the first
4276 * active element is the one that's split, this value remains -1.
4277 * It is useful as iteration bounds.
4278 */
4279 if (elt_split != 0) {
4280 info->reg_off_last[0] = reg_off_split - esize;
4281 }
4282
4283 /* Determine if an unaligned element spans the pages. */
4284 if (page_split % msize != 0) {
4285 /* It is helpful to know if the split element is active. */
4286 if ((vg[reg_off_split >> 6] >> (reg_off_split & 63)) & 1) {
4287 info->reg_off_split = reg_off_split;
4288 info->mem_off_split = mem_off_split;
4289
4290 if (reg_off_split == reg_off_last) {
4291 /* The page crossing element is last. */
4292 return true;
4293 }
4294 }
4295 reg_off_split += esize;
4296 mem_off_split += msize;
4297 }
4298
4299 /*
4300 * We do want the first active element on the second page, because
4301 * this may affect the address reported in an exception.
4302 */
4303 reg_off_split = find_next_active(vg, reg_off_split, reg_max, esz);
4304 tcg_debug_assert(reg_off_split <= reg_off_last);
4305 info->reg_off_first[1] = reg_off_split;
4306 info->mem_off_first[1] = (reg_off_split >> esz) * msize;
4307 info->reg_off_last[1] = reg_off_last;
4308 return true;
4309}
4310
4311/*
4312 * Resolve the guest virtual addresses to info->page[].
4313 * Control the generation of page faults with @fault. Return false if
4314 * there is no work to do, which can only happen with @fault == FAULT_NO.
4315 */
b854fd06
RH
4316static bool sve_cont_ldst_pages(SVEContLdSt *info, SVEContFault fault,
4317 CPUARMState *env, target_ulong addr,
4318 MMUAccessType access_type, uintptr_t retaddr)
b4cd95d2
RH
4319{
4320 int mmu_idx = cpu_mmu_index(env, false);
4321 int mem_off = info->mem_off_first[0];
4322 bool nofault = fault == FAULT_NO;
4323 bool have_work = true;
4324
4325 if (!sve_probe_page(&info->page[0], nofault, env, addr, mem_off,
4326 access_type, mmu_idx, retaddr)) {
4327 /* No work to be done. */
4328 return false;
4329 }
4330
4331 if (likely(info->page_split < 0)) {
4332 /* The entire operation was on the one page. */
4333 return true;
4334 }
4335
4336 /*
4337 * If the second page is invalid, then we want the fault address to be
4338 * the first byte on that page which is accessed.
4339 */
4340 if (info->mem_off_split >= 0) {
4341 /*
4342 * There is an element split across the pages. The fault address
4343 * should be the first byte of the second page.
4344 */
4345 mem_off = info->page_split;
4346 /*
4347 * If the split element is also the first active element
4348 * of the vector, then: For first-fault we should continue
4349 * to generate faults for the second page. For no-fault,
4350 * we have work only if the second page is valid.
4351 */
4352 if (info->mem_off_first[0] < info->mem_off_split) {
4353 nofault = FAULT_FIRST;
4354 have_work = false;
4355 }
4356 } else {
4357 /*
4358 * There is no element split across the pages. The fault address
4359 * should be the first active element on the second page.
4360 */
4361 mem_off = info->mem_off_first[1];
4362 /*
4363 * There must have been one active element on the first page,
4364 * so we're out of first-fault territory.
4365 */
4366 nofault = fault != FAULT_ALL;
4367 }
4368
4369 have_work |= sve_probe_page(&info->page[1], nofault, env, addr, mem_off,
4370 access_type, mmu_idx, retaddr);
4371 return have_work;
4372}
4373
4bcc3f0f
RH
4374static void sve_cont_ldst_watchpoints(SVEContLdSt *info, CPUARMState *env,
4375 uint64_t *vg, target_ulong addr,
4376 int esize, int msize, int wp_access,
4377 uintptr_t retaddr)
4378{
4379#ifndef CONFIG_USER_ONLY
4380 intptr_t mem_off, reg_off, reg_last;
4381 int flags0 = info->page[0].flags;
4382 int flags1 = info->page[1].flags;
4383
4384 if (likely(!((flags0 | flags1) & TLB_WATCHPOINT))) {
4385 return;
4386 }
4387
4388 /* Indicate that watchpoints are handled. */
4389 info->page[0].flags = flags0 & ~TLB_WATCHPOINT;
4390 info->page[1].flags = flags1 & ~TLB_WATCHPOINT;
4391
4392 if (flags0 & TLB_WATCHPOINT) {
4393 mem_off = info->mem_off_first[0];
4394 reg_off = info->reg_off_first[0];
4395 reg_last = info->reg_off_last[0];
4396
4397 while (reg_off <= reg_last) {
4398 uint64_t pg = vg[reg_off >> 6];
4399 do {
4400 if ((pg >> (reg_off & 63)) & 1) {
4401 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4402 msize, info->page[0].attrs,
4403 wp_access, retaddr);
4404 }
4405 reg_off += esize;
4406 mem_off += msize;
4407 } while (reg_off <= reg_last && (reg_off & 63));
4408 }
4409 }
4410
4411 mem_off = info->mem_off_split;
4412 if (mem_off >= 0) {
4413 cpu_check_watchpoint(env_cpu(env), addr + mem_off, msize,
4414 info->page[0].attrs, wp_access, retaddr);
4415 }
4416
4417 mem_off = info->mem_off_first[1];
4418 if ((flags1 & TLB_WATCHPOINT) && mem_off >= 0) {
4419 reg_off = info->reg_off_first[1];
4420 reg_last = info->reg_off_last[1];
4421
4422 do {
4423 uint64_t pg = vg[reg_off >> 6];
4424 do {
4425 if ((pg >> (reg_off & 63)) & 1) {
4426 cpu_check_watchpoint(env_cpu(env), addr + mem_off,
4427 msize, info->page[1].attrs,
4428 wp_access, retaddr);
4429 }
4430 reg_off += esize;
4431 mem_off += msize;
4432 } while (reg_off & 63);
4433 } while (reg_off <= reg_last);
4434 }
4435#endif
4436}
4437
9123aeb6
RH
4438/*
4439 * The result of tlb_vaddr_to_host for user-only is just g2h(x),
4440 * which is always non-null. Elide the useless test.
4441 */
4442static inline bool test_host_page(void *host)
4443{
4444#ifdef CONFIG_USER_ONLY
4445 return true;
4446#else
4447 return likely(host != NULL);
4448#endif
4449}
4450
4451/*
4452 * Common helper for all contiguous one-register predicated loads.
4453 */
b854fd06
RH
4454static inline QEMU_ALWAYS_INLINE
4455void sve_ld1_r(CPUARMState *env, uint64_t *vg, const target_ulong addr,
4456 uint32_t desc, const uintptr_t retaddr,
4457 const int esz, const int msz,
4458 sve_ldst1_host_fn *host_fn,
4459 sve_ldst1_tlb_fn *tlb_fn)
4460{
500d0484
RH
4461 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4462 void *vd = &env->vfp.zregs[rd];
9123aeb6 4463 const intptr_t reg_max = simd_oprsz(desc);
b854fd06
RH
4464 intptr_t reg_off, reg_last, mem_off;
4465 SVEContLdSt info;
9123aeb6 4466 void *host;
b854fd06 4467 int flags;
9123aeb6 4468
b854fd06
RH
4469 /* Find the active elements. */
4470 if (!sve_cont_ldst_elements(&info, addr, vg, reg_max, esz, 1 << msz)) {
9123aeb6
RH
4471 /* The entire predicate was false; no load occurs. */
4472 memset(vd, 0, reg_max);
4473 return;
4474 }
9123aeb6 4475
b854fd06
RH
4476 /* Probe the page(s). Exit with exception for any invalid page. */
4477 sve_cont_ldst_pages(&info, FAULT_ALL, env, addr, MMU_DATA_LOAD, retaddr);
9123aeb6 4478
4bcc3f0f
RH
4479 /* Handle watchpoints for all active elements. */
4480 sve_cont_ldst_watchpoints(&info, env, vg, addr, 1 << esz, 1 << msz,
4481 BP_MEM_READ, retaddr);
4482
4483 /* TODO: MTE check. */
4484
b854fd06
RH
4485 flags = info.page[0].flags | info.page[1].flags;
4486 if (unlikely(flags != 0)) {
9123aeb6 4487#ifdef CONFIG_USER_ONLY
b854fd06 4488 g_assert_not_reached();
9123aeb6 4489#else
b854fd06 4490 /*
4bcc3f0f 4491 * At least one page includes MMIO.
b854fd06
RH
4492 * Any bus operation can fail with cpu_transaction_failed,
4493 * which for ARM will raise SyncExternal. Perform the load
4494 * into scratch memory to preserve register state until the end.
4495 */
4496 ARMVectorReg scratch;
4497
4498 memset(&scratch, 0, reg_max);
4499 mem_off = info.mem_off_first[0];
4500 reg_off = info.reg_off_first[0];
4501 reg_last = info.reg_off_last[1];
4502 if (reg_last < 0) {
4503 reg_last = info.reg_off_split;
4504 if (reg_last < 0) {
4505 reg_last = info.reg_off_last[0];
9123aeb6
RH
4506 }
4507 }
4508
b854fd06
RH
4509 do {
4510 uint64_t pg = vg[reg_off >> 6];
4511 do {
4512 if ((pg >> (reg_off & 63)) & 1) {
4513 tlb_fn(env, &scratch, reg_off, addr + mem_off, retaddr);
4514 }
4515 reg_off += 1 << esz;
4516 mem_off += 1 << msz;
4517 } while (reg_off & 63);
4518 } while (reg_off <= reg_last);
4519
4520 memcpy(vd, &scratch, reg_max);
4521 return;
9123aeb6 4522#endif
b854fd06
RH
4523 }
4524
4525 /* The entire operation is in RAM, on valid pages. */
4526
4527 memset(vd, 0, reg_max);
4528 mem_off = info.mem_off_first[0];
4529 reg_off = info.reg_off_first[0];
4530 reg_last = info.reg_off_last[0];
4531 host = info.page[0].host;
4532
4533 while (reg_off <= reg_last) {
4534 uint64_t pg = vg[reg_off >> 6];
4535 do {
4536 if ((pg >> (reg_off & 63)) & 1) {
4537 host_fn(vd, reg_off, host + mem_off);
4538 }
4539 reg_off += 1 << esz;
4540 mem_off += 1 << msz;
4541 } while (reg_off <= reg_last && (reg_off & 63));
4542 }
9123aeb6 4543
b854fd06
RH
4544 /*
4545 * Use the slow path to manage the cross-page misalignment.
4546 * But we know this is RAM and cannot trap.
4547 */
4548 mem_off = info.mem_off_split;
4549 if (unlikely(mem_off >= 0)) {
4550 tlb_fn(env, vd, info.reg_off_split, addr + mem_off, retaddr);
4551 }
4552
4553 mem_off = info.mem_off_first[1];
4554 if (unlikely(mem_off >= 0)) {
4555 reg_off = info.reg_off_first[1];
4556 reg_last = info.reg_off_last[1];
4557 host = info.page[1].host;
4558
4559 do {
4560 uint64_t pg = vg[reg_off >> 6];
4561 do {
4562 if ((pg >> (reg_off & 63)) & 1) {
4563 host_fn(vd, reg_off, host + mem_off);
4564 }
4565 reg_off += 1 << esz;
4566 mem_off += 1 << msz;
4567 } while (reg_off & 63);
4568 } while (reg_off <= reg_last);
4569 }
c4e7c493
RH
4570}
4571
9123aeb6
RH
4572#define DO_LD1_1(NAME, ESZ) \
4573void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
4574 target_ulong addr, uint32_t desc) \
4575{ \
4576 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, 0, \
4577 sve_##NAME##_host, sve_##NAME##_tlb); \
4578}
4579
9123aeb6 4580#define DO_LD1_2(NAME, ESZ, MSZ) \
7d0a57a2
RH
4581void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
4582 target_ulong addr, uint32_t desc) \
4583{ \
4584 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4585 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
4586} \
4587void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
4588 target_ulong addr, uint32_t desc) \
4589{ \
4590 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4591 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
9123aeb6
RH
4592}
4593
4594DO_LD1_1(ld1bb, 0)
4595DO_LD1_1(ld1bhu, 1)
4596DO_LD1_1(ld1bhs, 1)
4597DO_LD1_1(ld1bsu, 2)
4598DO_LD1_1(ld1bss, 2)
4599DO_LD1_1(ld1bdu, 3)
4600DO_LD1_1(ld1bds, 3)
4601
4602DO_LD1_2(ld1hh, 1, 1)
4603DO_LD1_2(ld1hsu, 2, 1)
4604DO_LD1_2(ld1hss, 2, 1)
4605DO_LD1_2(ld1hdu, 3, 1)
4606DO_LD1_2(ld1hds, 3, 1)
4607
4608DO_LD1_2(ld1ss, 2, 2)
4609DO_LD1_2(ld1sdu, 3, 2)
4610DO_LD1_2(ld1sds, 3, 2)
4611
4612DO_LD1_2(ld1dd, 3, 3)
4613
4614#undef DO_LD1_1
4615#undef DO_LD1_2
4616
f27d4dc2
RH
4617/*
4618 * Common helpers for all contiguous 2,3,4-register predicated loads.
4619 */
4620static void sve_ld2_r(CPUARMState *env, void *vg, target_ulong addr,
4621 uint32_t desc, int size, uintptr_t ra,
6799ce7b 4622 sve_ldst1_tlb_fn *tlb_fn)
f27d4dc2 4623{
500d0484 4624 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
f27d4dc2 4625 intptr_t i, oprsz = simd_oprsz(desc);
f27d4dc2
RH
4626 ARMVectorReg scratch[2] = { };
4627
f27d4dc2
RH
4628 for (i = 0; i < oprsz; ) {
4629 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4630 do {
4631 if (pg & 1) {
6799ce7b
RH
4632 tlb_fn(env, &scratch[0], i, addr, ra);
4633 tlb_fn(env, &scratch[1], i, addr + size, ra);
f27d4dc2
RH
4634 }
4635 i += size, pg >>= size;
4636 addr += 2 * size;
4637 } while (i & 15);
4638 }
f27d4dc2
RH
4639
4640 /* Wait until all exceptions have been raised to write back. */
4641 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4642 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
c4e7c493
RH
4643}
4644
f27d4dc2
RH
4645static void sve_ld3_r(CPUARMState *env, void *vg, target_ulong addr,
4646 uint32_t desc, int size, uintptr_t ra,
6799ce7b 4647 sve_ldst1_tlb_fn *tlb_fn)
f27d4dc2 4648{
500d0484 4649 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
f27d4dc2 4650 intptr_t i, oprsz = simd_oprsz(desc);
f27d4dc2
RH
4651 ARMVectorReg scratch[3] = { };
4652
f27d4dc2
RH
4653 for (i = 0; i < oprsz; ) {
4654 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4655 do {
4656 if (pg & 1) {
6799ce7b
RH
4657 tlb_fn(env, &scratch[0], i, addr, ra);
4658 tlb_fn(env, &scratch[1], i, addr + size, ra);
4659 tlb_fn(env, &scratch[2], i, addr + 2 * size, ra);
f27d4dc2
RH
4660 }
4661 i += size, pg >>= size;
4662 addr += 3 * size;
4663 } while (i & 15);
4664 }
f27d4dc2
RH
4665
4666 /* Wait until all exceptions have been raised to write back. */
4667 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4668 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4669 memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
c4e7c493
RH
4670}
4671
f27d4dc2
RH
4672static void sve_ld4_r(CPUARMState *env, void *vg, target_ulong addr,
4673 uint32_t desc, int size, uintptr_t ra,
6799ce7b 4674 sve_ldst1_tlb_fn *tlb_fn)
f27d4dc2 4675{
500d0484 4676 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
f27d4dc2 4677 intptr_t i, oprsz = simd_oprsz(desc);
f27d4dc2
RH
4678 ARMVectorReg scratch[4] = { };
4679
f27d4dc2
RH
4680 for (i = 0; i < oprsz; ) {
4681 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4682 do {
4683 if (pg & 1) {
6799ce7b
RH
4684 tlb_fn(env, &scratch[0], i, addr, ra);
4685 tlb_fn(env, &scratch[1], i, addr + size, ra);
4686 tlb_fn(env, &scratch[2], i, addr + 2 * size, ra);
4687 tlb_fn(env, &scratch[3], i, addr + 3 * size, ra);
f27d4dc2
RH
4688 }
4689 i += size, pg >>= size;
4690 addr += 4 * size;
4691 } while (i & 15);
4692 }
f27d4dc2
RH
4693
4694 /* Wait until all exceptions have been raised to write back. */
4695 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4696 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4697 memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
4698 memcpy(&env->vfp.zregs[(rd + 3) & 31], &scratch[3], oprsz);
4699}
4700
4701#define DO_LDN_1(N) \
3f2f3b33 4702void QEMU_FLATTEN HELPER(sve_ld##N##bb_r) \
f27d4dc2
RH
4703 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4704{ \
4705 sve_ld##N##_r(env, vg, addr, desc, 1, GETPC(), sve_ld1bb_tlb); \
4706}
4707
4708#define DO_LDN_2(N, SUFF, SIZE) \
3f2f3b33 4709void QEMU_FLATTEN HELPER(sve_ld##N##SUFF##_le_r) \
7d0a57a2
RH
4710 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4711{ \
4712 sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \
4713 sve_ld1##SUFF##_le_tlb); \
4714} \
3f2f3b33 4715void QEMU_FLATTEN HELPER(sve_ld##N##SUFF##_be_r) \
f27d4dc2
RH
4716 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4717{ \
4718 sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \
7d0a57a2 4719 sve_ld1##SUFF##_be_tlb); \
c4e7c493
RH
4720}
4721
f27d4dc2
RH
4722DO_LDN_1(2)
4723DO_LDN_1(3)
4724DO_LDN_1(4)
c4e7c493 4725
f27d4dc2
RH
4726DO_LDN_2(2, hh, 2)
4727DO_LDN_2(3, hh, 2)
4728DO_LDN_2(4, hh, 2)
c4e7c493 4729
f27d4dc2
RH
4730DO_LDN_2(2, ss, 4)
4731DO_LDN_2(3, ss, 4)
4732DO_LDN_2(4, ss, 4)
c4e7c493 4733
f27d4dc2
RH
4734DO_LDN_2(2, dd, 8)
4735DO_LDN_2(3, dd, 8)
4736DO_LDN_2(4, dd, 8)
c4e7c493 4737
f27d4dc2
RH
4738#undef DO_LDN_1
4739#undef DO_LDN_2
e2654d75
RH
4740
4741/*
4742 * Load contiguous data, first-fault and no-fault.
9123aeb6
RH
4743 *
4744 * For user-only, one could argue that we should hold the mmap_lock during
4745 * the operation so that there is no race between page_check_range and the
4746 * load operation. However, unmapping pages out from under a running thread
4747 * is extraordinarily unlikely. This theoretical race condition also affects
4748 * linux-user/ in its get_user/put_user macros.
4749 *
4750 * TODO: Construct some helpers, written in assembly, that interact with
4751 * handle_cpu_signal to produce memory ops which can properly report errors
4752 * without racing.
e2654d75
RH
4753 */
4754
e2654d75
RH
4755/* Fault on byte I. All bits in FFR from I are cleared. The vector
4756 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4757 * option, which leaves subsequent data unchanged.
4758 */
4759static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4760{
4761 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4762
4763 if (i & 63) {
4764 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4765 i = ROUND_UP(i, 64);
4766 }
4767 for (; i < oprsz; i += 64) {
4768 ffr[i / 64] = 0;
4769 }
4770}
4771
9123aeb6
RH
4772/*
4773 * Common helper for all contiguous first-fault loads.
4774 */
4775static void sve_ldff1_r(CPUARMState *env, void *vg, const target_ulong addr,
4776 uint32_t desc, const uintptr_t retaddr,
4777 const int esz, const int msz,
cf4a49b7 4778 sve_ldst1_host_fn *host_fn,
6799ce7b 4779 sve_ldst1_tlb_fn *tlb_fn)
9123aeb6 4780{
500d0484
RH
4781 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
4782 const int mmu_idx = get_mmuidx(oi);
4783 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4784 void *vd = &env->vfp.zregs[rd];
9123aeb6
RH
4785 const int diffsz = esz - msz;
4786 const intptr_t reg_max = simd_oprsz(desc);
4787 const intptr_t mem_max = reg_max >> diffsz;
cf4a49b7 4788 intptr_t split, reg_off, mem_off, i;
9123aeb6
RH
4789 void *host;
4790
4791 /* Skip to the first active element. */
4792 reg_off = find_next_active(vg, 0, reg_max, esz);
4793 if (unlikely(reg_off == reg_max)) {
4794 /* The entire predicate was false; no load occurs. */
4795 memset(vd, 0, reg_max);
4796 return;
4797 }
4798 mem_off = reg_off >> diffsz;
9123aeb6
RH
4799
4800 /*
4801 * If the (remaining) load is entirely within a single page, then:
4802 * For softmmu, and the tlb hits, then no faults will occur;
4803 * For user-only, either the first load will fault or none will.
4804 * We can thus perform the load directly to the destination and
4805 * Vd will be unmodified on any exception path.
4806 */
4807 split = max_for_page(addr, mem_off, mem_max);
4808 if (likely(split == mem_max)) {
4809 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4810 if (test_host_page(host)) {
cf4a49b7
RH
4811 i = reg_off;
4812 host -= mem_off;
4813 do {
4814 host_fn(vd, i, host + (i >> diffsz));
4815 i = find_next_active(vg, i + (1 << esz), reg_max, esz);
4816 } while (i < reg_max);
9123aeb6
RH
4817 /* After any fault, zero any leading inactive elements. */
4818 swap_memzero(vd, reg_off);
4819 return;
4820 }
4821 }
4822
9123aeb6
RH
4823 /*
4824 * Perform one normal read, which will fault or not.
4825 * But it is likely to bring the page into the tlb.
4826 */
6799ce7b 4827 tlb_fn(env, vd, reg_off, addr + mem_off, retaddr);
9123aeb6
RH
4828
4829 /* After any fault, zero any leading predicated false elts. */
4830 swap_memzero(vd, reg_off);
4831 mem_off += 1 << msz;
4832 reg_off += 1 << esz;
4833
4834 /* Try again to read the balance of the page. */
4835 split = max_for_page(addr, mem_off - 1, mem_max);
4836 if (split >= (1 << msz)) {
4837 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4838 if (host) {
cf4a49b7
RH
4839 host -= mem_off;
4840 do {
4841 host_fn(vd, reg_off, host + mem_off);
4842 reg_off += 1 << esz;
4843 reg_off = find_next_active(vg, reg_off, reg_max, esz);
4844 mem_off = reg_off >> diffsz;
4845 } while (split - mem_off >= (1 << msz));
9123aeb6
RH
4846 }
4847 }
9123aeb6 4848
9123aeb6
RH
4849 record_fault(env, reg_off, reg_max);
4850}
4851
4852/*
4853 * Common helper for all contiguous no-fault loads.
e2654d75 4854 */
9123aeb6
RH
4855static void sve_ldnf1_r(CPUARMState *env, void *vg, const target_ulong addr,
4856 uint32_t desc, const int esz, const int msz,
cf4a49b7 4857 sve_ldst1_host_fn *host_fn)
9123aeb6 4858{
500d0484
RH
4859 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
4860 void *vd = &env->vfp.zregs[rd];
9123aeb6
RH
4861 const int diffsz = esz - msz;
4862 const intptr_t reg_max = simd_oprsz(desc);
4863 const intptr_t mem_max = reg_max >> diffsz;
4864 const int mmu_idx = cpu_mmu_index(env, false);
4865 intptr_t split, reg_off, mem_off;
4866 void *host;
4867
4868#ifdef CONFIG_USER_ONLY
4869 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, mmu_idx);
4870 if (likely(page_check_range(addr, mem_max, PAGE_READ) == 0)) {
4871 /* The entire operation is valid and will not fault. */
cf4a49b7
RH
4872 reg_off = 0;
4873 do {
4874 mem_off = reg_off >> diffsz;
4875 host_fn(vd, reg_off, host + mem_off);
4876 reg_off += 1 << esz;
4877 reg_off = find_next_active(vg, reg_off, reg_max, esz);
4878 } while (reg_off < reg_max);
9123aeb6
RH
4879 return;
4880 }
4881#endif
4882
4883 /* There will be no fault, so we may modify in advance. */
4884 memset(vd, 0, reg_max);
4885
4886 /* Skip to the first active element. */
4887 reg_off = find_next_active(vg, 0, reg_max, esz);
4888 if (unlikely(reg_off == reg_max)) {
4889 /* The entire predicate was false; no load occurs. */
4890 return;
4891 }
4892 mem_off = reg_off >> diffsz;
4893
4894#ifdef CONFIG_USER_ONLY
4895 if (page_check_range(addr + mem_off, 1 << msz, PAGE_READ) == 0) {
4896 /* At least one load is valid; take the rest of the page. */
4897 split = max_for_page(addr, mem_off + (1 << msz) - 1, mem_max);
cf4a49b7
RH
4898 do {
4899 host_fn(vd, reg_off, host + mem_off);
4900 reg_off += 1 << esz;
4901 reg_off = find_next_active(vg, reg_off, reg_max, esz);
4902 mem_off = reg_off >> diffsz;
4903 } while (split - mem_off >= (1 << msz));
9123aeb6
RH
4904 }
4905#else
4906 /*
4907 * If the address is not in the TLB, we have no way to bring the
4908 * entry into the TLB without also risking a fault. Note that
4909 * the corollary is that we never load from an address not in RAM.
4910 *
4911 * This last is out of spec, in a weird corner case.
4912 * Per the MemNF/MemSingleNF pseudocode, a NF load from Device memory
4913 * must not actually hit the bus -- it returns UNKNOWN data instead.
4914 * But if you map non-RAM with Normal memory attributes and do a NF
4915 * load then it should access the bus. (Nobody ought actually do this
4916 * in the real world, obviously.)
4917 *
4918 * Then there are the annoying special cases with watchpoints...
4811e909 4919 * TODO: Add a form of non-faulting loads using cc->tlb_fill(probe=true).
9123aeb6
RH
4920 */
4921 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4922 split = max_for_page(addr, mem_off, mem_max);
4923 if (host && split >= (1 << msz)) {
cf4a49b7
RH
4924 host -= mem_off;
4925 do {
4926 host_fn(vd, reg_off, host + mem_off);
4927 reg_off += 1 << esz;
4928 reg_off = find_next_active(vg, reg_off, reg_max, esz);
4929 mem_off = reg_off >> diffsz;
4930 } while (split - mem_off >= (1 << msz));
9123aeb6
RH
4931 }
4932#endif
4933
4934 record_fault(env, reg_off, reg_max);
4935}
4936
4937#define DO_LDFF1_LDNF1_1(PART, ESZ) \
4938void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
4939 target_ulong addr, uint32_t desc) \
e2654d75 4940{ \
9123aeb6
RH
4941 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, 0, \
4942 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
e2654d75 4943} \
9123aeb6
RH
4944void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
4945 target_ulong addr, uint32_t desc) \
e2654d75 4946{ \
9123aeb6 4947 sve_ldnf1_r(env, vg, addr, desc, ESZ, 0, sve_ld1##PART##_host); \
e2654d75
RH
4948}
4949
9123aeb6 4950#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
7d0a57a2
RH
4951void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
4952 target_ulong addr, uint32_t desc) \
e2654d75 4953{ \
7d0a57a2
RH
4954 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4955 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
9123aeb6 4956} \
7d0a57a2
RH
4957void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
4958 target_ulong addr, uint32_t desc) \
9123aeb6 4959{ \
7d0a57a2
RH
4960 sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_le_host); \
4961} \
4962void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
4963 target_ulong addr, uint32_t desc) \
4964{ \
4965 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4966 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
4967} \
4968void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
4969 target_ulong addr, uint32_t desc) \
4970{ \
4971 sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_be_host); \
e2654d75
RH
4972}
4973
9123aeb6
RH
4974DO_LDFF1_LDNF1_1(bb, 0)
4975DO_LDFF1_LDNF1_1(bhu, 1)
4976DO_LDFF1_LDNF1_1(bhs, 1)
4977DO_LDFF1_LDNF1_1(bsu, 2)
4978DO_LDFF1_LDNF1_1(bss, 2)
4979DO_LDFF1_LDNF1_1(bdu, 3)
4980DO_LDFF1_LDNF1_1(bds, 3)
e2654d75 4981
9123aeb6
RH
4982DO_LDFF1_LDNF1_2(hh, 1, 1)
4983DO_LDFF1_LDNF1_2(hsu, 2, 1)
4984DO_LDFF1_LDNF1_2(hss, 2, 1)
4985DO_LDFF1_LDNF1_2(hdu, 3, 1)
4986DO_LDFF1_LDNF1_2(hds, 3, 1)
e2654d75 4987
9123aeb6
RH
4988DO_LDFF1_LDNF1_2(ss, 2, 2)
4989DO_LDFF1_LDNF1_2(sdu, 3, 2)
4990DO_LDFF1_LDNF1_2(sds, 3, 2)
e2654d75 4991
9123aeb6 4992DO_LDFF1_LDNF1_2(dd, 3, 3)
e2654d75 4993
9123aeb6
RH
4994#undef DO_LDFF1_LDNF1_1
4995#undef DO_LDFF1_LDNF1_2
1a039c7e 4996
9fd46c83
RH
4997/*
4998 * Common helpers for all contiguous 1,2,3,4-register predicated stores.
4999 */
5000static void sve_st1_r(CPUARMState *env, void *vg, target_ulong addr,
5001 uint32_t desc, const uintptr_t ra,
5002 const int esize, const int msize,
6799ce7b 5003 sve_ldst1_tlb_fn *tlb_fn)
9fd46c83 5004{
500d0484 5005 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
9fd46c83 5006 intptr_t i, oprsz = simd_oprsz(desc);
9fd46c83 5007 void *vd = &env->vfp.zregs[rd];
1a039c7e 5008
9fd46c83
RH
5009 for (i = 0; i < oprsz; ) {
5010 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
5011 do {
5012 if (pg & 1) {
6799ce7b 5013 tlb_fn(env, vd, i, addr, ra);
9fd46c83
RH
5014 }
5015 i += esize, pg >>= esize;
5016 addr += msize;
5017 } while (i & 15);
5018 }
9fd46c83 5019}
1a039c7e 5020
9fd46c83
RH
5021static void sve_st2_r(CPUARMState *env, void *vg, target_ulong addr,
5022 uint32_t desc, const uintptr_t ra,
5023 const int esize, const int msize,
6799ce7b 5024 sve_ldst1_tlb_fn *tlb_fn)
1a039c7e 5025{
500d0484 5026 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
9fd46c83 5027 intptr_t i, oprsz = simd_oprsz(desc);
9fd46c83
RH
5028 void *d1 = &env->vfp.zregs[rd];
5029 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
1a039c7e 5030
9fd46c83
RH
5031 for (i = 0; i < oprsz; ) {
5032 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
5033 do {
5034 if (pg & 1) {
6799ce7b
RH
5035 tlb_fn(env, d1, i, addr, ra);
5036 tlb_fn(env, d2, i, addr + msize, ra);
9fd46c83
RH
5037 }
5038 i += esize, pg >>= esize;
5039 addr += 2 * msize;
5040 } while (i & 15);
1a039c7e
RH
5041 }
5042}
5043
9fd46c83
RH
5044static void sve_st3_r(CPUARMState *env, void *vg, target_ulong addr,
5045 uint32_t desc, const uintptr_t ra,
5046 const int esize, const int msize,
6799ce7b 5047 sve_ldst1_tlb_fn *tlb_fn)
1a039c7e 5048{
500d0484 5049 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
9fd46c83 5050 intptr_t i, oprsz = simd_oprsz(desc);
9fd46c83
RH
5051 void *d1 = &env->vfp.zregs[rd];
5052 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
5053 void *d3 = &env->vfp.zregs[(rd + 2) & 31];
1a039c7e 5054
9fd46c83
RH
5055 for (i = 0; i < oprsz; ) {
5056 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
5057 do {
5058 if (pg & 1) {
6799ce7b
RH
5059 tlb_fn(env, d1, i, addr, ra);
5060 tlb_fn(env, d2, i, addr + msize, ra);
5061 tlb_fn(env, d3, i, addr + 2 * msize, ra);
9fd46c83
RH
5062 }
5063 i += esize, pg >>= esize;
5064 addr += 3 * msize;
5065 } while (i & 15);
1a039c7e
RH
5066 }
5067}
5068
9fd46c83
RH
5069static void sve_st4_r(CPUARMState *env, void *vg, target_ulong addr,
5070 uint32_t desc, const uintptr_t ra,
5071 const int esize, const int msize,
6799ce7b 5072 sve_ldst1_tlb_fn *tlb_fn)
1a039c7e 5073{
500d0484 5074 const unsigned rd = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 5);
9fd46c83 5075 intptr_t i, oprsz = simd_oprsz(desc);
9fd46c83
RH
5076 void *d1 = &env->vfp.zregs[rd];
5077 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
5078 void *d3 = &env->vfp.zregs[(rd + 2) & 31];
5079 void *d4 = &env->vfp.zregs[(rd + 3) & 31];
1a039c7e 5080
9fd46c83
RH
5081 for (i = 0; i < oprsz; ) {
5082 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
5083 do {
5084 if (pg & 1) {
6799ce7b
RH
5085 tlb_fn(env, d1, i, addr, ra);
5086 tlb_fn(env, d2, i, addr + msize, ra);
5087 tlb_fn(env, d3, i, addr + 2 * msize, ra);
5088 tlb_fn(env, d4, i, addr + 3 * msize, ra);
9fd46c83
RH
5089 }
5090 i += esize, pg >>= esize;
5091 addr += 4 * msize;
5092 } while (i & 15);
1a039c7e 5093 }
9fd46c83
RH
5094}
5095
5096#define DO_STN_1(N, NAME, ESIZE) \
3f2f3b33 5097void QEMU_FLATTEN HELPER(sve_st##N##NAME##_r) \
9fd46c83
RH
5098 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
5099{ \
5100 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, 1, \
5101 sve_st1##NAME##_tlb); \
1a039c7e 5102}
f6dbf62a 5103
9fd46c83 5104#define DO_STN_2(N, NAME, ESIZE, MSIZE) \
3f2f3b33 5105void QEMU_FLATTEN HELPER(sve_st##N##NAME##_le_r) \
9fd46c83
RH
5106 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
5107{ \
5108 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, MSIZE, \
28d57f2d
RH
5109 sve_st1##NAME##_le_tlb); \
5110} \
3f2f3b33 5111void QEMU_FLATTEN HELPER(sve_st##N##NAME##_be_r) \
28d57f2d
RH
5112 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
5113{ \
5114 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, MSIZE, \
5115 sve_st1##NAME##_be_tlb); \
9fd46c83
RH
5116}
5117
5118DO_STN_1(1, bb, 1)
5119DO_STN_1(1, bh, 2)
5120DO_STN_1(1, bs, 4)
5121DO_STN_1(1, bd, 8)
5122DO_STN_1(2, bb, 1)
5123DO_STN_1(3, bb, 1)
5124DO_STN_1(4, bb, 1)
5125
5126DO_STN_2(1, hh, 2, 2)
5127DO_STN_2(1, hs, 4, 2)
5128DO_STN_2(1, hd, 8, 2)
5129DO_STN_2(2, hh, 2, 2)
5130DO_STN_2(3, hh, 2, 2)
5131DO_STN_2(4, hh, 2, 2)
5132
5133DO_STN_2(1, ss, 4, 4)
5134DO_STN_2(1, sd, 8, 4)
5135DO_STN_2(2, ss, 4, 4)
5136DO_STN_2(3, ss, 4, 4)
5137DO_STN_2(4, ss, 4, 4)
5138
5139DO_STN_2(1, dd, 8, 8)
5140DO_STN_2(2, dd, 8, 8)
5141DO_STN_2(3, dd, 8, 8)
5142DO_STN_2(4, dd, 8, 8)
5143
5144#undef DO_STN_1
5145#undef DO_STN_2
5146
d4f75f25
RH
5147/*
5148 * Loads with a vector index.
5149 */
673e9fa6 5150
d4f75f25
RH
5151/*
5152 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
5153 */
5154typedef target_ulong zreg_off_fn(void *reg, intptr_t reg_ofs);
5155
5156static target_ulong off_zsu_s(void *reg, intptr_t reg_ofs)
5157{
5158 return *(uint32_t *)(reg + H1_4(reg_ofs));
673e9fa6
RH
5159}
5160
d4f75f25
RH
5161static target_ulong off_zss_s(void *reg, intptr_t reg_ofs)
5162{
5163 return *(int32_t *)(reg + H1_4(reg_ofs));
5164}
5165
5166static target_ulong off_zsu_d(void *reg, intptr_t reg_ofs)
5167{
5168 return (uint32_t)*(uint64_t *)(reg + reg_ofs);
5169}
5170
5171static target_ulong off_zss_d(void *reg, intptr_t reg_ofs)
5172{
5173 return (int32_t)*(uint64_t *)(reg + reg_ofs);
5174}
5175
5176static target_ulong off_zd_d(void *reg, intptr_t reg_ofs)
5177{
5178 return *(uint64_t *)(reg + reg_ofs);
673e9fa6
RH
5179}
5180
d4f75f25
RH
5181static void sve_ld1_zs(CPUARMState *env, void *vd, void *vg, void *vm,
5182 target_ulong base, uint32_t desc, uintptr_t ra,
6799ce7b 5183 zreg_off_fn *off_fn, sve_ldst1_tlb_fn *tlb_fn)
d4f75f25 5184{
500d0484 5185 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
d4f75f25 5186 intptr_t i, oprsz = simd_oprsz(desc);
d4f75f25
RH
5187 ARMVectorReg scratch = { };
5188
d4f75f25
RH
5189 for (i = 0; i < oprsz; ) {
5190 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
5191 do {
5192 if (likely(pg & 1)) {
5193 target_ulong off = off_fn(vm, i);
6799ce7b 5194 tlb_fn(env, &scratch, i, base + (off << scale), ra);
d4f75f25
RH
5195 }
5196 i += 4, pg >>= 4;
5197 } while (i & 15);
5198 }
d4f75f25
RH
5199
5200 /* Wait until all exceptions have been raised to write back. */
5201 memcpy(vd, &scratch, oprsz);
5202}
5203
5204static void sve_ld1_zd(CPUARMState *env, void *vd, void *vg, void *vm,
5205 target_ulong base, uint32_t desc, uintptr_t ra,
6799ce7b 5206 zreg_off_fn *off_fn, sve_ldst1_tlb_fn *tlb_fn)
d4f75f25 5207{
500d0484 5208 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
d4f75f25 5209 intptr_t i, oprsz = simd_oprsz(desc) / 8;
d4f75f25
RH
5210 ARMVectorReg scratch = { };
5211
d4f75f25
RH
5212 for (i = 0; i < oprsz; i++) {
5213 uint8_t pg = *(uint8_t *)(vg + H1(i));
5214 if (likely(pg & 1)) {
5215 target_ulong off = off_fn(vm, i * 8);
6799ce7b 5216 tlb_fn(env, &scratch, i * 8, base + (off << scale), ra);
d4f75f25
RH
5217 }
5218 }
d4f75f25
RH
5219
5220 /* Wait until all exceptions have been raised to write back. */
5221 memcpy(vd, &scratch, oprsz * 8);
5222}
5223
5224#define DO_LD1_ZPZ_S(MEM, OFS) \
3f2f3b33 5225void QEMU_FLATTEN HELPER(sve_ld##MEM##_##OFS) \
d4f75f25
RH
5226 (CPUARMState *env, void *vd, void *vg, void *vm, \
5227 target_ulong base, uint32_t desc) \
5228{ \
5229 sve_ld1_zs(env, vd, vg, vm, base, desc, GETPC(), \
5230 off_##OFS##_s, sve_ld1##MEM##_tlb); \
5231}
5232
5233#define DO_LD1_ZPZ_D(MEM, OFS) \
3f2f3b33 5234void QEMU_FLATTEN HELPER(sve_ld##MEM##_##OFS) \
d4f75f25
RH
5235 (CPUARMState *env, void *vd, void *vg, void *vm, \
5236 target_ulong base, uint32_t desc) \
5237{ \
5238 sve_ld1_zd(env, vd, vg, vm, base, desc, GETPC(), \
5239 off_##OFS##_d, sve_ld1##MEM##_tlb); \
5240}
5241
5242DO_LD1_ZPZ_S(bsu, zsu)
5243DO_LD1_ZPZ_S(bsu, zss)
5244DO_LD1_ZPZ_D(bdu, zsu)
5245DO_LD1_ZPZ_D(bdu, zss)
5246DO_LD1_ZPZ_D(bdu, zd)
5247
5248DO_LD1_ZPZ_S(bss, zsu)
5249DO_LD1_ZPZ_S(bss, zss)
5250DO_LD1_ZPZ_D(bds, zsu)
5251DO_LD1_ZPZ_D(bds, zss)
5252DO_LD1_ZPZ_D(bds, zd)
5253
5254DO_LD1_ZPZ_S(hsu_le, zsu)
5255DO_LD1_ZPZ_S(hsu_le, zss)
5256DO_LD1_ZPZ_D(hdu_le, zsu)
5257DO_LD1_ZPZ_D(hdu_le, zss)
5258DO_LD1_ZPZ_D(hdu_le, zd)
5259
5260DO_LD1_ZPZ_S(hsu_be, zsu)
5261DO_LD1_ZPZ_S(hsu_be, zss)
5262DO_LD1_ZPZ_D(hdu_be, zsu)
5263DO_LD1_ZPZ_D(hdu_be, zss)
5264DO_LD1_ZPZ_D(hdu_be, zd)
5265
5266DO_LD1_ZPZ_S(hss_le, zsu)
5267DO_LD1_ZPZ_S(hss_le, zss)
5268DO_LD1_ZPZ_D(hds_le, zsu)
5269DO_LD1_ZPZ_D(hds_le, zss)
5270DO_LD1_ZPZ_D(hds_le, zd)
5271
5272DO_LD1_ZPZ_S(hss_be, zsu)
5273DO_LD1_ZPZ_S(hss_be, zss)
5274DO_LD1_ZPZ_D(hds_be, zsu)
5275DO_LD1_ZPZ_D(hds_be, zss)
5276DO_LD1_ZPZ_D(hds_be, zd)
5277
5278DO_LD1_ZPZ_S(ss_le, zsu)
5279DO_LD1_ZPZ_S(ss_le, zss)
5280DO_LD1_ZPZ_D(sdu_le, zsu)
5281DO_LD1_ZPZ_D(sdu_le, zss)
5282DO_LD1_ZPZ_D(sdu_le, zd)
5283
5284DO_LD1_ZPZ_S(ss_be, zsu)
5285DO_LD1_ZPZ_S(ss_be, zss)
5286DO_LD1_ZPZ_D(sdu_be, zsu)
5287DO_LD1_ZPZ_D(sdu_be, zss)
5288DO_LD1_ZPZ_D(sdu_be, zd)
5289
5290DO_LD1_ZPZ_D(sds_le, zsu)
5291DO_LD1_ZPZ_D(sds_le, zss)
5292DO_LD1_ZPZ_D(sds_le, zd)
5293
5294DO_LD1_ZPZ_D(sds_be, zsu)
5295DO_LD1_ZPZ_D(sds_be, zss)
5296DO_LD1_ZPZ_D(sds_be, zd)
5297
5298DO_LD1_ZPZ_D(dd_le, zsu)
5299DO_LD1_ZPZ_D(dd_le, zss)
5300DO_LD1_ZPZ_D(dd_le, zd)
5301
5302DO_LD1_ZPZ_D(dd_be, zsu)
5303DO_LD1_ZPZ_D(dd_be, zss)
5304DO_LD1_ZPZ_D(dd_be, zd)
5305
5306#undef DO_LD1_ZPZ_S
5307#undef DO_LD1_ZPZ_D
673e9fa6 5308
ed67eb7f
RH
5309/* First fault loads with a vector index. */
5310
116347ce
RH
5311/* Load one element into VD+REG_OFF from (ENV,VADDR) without faulting.
5312 * The controlling predicate is known to be true. Return true if the
5313 * load was successful.
5314 */
5315typedef bool sve_ld1_nf_fn(CPUARMState *env, void *vd, intptr_t reg_off,
5316 target_ulong vaddr, int mmu_idx);
ed67eb7f 5317
116347ce
RH
5318#ifdef CONFIG_SOFTMMU
5319#define DO_LD_NF(NAME, H, TYPEE, TYPEM, HOST) \
5320static bool sve_ld##NAME##_nf(CPUARMState *env, void *vd, intptr_t reg_off, \
500d0484 5321 target_ulong addr, int mmu_idx) \
116347ce
RH
5322{ \
5323 target_ulong next_page = -(addr | TARGET_PAGE_MASK); \
5324 if (likely(next_page - addr >= sizeof(TYPEM))) { \
5325 void *host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, mmu_idx); \
5326 if (likely(host)) { \
5327 TYPEM val = HOST(host); \
5328 *(TYPEE *)(vd + H(reg_off)) = val; \
5329 return true; \
5330 } \
5331 } \
5332 return false; \
ed67eb7f 5333}
ed67eb7f 5334#else
116347ce
RH
5335#define DO_LD_NF(NAME, H, TYPEE, TYPEM, HOST) \
5336static bool sve_ld##NAME##_nf(CPUARMState *env, void *vd, intptr_t reg_off, \
5337 target_ulong addr, int mmu_idx) \
5338{ \
5339 if (likely(page_check_range(addr, sizeof(TYPEM), PAGE_READ))) { \
5340 TYPEM val = HOST(g2h(addr)); \
5341 *(TYPEE *)(vd + H(reg_off)) = val; \
5342 return true; \
5343 } \
5344 return false; \
5345}
5346#endif
ed67eb7f 5347
116347ce
RH
5348DO_LD_NF(bsu, H1_4, uint32_t, uint8_t, ldub_p)
5349DO_LD_NF(bss, H1_4, uint32_t, int8_t, ldsb_p)
5350DO_LD_NF(bdu, , uint64_t, uint8_t, ldub_p)
5351DO_LD_NF(bds, , uint64_t, int8_t, ldsb_p)
5352
5353DO_LD_NF(hsu_le, H1_4, uint32_t, uint16_t, lduw_le_p)
5354DO_LD_NF(hss_le, H1_4, uint32_t, int16_t, ldsw_le_p)
5355DO_LD_NF(hsu_be, H1_4, uint32_t, uint16_t, lduw_be_p)
5356DO_LD_NF(hss_be, H1_4, uint32_t, int16_t, ldsw_be_p)
5357DO_LD_NF(hdu_le, , uint64_t, uint16_t, lduw_le_p)
5358DO_LD_NF(hds_le, , uint64_t, int16_t, ldsw_le_p)
5359DO_LD_NF(hdu_be, , uint64_t, uint16_t, lduw_be_p)
5360DO_LD_NF(hds_be, , uint64_t, int16_t, ldsw_be_p)
5361
5362DO_LD_NF(ss_le, H1_4, uint32_t, uint32_t, ldl_le_p)
5363DO_LD_NF(ss_be, H1_4, uint32_t, uint32_t, ldl_be_p)
5364DO_LD_NF(sdu_le, , uint64_t, uint32_t, ldl_le_p)
5365DO_LD_NF(sds_le, , uint64_t, int32_t, ldl_le_p)
5366DO_LD_NF(sdu_be, , uint64_t, uint32_t, ldl_be_p)
5367DO_LD_NF(sds_be, , uint64_t, int32_t, ldl_be_p)
5368
5369DO_LD_NF(dd_le, , uint64_t, uint64_t, ldq_le_p)
5370DO_LD_NF(dd_be, , uint64_t, uint64_t, ldq_be_p)
5371
5372/*
5373 * Common helper for all gather first-faulting loads.
5374 */
5375static inline void sve_ldff1_zs(CPUARMState *env, void *vd, void *vg, void *vm,
5376 target_ulong base, uint32_t desc, uintptr_t ra,
6799ce7b 5377 zreg_off_fn *off_fn, sve_ldst1_tlb_fn *tlb_fn,
116347ce
RH
5378 sve_ld1_nf_fn *nonfault_fn)
5379{
500d0484
RH
5380 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
5381 const int mmu_idx = get_mmuidx(oi);
5382 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
116347ce 5383 intptr_t reg_off, reg_max = simd_oprsz(desc);
116347ce
RH
5384 target_ulong addr;
5385
5386 /* Skip to the first true predicate. */
5387 reg_off = find_next_active(vg, 0, reg_max, MO_32);
5388 if (likely(reg_off < reg_max)) {
5389 /* Perform one normal read, which will fault or not. */
116347ce
RH
5390 addr = off_fn(vm, reg_off);
5391 addr = base + (addr << scale);
6799ce7b 5392 tlb_fn(env, vd, reg_off, addr, ra);
116347ce
RH
5393
5394 /* The rest of the reads will be non-faulting. */
116347ce
RH
5395 }
5396
5397 /* After any fault, zero the leading predicated false elements. */
5398 swap_memzero(vd, reg_off);
5399
5400 while (likely((reg_off += 4) < reg_max)) {
5401 uint64_t pg = *(uint64_t *)(vg + (reg_off >> 6) * 8);
5402 if (likely((pg >> (reg_off & 63)) & 1)) {
5403 addr = off_fn(vm, reg_off);
5404 addr = base + (addr << scale);
5405 if (!nonfault_fn(env, vd, reg_off, addr, mmu_idx)) {
5406 record_fault(env, reg_off, reg_max);
5407 break;
5408 }
5409 } else {
5410 *(uint32_t *)(vd + H1_4(reg_off)) = 0;
5411 }
5412 }
ed67eb7f
RH
5413}
5414
116347ce
RH
5415static inline void sve_ldff1_zd(CPUARMState *env, void *vd, void *vg, void *vm,
5416 target_ulong base, uint32_t desc, uintptr_t ra,
6799ce7b 5417 zreg_off_fn *off_fn, sve_ldst1_tlb_fn *tlb_fn,
116347ce
RH
5418 sve_ld1_nf_fn *nonfault_fn)
5419{
500d0484
RH
5420 const TCGMemOpIdx oi = extract32(desc, SIMD_DATA_SHIFT, MEMOPIDX_SHIFT);
5421 const int mmu_idx = get_mmuidx(oi);
5422 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
116347ce 5423 intptr_t reg_off, reg_max = simd_oprsz(desc);
116347ce
RH
5424 target_ulong addr;
5425
5426 /* Skip to the first true predicate. */
5427 reg_off = find_next_active(vg, 0, reg_max, MO_64);
5428 if (likely(reg_off < reg_max)) {
5429 /* Perform one normal read, which will fault or not. */
116347ce
RH
5430 addr = off_fn(vm, reg_off);
5431 addr = base + (addr << scale);
6799ce7b 5432 tlb_fn(env, vd, reg_off, addr, ra);
116347ce
RH
5433
5434 /* The rest of the reads will be non-faulting. */
116347ce 5435 }
ed67eb7f 5436
116347ce
RH
5437 /* After any fault, zero the leading predicated false elements. */
5438 swap_memzero(vd, reg_off);
5439
5440 while (likely((reg_off += 8) < reg_max)) {
5441 uint8_t pg = *(uint8_t *)(vg + H1(reg_off >> 3));
5442 if (likely(pg & 1)) {
5443 addr = off_fn(vm, reg_off);
5444 addr = base + (addr << scale);
5445 if (!nonfault_fn(env, vd, reg_off, addr, mmu_idx)) {
5446 record_fault(env, reg_off, reg_max);
5447 break;
5448 }
5449 } else {
5450 *(uint64_t *)(vd + reg_off) = 0;
5451 }
5452 }
5453}
5454
5455#define DO_LDFF1_ZPZ_S(MEM, OFS) \
5456void HELPER(sve_ldff##MEM##_##OFS) \
5457 (CPUARMState *env, void *vd, void *vg, void *vm, \
5458 target_ulong base, uint32_t desc) \
5459{ \
5460 sve_ldff1_zs(env, vd, vg, vm, base, desc, GETPC(), \
5461 off_##OFS##_s, sve_ld1##MEM##_tlb, sve_ld##MEM##_nf); \
5462}
5463
5464#define DO_LDFF1_ZPZ_D(MEM, OFS) \
5465void HELPER(sve_ldff##MEM##_##OFS) \
5466 (CPUARMState *env, void *vd, void *vg, void *vm, \
5467 target_ulong base, uint32_t desc) \
5468{ \
5469 sve_ldff1_zd(env, vd, vg, vm, base, desc, GETPC(), \
5470 off_##OFS##_d, sve_ld1##MEM##_tlb, sve_ld##MEM##_nf); \
5471}
5472
5473DO_LDFF1_ZPZ_S(bsu, zsu)
5474DO_LDFF1_ZPZ_S(bsu, zss)
5475DO_LDFF1_ZPZ_D(bdu, zsu)
5476DO_LDFF1_ZPZ_D(bdu, zss)
5477DO_LDFF1_ZPZ_D(bdu, zd)
5478
5479DO_LDFF1_ZPZ_S(bss, zsu)
5480DO_LDFF1_ZPZ_S(bss, zss)
5481DO_LDFF1_ZPZ_D(bds, zsu)
5482DO_LDFF1_ZPZ_D(bds, zss)
5483DO_LDFF1_ZPZ_D(bds, zd)
5484
5485DO_LDFF1_ZPZ_S(hsu_le, zsu)
5486DO_LDFF1_ZPZ_S(hsu_le, zss)
5487DO_LDFF1_ZPZ_D(hdu_le, zsu)
5488DO_LDFF1_ZPZ_D(hdu_le, zss)
5489DO_LDFF1_ZPZ_D(hdu_le, zd)
5490
5491DO_LDFF1_ZPZ_S(hsu_be, zsu)
5492DO_LDFF1_ZPZ_S(hsu_be, zss)
5493DO_LDFF1_ZPZ_D(hdu_be, zsu)
5494DO_LDFF1_ZPZ_D(hdu_be, zss)
5495DO_LDFF1_ZPZ_D(hdu_be, zd)
5496
5497DO_LDFF1_ZPZ_S(hss_le, zsu)
5498DO_LDFF1_ZPZ_S(hss_le, zss)
5499DO_LDFF1_ZPZ_D(hds_le, zsu)
5500DO_LDFF1_ZPZ_D(hds_le, zss)
5501DO_LDFF1_ZPZ_D(hds_le, zd)
5502
5503DO_LDFF1_ZPZ_S(hss_be, zsu)
5504DO_LDFF1_ZPZ_S(hss_be, zss)
5505DO_LDFF1_ZPZ_D(hds_be, zsu)
5506DO_LDFF1_ZPZ_D(hds_be, zss)
5507DO_LDFF1_ZPZ_D(hds_be, zd)
5508
5509DO_LDFF1_ZPZ_S(ss_le, zsu)
5510DO_LDFF1_ZPZ_S(ss_le, zss)
5511DO_LDFF1_ZPZ_D(sdu_le, zsu)
5512DO_LDFF1_ZPZ_D(sdu_le, zss)
5513DO_LDFF1_ZPZ_D(sdu_le, zd)
5514
5515DO_LDFF1_ZPZ_S(ss_be, zsu)
5516DO_LDFF1_ZPZ_S(ss_be, zss)
5517DO_LDFF1_ZPZ_D(sdu_be, zsu)
5518DO_LDFF1_ZPZ_D(sdu_be, zss)
5519DO_LDFF1_ZPZ_D(sdu_be, zd)
5520
5521DO_LDFF1_ZPZ_D(sds_le, zsu)
5522DO_LDFF1_ZPZ_D(sds_le, zss)
5523DO_LDFF1_ZPZ_D(sds_le, zd)
5524
5525DO_LDFF1_ZPZ_D(sds_be, zsu)
5526DO_LDFF1_ZPZ_D(sds_be, zss)
5527DO_LDFF1_ZPZ_D(sds_be, zd)
5528
5529DO_LDFF1_ZPZ_D(dd_le, zsu)
5530DO_LDFF1_ZPZ_D(dd_le, zss)
5531DO_LDFF1_ZPZ_D(dd_le, zd)
5532
5533DO_LDFF1_ZPZ_D(dd_be, zsu)
5534DO_LDFF1_ZPZ_D(dd_be, zss)
5535DO_LDFF1_ZPZ_D(dd_be, zd)
ed67eb7f 5536
f6dbf62a
RH
5537/* Stores with a vector index. */
5538
78cf1b88
RH
5539static void sve_st1_zs(CPUARMState *env, void *vd, void *vg, void *vm,
5540 target_ulong base, uint32_t desc, uintptr_t ra,
6799ce7b 5541 zreg_off_fn *off_fn, sve_ldst1_tlb_fn *tlb_fn)
78cf1b88 5542{
500d0484 5543 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
78cf1b88 5544 intptr_t i, oprsz = simd_oprsz(desc);
f6dbf62a 5545
78cf1b88
RH
5546 for (i = 0; i < oprsz; ) {
5547 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
5548 do {
5549 if (likely(pg & 1)) {
5550 target_ulong off = off_fn(vm, i);
6799ce7b 5551 tlb_fn(env, vd, i, base + (off << scale), ra);
78cf1b88
RH
5552 }
5553 i += 4, pg >>= 4;
5554 } while (i & 15);
5555 }
f6dbf62a
RH
5556}
5557
78cf1b88
RH
5558static void sve_st1_zd(CPUARMState *env, void *vd, void *vg, void *vm,
5559 target_ulong base, uint32_t desc, uintptr_t ra,
6799ce7b 5560 zreg_off_fn *off_fn, sve_ldst1_tlb_fn *tlb_fn)
78cf1b88 5561{
500d0484 5562 const int scale = extract32(desc, SIMD_DATA_SHIFT + MEMOPIDX_SHIFT, 2);
78cf1b88 5563 intptr_t i, oprsz = simd_oprsz(desc) / 8;
f6dbf62a 5564
78cf1b88
RH
5565 for (i = 0; i < oprsz; i++) {
5566 uint8_t pg = *(uint8_t *)(vg + H1(i));
5567 if (likely(pg & 1)) {
5568 target_ulong off = off_fn(vm, i * 8);
6799ce7b 5569 tlb_fn(env, vd, i * 8, base + (off << scale), ra);
78cf1b88
RH
5570 }
5571 }
78cf1b88 5572}
f6dbf62a 5573
78cf1b88 5574#define DO_ST1_ZPZ_S(MEM, OFS) \
3f2f3b33 5575void QEMU_FLATTEN HELPER(sve_st##MEM##_##OFS) \
78cf1b88
RH
5576 (CPUARMState *env, void *vd, void *vg, void *vm, \
5577 target_ulong base, uint32_t desc) \
5578{ \
5579 sve_st1_zs(env, vd, vg, vm, base, desc, GETPC(), \
5580 off_##OFS##_s, sve_st1##MEM##_tlb); \
5581}
f6dbf62a 5582
78cf1b88 5583#define DO_ST1_ZPZ_D(MEM, OFS) \
3f2f3b33 5584void QEMU_FLATTEN HELPER(sve_st##MEM##_##OFS) \
78cf1b88
RH
5585 (CPUARMState *env, void *vd, void *vg, void *vm, \
5586 target_ulong base, uint32_t desc) \
5587{ \
5588 sve_st1_zd(env, vd, vg, vm, base, desc, GETPC(), \
5589 off_##OFS##_d, sve_st1##MEM##_tlb); \
5590}
5591
5592DO_ST1_ZPZ_S(bs, zsu)
5593DO_ST1_ZPZ_S(hs_le, zsu)
5594DO_ST1_ZPZ_S(hs_be, zsu)
5595DO_ST1_ZPZ_S(ss_le, zsu)
5596DO_ST1_ZPZ_S(ss_be, zsu)
5597
5598DO_ST1_ZPZ_S(bs, zss)
5599DO_ST1_ZPZ_S(hs_le, zss)
5600DO_ST1_ZPZ_S(hs_be, zss)
5601DO_ST1_ZPZ_S(ss_le, zss)
5602DO_ST1_ZPZ_S(ss_be, zss)
5603
5604DO_ST1_ZPZ_D(bd, zsu)
5605DO_ST1_ZPZ_D(hd_le, zsu)
5606DO_ST1_ZPZ_D(hd_be, zsu)
5607DO_ST1_ZPZ_D(sd_le, zsu)
5608DO_ST1_ZPZ_D(sd_be, zsu)
5609DO_ST1_ZPZ_D(dd_le, zsu)
5610DO_ST1_ZPZ_D(dd_be, zsu)
5611
5612DO_ST1_ZPZ_D(bd, zss)
5613DO_ST1_ZPZ_D(hd_le, zss)
5614DO_ST1_ZPZ_D(hd_be, zss)
5615DO_ST1_ZPZ_D(sd_le, zss)
5616DO_ST1_ZPZ_D(sd_be, zss)
5617DO_ST1_ZPZ_D(dd_le, zss)
5618DO_ST1_ZPZ_D(dd_be, zss)
5619
5620DO_ST1_ZPZ_D(bd, zd)
5621DO_ST1_ZPZ_D(hd_le, zd)
5622DO_ST1_ZPZ_D(hd_be, zd)
5623DO_ST1_ZPZ_D(sd_le, zd)
5624DO_ST1_ZPZ_D(sd_be, zd)
5625DO_ST1_ZPZ_D(dd_le, zd)
5626DO_ST1_ZPZ_D(dd_be, zd)
5627
5628#undef DO_ST1_ZPZ_S
5629#undef DO_ST1_ZPZ_D