]> git.proxmox.com Git - mirror_qemu.git/blame - target/arm/sve_helper.c
target/arm: Split contiguous stores for endianness
[mirror_qemu.git] / target / arm / sve_helper.c
CommitLineData
9e18d7a6
RH
1/*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "qemu/osdep.h"
21#include "cpu.h"
22#include "exec/exec-all.h"
23#include "exec/cpu_ldst.h"
24#include "exec/helper-proto.h"
25#include "tcg/tcg-gvec-desc.h"
a1f233f2 26#include "fpu/softfloat.h"
9e18d7a6
RH
27
28
f97cfd59
RH
29/* Note that vector data is stored in host-endian 64-bit chunks,
30 so addressing units smaller than that needs a host-endian fixup. */
31#ifdef HOST_WORDS_BIGENDIAN
32#define H1(x) ((x) ^ 7)
33#define H1_2(x) ((x) ^ 6)
34#define H1_4(x) ((x) ^ 4)
35#define H2(x) ((x) ^ 3)
36#define H4(x) ((x) ^ 1)
37#else
38#define H1(x) (x)
39#define H1_2(x) (x)
40#define H1_4(x) (x)
41#define H2(x) (x)
42#define H4(x) (x)
43#endif
44
9e18d7a6
RH
45/* Return a value for NZCV as per the ARM PredTest pseudofunction.
46 *
47 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
48 * and bit 0 set if C is set. Compare the definitions of these variables
49 * within CPUARMState.
50 */
51
52/* For no G bits set, NZCV = C. */
53#define PREDTEST_INIT 1
54
55/* This is an iterative function, called for each Pd and Pg word
56 * moving forward.
57 */
58static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
59{
60 if (likely(g)) {
61 /* Compute N from first D & G.
62 Use bit 2 to signal first G bit seen. */
63 if (!(flags & 4)) {
64 flags |= ((d & (g & -g)) != 0) << 31;
65 flags |= 4;
66 }
67
68 /* Accumulate Z from each D & G. */
69 flags |= ((d & g) != 0) << 1;
70
71 /* Compute C from last !(D & G). Replace previous. */
72 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
73 }
74 return flags;
75}
76
757f9cff
RH
77/* This is an iterative function, called for each Pd and Pg word
78 * moving backward.
79 */
80static uint32_t iter_predtest_bwd(uint64_t d, uint64_t g, uint32_t flags)
81{
82 if (likely(g)) {
83 /* Compute C from first (i.e last) !(D & G).
84 Use bit 2 to signal first G bit seen. */
85 if (!(flags & 4)) {
86 flags += 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
87 flags |= (d & pow2floor(g)) == 0;
88 }
89
90 /* Accumulate Z from each D & G. */
91 flags |= ((d & g) != 0) << 1;
92
93 /* Compute N from last (i.e first) D & G. Replace previous. */
94 flags = deposit32(flags, 31, 1, (d & (g & -g)) != 0);
95 }
96 return flags;
97}
98
9e18d7a6
RH
99/* The same for a single word predicate. */
100uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
101{
102 return iter_predtest_fwd(d, g, PREDTEST_INIT);
103}
104
105/* The same for a multi-word predicate. */
106uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
107{
108 uint32_t flags = PREDTEST_INIT;
109 uint64_t *d = vd, *g = vg;
110 uintptr_t i = 0;
111
112 do {
113 flags = iter_predtest_fwd(d[i], g[i], flags);
114 } while (++i < words);
115
116 return flags;
117}
516e246a 118
ccd841c3
RH
119/* Expand active predicate bits to bytes, for byte elements.
120 * for (i = 0; i < 256; ++i) {
121 * unsigned long m = 0;
122 * for (j = 0; j < 8; j++) {
123 * if ((i >> j) & 1) {
124 * m |= 0xfful << (j << 3);
125 * }
126 * }
127 * printf("0x%016lx,\n", m);
128 * }
129 */
130static inline uint64_t expand_pred_b(uint8_t byte)
131{
132 static const uint64_t word[256] = {
133 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
134 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
135 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
136 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
137 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
138 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
139 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
140 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
141 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
142 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
143 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
144 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
145 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
146 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
147 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
148 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
149 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
150 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
151 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
152 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
153 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
154 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
155 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
156 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
157 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
158 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
159 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
160 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
161 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
162 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
163 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
164 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
165 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
166 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
167 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
168 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
169 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
170 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
171 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
172 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
173 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
174 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
175 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
176 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
177 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
178 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
179 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
180 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
181 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
182 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
183 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
184 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
185 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
186 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
187 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
188 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
189 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
190 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
191 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
192 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
193 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
194 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
195 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
196 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
197 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
198 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
199 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
200 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
201 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
202 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
203 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
204 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
205 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
206 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
207 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
208 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
209 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
210 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
211 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
212 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
213 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
214 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
215 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
216 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
217 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
218 0xffffffffffffffff,
219 };
220 return word[byte];
221}
222
223/* Similarly for half-word elements.
224 * for (i = 0; i < 256; ++i) {
225 * unsigned long m = 0;
226 * if (i & 0xaa) {
227 * continue;
228 * }
229 * for (j = 0; j < 8; j += 2) {
230 * if ((i >> j) & 1) {
231 * m |= 0xfffful << (j << 3);
232 * }
233 * }
234 * printf("[0x%x] = 0x%016lx,\n", i, m);
235 * }
236 */
237static inline uint64_t expand_pred_h(uint8_t byte)
238{
239 static const uint64_t word[] = {
240 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
241 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
242 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
243 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
244 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
245 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
246 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
247 [0x55] = 0xffffffffffffffff,
248 };
249 return word[byte & 0x55];
250}
251
252/* Similarly for single word elements. */
253static inline uint64_t expand_pred_s(uint8_t byte)
254{
255 static const uint64_t word[] = {
256 [0x01] = 0x00000000ffffffffull,
257 [0x10] = 0xffffffff00000000ull,
258 [0x11] = 0xffffffffffffffffull,
259 };
260 return word[byte & 0x11];
261}
262
dae8fb90
RH
263/* Swap 16-bit words within a 32-bit word. */
264static inline uint32_t hswap32(uint32_t h)
265{
266 return rol32(h, 16);
267}
268
269/* Swap 16-bit words within a 64-bit word. */
270static inline uint64_t hswap64(uint64_t h)
271{
272 uint64_t m = 0x0000ffff0000ffffull;
273 h = rol64(h, 32);
274 return ((h & m) << 16) | ((h >> 16) & m);
275}
276
277/* Swap 32-bit words within a 64-bit word. */
278static inline uint64_t wswap64(uint64_t h)
279{
280 return rol64(h, 32);
281}
282
516e246a
RH
283#define LOGICAL_PPPP(NAME, FUNC) \
284void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
285{ \
286 uintptr_t opr_sz = simd_oprsz(desc); \
287 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
288 uintptr_t i; \
289 for (i = 0; i < opr_sz / 8; ++i) { \
290 d[i] = FUNC(n[i], m[i], g[i]); \
291 } \
292}
293
294#define DO_AND(N, M, G) (((N) & (M)) & (G))
295#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
296#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
297#define DO_ORR(N, M, G) (((N) | (M)) & (G))
298#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
299#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
300#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
301#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
302
303LOGICAL_PPPP(sve_and_pppp, DO_AND)
304LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
305LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
306LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
307LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
308LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
309LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
310LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
311
312#undef DO_AND
313#undef DO_BIC
314#undef DO_EOR
315#undef DO_ORR
316#undef DO_ORN
317#undef DO_NOR
318#undef DO_NAND
319#undef DO_SEL
320#undef LOGICAL_PPPP
028e2a7b 321
f97cfd59
RH
322/* Fully general three-operand expander, controlled by a predicate.
323 * This is complicated by the host-endian storage of the register file.
324 */
325/* ??? I don't expect the compiler could ever vectorize this itself.
326 * With some tables we can convert bit masks to byte masks, and with
327 * extra care wrt byte/word ordering we could use gcc generic vectors
328 * and do 16 bytes at a time.
329 */
330#define DO_ZPZZ(NAME, TYPE, H, OP) \
331void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
332{ \
333 intptr_t i, opr_sz = simd_oprsz(desc); \
334 for (i = 0; i < opr_sz; ) { \
335 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
336 do { \
337 if (pg & 1) { \
338 TYPE nn = *(TYPE *)(vn + H(i)); \
339 TYPE mm = *(TYPE *)(vm + H(i)); \
340 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
341 } \
342 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
343 } while (i & 15); \
344 } \
345}
346
347/* Similarly, specialized for 64-bit operands. */
348#define DO_ZPZZ_D(NAME, TYPE, OP) \
349void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
350{ \
351 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
352 TYPE *d = vd, *n = vn, *m = vm; \
353 uint8_t *pg = vg; \
354 for (i = 0; i < opr_sz; i += 1) { \
355 if (pg[H1(i)] & 1) { \
356 TYPE nn = n[i], mm = m[i]; \
357 d[i] = OP(nn, mm); \
358 } \
359 } \
360}
361
362#define DO_AND(N, M) (N & M)
363#define DO_EOR(N, M) (N ^ M)
364#define DO_ORR(N, M) (N | M)
365#define DO_BIC(N, M) (N & ~M)
366#define DO_ADD(N, M) (N + M)
367#define DO_SUB(N, M) (N - M)
368#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
369#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
370#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
371#define DO_MUL(N, M) (N * M)
7e8fafbf
RH
372
373
374/*
375 * We must avoid the C undefined behaviour cases: division by
376 * zero and signed division of INT_MIN by -1. Both of these
377 * have architecturally defined required results for Arm.
378 * We special case all signed divisions by -1 to avoid having
379 * to deduce the minimum integer for the type involved.
380 */
381#define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
382#define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
f97cfd59
RH
383
384DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
385DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
386DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
387DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
388
389DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
390DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
391DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
392DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
393
394DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
395DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
396DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
397DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
398
399DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
400DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
401DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
402DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
403
404DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
405DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
406DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
407DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
408
409DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
410DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
411DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
412DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
413
414DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
415DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
416DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
417DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
418
419DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
420DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
421DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
422DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
423
424DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
425DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
426DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
427DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
428
429DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
430DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
431DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
432DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
433
434DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
435DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
436DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
437DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
438
439DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
440DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
441DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
442DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
443
444/* Because the computation type is at least twice as large as required,
445 these work for both signed and unsigned source types. */
446static inline uint8_t do_mulh_b(int32_t n, int32_t m)
447{
448 return (n * m) >> 8;
449}
450
451static inline uint16_t do_mulh_h(int32_t n, int32_t m)
452{
453 return (n * m) >> 16;
454}
455
456static inline uint32_t do_mulh_s(int64_t n, int64_t m)
457{
458 return (n * m) >> 32;
459}
460
461static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
462{
463 uint64_t lo, hi;
464 muls64(&lo, &hi, n, m);
465 return hi;
466}
467
468static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
469{
470 uint64_t lo, hi;
471 mulu64(&lo, &hi, n, m);
472 return hi;
473}
474
475DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
476DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
477DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
478DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
479
480DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
481DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
482DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
483DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
484
485DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
486DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
487DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
488DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
489
7e8fafbf
RH
490DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_SDIV)
491DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_SDIV)
f97cfd59 492
7e8fafbf
RH
493DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_UDIV)
494DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_UDIV)
f97cfd59 495
27721dbb
RH
496/* Note that all bits of the shift are significant
497 and not modulo the element size. */
498#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
499#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
500#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
501
502DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
503DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
504DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
505
506DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
507DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
508DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
509
510DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
511DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
512DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
513
514DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
515DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
516DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
517
f97cfd59
RH
518#undef DO_ZPZZ
519#undef DO_ZPZZ_D
047cec97 520
fe7f8dfb
RH
521/* Three-operand expander, controlled by a predicate, in which the
522 * third operand is "wide". That is, for D = N op M, the same 64-bit
523 * value of M is used with all of the narrower values of N.
524 */
525#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
526void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
527{ \
528 intptr_t i, opr_sz = simd_oprsz(desc); \
529 for (i = 0; i < opr_sz; ) { \
530 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
531 TYPEW mm = *(TYPEW *)(vm + i); \
532 do { \
533 if (pg & 1) { \
534 TYPE nn = *(TYPE *)(vn + H(i)); \
535 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
536 } \
537 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
538 } while (i & 7); \
539 } \
540}
541
542DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
543DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
544DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
545
546DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
547DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
548DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
549
550DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
551DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
552DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
553
554#undef DO_ZPZW
555
afac6d04
RH
556/* Fully general two-operand expander, controlled by a predicate.
557 */
558#define DO_ZPZ(NAME, TYPE, H, OP) \
559void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
560{ \
561 intptr_t i, opr_sz = simd_oprsz(desc); \
562 for (i = 0; i < opr_sz; ) { \
563 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
564 do { \
565 if (pg & 1) { \
566 TYPE nn = *(TYPE *)(vn + H(i)); \
567 *(TYPE *)(vd + H(i)) = OP(nn); \
568 } \
569 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
570 } while (i & 15); \
571 } \
572}
573
574/* Similarly, specialized for 64-bit operands. */
575#define DO_ZPZ_D(NAME, TYPE, OP) \
576void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
577{ \
578 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
579 TYPE *d = vd, *n = vn; \
580 uint8_t *pg = vg; \
581 for (i = 0; i < opr_sz; i += 1) { \
582 if (pg[H1(i)] & 1) { \
583 TYPE nn = n[i]; \
584 d[i] = OP(nn); \
585 } \
586 } \
587}
588
589#define DO_CLS_B(N) (clrsb32(N) - 24)
590#define DO_CLS_H(N) (clrsb32(N) - 16)
591
592DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
593DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
594DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
595DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
596
597#define DO_CLZ_B(N) (clz32(N) - 24)
598#define DO_CLZ_H(N) (clz32(N) - 16)
599
600DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
601DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
602DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
603DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
604
605DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
606DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
607DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
608DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
609
610#define DO_CNOT(N) (N == 0)
611
612DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
613DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
614DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
615DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
616
617#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
618
619DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
620DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
621DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
622
623#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
624
625DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
626DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
627DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
628
629#define DO_NOT(N) (~N)
630
631DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
632DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
633DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
634DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
635
636#define DO_SXTB(N) ((int8_t)N)
637#define DO_SXTH(N) ((int16_t)N)
638#define DO_SXTS(N) ((int32_t)N)
639#define DO_UXTB(N) ((uint8_t)N)
640#define DO_UXTH(N) ((uint16_t)N)
641#define DO_UXTS(N) ((uint32_t)N)
642
643DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
644DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
645DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
646DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
647DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
648DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
649
650DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
651DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
652DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
653DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
654DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
655DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
656
657#define DO_ABS(N) (N < 0 ? -N : N)
658
659DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
660DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
661DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
662DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
663
664#define DO_NEG(N) (-N)
665
666DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
667DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
668DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
669DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
670
dae8fb90
RH
671DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
672DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
673DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
674
675DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
676DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
677
678DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
679
680DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
681DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
682DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
683DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
684
d9d78dcc
RH
685/* Three-operand expander, unpredicated, in which the third operand is "wide".
686 */
687#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
688void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
689{ \
690 intptr_t i, opr_sz = simd_oprsz(desc); \
691 for (i = 0; i < opr_sz; ) { \
692 TYPEW mm = *(TYPEW *)(vm + i); \
693 do { \
694 TYPE nn = *(TYPE *)(vn + H(i)); \
695 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
696 i += sizeof(TYPE); \
697 } while (i & 7); \
698 } \
699}
700
701DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
702DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
703DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
704
705DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
706DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
707DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
708
709DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
710DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
711DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
712
713#undef DO_ZZW
714
afac6d04
RH
715#undef DO_CLS_B
716#undef DO_CLS_H
717#undef DO_CLZ_B
718#undef DO_CLZ_H
719#undef DO_CNOT
720#undef DO_FABS
721#undef DO_FNEG
722#undef DO_ABS
723#undef DO_NEG
724#undef DO_ZPZ
725#undef DO_ZPZ_D
726
047cec97
RH
727/* Two-operand reduction expander, controlled by a predicate.
728 * The difference between TYPERED and TYPERET has to do with
729 * sign-extension. E.g. for SMAX, TYPERED must be signed,
730 * but TYPERET must be unsigned so that e.g. a 32-bit value
731 * is not sign-extended to the ABI uint64_t return type.
732 */
733/* ??? If we were to vectorize this by hand the reduction ordering
734 * would change. For integer operands, this is perfectly fine.
735 */
736#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
737uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
738{ \
739 intptr_t i, opr_sz = simd_oprsz(desc); \
740 TYPERED ret = INIT; \
741 for (i = 0; i < opr_sz; ) { \
742 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
743 do { \
744 if (pg & 1) { \
745 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
746 ret = OP(ret, nn); \
747 } \
748 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
749 } while (i & 15); \
750 } \
751 return (TYPERET)ret; \
752}
753
754#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
755uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
756{ \
757 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
758 TYPEE *n = vn; \
759 uint8_t *pg = vg; \
760 TYPER ret = INIT; \
761 for (i = 0; i < opr_sz; i += 1) { \
762 if (pg[H1(i)] & 1) { \
763 TYPEE nn = n[i]; \
764 ret = OP(ret, nn); \
765 } \
766 } \
767 return ret; \
768}
769
770DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
771DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
772DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
773DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
774
775DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
776DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
777DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
778DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
779
780DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
781DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
782DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
783DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
784
785DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
786DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
787DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
788
789DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
790DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
791DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
792DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
793
794DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
795DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
796DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
797DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
798
799DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
800DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
801DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
802DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
803
804DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
805DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
806DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
807DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
808
809DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
810DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
811DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
812DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
813
814#undef DO_VPZ
815#undef DO_VPZ_D
816
6e6a157d
RH
817/* Two vector operand, one scalar operand, unpredicated. */
818#define DO_ZZI(NAME, TYPE, OP) \
819void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
820{ \
821 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
822 TYPE s = s64, *d = vd, *n = vn; \
823 for (i = 0; i < opr_sz; ++i) { \
824 d[i] = OP(n[i], s); \
825 } \
826}
827
828#define DO_SUBR(X, Y) (Y - X)
829
830DO_ZZI(sve_subri_b, uint8_t, DO_SUBR)
831DO_ZZI(sve_subri_h, uint16_t, DO_SUBR)
832DO_ZZI(sve_subri_s, uint32_t, DO_SUBR)
833DO_ZZI(sve_subri_d, uint64_t, DO_SUBR)
834
835DO_ZZI(sve_smaxi_b, int8_t, DO_MAX)
836DO_ZZI(sve_smaxi_h, int16_t, DO_MAX)
837DO_ZZI(sve_smaxi_s, int32_t, DO_MAX)
838DO_ZZI(sve_smaxi_d, int64_t, DO_MAX)
839
840DO_ZZI(sve_smini_b, int8_t, DO_MIN)
841DO_ZZI(sve_smini_h, int16_t, DO_MIN)
842DO_ZZI(sve_smini_s, int32_t, DO_MIN)
843DO_ZZI(sve_smini_d, int64_t, DO_MIN)
844
845DO_ZZI(sve_umaxi_b, uint8_t, DO_MAX)
846DO_ZZI(sve_umaxi_h, uint16_t, DO_MAX)
847DO_ZZI(sve_umaxi_s, uint32_t, DO_MAX)
848DO_ZZI(sve_umaxi_d, uint64_t, DO_MAX)
849
850DO_ZZI(sve_umini_b, uint8_t, DO_MIN)
851DO_ZZI(sve_umini_h, uint16_t, DO_MIN)
852DO_ZZI(sve_umini_s, uint32_t, DO_MIN)
853DO_ZZI(sve_umini_d, uint64_t, DO_MIN)
854
855#undef DO_ZZI
856
f97cfd59
RH
857#undef DO_AND
858#undef DO_ORR
859#undef DO_EOR
860#undef DO_BIC
861#undef DO_ADD
862#undef DO_SUB
863#undef DO_MAX
864#undef DO_MIN
865#undef DO_ABD
866#undef DO_MUL
867#undef DO_DIV
27721dbb
RH
868#undef DO_ASR
869#undef DO_LSR
870#undef DO_LSL
6e6a157d 871#undef DO_SUBR
f97cfd59 872
028e2a7b
RH
873/* Similar to the ARM LastActiveElement pseudocode function, except the
874 result is multiplied by the element size. This includes the not found
875 indication; e.g. not found for esz=3 is -8. */
876static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
877{
878 uint64_t mask = pred_esz_masks[esz];
879 intptr_t i = words;
880
881 do {
882 uint64_t this_g = g[--i] & mask;
883 if (this_g) {
884 return i * 64 + (63 - clz64(this_g));
885 }
886 } while (i > 0);
887 return (intptr_t)-1 << esz;
888}
889
890uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
891{
892 uint32_t flags = PREDTEST_INIT;
893 uint64_t *d = vd, *g = vg;
894 intptr_t i = 0;
895
896 do {
897 uint64_t this_d = d[i];
898 uint64_t this_g = g[i];
899
900 if (this_g) {
901 if (!(flags & 4)) {
902 /* Set in D the first bit of G. */
903 this_d |= this_g & -this_g;
904 d[i] = this_d;
905 }
906 flags = iter_predtest_fwd(this_d, this_g, flags);
907 }
908 } while (++i < words);
909
910 return flags;
911}
912
913uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
914{
915 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
916 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
917 uint32_t flags = PREDTEST_INIT;
918 uint64_t *d = vd, *g = vg, esz_mask;
919 intptr_t i, next;
920
921 next = last_active_element(vd, words, esz) + (1 << esz);
922 esz_mask = pred_esz_masks[esz];
923
924 /* Similar to the pseudocode for pnext, but scaled by ESZ
925 so that we find the correct bit. */
926 if (next < words * 64) {
927 uint64_t mask = -1;
928
929 if (next & 63) {
930 mask = ~((1ull << (next & 63)) - 1);
931 next &= -64;
932 }
933 do {
934 uint64_t this_g = g[next / 64] & esz_mask & mask;
935 if (this_g != 0) {
936 next = (next & -64) + ctz64(this_g);
937 break;
938 }
939 next += 64;
940 mask = -1;
941 } while (next < words * 64);
942 }
943
944 i = 0;
945 do {
946 uint64_t this_d = 0;
947 if (i == next / 64) {
948 this_d = 1ull << (next & 63);
949 }
950 d[i] = this_d;
951 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
952 } while (++i < words);
953
954 return flags;
955}
ccd841c3
RH
956
957/* Store zero into every active element of Zd. We will use this for two
958 * and three-operand predicated instructions for which logic dictates a
959 * zero result. In particular, logical shift by element size, which is
960 * otherwise undefined on the host.
961 *
962 * For element sizes smaller than uint64_t, we use tables to expand
963 * the N bits of the controlling predicate to a byte mask, and clear
964 * those bytes.
965 */
966void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
967{
968 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
969 uint64_t *d = vd;
970 uint8_t *pg = vg;
971 for (i = 0; i < opr_sz; i += 1) {
972 d[i] &= ~expand_pred_b(pg[H1(i)]);
973 }
974}
975
976void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
977{
978 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
979 uint64_t *d = vd;
980 uint8_t *pg = vg;
981 for (i = 0; i < opr_sz; i += 1) {
982 d[i] &= ~expand_pred_h(pg[H1(i)]);
983 }
984}
985
986void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
987{
988 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
989 uint64_t *d = vd;
990 uint8_t *pg = vg;
991 for (i = 0; i < opr_sz; i += 1) {
992 d[i] &= ~expand_pred_s(pg[H1(i)]);
993 }
994}
995
996void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
997{
998 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
999 uint64_t *d = vd;
1000 uint8_t *pg = vg;
1001 for (i = 0; i < opr_sz; i += 1) {
1002 if (pg[H1(i)] & 1) {
1003 d[i] = 0;
1004 }
1005 }
1006}
1007
68459864
RH
1008/* Copy Zn into Zd, and store zero into inactive elements. */
1009void HELPER(sve_movz_b)(void *vd, void *vn, void *vg, uint32_t desc)
1010{
1011 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1012 uint64_t *d = vd, *n = vn;
1013 uint8_t *pg = vg;
1014 for (i = 0; i < opr_sz; i += 1) {
1015 d[i] = n[i] & expand_pred_b(pg[H1(i)]);
1016 }
1017}
1018
1019void HELPER(sve_movz_h)(void *vd, void *vn, void *vg, uint32_t desc)
1020{
1021 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1022 uint64_t *d = vd, *n = vn;
1023 uint8_t *pg = vg;
1024 for (i = 0; i < opr_sz; i += 1) {
1025 d[i] = n[i] & expand_pred_h(pg[H1(i)]);
1026 }
1027}
1028
1029void HELPER(sve_movz_s)(void *vd, void *vn, void *vg, uint32_t desc)
1030{
1031 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1032 uint64_t *d = vd, *n = vn;
1033 uint8_t *pg = vg;
1034 for (i = 0; i < opr_sz; i += 1) {
1035 d[i] = n[i] & expand_pred_s(pg[H1(i)]);
1036 }
1037}
1038
1039void HELPER(sve_movz_d)(void *vd, void *vn, void *vg, uint32_t desc)
1040{
1041 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1042 uint64_t *d = vd, *n = vn;
1043 uint8_t *pg = vg;
1044 for (i = 0; i < opr_sz; i += 1) {
054e7adf 1045 d[i] = n[i] & -(uint64_t)(pg[H1(i)] & 1);
68459864
RH
1046 }
1047}
1048
ccd841c3
RH
1049/* Three-operand expander, immediate operand, controlled by a predicate.
1050 */
1051#define DO_ZPZI(NAME, TYPE, H, OP) \
1052void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1053{ \
1054 intptr_t i, opr_sz = simd_oprsz(desc); \
1055 TYPE imm = simd_data(desc); \
1056 for (i = 0; i < opr_sz; ) { \
1057 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1058 do { \
1059 if (pg & 1) { \
1060 TYPE nn = *(TYPE *)(vn + H(i)); \
1061 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1062 } \
1063 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1064 } while (i & 15); \
1065 } \
1066}
1067
1068/* Similarly, specialized for 64-bit operands. */
1069#define DO_ZPZI_D(NAME, TYPE, OP) \
1070void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1071{ \
1072 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1073 TYPE *d = vd, *n = vn; \
1074 TYPE imm = simd_data(desc); \
1075 uint8_t *pg = vg; \
1076 for (i = 0; i < opr_sz; i += 1) { \
1077 if (pg[H1(i)] & 1) { \
1078 TYPE nn = n[i]; \
1079 d[i] = OP(nn, imm); \
1080 } \
1081 } \
1082}
1083
1084#define DO_SHR(N, M) (N >> M)
1085#define DO_SHL(N, M) (N << M)
1086
1087/* Arithmetic shift right for division. This rounds negative numbers
1088 toward zero as per signed division. Therefore before shifting,
1089 when N is negative, add 2**M-1. */
1090#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1091
1092DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
1093DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
1094DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
1095DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
1096
1097DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
1098DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
1099DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
1100DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
1101
1102DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
1103DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
1104DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
1105DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
1106
1107DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
1108DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
1109DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
1110DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
1111
1112#undef DO_SHR
1113#undef DO_SHL
1114#undef DO_ASRD
1115#undef DO_ZPZI
1116#undef DO_ZPZI_D
96a36e4a
RH
1117
1118/* Fully general four-operand expander, controlled by a predicate.
1119 */
1120#define DO_ZPZZZ(NAME, TYPE, H, OP) \
1121void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1122 void *vg, uint32_t desc) \
1123{ \
1124 intptr_t i, opr_sz = simd_oprsz(desc); \
1125 for (i = 0; i < opr_sz; ) { \
1126 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1127 do { \
1128 if (pg & 1) { \
1129 TYPE nn = *(TYPE *)(vn + H(i)); \
1130 TYPE mm = *(TYPE *)(vm + H(i)); \
1131 TYPE aa = *(TYPE *)(va + H(i)); \
1132 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1133 } \
1134 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1135 } while (i & 15); \
1136 } \
1137}
1138
1139/* Similarly, specialized for 64-bit operands. */
1140#define DO_ZPZZZ_D(NAME, TYPE, OP) \
1141void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1142 void *vg, uint32_t desc) \
1143{ \
1144 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1145 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1146 uint8_t *pg = vg; \
1147 for (i = 0; i < opr_sz; i += 1) { \
1148 if (pg[H1(i)] & 1) { \
1149 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1150 d[i] = OP(aa, nn, mm); \
1151 } \
1152 } \
1153}
1154
1155#define DO_MLA(A, N, M) (A + N * M)
1156#define DO_MLS(A, N, M) (A - N * M)
1157
1158DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1159DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1160
1161DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1162DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1163
1164DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1165DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1166
1167DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1168DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1169
1170#undef DO_MLA
1171#undef DO_MLS
1172#undef DO_ZPZZZ
1173#undef DO_ZPZZZ_D
9a56c9c3
RH
1174
1175void HELPER(sve_index_b)(void *vd, uint32_t start,
1176 uint32_t incr, uint32_t desc)
1177{
1178 intptr_t i, opr_sz = simd_oprsz(desc);
1179 uint8_t *d = vd;
1180 for (i = 0; i < opr_sz; i += 1) {
1181 d[H1(i)] = start + i * incr;
1182 }
1183}
1184
1185void HELPER(sve_index_h)(void *vd, uint32_t start,
1186 uint32_t incr, uint32_t desc)
1187{
1188 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1189 uint16_t *d = vd;
1190 for (i = 0; i < opr_sz; i += 1) {
1191 d[H2(i)] = start + i * incr;
1192 }
1193}
1194
1195void HELPER(sve_index_s)(void *vd, uint32_t start,
1196 uint32_t incr, uint32_t desc)
1197{
1198 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1199 uint32_t *d = vd;
1200 for (i = 0; i < opr_sz; i += 1) {
1201 d[H4(i)] = start + i * incr;
1202 }
1203}
1204
1205void HELPER(sve_index_d)(void *vd, uint64_t start,
1206 uint64_t incr, uint32_t desc)
1207{
1208 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1209 uint64_t *d = vd;
1210 for (i = 0; i < opr_sz; i += 1) {
1211 d[i] = start + i * incr;
1212 }
1213}
4b242d9c
RH
1214
1215void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1216{
1217 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1218 uint32_t sh = simd_data(desc);
1219 uint32_t *d = vd, *n = vn, *m = vm;
1220 for (i = 0; i < opr_sz; i += 1) {
1221 d[i] = n[i] + (m[i] << sh);
1222 }
1223}
1224
1225void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1226{
1227 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1228 uint64_t sh = simd_data(desc);
1229 uint64_t *d = vd, *n = vn, *m = vm;
1230 for (i = 0; i < opr_sz; i += 1) {
1231 d[i] = n[i] + (m[i] << sh);
1232 }
1233}
1234
1235void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1236{
1237 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1238 uint64_t sh = simd_data(desc);
1239 uint64_t *d = vd, *n = vn, *m = vm;
1240 for (i = 0; i < opr_sz; i += 1) {
1241 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1242 }
1243}
1244
1245void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1246{
1247 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1248 uint64_t sh = simd_data(desc);
1249 uint64_t *d = vd, *n = vn, *m = vm;
1250 for (i = 0; i < opr_sz; i += 1) {
1251 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1252 }
1253}
0762cd42
RH
1254
1255void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1256{
1257 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1258 static const uint16_t coeff[] = {
1259 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1260 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1261 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1262 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1263 };
1264 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1265 uint16_t *d = vd, *n = vn;
1266
1267 for (i = 0; i < opr_sz; i++) {
1268 uint16_t nn = n[i];
1269 intptr_t idx = extract32(nn, 0, 5);
1270 uint16_t exp = extract32(nn, 5, 5);
1271 d[i] = coeff[idx] | (exp << 10);
1272 }
1273}
1274
1275void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1276{
1277 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1278 static const uint32_t coeff[] = {
1279 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1280 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1281 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1282 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1283 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1284 0x1ef532, 0x20b051, 0x227043, 0x243516,
1285 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1286 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1287 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1288 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1289 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1290 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1291 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1292 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1293 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1294 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1295 };
1296 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1297 uint32_t *d = vd, *n = vn;
1298
1299 for (i = 0; i < opr_sz; i++) {
1300 uint32_t nn = n[i];
1301 intptr_t idx = extract32(nn, 0, 6);
1302 uint32_t exp = extract32(nn, 6, 8);
1303 d[i] = coeff[idx] | (exp << 23);
1304 }
1305}
1306
1307void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1308{
1309 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1310 static const uint64_t coeff[] = {
1311 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1312 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1313 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1314 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1315 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1316 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1317 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1318 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1319 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1320 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1321 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1322 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1323 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1324 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1325 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1326 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1327 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1328 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1329 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1330 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1331 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1332 0xFA7C1819E90D8ull,
1333 };
1334 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1335 uint64_t *d = vd, *n = vn;
1336
1337 for (i = 0; i < opr_sz; i++) {
1338 uint64_t nn = n[i];
1339 intptr_t idx = extract32(nn, 0, 6);
1340 uint64_t exp = extract32(nn, 6, 11);
1341 d[i] = coeff[idx] | (exp << 52);
1342 }
1343}
a1f233f2
RH
1344
1345void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1346{
1347 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1348 uint16_t *d = vd, *n = vn, *m = vm;
1349 for (i = 0; i < opr_sz; i += 1) {
1350 uint16_t nn = n[i];
1351 uint16_t mm = m[i];
1352 if (mm & 1) {
1353 nn = float16_one;
1354 }
1355 d[i] = nn ^ (mm & 2) << 14;
1356 }
1357}
1358
1359void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1360{
1361 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1362 uint32_t *d = vd, *n = vn, *m = vm;
1363 for (i = 0; i < opr_sz; i += 1) {
1364 uint32_t nn = n[i];
1365 uint32_t mm = m[i];
1366 if (mm & 1) {
1367 nn = float32_one;
1368 }
1369 d[i] = nn ^ (mm & 2) << 30;
1370 }
1371}
1372
1373void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1374{
1375 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1376 uint64_t *d = vd, *n = vn, *m = vm;
1377 for (i = 0; i < opr_sz; i += 1) {
1378 uint64_t nn = n[i];
1379 uint64_t mm = m[i];
1380 if (mm & 1) {
1381 nn = float64_one;
1382 }
1383 d[i] = nn ^ (mm & 2) << 62;
1384 }
1385}
24e82e68
RH
1386
1387/*
1388 * Signed saturating addition with scalar operand.
1389 */
1390
1391void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1392{
1393 intptr_t i, oprsz = simd_oprsz(desc);
1394
1395 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1396 int r = *(int8_t *)(a + i) + b;
1397 if (r > INT8_MAX) {
1398 r = INT8_MAX;
1399 } else if (r < INT8_MIN) {
1400 r = INT8_MIN;
1401 }
1402 *(int8_t *)(d + i) = r;
1403 }
1404}
1405
1406void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1407{
1408 intptr_t i, oprsz = simd_oprsz(desc);
1409
1410 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1411 int r = *(int16_t *)(a + i) + b;
1412 if (r > INT16_MAX) {
1413 r = INT16_MAX;
1414 } else if (r < INT16_MIN) {
1415 r = INT16_MIN;
1416 }
1417 *(int16_t *)(d + i) = r;
1418 }
1419}
1420
1421void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1422{
1423 intptr_t i, oprsz = simd_oprsz(desc);
1424
1425 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1426 int64_t r = *(int32_t *)(a + i) + b;
1427 if (r > INT32_MAX) {
1428 r = INT32_MAX;
1429 } else if (r < INT32_MIN) {
1430 r = INT32_MIN;
1431 }
1432 *(int32_t *)(d + i) = r;
1433 }
1434}
1435
1436void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1437{
1438 intptr_t i, oprsz = simd_oprsz(desc);
1439
1440 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1441 int64_t ai = *(int64_t *)(a + i);
1442 int64_t r = ai + b;
1443 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1444 /* Signed overflow. */
1445 r = (r < 0 ? INT64_MAX : INT64_MIN);
1446 }
1447 *(int64_t *)(d + i) = r;
1448 }
1449}
1450
1451/*
1452 * Unsigned saturating addition with scalar operand.
1453 */
1454
1455void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1456{
1457 intptr_t i, oprsz = simd_oprsz(desc);
1458
1459 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1460 int r = *(uint8_t *)(a + i) + b;
1461 if (r > UINT8_MAX) {
1462 r = UINT8_MAX;
1463 } else if (r < 0) {
1464 r = 0;
1465 }
1466 *(uint8_t *)(d + i) = r;
1467 }
1468}
1469
1470void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1471{
1472 intptr_t i, oprsz = simd_oprsz(desc);
1473
1474 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1475 int r = *(uint16_t *)(a + i) + b;
1476 if (r > UINT16_MAX) {
1477 r = UINT16_MAX;
1478 } else if (r < 0) {
1479 r = 0;
1480 }
1481 *(uint16_t *)(d + i) = r;
1482 }
1483}
1484
1485void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1486{
1487 intptr_t i, oprsz = simd_oprsz(desc);
1488
1489 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1490 int64_t r = *(uint32_t *)(a + i) + b;
1491 if (r > UINT32_MAX) {
1492 r = UINT32_MAX;
1493 } else if (r < 0) {
1494 r = 0;
1495 }
1496 *(uint32_t *)(d + i) = r;
1497 }
1498}
1499
1500void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1501{
1502 intptr_t i, oprsz = simd_oprsz(desc);
1503
1504 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1505 uint64_t r = *(uint64_t *)(a + i) + b;
1506 if (r < b) {
1507 r = UINT64_MAX;
1508 }
1509 *(uint64_t *)(d + i) = r;
1510 }
1511}
1512
1513void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1514{
1515 intptr_t i, oprsz = simd_oprsz(desc);
1516
1517 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1518 uint64_t ai = *(uint64_t *)(a + i);
1519 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1520 }
1521}
f25a2361
RH
1522
1523/* Two operand predicated copy immediate with merge. All valid immediates
1524 * can fit within 17 signed bits in the simd_data field.
1525 */
1526void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1527 uint64_t mm, uint32_t desc)
1528{
1529 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1530 uint64_t *d = vd, *n = vn;
1531 uint8_t *pg = vg;
1532
1533 mm = dup_const(MO_8, mm);
1534 for (i = 0; i < opr_sz; i += 1) {
1535 uint64_t nn = n[i];
1536 uint64_t pp = expand_pred_b(pg[H1(i)]);
1537 d[i] = (mm & pp) | (nn & ~pp);
1538 }
1539}
1540
1541void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1542 uint64_t mm, uint32_t desc)
1543{
1544 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1545 uint64_t *d = vd, *n = vn;
1546 uint8_t *pg = vg;
1547
1548 mm = dup_const(MO_16, mm);
1549 for (i = 0; i < opr_sz; i += 1) {
1550 uint64_t nn = n[i];
1551 uint64_t pp = expand_pred_h(pg[H1(i)]);
1552 d[i] = (mm & pp) | (nn & ~pp);
1553 }
1554}
1555
1556void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1557 uint64_t mm, uint32_t desc)
1558{
1559 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1560 uint64_t *d = vd, *n = vn;
1561 uint8_t *pg = vg;
1562
1563 mm = dup_const(MO_32, mm);
1564 for (i = 0; i < opr_sz; i += 1) {
1565 uint64_t nn = n[i];
1566 uint64_t pp = expand_pred_s(pg[H1(i)]);
1567 d[i] = (mm & pp) | (nn & ~pp);
1568 }
1569}
1570
1571void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1572 uint64_t mm, uint32_t desc)
1573{
1574 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1575 uint64_t *d = vd, *n = vn;
1576 uint8_t *pg = vg;
1577
1578 for (i = 0; i < opr_sz; i += 1) {
1579 uint64_t nn = n[i];
1580 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1581 }
1582}
1583
1584void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1585{
1586 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1587 uint64_t *d = vd;
1588 uint8_t *pg = vg;
1589
1590 val = dup_const(MO_8, val);
1591 for (i = 0; i < opr_sz; i += 1) {
1592 d[i] = val & expand_pred_b(pg[H1(i)]);
1593 }
1594}
1595
1596void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1597{
1598 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1599 uint64_t *d = vd;
1600 uint8_t *pg = vg;
1601
1602 val = dup_const(MO_16, val);
1603 for (i = 0; i < opr_sz; i += 1) {
1604 d[i] = val & expand_pred_h(pg[H1(i)]);
1605 }
1606}
1607
1608void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1609{
1610 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1611 uint64_t *d = vd;
1612 uint8_t *pg = vg;
1613
1614 val = dup_const(MO_32, val);
1615 for (i = 0; i < opr_sz; i += 1) {
1616 d[i] = val & expand_pred_s(pg[H1(i)]);
1617 }
1618}
1619
1620void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1621{
1622 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1623 uint64_t *d = vd;
1624 uint8_t *pg = vg;
1625
1626 for (i = 0; i < opr_sz; i += 1) {
1627 d[i] = (pg[H1(i)] & 1 ? val : 0);
1628 }
1629}
b94f8f60
RH
1630
1631/* Big-endian hosts need to frob the byte indicies. If the copy
1632 * happens to be 8-byte aligned, then no frobbing necessary.
1633 */
1634static void swap_memmove(void *vd, void *vs, size_t n)
1635{
1636 uintptr_t d = (uintptr_t)vd;
1637 uintptr_t s = (uintptr_t)vs;
1638 uintptr_t o = (d | s | n) & 7;
1639 size_t i;
1640
1641#ifndef HOST_WORDS_BIGENDIAN
1642 o = 0;
1643#endif
1644 switch (o) {
1645 case 0:
1646 memmove(vd, vs, n);
1647 break;
1648
1649 case 4:
1650 if (d < s || d >= s + n) {
1651 for (i = 0; i < n; i += 4) {
1652 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1653 }
1654 } else {
1655 for (i = n; i > 0; ) {
1656 i -= 4;
1657 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1658 }
1659 }
1660 break;
1661
1662 case 2:
1663 case 6:
1664 if (d < s || d >= s + n) {
1665 for (i = 0; i < n; i += 2) {
1666 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1667 }
1668 } else {
1669 for (i = n; i > 0; ) {
1670 i -= 2;
1671 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1672 }
1673 }
1674 break;
1675
1676 default:
1677 if (d < s || d >= s + n) {
1678 for (i = 0; i < n; i++) {
1679 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1680 }
1681 } else {
1682 for (i = n; i > 0; ) {
1683 i -= 1;
1684 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1685 }
1686 }
1687 break;
1688 }
1689}
1690
9123aeb6
RH
1691/* Similarly for memset of 0. */
1692static void swap_memzero(void *vd, size_t n)
1693{
1694 uintptr_t d = (uintptr_t)vd;
1695 uintptr_t o = (d | n) & 7;
1696 size_t i;
1697
1698 /* Usually, the first bit of a predicate is set, so N is 0. */
1699 if (likely(n == 0)) {
1700 return;
1701 }
1702
1703#ifndef HOST_WORDS_BIGENDIAN
1704 o = 0;
1705#endif
1706 switch (o) {
1707 case 0:
1708 memset(vd, 0, n);
1709 break;
1710
1711 case 4:
1712 for (i = 0; i < n; i += 4) {
1713 *(uint32_t *)H1_4(d + i) = 0;
1714 }
1715 break;
1716
1717 case 2:
1718 case 6:
1719 for (i = 0; i < n; i += 2) {
1720 *(uint16_t *)H1_2(d + i) = 0;
1721 }
1722 break;
1723
1724 default:
1725 for (i = 0; i < n; i++) {
1726 *(uint8_t *)H1(d + i) = 0;
1727 }
1728 break;
1729 }
1730}
1731
b94f8f60
RH
1732void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1733{
1734 intptr_t opr_sz = simd_oprsz(desc);
1735 size_t n_ofs = simd_data(desc);
1736 size_t n_siz = opr_sz - n_ofs;
1737
1738 if (vd != vm) {
1739 swap_memmove(vd, vn + n_ofs, n_siz);
1740 swap_memmove(vd + n_siz, vm, n_ofs);
1741 } else if (vd != vn) {
1742 swap_memmove(vd + n_siz, vd, n_ofs);
1743 swap_memmove(vd, vn + n_ofs, n_siz);
1744 } else {
1745 /* vd == vn == vm. Need temp space. */
1746 ARMVectorReg tmp;
1747 swap_memmove(&tmp, vm, n_ofs);
1748 swap_memmove(vd, vd + n_ofs, n_siz);
1749 memcpy(vd + n_siz, &tmp, n_ofs);
1750 }
1751}
30562ab7
RH
1752
1753#define DO_INSR(NAME, TYPE, H) \
1754void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1755{ \
1756 intptr_t opr_sz = simd_oprsz(desc); \
1757 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1758 *(TYPE *)(vd + H(0)) = val; \
1759}
1760
1761DO_INSR(sve_insr_b, uint8_t, H1)
1762DO_INSR(sve_insr_h, uint16_t, H1_2)
1763DO_INSR(sve_insr_s, uint32_t, H1_4)
1764DO_INSR(sve_insr_d, uint64_t, )
1765
1766#undef DO_INSR
1767
1768void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1769{
1770 intptr_t i, j, opr_sz = simd_oprsz(desc);
1771 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1772 uint64_t f = *(uint64_t *)(vn + i);
1773 uint64_t b = *(uint64_t *)(vn + j);
1774 *(uint64_t *)(vd + i) = bswap64(b);
1775 *(uint64_t *)(vd + j) = bswap64(f);
1776 }
1777}
1778
30562ab7
RH
1779void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1780{
1781 intptr_t i, j, opr_sz = simd_oprsz(desc);
1782 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1783 uint64_t f = *(uint64_t *)(vn + i);
1784 uint64_t b = *(uint64_t *)(vn + j);
1785 *(uint64_t *)(vd + i) = hswap64(b);
1786 *(uint64_t *)(vd + j) = hswap64(f);
1787 }
1788}
1789
1790void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1791{
1792 intptr_t i, j, opr_sz = simd_oprsz(desc);
1793 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1794 uint64_t f = *(uint64_t *)(vn + i);
1795 uint64_t b = *(uint64_t *)(vn + j);
1796 *(uint64_t *)(vd + i) = rol64(b, 32);
1797 *(uint64_t *)(vd + j) = rol64(f, 32);
1798 }
1799}
1800
1801void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1802{
1803 intptr_t i, j, opr_sz = simd_oprsz(desc);
1804 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1805 uint64_t f = *(uint64_t *)(vn + i);
1806 uint64_t b = *(uint64_t *)(vn + j);
1807 *(uint64_t *)(vd + i) = b;
1808 *(uint64_t *)(vd + j) = f;
1809 }
1810}
1811
1812#define DO_TBL(NAME, TYPE, H) \
1813void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1814{ \
1815 intptr_t i, opr_sz = simd_oprsz(desc); \
1816 uintptr_t elem = opr_sz / sizeof(TYPE); \
1817 TYPE *d = vd, *n = vn, *m = vm; \
1818 ARMVectorReg tmp; \
1819 if (unlikely(vd == vn)) { \
1820 n = memcpy(&tmp, vn, opr_sz); \
1821 } \
1822 for (i = 0; i < elem; i++) { \
1823 TYPE j = m[H(i)]; \
1824 d[H(i)] = j < elem ? n[H(j)] : 0; \
1825 } \
1826}
1827
1828DO_TBL(sve_tbl_b, uint8_t, H1)
1829DO_TBL(sve_tbl_h, uint16_t, H2)
1830DO_TBL(sve_tbl_s, uint32_t, H4)
1831DO_TBL(sve_tbl_d, uint64_t, )
1832
1833#undef TBL
1834
1835#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1836void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1837{ \
1838 intptr_t i, opr_sz = simd_oprsz(desc); \
1839 TYPED *d = vd; \
1840 TYPES *n = vn; \
1841 ARMVectorReg tmp; \
1842 if (unlikely(vn - vd < opr_sz)) { \
1843 n = memcpy(&tmp, n, opr_sz / 2); \
1844 } \
1845 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1846 d[HD(i)] = n[HS(i)]; \
1847 } \
1848}
1849
1850DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1851DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1852DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1853
1854DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1855DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1856DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1857
1858#undef DO_UNPK
d731d8cb
RH
1859
1860/* Mask of bits included in the even numbered predicates of width esz.
1861 * We also use this for expand_bits/compress_bits, and so extend the
1862 * same pattern out to 16-bit units.
1863 */
1864static const uint64_t even_bit_esz_masks[5] = {
1865 0x5555555555555555ull,
1866 0x3333333333333333ull,
1867 0x0f0f0f0f0f0f0f0full,
1868 0x00ff00ff00ff00ffull,
1869 0x0000ffff0000ffffull,
1870};
1871
1872/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1873 * For N==0, this corresponds to the operation that in qemu/bitops.h
1874 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1875 * section 7-2 Shuffling Bits.
1876 */
1877static uint64_t expand_bits(uint64_t x, int n)
1878{
1879 int i;
1880
1881 x &= 0xffffffffu;
1882 for (i = 4; i >= n; i--) {
1883 int sh = 1 << i;
1884 x = ((x << sh) | x) & even_bit_esz_masks[i];
1885 }
1886 return x;
1887}
1888
1889/* Compress units of 2**(N+1) bits to units of 2**N bits.
1890 * For N==0, this corresponds to the operation that in qemu/bitops.h
1891 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1892 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1893 */
1894static uint64_t compress_bits(uint64_t x, int n)
1895{
1896 int i;
1897
1898 for (i = n; i <= 4; i++) {
1899 int sh = 1 << i;
1900 x &= even_bit_esz_masks[i];
1901 x = (x >> sh) | x;
1902 }
1903 return x & 0xffffffffu;
1904}
1905
1906void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1907{
1908 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1909 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1910 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1911 uint64_t *d = vd;
1912 intptr_t i;
1913
1914 if (oprsz <= 8) {
1915 uint64_t nn = *(uint64_t *)vn;
1916 uint64_t mm = *(uint64_t *)vm;
1917 int half = 4 * oprsz;
1918
1919 nn = extract64(nn, high * half, half);
1920 mm = extract64(mm, high * half, half);
1921 nn = expand_bits(nn, esz);
1922 mm = expand_bits(mm, esz);
1923 d[0] = nn + (mm << (1 << esz));
1924 } else {
1925 ARMPredicateReg tmp_n, tmp_m;
1926
1927 /* We produce output faster than we consume input.
1928 Therefore we must be mindful of possible overlap. */
1929 if ((vn - vd) < (uintptr_t)oprsz) {
1930 vn = memcpy(&tmp_n, vn, oprsz);
1931 }
1932 if ((vm - vd) < (uintptr_t)oprsz) {
1933 vm = memcpy(&tmp_m, vm, oprsz);
1934 }
1935 if (high) {
1936 high = oprsz >> 1;
1937 }
1938
1939 if ((high & 3) == 0) {
1940 uint32_t *n = vn, *m = vm;
1941 high >>= 2;
1942
1943 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1944 uint64_t nn = n[H4(high + i)];
1945 uint64_t mm = m[H4(high + i)];
1946
1947 nn = expand_bits(nn, esz);
1948 mm = expand_bits(mm, esz);
1949 d[i] = nn + (mm << (1 << esz));
1950 }
1951 } else {
1952 uint8_t *n = vn, *m = vm;
1953 uint16_t *d16 = vd;
1954
1955 for (i = 0; i < oprsz / 2; i++) {
1956 uint16_t nn = n[H1(high + i)];
1957 uint16_t mm = m[H1(high + i)];
1958
1959 nn = expand_bits(nn, esz);
1960 mm = expand_bits(mm, esz);
1961 d16[H2(i)] = nn + (mm << (1 << esz));
1962 }
1963 }
1964 }
1965}
1966
1967void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1968{
1969 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1970 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1971 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1972 uint64_t *d = vd, *n = vn, *m = vm;
1973 uint64_t l, h;
1974 intptr_t i;
1975
1976 if (oprsz <= 8) {
1977 l = compress_bits(n[0] >> odd, esz);
1978 h = compress_bits(m[0] >> odd, esz);
1979 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1980 } else {
1981 ARMPredicateReg tmp_m;
1982 intptr_t oprsz_16 = oprsz / 16;
1983
1984 if ((vm - vd) < (uintptr_t)oprsz) {
1985 m = memcpy(&tmp_m, vm, oprsz);
1986 }
1987
1988 for (i = 0; i < oprsz_16; i++) {
1989 l = n[2 * i + 0];
1990 h = n[2 * i + 1];
1991 l = compress_bits(l >> odd, esz);
1992 h = compress_bits(h >> odd, esz);
1993 d[i] = l + (h << 32);
1994 }
1995
1996 /* For VL which is not a power of 2, the results from M do not
1997 align nicely with the uint64_t for D. Put the aligned results
1998 from M into TMP_M and then copy it into place afterward. */
1999 if (oprsz & 15) {
2000 d[i] = compress_bits(n[2 * i] >> odd, esz);
2001
2002 for (i = 0; i < oprsz_16; i++) {
2003 l = m[2 * i + 0];
2004 h = m[2 * i + 1];
2005 l = compress_bits(l >> odd, esz);
2006 h = compress_bits(h >> odd, esz);
2007 tmp_m.p[i] = l + (h << 32);
2008 }
2009 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
2010
2011 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
2012 } else {
2013 for (i = 0; i < oprsz_16; i++) {
2014 l = m[2 * i + 0];
2015 h = m[2 * i + 1];
2016 l = compress_bits(l >> odd, esz);
2017 h = compress_bits(h >> odd, esz);
2018 d[oprsz_16 + i] = l + (h << 32);
2019 }
2020 }
2021 }
2022}
2023
2024void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
2025{
2026 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2027 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2028 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2029 uint64_t *d = vd, *n = vn, *m = vm;
2030 uint64_t mask;
2031 int shr, shl;
2032 intptr_t i;
2033
2034 shl = 1 << esz;
2035 shr = 0;
2036 mask = even_bit_esz_masks[esz];
2037 if (odd) {
2038 mask <<= shl;
2039 shr = shl;
2040 shl = 0;
2041 }
2042
2043 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2044 uint64_t nn = (n[i] & mask) >> shr;
2045 uint64_t mm = (m[i] & mask) << shl;
2046 d[i] = nn + mm;
2047 }
2048}
2049
2050/* Reverse units of 2**N bits. */
2051static uint64_t reverse_bits_64(uint64_t x, int n)
2052{
2053 int i, sh;
2054
2055 x = bswap64(x);
2056 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2057 uint64_t mask = even_bit_esz_masks[i];
2058 x = ((x & mask) << sh) | ((x >> sh) & mask);
2059 }
2060 return x;
2061}
2062
2063static uint8_t reverse_bits_8(uint8_t x, int n)
2064{
2065 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
2066 int i, sh;
2067
2068 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
2069 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
2070 }
2071 return x;
2072}
2073
2074void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
2075{
2076 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2077 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2078 intptr_t i, oprsz_2 = oprsz / 2;
2079
2080 if (oprsz <= 8) {
2081 uint64_t l = *(uint64_t *)vn;
2082 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
2083 *(uint64_t *)vd = l;
2084 } else if ((oprsz & 15) == 0) {
2085 for (i = 0; i < oprsz_2; i += 8) {
2086 intptr_t ih = oprsz - 8 - i;
2087 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
2088 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
2089 *(uint64_t *)(vd + i) = h;
2090 *(uint64_t *)(vd + ih) = l;
2091 }
2092 } else {
2093 for (i = 0; i < oprsz_2; i += 1) {
2094 intptr_t il = H1(i);
2095 intptr_t ih = H1(oprsz - 1 - i);
2096 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
2097 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
2098 *(uint8_t *)(vd + il) = h;
2099 *(uint8_t *)(vd + ih) = l;
2100 }
2101 }
2102}
2103
2104void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
2105{
2106 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2107 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
2108 uint64_t *d = vd;
2109 intptr_t i;
2110
2111 if (oprsz <= 8) {
2112 uint64_t nn = *(uint64_t *)vn;
2113 int half = 4 * oprsz;
2114
2115 nn = extract64(nn, high * half, half);
2116 nn = expand_bits(nn, 0);
2117 d[0] = nn;
2118 } else {
2119 ARMPredicateReg tmp_n;
2120
2121 /* We produce output faster than we consume input.
2122 Therefore we must be mindful of possible overlap. */
2123 if ((vn - vd) < (uintptr_t)oprsz) {
2124 vn = memcpy(&tmp_n, vn, oprsz);
2125 }
2126 if (high) {
2127 high = oprsz >> 1;
2128 }
2129
2130 if ((high & 3) == 0) {
2131 uint32_t *n = vn;
2132 high >>= 2;
2133
2134 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
2135 uint64_t nn = n[H4(high + i)];
2136 d[i] = expand_bits(nn, 0);
2137 }
2138 } else {
2139 uint16_t *d16 = vd;
2140 uint8_t *n = vn;
2141
2142 for (i = 0; i < oprsz / 2; i++) {
2143 uint16_t nn = n[H1(high + i)];
2144 d16[H2(i)] = expand_bits(nn, 0);
2145 }
2146 }
2147 }
2148}
234b48e9
RH
2149
2150#define DO_ZIP(NAME, TYPE, H) \
2151void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2152{ \
2153 intptr_t oprsz = simd_oprsz(desc); \
2154 intptr_t i, oprsz_2 = oprsz / 2; \
2155 ARMVectorReg tmp_n, tmp_m; \
2156 /* We produce output faster than we consume input. \
2157 Therefore we must be mindful of possible overlap. */ \
2158 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2159 vn = memcpy(&tmp_n, vn, oprsz_2); \
2160 } \
2161 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2162 vm = memcpy(&tmp_m, vm, oprsz_2); \
2163 } \
2164 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2165 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2166 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2167 } \
2168}
2169
2170DO_ZIP(sve_zip_b, uint8_t, H1)
2171DO_ZIP(sve_zip_h, uint16_t, H1_2)
2172DO_ZIP(sve_zip_s, uint32_t, H1_4)
2173DO_ZIP(sve_zip_d, uint64_t, )
2174
2175#define DO_UZP(NAME, TYPE, H) \
2176void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2177{ \
2178 intptr_t oprsz = simd_oprsz(desc); \
2179 intptr_t oprsz_2 = oprsz / 2; \
2180 intptr_t odd_ofs = simd_data(desc); \
2181 intptr_t i; \
2182 ARMVectorReg tmp_m; \
2183 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2184 vm = memcpy(&tmp_m, vm, oprsz); \
2185 } \
2186 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2187 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2188 } \
2189 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2190 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2191 } \
2192}
2193
2194DO_UZP(sve_uzp_b, uint8_t, H1)
2195DO_UZP(sve_uzp_h, uint16_t, H1_2)
2196DO_UZP(sve_uzp_s, uint32_t, H1_4)
2197DO_UZP(sve_uzp_d, uint64_t, )
2198
2199#define DO_TRN(NAME, TYPE, H) \
2200void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2201{ \
2202 intptr_t oprsz = simd_oprsz(desc); \
2203 intptr_t odd_ofs = simd_data(desc); \
2204 intptr_t i; \
2205 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2206 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2207 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2208 *(TYPE *)(vd + H(i + 0)) = ae; \
2209 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2210 } \
2211}
2212
2213DO_TRN(sve_trn_b, uint8_t, H1)
2214DO_TRN(sve_trn_h, uint16_t, H1_2)
2215DO_TRN(sve_trn_s, uint32_t, H1_4)
2216DO_TRN(sve_trn_d, uint64_t, )
2217
2218#undef DO_ZIP
2219#undef DO_UZP
2220#undef DO_TRN
3ca879ae
RH
2221
2222void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2223{
2224 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2225 uint32_t *d = vd, *n = vn;
2226 uint8_t *pg = vg;
2227
2228 for (i = j = 0; i < opr_sz; i++) {
2229 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2230 d[H4(j)] = n[H4(i)];
2231 j++;
2232 }
2233 }
2234 for (; j < opr_sz; j++) {
2235 d[H4(j)] = 0;
2236 }
2237}
2238
2239void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2240{
2241 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2242 uint64_t *d = vd, *n = vn;
2243 uint8_t *pg = vg;
2244
2245 for (i = j = 0; i < opr_sz; i++) {
2246 if (pg[H1(i)] & 1) {
2247 d[j] = n[i];
2248 j++;
2249 }
2250 }
2251 for (; j < opr_sz; j++) {
2252 d[j] = 0;
2253 }
2254}
ef23cb72
RH
2255
2256/* Similar to the ARM LastActiveElement pseudocode function, except the
2257 * result is multiplied by the element size. This includes the not found
2258 * indication; e.g. not found for esz=3 is -8.
2259 */
2260int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2261{
2262 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2263 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2264
2265 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2266}
b48ff240
RH
2267
2268void HELPER(sve_splice)(void *vd, void *vn, void *vm, void *vg, uint32_t desc)
2269{
2270 intptr_t opr_sz = simd_oprsz(desc) / 8;
2271 int esz = simd_data(desc);
2272 uint64_t pg, first_g, last_g, len, mask = pred_esz_masks[esz];
2273 intptr_t i, first_i, last_i;
2274 ARMVectorReg tmp;
2275
2276 first_i = last_i = 0;
2277 first_g = last_g = 0;
2278
2279 /* Find the extent of the active elements within VG. */
2280 for (i = QEMU_ALIGN_UP(opr_sz, 8) - 8; i >= 0; i -= 8) {
2281 pg = *(uint64_t *)(vg + i) & mask;
2282 if (pg) {
2283 if (last_g == 0) {
2284 last_g = pg;
2285 last_i = i;
2286 }
2287 first_g = pg;
2288 first_i = i;
2289 }
2290 }
2291
2292 len = 0;
2293 if (first_g != 0) {
2294 first_i = first_i * 8 + ctz64(first_g);
2295 last_i = last_i * 8 + 63 - clz64(last_g);
2296 len = last_i - first_i + (1 << esz);
2297 if (vd == vm) {
2298 vm = memcpy(&tmp, vm, opr_sz * 8);
2299 }
2300 swap_memmove(vd, vn + first_i, len);
2301 }
2302 swap_memmove(vd + len, vm, opr_sz * 8 - len);
2303}
d3fe4a29
RH
2304
2305void HELPER(sve_sel_zpzz_b)(void *vd, void *vn, void *vm,
2306 void *vg, uint32_t desc)
2307{
2308 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2309 uint64_t *d = vd, *n = vn, *m = vm;
2310 uint8_t *pg = vg;
2311
2312 for (i = 0; i < opr_sz; i += 1) {
2313 uint64_t nn = n[i], mm = m[i];
2314 uint64_t pp = expand_pred_b(pg[H1(i)]);
2315 d[i] = (nn & pp) | (mm & ~pp);
2316 }
2317}
2318
2319void HELPER(sve_sel_zpzz_h)(void *vd, void *vn, void *vm,
2320 void *vg, uint32_t desc)
2321{
2322 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2323 uint64_t *d = vd, *n = vn, *m = vm;
2324 uint8_t *pg = vg;
2325
2326 for (i = 0; i < opr_sz; i += 1) {
2327 uint64_t nn = n[i], mm = m[i];
2328 uint64_t pp = expand_pred_h(pg[H1(i)]);
2329 d[i] = (nn & pp) | (mm & ~pp);
2330 }
2331}
2332
2333void HELPER(sve_sel_zpzz_s)(void *vd, void *vn, void *vm,
2334 void *vg, uint32_t desc)
2335{
2336 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2337 uint64_t *d = vd, *n = vn, *m = vm;
2338 uint8_t *pg = vg;
2339
2340 for (i = 0; i < opr_sz; i += 1) {
2341 uint64_t nn = n[i], mm = m[i];
2342 uint64_t pp = expand_pred_s(pg[H1(i)]);
2343 d[i] = (nn & pp) | (mm & ~pp);
2344 }
2345}
2346
2347void HELPER(sve_sel_zpzz_d)(void *vd, void *vn, void *vm,
2348 void *vg, uint32_t desc)
2349{
2350 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2351 uint64_t *d = vd, *n = vn, *m = vm;
2352 uint8_t *pg = vg;
2353
2354 for (i = 0; i < opr_sz; i += 1) {
2355 uint64_t nn = n[i], mm = m[i];
2356 d[i] = (pg[H1(i)] & 1 ? nn : mm);
2357 }
2358}
757f9cff
RH
2359
2360/* Two operand comparison controlled by a predicate.
2361 * ??? It is very tempting to want to be able to expand this inline
2362 * with x86 instructions, e.g.
2363 *
2364 * vcmpeqw zm, zn, %ymm0
2365 * vpmovmskb %ymm0, %eax
2366 * and $0x5555, %eax
2367 * and pg, %eax
2368 *
2369 * or even aarch64, e.g.
2370 *
2371 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2372 * cmeq v0.8h, zn, zm
2373 * and v0.8h, v0.8h, mask
2374 * addv h0, v0.8h
2375 * and v0.8b, pg
2376 *
2377 * However, coming up with an abstraction that allows vector inputs and
2378 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2379 * scalar outputs, is tricky.
2380 */
2381#define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2382uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2383{ \
2384 intptr_t opr_sz = simd_oprsz(desc); \
2385 uint32_t flags = PREDTEST_INIT; \
2386 intptr_t i = opr_sz; \
2387 do { \
2388 uint64_t out = 0, pg; \
2389 do { \
2390 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2391 TYPE nn = *(TYPE *)(vn + H(i)); \
2392 TYPE mm = *(TYPE *)(vm + H(i)); \
2393 out |= nn OP mm; \
2394 } while (i & 63); \
2395 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2396 out &= pg; \
2397 *(uint64_t *)(vd + (i >> 3)) = out; \
2398 flags = iter_predtest_bwd(out, pg, flags); \
2399 } while (i > 0); \
2400 return flags; \
2401}
2402
2403#define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2404 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2405#define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2406 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2407#define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2408 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2409#define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2410 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2411
2412DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b, uint8_t, ==)
2413DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h, uint16_t, ==)
2414DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s, uint32_t, ==)
2415DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d, uint64_t, ==)
2416
2417DO_CMP_PPZZ_B(sve_cmpne_ppzz_b, uint8_t, !=)
2418DO_CMP_PPZZ_H(sve_cmpne_ppzz_h, uint16_t, !=)
2419DO_CMP_PPZZ_S(sve_cmpne_ppzz_s, uint32_t, !=)
2420DO_CMP_PPZZ_D(sve_cmpne_ppzz_d, uint64_t, !=)
2421
2422DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b, int8_t, >)
2423DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h, int16_t, >)
2424DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s, int32_t, >)
2425DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d, int64_t, >)
2426
2427DO_CMP_PPZZ_B(sve_cmpge_ppzz_b, int8_t, >=)
2428DO_CMP_PPZZ_H(sve_cmpge_ppzz_h, int16_t, >=)
2429DO_CMP_PPZZ_S(sve_cmpge_ppzz_s, int32_t, >=)
2430DO_CMP_PPZZ_D(sve_cmpge_ppzz_d, int64_t, >=)
2431
2432DO_CMP_PPZZ_B(sve_cmphi_ppzz_b, uint8_t, >)
2433DO_CMP_PPZZ_H(sve_cmphi_ppzz_h, uint16_t, >)
2434DO_CMP_PPZZ_S(sve_cmphi_ppzz_s, uint32_t, >)
2435DO_CMP_PPZZ_D(sve_cmphi_ppzz_d, uint64_t, >)
2436
2437DO_CMP_PPZZ_B(sve_cmphs_ppzz_b, uint8_t, >=)
2438DO_CMP_PPZZ_H(sve_cmphs_ppzz_h, uint16_t, >=)
2439DO_CMP_PPZZ_S(sve_cmphs_ppzz_s, uint32_t, >=)
2440DO_CMP_PPZZ_D(sve_cmphs_ppzz_d, uint64_t, >=)
2441
2442#undef DO_CMP_PPZZ_B
2443#undef DO_CMP_PPZZ_H
2444#undef DO_CMP_PPZZ_S
2445#undef DO_CMP_PPZZ_D
2446#undef DO_CMP_PPZZ
2447
2448/* Similar, but the second source is "wide". */
2449#define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2450uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2451{ \
2452 intptr_t opr_sz = simd_oprsz(desc); \
2453 uint32_t flags = PREDTEST_INIT; \
2454 intptr_t i = opr_sz; \
2455 do { \
2456 uint64_t out = 0, pg; \
2457 do { \
2458 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2459 do { \
2460 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2461 TYPE nn = *(TYPE *)(vn + H(i)); \
2462 out |= nn OP mm; \
2463 } while (i & 7); \
2464 } while (i & 63); \
2465 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2466 out &= pg; \
2467 *(uint64_t *)(vd + (i >> 3)) = out; \
2468 flags = iter_predtest_bwd(out, pg, flags); \
2469 } while (i > 0); \
2470 return flags; \
2471}
2472
2473#define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2474 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2475#define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2476 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2477#define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2478 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2479
df4e0010
RH
2480DO_CMP_PPZW_B(sve_cmpeq_ppzw_b, int8_t, uint64_t, ==)
2481DO_CMP_PPZW_H(sve_cmpeq_ppzw_h, int16_t, uint64_t, ==)
2482DO_CMP_PPZW_S(sve_cmpeq_ppzw_s, int32_t, uint64_t, ==)
757f9cff 2483
df4e0010
RH
2484DO_CMP_PPZW_B(sve_cmpne_ppzw_b, int8_t, uint64_t, !=)
2485DO_CMP_PPZW_H(sve_cmpne_ppzw_h, int16_t, uint64_t, !=)
2486DO_CMP_PPZW_S(sve_cmpne_ppzw_s, int32_t, uint64_t, !=)
757f9cff
RH
2487
2488DO_CMP_PPZW_B(sve_cmpgt_ppzw_b, int8_t, int64_t, >)
2489DO_CMP_PPZW_H(sve_cmpgt_ppzw_h, int16_t, int64_t, >)
2490DO_CMP_PPZW_S(sve_cmpgt_ppzw_s, int32_t, int64_t, >)
2491
2492DO_CMP_PPZW_B(sve_cmpge_ppzw_b, int8_t, int64_t, >=)
2493DO_CMP_PPZW_H(sve_cmpge_ppzw_h, int16_t, int64_t, >=)
2494DO_CMP_PPZW_S(sve_cmpge_ppzw_s, int32_t, int64_t, >=)
2495
2496DO_CMP_PPZW_B(sve_cmphi_ppzw_b, uint8_t, uint64_t, >)
2497DO_CMP_PPZW_H(sve_cmphi_ppzw_h, uint16_t, uint64_t, >)
2498DO_CMP_PPZW_S(sve_cmphi_ppzw_s, uint32_t, uint64_t, >)
2499
2500DO_CMP_PPZW_B(sve_cmphs_ppzw_b, uint8_t, uint64_t, >=)
2501DO_CMP_PPZW_H(sve_cmphs_ppzw_h, uint16_t, uint64_t, >=)
2502DO_CMP_PPZW_S(sve_cmphs_ppzw_s, uint32_t, uint64_t, >=)
2503
2504DO_CMP_PPZW_B(sve_cmplt_ppzw_b, int8_t, int64_t, <)
2505DO_CMP_PPZW_H(sve_cmplt_ppzw_h, int16_t, int64_t, <)
2506DO_CMP_PPZW_S(sve_cmplt_ppzw_s, int32_t, int64_t, <)
2507
2508DO_CMP_PPZW_B(sve_cmple_ppzw_b, int8_t, int64_t, <=)
2509DO_CMP_PPZW_H(sve_cmple_ppzw_h, int16_t, int64_t, <=)
2510DO_CMP_PPZW_S(sve_cmple_ppzw_s, int32_t, int64_t, <=)
2511
2512DO_CMP_PPZW_B(sve_cmplo_ppzw_b, uint8_t, uint64_t, <)
2513DO_CMP_PPZW_H(sve_cmplo_ppzw_h, uint16_t, uint64_t, <)
2514DO_CMP_PPZW_S(sve_cmplo_ppzw_s, uint32_t, uint64_t, <)
2515
2516DO_CMP_PPZW_B(sve_cmpls_ppzw_b, uint8_t, uint64_t, <=)
2517DO_CMP_PPZW_H(sve_cmpls_ppzw_h, uint16_t, uint64_t, <=)
2518DO_CMP_PPZW_S(sve_cmpls_ppzw_s, uint32_t, uint64_t, <=)
2519
2520#undef DO_CMP_PPZW_B
2521#undef DO_CMP_PPZW_H
2522#undef DO_CMP_PPZW_S
2523#undef DO_CMP_PPZW
38cadeba
RH
2524
2525/* Similar, but the second source is immediate. */
2526#define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2527uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2528{ \
2529 intptr_t opr_sz = simd_oprsz(desc); \
2530 uint32_t flags = PREDTEST_INIT; \
2531 TYPE mm = simd_data(desc); \
2532 intptr_t i = opr_sz; \
2533 do { \
2534 uint64_t out = 0, pg; \
2535 do { \
2536 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2537 TYPE nn = *(TYPE *)(vn + H(i)); \
2538 out |= nn OP mm; \
2539 } while (i & 63); \
2540 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2541 out &= pg; \
2542 *(uint64_t *)(vd + (i >> 3)) = out; \
2543 flags = iter_predtest_bwd(out, pg, flags); \
2544 } while (i > 0); \
2545 return flags; \
2546}
2547
2548#define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2549 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2550#define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2551 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2552#define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2553 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2554#define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2555 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2556
2557DO_CMP_PPZI_B(sve_cmpeq_ppzi_b, uint8_t, ==)
2558DO_CMP_PPZI_H(sve_cmpeq_ppzi_h, uint16_t, ==)
2559DO_CMP_PPZI_S(sve_cmpeq_ppzi_s, uint32_t, ==)
2560DO_CMP_PPZI_D(sve_cmpeq_ppzi_d, uint64_t, ==)
2561
2562DO_CMP_PPZI_B(sve_cmpne_ppzi_b, uint8_t, !=)
2563DO_CMP_PPZI_H(sve_cmpne_ppzi_h, uint16_t, !=)
2564DO_CMP_PPZI_S(sve_cmpne_ppzi_s, uint32_t, !=)
2565DO_CMP_PPZI_D(sve_cmpne_ppzi_d, uint64_t, !=)
2566
2567DO_CMP_PPZI_B(sve_cmpgt_ppzi_b, int8_t, >)
2568DO_CMP_PPZI_H(sve_cmpgt_ppzi_h, int16_t, >)
2569DO_CMP_PPZI_S(sve_cmpgt_ppzi_s, int32_t, >)
2570DO_CMP_PPZI_D(sve_cmpgt_ppzi_d, int64_t, >)
2571
2572DO_CMP_PPZI_B(sve_cmpge_ppzi_b, int8_t, >=)
2573DO_CMP_PPZI_H(sve_cmpge_ppzi_h, int16_t, >=)
2574DO_CMP_PPZI_S(sve_cmpge_ppzi_s, int32_t, >=)
2575DO_CMP_PPZI_D(sve_cmpge_ppzi_d, int64_t, >=)
2576
2577DO_CMP_PPZI_B(sve_cmphi_ppzi_b, uint8_t, >)
2578DO_CMP_PPZI_H(sve_cmphi_ppzi_h, uint16_t, >)
2579DO_CMP_PPZI_S(sve_cmphi_ppzi_s, uint32_t, >)
2580DO_CMP_PPZI_D(sve_cmphi_ppzi_d, uint64_t, >)
2581
2582DO_CMP_PPZI_B(sve_cmphs_ppzi_b, uint8_t, >=)
2583DO_CMP_PPZI_H(sve_cmphs_ppzi_h, uint16_t, >=)
2584DO_CMP_PPZI_S(sve_cmphs_ppzi_s, uint32_t, >=)
2585DO_CMP_PPZI_D(sve_cmphs_ppzi_d, uint64_t, >=)
2586
2587DO_CMP_PPZI_B(sve_cmplt_ppzi_b, int8_t, <)
2588DO_CMP_PPZI_H(sve_cmplt_ppzi_h, int16_t, <)
2589DO_CMP_PPZI_S(sve_cmplt_ppzi_s, int32_t, <)
2590DO_CMP_PPZI_D(sve_cmplt_ppzi_d, int64_t, <)
2591
2592DO_CMP_PPZI_B(sve_cmple_ppzi_b, int8_t, <=)
2593DO_CMP_PPZI_H(sve_cmple_ppzi_h, int16_t, <=)
2594DO_CMP_PPZI_S(sve_cmple_ppzi_s, int32_t, <=)
2595DO_CMP_PPZI_D(sve_cmple_ppzi_d, int64_t, <=)
2596
2597DO_CMP_PPZI_B(sve_cmplo_ppzi_b, uint8_t, <)
2598DO_CMP_PPZI_H(sve_cmplo_ppzi_h, uint16_t, <)
2599DO_CMP_PPZI_S(sve_cmplo_ppzi_s, uint32_t, <)
2600DO_CMP_PPZI_D(sve_cmplo_ppzi_d, uint64_t, <)
2601
2602DO_CMP_PPZI_B(sve_cmpls_ppzi_b, uint8_t, <=)
2603DO_CMP_PPZI_H(sve_cmpls_ppzi_h, uint16_t, <=)
2604DO_CMP_PPZI_S(sve_cmpls_ppzi_s, uint32_t, <=)
2605DO_CMP_PPZI_D(sve_cmpls_ppzi_d, uint64_t, <=)
2606
2607#undef DO_CMP_PPZI_B
2608#undef DO_CMP_PPZI_H
2609#undef DO_CMP_PPZI_S
2610#undef DO_CMP_PPZI_D
2611#undef DO_CMP_PPZI
35da316f
RH
2612
2613/* Similar to the ARM LastActive pseudocode function. */
2614static bool last_active_pred(void *vd, void *vg, intptr_t oprsz)
2615{
2616 intptr_t i;
2617
2618 for (i = QEMU_ALIGN_UP(oprsz, 8) - 8; i >= 0; i -= 8) {
2619 uint64_t pg = *(uint64_t *)(vg + i);
2620 if (pg) {
2621 return (pow2floor(pg) & *(uint64_t *)(vd + i)) != 0;
2622 }
2623 }
2624 return 0;
2625}
2626
2627/* Compute a mask into RETB that is true for all G, up to and including
2628 * (if after) or excluding (if !after) the first G & N.
2629 * Return true if BRK found.
2630 */
2631static bool compute_brk(uint64_t *retb, uint64_t n, uint64_t g,
2632 bool brk, bool after)
2633{
2634 uint64_t b;
2635
2636 if (brk) {
2637 b = 0;
2638 } else if ((g & n) == 0) {
2639 /* For all G, no N are set; break not found. */
2640 b = g;
2641 } else {
2642 /* Break somewhere in N. Locate it. */
2643 b = g & n; /* guard true, pred true */
2644 b = b & -b; /* first such */
2645 if (after) {
2646 b = b | (b - 1); /* break after same */
2647 } else {
2648 b = b - 1; /* break before same */
2649 }
2650 brk = true;
2651 }
2652
2653 *retb = b;
2654 return brk;
2655}
2656
2657/* Compute a zeroing BRK. */
2658static void compute_brk_z(uint64_t *d, uint64_t *n, uint64_t *g,
2659 intptr_t oprsz, bool after)
2660{
2661 bool brk = false;
2662 intptr_t i;
2663
2664 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2665 uint64_t this_b, this_g = g[i];
2666
2667 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2668 d[i] = this_b & this_g;
2669 }
2670}
2671
2672/* Likewise, but also compute flags. */
2673static uint32_t compute_brks_z(uint64_t *d, uint64_t *n, uint64_t *g,
2674 intptr_t oprsz, bool after)
2675{
2676 uint32_t flags = PREDTEST_INIT;
2677 bool brk = false;
2678 intptr_t i;
2679
2680 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2681 uint64_t this_b, this_d, this_g = g[i];
2682
2683 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2684 d[i] = this_d = this_b & this_g;
2685 flags = iter_predtest_fwd(this_d, this_g, flags);
2686 }
2687 return flags;
2688}
2689
2690/* Compute a merging BRK. */
2691static void compute_brk_m(uint64_t *d, uint64_t *n, uint64_t *g,
2692 intptr_t oprsz, bool after)
2693{
2694 bool brk = false;
2695 intptr_t i;
2696
2697 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2698 uint64_t this_b, this_g = g[i];
2699
2700 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2701 d[i] = (this_b & this_g) | (d[i] & ~this_g);
2702 }
2703}
2704
2705/* Likewise, but also compute flags. */
2706static uint32_t compute_brks_m(uint64_t *d, uint64_t *n, uint64_t *g,
2707 intptr_t oprsz, bool after)
2708{
2709 uint32_t flags = PREDTEST_INIT;
2710 bool brk = false;
2711 intptr_t i;
2712
2713 for (i = 0; i < oprsz / 8; ++i) {
2714 uint64_t this_b, this_d = d[i], this_g = g[i];
2715
2716 brk = compute_brk(&this_b, n[i], this_g, brk, after);
2717 d[i] = this_d = (this_b & this_g) | (this_d & ~this_g);
2718 flags = iter_predtest_fwd(this_d, this_g, flags);
2719 }
2720 return flags;
2721}
2722
2723static uint32_t do_zero(ARMPredicateReg *d, intptr_t oprsz)
2724{
2725 /* It is quicker to zero the whole predicate than loop on OPRSZ.
2726 * The compiler should turn this into 4 64-bit integer stores.
2727 */
2728 memset(d, 0, sizeof(ARMPredicateReg));
2729 return PREDTEST_INIT;
2730}
2731
2732void HELPER(sve_brkpa)(void *vd, void *vn, void *vm, void *vg,
2733 uint32_t pred_desc)
2734{
2735 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2736 if (last_active_pred(vn, vg, oprsz)) {
2737 compute_brk_z(vd, vm, vg, oprsz, true);
2738 } else {
2739 do_zero(vd, oprsz);
2740 }
2741}
2742
2743uint32_t HELPER(sve_brkpas)(void *vd, void *vn, void *vm, void *vg,
2744 uint32_t pred_desc)
2745{
2746 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2747 if (last_active_pred(vn, vg, oprsz)) {
2748 return compute_brks_z(vd, vm, vg, oprsz, true);
2749 } else {
2750 return do_zero(vd, oprsz);
2751 }
2752}
2753
2754void HELPER(sve_brkpb)(void *vd, void *vn, void *vm, void *vg,
2755 uint32_t pred_desc)
2756{
2757 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2758 if (last_active_pred(vn, vg, oprsz)) {
2759 compute_brk_z(vd, vm, vg, oprsz, false);
2760 } else {
2761 do_zero(vd, oprsz);
2762 }
2763}
2764
2765uint32_t HELPER(sve_brkpbs)(void *vd, void *vn, void *vm, void *vg,
2766 uint32_t pred_desc)
2767{
2768 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2769 if (last_active_pred(vn, vg, oprsz)) {
2770 return compute_brks_z(vd, vm, vg, oprsz, false);
2771 } else {
2772 return do_zero(vd, oprsz);
2773 }
2774}
2775
2776void HELPER(sve_brka_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2777{
2778 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2779 compute_brk_z(vd, vn, vg, oprsz, true);
2780}
2781
2782uint32_t HELPER(sve_brkas_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2783{
2784 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2785 return compute_brks_z(vd, vn, vg, oprsz, true);
2786}
2787
2788void HELPER(sve_brkb_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2789{
2790 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2791 compute_brk_z(vd, vn, vg, oprsz, false);
2792}
2793
2794uint32_t HELPER(sve_brkbs_z)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2795{
2796 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2797 return compute_brks_z(vd, vn, vg, oprsz, false);
2798}
2799
2800void HELPER(sve_brka_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2801{
2802 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2803 compute_brk_m(vd, vn, vg, oprsz, true);
2804}
2805
2806uint32_t HELPER(sve_brkas_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2807{
2808 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2809 return compute_brks_m(vd, vn, vg, oprsz, true);
2810}
2811
2812void HELPER(sve_brkb_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2813{
2814 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2815 compute_brk_m(vd, vn, vg, oprsz, false);
2816}
2817
2818uint32_t HELPER(sve_brkbs_m)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2819{
2820 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2821 return compute_brks_m(vd, vn, vg, oprsz, false);
2822}
2823
2824void HELPER(sve_brkn)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2825{
2826 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2827
2828 if (!last_active_pred(vn, vg, oprsz)) {
2829 do_zero(vd, oprsz);
2830 }
2831}
2832
2833/* As if PredTest(Ones(PL), D, esz). */
2834static uint32_t predtest_ones(ARMPredicateReg *d, intptr_t oprsz,
2835 uint64_t esz_mask)
2836{
2837 uint32_t flags = PREDTEST_INIT;
2838 intptr_t i;
2839
2840 for (i = 0; i < oprsz / 8; i++) {
2841 flags = iter_predtest_fwd(d->p[i], esz_mask, flags);
2842 }
2843 if (oprsz & 7) {
2844 uint64_t mask = ~(-1ULL << (8 * (oprsz & 7)));
2845 flags = iter_predtest_fwd(d->p[i], esz_mask & mask, flags);
2846 }
2847 return flags;
2848}
2849
2850uint32_t HELPER(sve_brkns)(void *vd, void *vn, void *vg, uint32_t pred_desc)
2851{
2852 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2853
2854 if (last_active_pred(vn, vg, oprsz)) {
2855 return predtest_ones(vd, oprsz, -1);
2856 } else {
2857 return do_zero(vd, oprsz);
2858 }
2859}
9ee3a611
RH
2860
2861uint64_t HELPER(sve_cntp)(void *vn, void *vg, uint32_t pred_desc)
2862{
2863 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2864 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2865 uint64_t *n = vn, *g = vg, sum = 0, mask = pred_esz_masks[esz];
2866 intptr_t i;
2867
2868 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); ++i) {
2869 uint64_t t = n[i] & g[i] & mask;
2870 sum += ctpop64(t);
2871 }
2872 return sum;
2873}
caf1cefc
RH
2874
2875uint32_t HELPER(sve_while)(void *vd, uint32_t count, uint32_t pred_desc)
2876{
2877 uintptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2878 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2879 uint64_t esz_mask = pred_esz_masks[esz];
2880 ARMPredicateReg *d = vd;
2881 uint32_t flags;
2882 intptr_t i;
2883
2884 /* Begin with a zero predicate register. */
2885 flags = do_zero(d, oprsz);
2886 if (count == 0) {
2887 return flags;
2888 }
2889
caf1cefc
RH
2890 /* Set all of the requested bits. */
2891 for (i = 0; i < count / 64; ++i) {
2892 d->p[i] = esz_mask;
2893 }
2894 if (count & 63) {
2895 d->p[i] = MAKE_64BIT_MASK(0, count & 63) & esz_mask;
2896 }
2897
2898 return predtest_ones(d, oprsz, esz_mask);
2899}
c4e7c493 2900
23fbe79f
RH
2901/* Recursive reduction on a function;
2902 * C.f. the ARM ARM function ReducePredicated.
2903 *
2904 * While it would be possible to write this without the DATA temporary,
2905 * it is much simpler to process the predicate register this way.
2906 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
2907 * little to gain with a more complex non-recursive form.
2908 */
2909#define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
2910static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
2911{ \
2912 if (n == 1) { \
2913 return *data; \
2914 } else { \
2915 uintptr_t half = n / 2; \
2916 TYPE lo = NAME##_reduce(data, status, half); \
2917 TYPE hi = NAME##_reduce(data + half, status, half); \
2918 return TYPE##_##FUNC(lo, hi, status); \
2919 } \
2920} \
2921uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
2922{ \
2923 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_maxsz(desc); \
2924 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
2925 for (i = 0; i < oprsz; ) { \
2926 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2927 do { \
2928 TYPE nn = *(TYPE *)(vn + H(i)); \
2929 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
2930 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2931 } while (i & 15); \
2932 } \
2933 for (; i < maxsz; i += sizeof(TYPE)) { \
2934 *(TYPE *)((void *)data + i) = IDENT; \
2935 } \
2936 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
2937}
2938
2939DO_REDUCE(sve_faddv_h, float16, H1_2, add, float16_zero)
2940DO_REDUCE(sve_faddv_s, float32, H1_4, add, float32_zero)
2941DO_REDUCE(sve_faddv_d, float64, , add, float64_zero)
2942
2943/* Identity is floatN_default_nan, without the function call. */
2944DO_REDUCE(sve_fminnmv_h, float16, H1_2, minnum, 0x7E00)
2945DO_REDUCE(sve_fminnmv_s, float32, H1_4, minnum, 0x7FC00000)
2946DO_REDUCE(sve_fminnmv_d, float64, , minnum, 0x7FF8000000000000ULL)
2947
2948DO_REDUCE(sve_fmaxnmv_h, float16, H1_2, maxnum, 0x7E00)
2949DO_REDUCE(sve_fmaxnmv_s, float32, H1_4, maxnum, 0x7FC00000)
2950DO_REDUCE(sve_fmaxnmv_d, float64, , maxnum, 0x7FF8000000000000ULL)
2951
2952DO_REDUCE(sve_fminv_h, float16, H1_2, min, float16_infinity)
2953DO_REDUCE(sve_fminv_s, float32, H1_4, min, float32_infinity)
2954DO_REDUCE(sve_fminv_d, float64, , min, float64_infinity)
2955
2956DO_REDUCE(sve_fmaxv_h, float16, H1_2, max, float16_chs(float16_infinity))
2957DO_REDUCE(sve_fmaxv_s, float32, H1_4, max, float32_chs(float32_infinity))
2958DO_REDUCE(sve_fmaxv_d, float64, , max, float64_chs(float64_infinity))
2959
2960#undef DO_REDUCE
2961
7f9ddf64
RH
2962uint64_t HELPER(sve_fadda_h)(uint64_t nn, void *vm, void *vg,
2963 void *status, uint32_t desc)
2964{
2965 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2966 float16 result = nn;
2967
2968 do {
2969 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2970 do {
2971 if (pg & 1) {
2972 float16 mm = *(float16 *)(vm + H1_2(i));
2973 result = float16_add(result, mm, status);
2974 }
2975 i += sizeof(float16), pg >>= sizeof(float16);
2976 } while (i & 15);
2977 } while (i < opr_sz);
2978
2979 return result;
2980}
2981
2982uint64_t HELPER(sve_fadda_s)(uint64_t nn, void *vm, void *vg,
2983 void *status, uint32_t desc)
2984{
2985 intptr_t i = 0, opr_sz = simd_oprsz(desc);
2986 float32 result = nn;
2987
2988 do {
2989 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
2990 do {
2991 if (pg & 1) {
2992 float32 mm = *(float32 *)(vm + H1_2(i));
2993 result = float32_add(result, mm, status);
2994 }
2995 i += sizeof(float32), pg >>= sizeof(float32);
2996 } while (i & 15);
2997 } while (i < opr_sz);
2998
2999 return result;
3000}
3001
3002uint64_t HELPER(sve_fadda_d)(uint64_t nn, void *vm, void *vg,
3003 void *status, uint32_t desc)
3004{
3005 intptr_t i = 0, opr_sz = simd_oprsz(desc) / 8;
3006 uint64_t *m = vm;
3007 uint8_t *pg = vg;
3008
3009 for (i = 0; i < opr_sz; i++) {
3010 if (pg[H1(i)] & 1) {
3011 nn = float64_add(nn, m[i], status);
3012 }
3013 }
3014
3015 return nn;
3016}
3017
ec3b87c2
RH
3018/* Fully general three-operand expander, controlled by a predicate,
3019 * With the extra float_status parameter.
3020 */
3021#define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
3022void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3023 void *status, uint32_t desc) \
3024{ \
3025 intptr_t i = simd_oprsz(desc); \
3026 uint64_t *g = vg; \
3027 do { \
3028 uint64_t pg = g[(i - 1) >> 6]; \
3029 do { \
3030 i -= sizeof(TYPE); \
3031 if (likely((pg >> (i & 63)) & 1)) { \
3032 TYPE nn = *(TYPE *)(vn + H(i)); \
3033 TYPE mm = *(TYPE *)(vm + H(i)); \
3034 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3035 } \
3036 } while (i & 63); \
3037 } while (i != 0); \
3038}
3039
3040DO_ZPZZ_FP(sve_fadd_h, uint16_t, H1_2, float16_add)
3041DO_ZPZZ_FP(sve_fadd_s, uint32_t, H1_4, float32_add)
3042DO_ZPZZ_FP(sve_fadd_d, uint64_t, , float64_add)
3043
3044DO_ZPZZ_FP(sve_fsub_h, uint16_t, H1_2, float16_sub)
3045DO_ZPZZ_FP(sve_fsub_s, uint32_t, H1_4, float32_sub)
3046DO_ZPZZ_FP(sve_fsub_d, uint64_t, , float64_sub)
3047
3048DO_ZPZZ_FP(sve_fmul_h, uint16_t, H1_2, float16_mul)
3049DO_ZPZZ_FP(sve_fmul_s, uint32_t, H1_4, float32_mul)
3050DO_ZPZZ_FP(sve_fmul_d, uint64_t, , float64_mul)
3051
3052DO_ZPZZ_FP(sve_fdiv_h, uint16_t, H1_2, float16_div)
3053DO_ZPZZ_FP(sve_fdiv_s, uint32_t, H1_4, float32_div)
3054DO_ZPZZ_FP(sve_fdiv_d, uint64_t, , float64_div)
3055
3056DO_ZPZZ_FP(sve_fmin_h, uint16_t, H1_2, float16_min)
3057DO_ZPZZ_FP(sve_fmin_s, uint32_t, H1_4, float32_min)
3058DO_ZPZZ_FP(sve_fmin_d, uint64_t, , float64_min)
3059
3060DO_ZPZZ_FP(sve_fmax_h, uint16_t, H1_2, float16_max)
3061DO_ZPZZ_FP(sve_fmax_s, uint32_t, H1_4, float32_max)
3062DO_ZPZZ_FP(sve_fmax_d, uint64_t, , float64_max)
3063
3064DO_ZPZZ_FP(sve_fminnum_h, uint16_t, H1_2, float16_minnum)
3065DO_ZPZZ_FP(sve_fminnum_s, uint32_t, H1_4, float32_minnum)
3066DO_ZPZZ_FP(sve_fminnum_d, uint64_t, , float64_minnum)
3067
3068DO_ZPZZ_FP(sve_fmaxnum_h, uint16_t, H1_2, float16_maxnum)
3069DO_ZPZZ_FP(sve_fmaxnum_s, uint32_t, H1_4, float32_maxnum)
3070DO_ZPZZ_FP(sve_fmaxnum_d, uint64_t, , float64_maxnum)
3071
3072static inline float16 abd_h(float16 a, float16 b, float_status *s)
3073{
3074 return float16_abs(float16_sub(a, b, s));
3075}
3076
3077static inline float32 abd_s(float32 a, float32 b, float_status *s)
3078{
3079 return float32_abs(float32_sub(a, b, s));
3080}
3081
3082static inline float64 abd_d(float64 a, float64 b, float_status *s)
3083{
3084 return float64_abs(float64_sub(a, b, s));
3085}
3086
3087DO_ZPZZ_FP(sve_fabd_h, uint16_t, H1_2, abd_h)
3088DO_ZPZZ_FP(sve_fabd_s, uint32_t, H1_4, abd_s)
3089DO_ZPZZ_FP(sve_fabd_d, uint64_t, , abd_d)
3090
3091static inline float64 scalbn_d(float64 a, int64_t b, float_status *s)
3092{
3093 int b_int = MIN(MAX(b, INT_MIN), INT_MAX);
3094 return float64_scalbn(a, b_int, s);
3095}
3096
3097DO_ZPZZ_FP(sve_fscalbn_h, int16_t, H1_2, float16_scalbn)
3098DO_ZPZZ_FP(sve_fscalbn_s, int32_t, H1_4, float32_scalbn)
3099DO_ZPZZ_FP(sve_fscalbn_d, int64_t, , scalbn_d)
3100
3101DO_ZPZZ_FP(sve_fmulx_h, uint16_t, H1_2, helper_advsimd_mulxh)
3102DO_ZPZZ_FP(sve_fmulx_s, uint32_t, H1_4, helper_vfp_mulxs)
3103DO_ZPZZ_FP(sve_fmulx_d, uint64_t, , helper_vfp_mulxd)
3104
3105#undef DO_ZPZZ_FP
3106
cc48affe
RH
3107/* Three-operand expander, with one scalar operand, controlled by
3108 * a predicate, with the extra float_status parameter.
3109 */
3110#define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3111void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3112 void *status, uint32_t desc) \
3113{ \
3114 intptr_t i = simd_oprsz(desc); \
3115 uint64_t *g = vg; \
3116 TYPE mm = scalar; \
3117 do { \
3118 uint64_t pg = g[(i - 1) >> 6]; \
3119 do { \
3120 i -= sizeof(TYPE); \
3121 if (likely((pg >> (i & 63)) & 1)) { \
3122 TYPE nn = *(TYPE *)(vn + H(i)); \
3123 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3124 } \
3125 } while (i & 63); \
3126 } while (i != 0); \
3127}
3128
3129DO_ZPZS_FP(sve_fadds_h, float16, H1_2, float16_add)
3130DO_ZPZS_FP(sve_fadds_s, float32, H1_4, float32_add)
3131DO_ZPZS_FP(sve_fadds_d, float64, , float64_add)
3132
3133DO_ZPZS_FP(sve_fsubs_h, float16, H1_2, float16_sub)
3134DO_ZPZS_FP(sve_fsubs_s, float32, H1_4, float32_sub)
3135DO_ZPZS_FP(sve_fsubs_d, float64, , float64_sub)
3136
3137DO_ZPZS_FP(sve_fmuls_h, float16, H1_2, float16_mul)
3138DO_ZPZS_FP(sve_fmuls_s, float32, H1_4, float32_mul)
3139DO_ZPZS_FP(sve_fmuls_d, float64, , float64_mul)
3140
3141static inline float16 subr_h(float16 a, float16 b, float_status *s)
3142{
3143 return float16_sub(b, a, s);
3144}
3145
3146static inline float32 subr_s(float32 a, float32 b, float_status *s)
3147{
3148 return float32_sub(b, a, s);
3149}
3150
3151static inline float64 subr_d(float64 a, float64 b, float_status *s)
3152{
3153 return float64_sub(b, a, s);
3154}
3155
3156DO_ZPZS_FP(sve_fsubrs_h, float16, H1_2, subr_h)
3157DO_ZPZS_FP(sve_fsubrs_s, float32, H1_4, subr_s)
3158DO_ZPZS_FP(sve_fsubrs_d, float64, , subr_d)
3159
3160DO_ZPZS_FP(sve_fmaxnms_h, float16, H1_2, float16_maxnum)
3161DO_ZPZS_FP(sve_fmaxnms_s, float32, H1_4, float32_maxnum)
3162DO_ZPZS_FP(sve_fmaxnms_d, float64, , float64_maxnum)
3163
3164DO_ZPZS_FP(sve_fminnms_h, float16, H1_2, float16_minnum)
3165DO_ZPZS_FP(sve_fminnms_s, float32, H1_4, float32_minnum)
3166DO_ZPZS_FP(sve_fminnms_d, float64, , float64_minnum)
3167
3168DO_ZPZS_FP(sve_fmaxs_h, float16, H1_2, float16_max)
3169DO_ZPZS_FP(sve_fmaxs_s, float32, H1_4, float32_max)
3170DO_ZPZS_FP(sve_fmaxs_d, float64, , float64_max)
3171
3172DO_ZPZS_FP(sve_fmins_h, float16, H1_2, float16_min)
3173DO_ZPZS_FP(sve_fmins_s, float32, H1_4, float32_min)
3174DO_ZPZS_FP(sve_fmins_d, float64, , float64_min)
3175
8092c6a3
RH
3176/* Fully general two-operand expander, controlled by a predicate,
3177 * With the extra float_status parameter.
3178 */
3179#define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3180void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3181{ \
3182 intptr_t i = simd_oprsz(desc); \
3183 uint64_t *g = vg; \
3184 do { \
3185 uint64_t pg = g[(i - 1) >> 6]; \
3186 do { \
3187 i -= sizeof(TYPE); \
3188 if (likely((pg >> (i & 63)) & 1)) { \
3189 TYPE nn = *(TYPE *)(vn + H(i)); \
3190 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3191 } \
3192 } while (i & 63); \
3193 } while (i != 0); \
3194}
3195
46d33d1e
RH
3196/* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3197 * FZ16. When converting from fp16, this affects flushing input denormals;
3198 * when converting to fp16, this affects flushing output denormals.
3199 */
3200static inline float32 sve_f16_to_f32(float16 f, float_status *fpst)
3201{
3202 flag save = get_flush_inputs_to_zero(fpst);
3203 float32 ret;
3204
3205 set_flush_inputs_to_zero(false, fpst);
3206 ret = float16_to_float32(f, true, fpst);
3207 set_flush_inputs_to_zero(save, fpst);
3208 return ret;
3209}
3210
3211static inline float64 sve_f16_to_f64(float16 f, float_status *fpst)
3212{
3213 flag save = get_flush_inputs_to_zero(fpst);
3214 float64 ret;
3215
3216 set_flush_inputs_to_zero(false, fpst);
3217 ret = float16_to_float64(f, true, fpst);
3218 set_flush_inputs_to_zero(save, fpst);
3219 return ret;
3220}
3221
3222static inline float16 sve_f32_to_f16(float32 f, float_status *fpst)
3223{
3224 flag save = get_flush_to_zero(fpst);
3225 float16 ret;
3226
3227 set_flush_to_zero(false, fpst);
3228 ret = float32_to_float16(f, true, fpst);
3229 set_flush_to_zero(save, fpst);
3230 return ret;
3231}
3232
3233static inline float16 sve_f64_to_f16(float64 f, float_status *fpst)
3234{
3235 flag save = get_flush_to_zero(fpst);
3236 float16 ret;
3237
3238 set_flush_to_zero(false, fpst);
3239 ret = float64_to_float16(f, true, fpst);
3240 set_flush_to_zero(save, fpst);
3241 return ret;
3242}
3243
df4de1af
RH
3244static inline int16_t vfp_float16_to_int16_rtz(float16 f, float_status *s)
3245{
3246 if (float16_is_any_nan(f)) {
3247 float_raise(float_flag_invalid, s);
3248 return 0;
3249 }
3250 return float16_to_int16_round_to_zero(f, s);
3251}
3252
3253static inline int64_t vfp_float16_to_int64_rtz(float16 f, float_status *s)
3254{
3255 if (float16_is_any_nan(f)) {
3256 float_raise(float_flag_invalid, s);
3257 return 0;
3258 }
3259 return float16_to_int64_round_to_zero(f, s);
3260}
3261
3262static inline int64_t vfp_float32_to_int64_rtz(float32 f, float_status *s)
3263{
3264 if (float32_is_any_nan(f)) {
3265 float_raise(float_flag_invalid, s);
3266 return 0;
3267 }
3268 return float32_to_int64_round_to_zero(f, s);
3269}
3270
3271static inline int64_t vfp_float64_to_int64_rtz(float64 f, float_status *s)
3272{
3273 if (float64_is_any_nan(f)) {
3274 float_raise(float_flag_invalid, s);
3275 return 0;
3276 }
3277 return float64_to_int64_round_to_zero(f, s);
3278}
3279
3280static inline uint16_t vfp_float16_to_uint16_rtz(float16 f, float_status *s)
3281{
3282 if (float16_is_any_nan(f)) {
3283 float_raise(float_flag_invalid, s);
3284 return 0;
3285 }
3286 return float16_to_uint16_round_to_zero(f, s);
3287}
3288
3289static inline uint64_t vfp_float16_to_uint64_rtz(float16 f, float_status *s)
3290{
3291 if (float16_is_any_nan(f)) {
3292 float_raise(float_flag_invalid, s);
3293 return 0;
3294 }
3295 return float16_to_uint64_round_to_zero(f, s);
3296}
3297
3298static inline uint64_t vfp_float32_to_uint64_rtz(float32 f, float_status *s)
3299{
3300 if (float32_is_any_nan(f)) {
3301 float_raise(float_flag_invalid, s);
3302 return 0;
3303 }
3304 return float32_to_uint64_round_to_zero(f, s);
3305}
3306
3307static inline uint64_t vfp_float64_to_uint64_rtz(float64 f, float_status *s)
3308{
3309 if (float64_is_any_nan(f)) {
3310 float_raise(float_flag_invalid, s);
3311 return 0;
3312 }
3313 return float64_to_uint64_round_to_zero(f, s);
3314}
3315
46d33d1e
RH
3316DO_ZPZ_FP(sve_fcvt_sh, uint32_t, H1_4, sve_f32_to_f16)
3317DO_ZPZ_FP(sve_fcvt_hs, uint32_t, H1_4, sve_f16_to_f32)
3318DO_ZPZ_FP(sve_fcvt_dh, uint64_t, , sve_f64_to_f16)
3319DO_ZPZ_FP(sve_fcvt_hd, uint64_t, , sve_f16_to_f64)
3320DO_ZPZ_FP(sve_fcvt_ds, uint64_t, , float64_to_float32)
3321DO_ZPZ_FP(sve_fcvt_sd, uint64_t, , float32_to_float64)
3322
df4de1af
RH
3323DO_ZPZ_FP(sve_fcvtzs_hh, uint16_t, H1_2, vfp_float16_to_int16_rtz)
3324DO_ZPZ_FP(sve_fcvtzs_hs, uint32_t, H1_4, helper_vfp_tosizh)
3325DO_ZPZ_FP(sve_fcvtzs_ss, uint32_t, H1_4, helper_vfp_tosizs)
3326DO_ZPZ_FP(sve_fcvtzs_hd, uint64_t, , vfp_float16_to_int64_rtz)
3327DO_ZPZ_FP(sve_fcvtzs_sd, uint64_t, , vfp_float32_to_int64_rtz)
3328DO_ZPZ_FP(sve_fcvtzs_ds, uint64_t, , helper_vfp_tosizd)
3329DO_ZPZ_FP(sve_fcvtzs_dd, uint64_t, , vfp_float64_to_int64_rtz)
3330
3331DO_ZPZ_FP(sve_fcvtzu_hh, uint16_t, H1_2, vfp_float16_to_uint16_rtz)
3332DO_ZPZ_FP(sve_fcvtzu_hs, uint32_t, H1_4, helper_vfp_touizh)
3333DO_ZPZ_FP(sve_fcvtzu_ss, uint32_t, H1_4, helper_vfp_touizs)
3334DO_ZPZ_FP(sve_fcvtzu_hd, uint64_t, , vfp_float16_to_uint64_rtz)
3335DO_ZPZ_FP(sve_fcvtzu_sd, uint64_t, , vfp_float32_to_uint64_rtz)
3336DO_ZPZ_FP(sve_fcvtzu_ds, uint64_t, , helper_vfp_touizd)
3337DO_ZPZ_FP(sve_fcvtzu_dd, uint64_t, , vfp_float64_to_uint64_rtz)
3338
cda3c753
RH
3339DO_ZPZ_FP(sve_frint_h, uint16_t, H1_2, helper_advsimd_rinth)
3340DO_ZPZ_FP(sve_frint_s, uint32_t, H1_4, helper_rints)
3341DO_ZPZ_FP(sve_frint_d, uint64_t, , helper_rintd)
3342
3343DO_ZPZ_FP(sve_frintx_h, uint16_t, H1_2, float16_round_to_int)
3344DO_ZPZ_FP(sve_frintx_s, uint32_t, H1_4, float32_round_to_int)
3345DO_ZPZ_FP(sve_frintx_d, uint64_t, , float64_round_to_int)
3346
ec5b375b
RH
3347DO_ZPZ_FP(sve_frecpx_h, uint16_t, H1_2, helper_frecpx_f16)
3348DO_ZPZ_FP(sve_frecpx_s, uint32_t, H1_4, helper_frecpx_f32)
3349DO_ZPZ_FP(sve_frecpx_d, uint64_t, , helper_frecpx_f64)
3350
3351DO_ZPZ_FP(sve_fsqrt_h, uint16_t, H1_2, float16_sqrt)
3352DO_ZPZ_FP(sve_fsqrt_s, uint32_t, H1_4, float32_sqrt)
3353DO_ZPZ_FP(sve_fsqrt_d, uint64_t, , float64_sqrt)
3354
8092c6a3
RH
3355DO_ZPZ_FP(sve_scvt_hh, uint16_t, H1_2, int16_to_float16)
3356DO_ZPZ_FP(sve_scvt_sh, uint32_t, H1_4, int32_to_float16)
3357DO_ZPZ_FP(sve_scvt_ss, uint32_t, H1_4, int32_to_float32)
3358DO_ZPZ_FP(sve_scvt_sd, uint64_t, , int32_to_float64)
3359DO_ZPZ_FP(sve_scvt_dh, uint64_t, , int64_to_float16)
3360DO_ZPZ_FP(sve_scvt_ds, uint64_t, , int64_to_float32)
3361DO_ZPZ_FP(sve_scvt_dd, uint64_t, , int64_to_float64)
3362
3363DO_ZPZ_FP(sve_ucvt_hh, uint16_t, H1_2, uint16_to_float16)
3364DO_ZPZ_FP(sve_ucvt_sh, uint32_t, H1_4, uint32_to_float16)
3365DO_ZPZ_FP(sve_ucvt_ss, uint32_t, H1_4, uint32_to_float32)
3366DO_ZPZ_FP(sve_ucvt_sd, uint64_t, , uint32_to_float64)
3367DO_ZPZ_FP(sve_ucvt_dh, uint64_t, , uint64_to_float16)
3368DO_ZPZ_FP(sve_ucvt_ds, uint64_t, , uint64_to_float32)
3369DO_ZPZ_FP(sve_ucvt_dd, uint64_t, , uint64_to_float64)
3370
3371#undef DO_ZPZ_FP
3372
6ceabaad
RH
3373/* 4-operand predicated multiply-add. This requires 7 operands to pass
3374 * "properly", so we need to encode some of the registers into DESC.
3375 */
3376QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 20 > 32);
3377
3378static void do_fmla_zpzzz_h(CPUARMState *env, void *vg, uint32_t desc,
3379 uint16_t neg1, uint16_t neg3)
3380{
3381 intptr_t i = simd_oprsz(desc);
3382 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3383 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3384 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3385 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3386 void *vd = &env->vfp.zregs[rd];
3387 void *vn = &env->vfp.zregs[rn];
3388 void *vm = &env->vfp.zregs[rm];
3389 void *va = &env->vfp.zregs[ra];
3390 uint64_t *g = vg;
3391
3392 do {
3393 uint64_t pg = g[(i - 1) >> 6];
3394 do {
3395 i -= 2;
3396 if (likely((pg >> (i & 63)) & 1)) {
3397 float16 e1, e2, e3, r;
3398
3399 e1 = *(uint16_t *)(vn + H1_2(i)) ^ neg1;
3400 e2 = *(uint16_t *)(vm + H1_2(i));
3401 e3 = *(uint16_t *)(va + H1_2(i)) ^ neg3;
52a339b1 3402 r = float16_muladd(e1, e2, e3, 0, &env->vfp.fp_status_f16);
6ceabaad
RH
3403 *(uint16_t *)(vd + H1_2(i)) = r;
3404 }
3405 } while (i & 63);
3406 } while (i != 0);
3407}
3408
3409void HELPER(sve_fmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3410{
3411 do_fmla_zpzzz_h(env, vg, desc, 0, 0);
3412}
3413
3414void HELPER(sve_fmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3415{
3416 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0);
3417}
3418
3419void HELPER(sve_fnmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3420{
3421 do_fmla_zpzzz_h(env, vg, desc, 0x8000, 0x8000);
3422}
3423
3424void HELPER(sve_fnmls_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3425{
3426 do_fmla_zpzzz_h(env, vg, desc, 0, 0x8000);
3427}
3428
3429static void do_fmla_zpzzz_s(CPUARMState *env, void *vg, uint32_t desc,
3430 uint32_t neg1, uint32_t neg3)
3431{
3432 intptr_t i = simd_oprsz(desc);
3433 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3434 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3435 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3436 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3437 void *vd = &env->vfp.zregs[rd];
3438 void *vn = &env->vfp.zregs[rn];
3439 void *vm = &env->vfp.zregs[rm];
3440 void *va = &env->vfp.zregs[ra];
3441 uint64_t *g = vg;
3442
3443 do {
3444 uint64_t pg = g[(i - 1) >> 6];
3445 do {
3446 i -= 4;
3447 if (likely((pg >> (i & 63)) & 1)) {
3448 float32 e1, e2, e3, r;
3449
3450 e1 = *(uint32_t *)(vn + H1_4(i)) ^ neg1;
3451 e2 = *(uint32_t *)(vm + H1_4(i));
3452 e3 = *(uint32_t *)(va + H1_4(i)) ^ neg3;
3453 r = float32_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3454 *(uint32_t *)(vd + H1_4(i)) = r;
3455 }
3456 } while (i & 63);
3457 } while (i != 0);
3458}
3459
3460void HELPER(sve_fmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3461{
3462 do_fmla_zpzzz_s(env, vg, desc, 0, 0);
3463}
3464
3465void HELPER(sve_fmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3466{
3467 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0);
3468}
3469
3470void HELPER(sve_fnmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3471{
3472 do_fmla_zpzzz_s(env, vg, desc, 0x80000000, 0x80000000);
3473}
3474
3475void HELPER(sve_fnmls_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3476{
3477 do_fmla_zpzzz_s(env, vg, desc, 0, 0x80000000);
3478}
3479
3480static void do_fmla_zpzzz_d(CPUARMState *env, void *vg, uint32_t desc,
3481 uint64_t neg1, uint64_t neg3)
3482{
3483 intptr_t i = simd_oprsz(desc);
3484 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3485 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3486 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3487 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3488 void *vd = &env->vfp.zregs[rd];
3489 void *vn = &env->vfp.zregs[rn];
3490 void *vm = &env->vfp.zregs[rm];
3491 void *va = &env->vfp.zregs[ra];
3492 uint64_t *g = vg;
3493
3494 do {
3495 uint64_t pg = g[(i - 1) >> 6];
3496 do {
3497 i -= 8;
3498 if (likely((pg >> (i & 63)) & 1)) {
3499 float64 e1, e2, e3, r;
3500
3501 e1 = *(uint64_t *)(vn + i) ^ neg1;
3502 e2 = *(uint64_t *)(vm + i);
3503 e3 = *(uint64_t *)(va + i) ^ neg3;
3504 r = float64_muladd(e1, e2, e3, 0, &env->vfp.fp_status);
3505 *(uint64_t *)(vd + i) = r;
3506 }
3507 } while (i & 63);
3508 } while (i != 0);
3509}
3510
3511void HELPER(sve_fmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3512{
3513 do_fmla_zpzzz_d(env, vg, desc, 0, 0);
3514}
3515
3516void HELPER(sve_fmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3517{
3518 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, 0);
3519}
3520
3521void HELPER(sve_fnmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3522{
3523 do_fmla_zpzzz_d(env, vg, desc, INT64_MIN, INT64_MIN);
3524}
3525
3526void HELPER(sve_fnmls_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3527{
3528 do_fmla_zpzzz_d(env, vg, desc, 0, INT64_MIN);
3529}
3530
abfdefd5
RH
3531/* Two operand floating-point comparison controlled by a predicate.
3532 * Unlike the integer version, we are not allowed to optimistically
3533 * compare operands, since the comparison may have side effects wrt
3534 * the FPSR.
3535 */
3536#define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3537void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3538 void *status, uint32_t desc) \
3539{ \
3540 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3541 uint64_t *d = vd, *g = vg; \
3542 do { \
3543 uint64_t out = 0, pg = g[j]; \
3544 do { \
3545 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3546 if (likely((pg >> (i & 63)) & 1)) { \
3547 TYPE nn = *(TYPE *)(vn + H(i)); \
3548 TYPE mm = *(TYPE *)(vm + H(i)); \
3549 out |= OP(TYPE, nn, mm, status); \
3550 } \
3551 } while (i & 63); \
3552 d[j--] = out; \
3553 } while (i > 0); \
3554}
3555
3556#define DO_FPCMP_PPZZ_H(NAME, OP) \
3557 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3558#define DO_FPCMP_PPZZ_S(NAME, OP) \
3559 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3560#define DO_FPCMP_PPZZ_D(NAME, OP) \
3561 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3562
3563#define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3564 DO_FPCMP_PPZZ_H(NAME, OP) \
3565 DO_FPCMP_PPZZ_S(NAME, OP) \
3566 DO_FPCMP_PPZZ_D(NAME, OP)
3567
3568#define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3569#define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4d2e2a03
RH
3570#define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3571#define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
abfdefd5
RH
3572#define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3573#define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3574#define DO_FCMUO(TYPE, X, Y, ST) \
3575 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3576#define DO_FACGE(TYPE, X, Y, ST) \
3577 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3578#define DO_FACGT(TYPE, X, Y, ST) \
3579 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3580
3581DO_FPCMP_PPZZ_ALL(sve_fcmge, DO_FCMGE)
3582DO_FPCMP_PPZZ_ALL(sve_fcmgt, DO_FCMGT)
3583DO_FPCMP_PPZZ_ALL(sve_fcmeq, DO_FCMEQ)
3584DO_FPCMP_PPZZ_ALL(sve_fcmne, DO_FCMNE)
3585DO_FPCMP_PPZZ_ALL(sve_fcmuo, DO_FCMUO)
3586DO_FPCMP_PPZZ_ALL(sve_facge, DO_FACGE)
3587DO_FPCMP_PPZZ_ALL(sve_facgt, DO_FACGT)
3588
3589#undef DO_FPCMP_PPZZ_ALL
3590#undef DO_FPCMP_PPZZ_D
3591#undef DO_FPCMP_PPZZ_S
3592#undef DO_FPCMP_PPZZ_H
3593#undef DO_FPCMP_PPZZ
3594
4d2e2a03
RH
3595/* One operand floating-point comparison against zero, controlled
3596 * by a predicate.
3597 */
3598#define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
3599void HELPER(NAME)(void *vd, void *vn, void *vg, \
3600 void *status, uint32_t desc) \
3601{ \
3602 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3603 uint64_t *d = vd, *g = vg; \
3604 do { \
3605 uint64_t out = 0, pg = g[j]; \
3606 do { \
3607 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3608 if ((pg >> (i & 63)) & 1) { \
3609 TYPE nn = *(TYPE *)(vn + H(i)); \
3610 out |= OP(TYPE, nn, 0, status); \
3611 } \
3612 } while (i & 63); \
3613 d[j--] = out; \
3614 } while (i > 0); \
3615}
3616
3617#define DO_FPCMP_PPZ0_H(NAME, OP) \
3618 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
3619#define DO_FPCMP_PPZ0_S(NAME, OP) \
3620 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
3621#define DO_FPCMP_PPZ0_D(NAME, OP) \
3622 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
3623
3624#define DO_FPCMP_PPZ0_ALL(NAME, OP) \
3625 DO_FPCMP_PPZ0_H(NAME, OP) \
3626 DO_FPCMP_PPZ0_S(NAME, OP) \
3627 DO_FPCMP_PPZ0_D(NAME, OP)
3628
3629DO_FPCMP_PPZ0_ALL(sve_fcmge0, DO_FCMGE)
3630DO_FPCMP_PPZ0_ALL(sve_fcmgt0, DO_FCMGT)
3631DO_FPCMP_PPZ0_ALL(sve_fcmle0, DO_FCMLE)
3632DO_FPCMP_PPZ0_ALL(sve_fcmlt0, DO_FCMLT)
3633DO_FPCMP_PPZ0_ALL(sve_fcmeq0, DO_FCMEQ)
3634DO_FPCMP_PPZ0_ALL(sve_fcmne0, DO_FCMNE)
3635
67fcd9ad
RH
3636/* FP Trig Multiply-Add. */
3637
3638void HELPER(sve_ftmad_h)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3639{
3640 static const float16 coeff[16] = {
3641 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3642 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
3643 };
3644 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float16);
3645 intptr_t x = simd_data(desc);
3646 float16 *d = vd, *n = vn, *m = vm;
3647 for (i = 0; i < opr_sz; i++) {
3648 float16 mm = m[i];
3649 intptr_t xx = x;
3650 if (float16_is_neg(mm)) {
3651 mm = float16_abs(mm);
3652 xx += 8;
3653 }
3654 d[i] = float16_muladd(n[i], mm, coeff[xx], 0, vs);
3655 }
3656}
3657
3658void HELPER(sve_ftmad_s)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3659{
3660 static const float32 coeff[16] = {
3661 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
3662 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
3663 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
3664 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
3665 };
3666 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float32);
3667 intptr_t x = simd_data(desc);
3668 float32 *d = vd, *n = vn, *m = vm;
3669 for (i = 0; i < opr_sz; i++) {
3670 float32 mm = m[i];
3671 intptr_t xx = x;
3672 if (float32_is_neg(mm)) {
3673 mm = float32_abs(mm);
3674 xx += 8;
3675 }
3676 d[i] = float32_muladd(n[i], mm, coeff[xx], 0, vs);
3677 }
3678}
3679
3680void HELPER(sve_ftmad_d)(void *vd, void *vn, void *vm, void *vs, uint32_t desc)
3681{
3682 static const float64 coeff[16] = {
3683 0x3ff0000000000000ull, 0xbfc5555555555543ull,
3684 0x3f8111111110f30cull, 0xbf2a01a019b92fc6ull,
3685 0x3ec71de351f3d22bull, 0xbe5ae5e2b60f7b91ull,
3686 0x3de5d8408868552full, 0x0000000000000000ull,
3687 0x3ff0000000000000ull, 0xbfe0000000000000ull,
3688 0x3fa5555555555536ull, 0xbf56c16c16c13a0bull,
3689 0x3efa01a019b1e8d8ull, 0xbe927e4f7282f468ull,
3690 0x3e21ee96d2641b13ull, 0xbda8f76380fbb401ull,
3691 };
3692 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(float64);
3693 intptr_t x = simd_data(desc);
3694 float64 *d = vd, *n = vn, *m = vm;
3695 for (i = 0; i < opr_sz; i++) {
3696 float64 mm = m[i];
3697 intptr_t xx = x;
3698 if (float64_is_neg(mm)) {
3699 mm = float64_abs(mm);
3700 xx += 8;
3701 }
3702 d[i] = float64_muladd(n[i], mm, coeff[xx], 0, vs);
3703 }
3704}
3705
76a9d9cd
RH
3706/*
3707 * FP Complex Add
3708 */
3709
3710void HELPER(sve_fcadd_h)(void *vd, void *vn, void *vm, void *vg,
3711 void *vs, uint32_t desc)
3712{
3713 intptr_t j, i = simd_oprsz(desc);
3714 uint64_t *g = vg;
3715 float16 neg_imag = float16_set_sign(0, simd_data(desc));
3716 float16 neg_real = float16_chs(neg_imag);
3717
3718 do {
3719 uint64_t pg = g[(i - 1) >> 6];
3720 do {
3721 float16 e0, e1, e2, e3;
3722
3723 /* I holds the real index; J holds the imag index. */
3724 j = i - sizeof(float16);
3725 i -= 2 * sizeof(float16);
3726
3727 e0 = *(float16 *)(vn + H1_2(i));
3728 e1 = *(float16 *)(vm + H1_2(j)) ^ neg_real;
3729 e2 = *(float16 *)(vn + H1_2(j));
3730 e3 = *(float16 *)(vm + H1_2(i)) ^ neg_imag;
3731
3732 if (likely((pg >> (i & 63)) & 1)) {
3733 *(float16 *)(vd + H1_2(i)) = float16_add(e0, e1, vs);
3734 }
3735 if (likely((pg >> (j & 63)) & 1)) {
3736 *(float16 *)(vd + H1_2(j)) = float16_add(e2, e3, vs);
3737 }
3738 } while (i & 63);
3739 } while (i != 0);
3740}
3741
3742void HELPER(sve_fcadd_s)(void *vd, void *vn, void *vm, void *vg,
3743 void *vs, uint32_t desc)
3744{
3745 intptr_t j, i = simd_oprsz(desc);
3746 uint64_t *g = vg;
3747 float32 neg_imag = float32_set_sign(0, simd_data(desc));
3748 float32 neg_real = float32_chs(neg_imag);
3749
3750 do {
3751 uint64_t pg = g[(i - 1) >> 6];
3752 do {
3753 float32 e0, e1, e2, e3;
3754
3755 /* I holds the real index; J holds the imag index. */
3756 j = i - sizeof(float32);
3757 i -= 2 * sizeof(float32);
3758
3759 e0 = *(float32 *)(vn + H1_2(i));
3760 e1 = *(float32 *)(vm + H1_2(j)) ^ neg_real;
3761 e2 = *(float32 *)(vn + H1_2(j));
3762 e3 = *(float32 *)(vm + H1_2(i)) ^ neg_imag;
3763
3764 if (likely((pg >> (i & 63)) & 1)) {
3765 *(float32 *)(vd + H1_2(i)) = float32_add(e0, e1, vs);
3766 }
3767 if (likely((pg >> (j & 63)) & 1)) {
3768 *(float32 *)(vd + H1_2(j)) = float32_add(e2, e3, vs);
3769 }
3770 } while (i & 63);
3771 } while (i != 0);
3772}
3773
3774void HELPER(sve_fcadd_d)(void *vd, void *vn, void *vm, void *vg,
3775 void *vs, uint32_t desc)
3776{
3777 intptr_t j, i = simd_oprsz(desc);
3778 uint64_t *g = vg;
3779 float64 neg_imag = float64_set_sign(0, simd_data(desc));
3780 float64 neg_real = float64_chs(neg_imag);
3781
3782 do {
3783 uint64_t pg = g[(i - 1) >> 6];
3784 do {
3785 float64 e0, e1, e2, e3;
3786
3787 /* I holds the real index; J holds the imag index. */
3788 j = i - sizeof(float64);
3789 i -= 2 * sizeof(float64);
3790
3791 e0 = *(float64 *)(vn + H1_2(i));
3792 e1 = *(float64 *)(vm + H1_2(j)) ^ neg_real;
3793 e2 = *(float64 *)(vn + H1_2(j));
3794 e3 = *(float64 *)(vm + H1_2(i)) ^ neg_imag;
3795
3796 if (likely((pg >> (i & 63)) & 1)) {
3797 *(float64 *)(vd + H1_2(i)) = float64_add(e0, e1, vs);
3798 }
3799 if (likely((pg >> (j & 63)) & 1)) {
3800 *(float64 *)(vd + H1_2(j)) = float64_add(e2, e3, vs);
3801 }
3802 } while (i & 63);
3803 } while (i != 0);
3804}
3805
05f48bab
RH
3806/*
3807 * FP Complex Multiply
3808 */
3809
3810QEMU_BUILD_BUG_ON(SIMD_DATA_SHIFT + 22 > 32);
3811
3812void HELPER(sve_fcmla_zpzzz_h)(CPUARMState *env, void *vg, uint32_t desc)
3813{
3814 intptr_t j, i = simd_oprsz(desc);
3815 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3816 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3817 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3818 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3819 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3820 bool flip = rot & 1;
3821 float16 neg_imag, neg_real;
3822 void *vd = &env->vfp.zregs[rd];
3823 void *vn = &env->vfp.zregs[rn];
3824 void *vm = &env->vfp.zregs[rm];
3825 void *va = &env->vfp.zregs[ra];
3826 uint64_t *g = vg;
3827
3828 neg_imag = float16_set_sign(0, (rot & 2) != 0);
3829 neg_real = float16_set_sign(0, rot == 1 || rot == 2);
3830
3831 do {
3832 uint64_t pg = g[(i - 1) >> 6];
3833 do {
3834 float16 e1, e2, e3, e4, nr, ni, mr, mi, d;
3835
3836 /* I holds the real index; J holds the imag index. */
3837 j = i - sizeof(float16);
3838 i -= 2 * sizeof(float16);
3839
3840 nr = *(float16 *)(vn + H1_2(i));
3841 ni = *(float16 *)(vn + H1_2(j));
3842 mr = *(float16 *)(vm + H1_2(i));
3843 mi = *(float16 *)(vm + H1_2(j));
3844
3845 e2 = (flip ? ni : nr);
3846 e1 = (flip ? mi : mr) ^ neg_real;
3847 e4 = e2;
3848 e3 = (flip ? mr : mi) ^ neg_imag;
3849
3850 if (likely((pg >> (i & 63)) & 1)) {
3851 d = *(float16 *)(va + H1_2(i));
3852 d = float16_muladd(e2, e1, d, 0, &env->vfp.fp_status_f16);
3853 *(float16 *)(vd + H1_2(i)) = d;
3854 }
3855 if (likely((pg >> (j & 63)) & 1)) {
3856 d = *(float16 *)(va + H1_2(j));
3857 d = float16_muladd(e4, e3, d, 0, &env->vfp.fp_status_f16);
3858 *(float16 *)(vd + H1_2(j)) = d;
3859 }
3860 } while (i & 63);
3861 } while (i != 0);
3862}
3863
3864void HELPER(sve_fcmla_zpzzz_s)(CPUARMState *env, void *vg, uint32_t desc)
3865{
3866 intptr_t j, i = simd_oprsz(desc);
3867 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3868 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3869 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3870 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3871 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3872 bool flip = rot & 1;
3873 float32 neg_imag, neg_real;
3874 void *vd = &env->vfp.zregs[rd];
3875 void *vn = &env->vfp.zregs[rn];
3876 void *vm = &env->vfp.zregs[rm];
3877 void *va = &env->vfp.zregs[ra];
3878 uint64_t *g = vg;
3879
3880 neg_imag = float32_set_sign(0, (rot & 2) != 0);
3881 neg_real = float32_set_sign(0, rot == 1 || rot == 2);
3882
3883 do {
3884 uint64_t pg = g[(i - 1) >> 6];
3885 do {
3886 float32 e1, e2, e3, e4, nr, ni, mr, mi, d;
3887
3888 /* I holds the real index; J holds the imag index. */
3889 j = i - sizeof(float32);
3890 i -= 2 * sizeof(float32);
3891
3892 nr = *(float32 *)(vn + H1_2(i));
3893 ni = *(float32 *)(vn + H1_2(j));
3894 mr = *(float32 *)(vm + H1_2(i));
3895 mi = *(float32 *)(vm + H1_2(j));
3896
3897 e2 = (flip ? ni : nr);
3898 e1 = (flip ? mi : mr) ^ neg_real;
3899 e4 = e2;
3900 e3 = (flip ? mr : mi) ^ neg_imag;
3901
3902 if (likely((pg >> (i & 63)) & 1)) {
3903 d = *(float32 *)(va + H1_2(i));
3904 d = float32_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3905 *(float32 *)(vd + H1_2(i)) = d;
3906 }
3907 if (likely((pg >> (j & 63)) & 1)) {
3908 d = *(float32 *)(va + H1_2(j));
3909 d = float32_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3910 *(float32 *)(vd + H1_2(j)) = d;
3911 }
3912 } while (i & 63);
3913 } while (i != 0);
3914}
3915
3916void HELPER(sve_fcmla_zpzzz_d)(CPUARMState *env, void *vg, uint32_t desc)
3917{
3918 intptr_t j, i = simd_oprsz(desc);
3919 unsigned rd = extract32(desc, SIMD_DATA_SHIFT, 5);
3920 unsigned rn = extract32(desc, SIMD_DATA_SHIFT + 5, 5);
3921 unsigned rm = extract32(desc, SIMD_DATA_SHIFT + 10, 5);
3922 unsigned ra = extract32(desc, SIMD_DATA_SHIFT + 15, 5);
3923 unsigned rot = extract32(desc, SIMD_DATA_SHIFT + 20, 2);
3924 bool flip = rot & 1;
3925 float64 neg_imag, neg_real;
3926 void *vd = &env->vfp.zregs[rd];
3927 void *vn = &env->vfp.zregs[rn];
3928 void *vm = &env->vfp.zregs[rm];
3929 void *va = &env->vfp.zregs[ra];
3930 uint64_t *g = vg;
3931
3932 neg_imag = float64_set_sign(0, (rot & 2) != 0);
3933 neg_real = float64_set_sign(0, rot == 1 || rot == 2);
3934
3935 do {
3936 uint64_t pg = g[(i - 1) >> 6];
3937 do {
3938 float64 e1, e2, e3, e4, nr, ni, mr, mi, d;
3939
3940 /* I holds the real index; J holds the imag index. */
3941 j = i - sizeof(float64);
3942 i -= 2 * sizeof(float64);
3943
3944 nr = *(float64 *)(vn + H1_2(i));
3945 ni = *(float64 *)(vn + H1_2(j));
3946 mr = *(float64 *)(vm + H1_2(i));
3947 mi = *(float64 *)(vm + H1_2(j));
3948
3949 e2 = (flip ? ni : nr);
3950 e1 = (flip ? mi : mr) ^ neg_real;
3951 e4 = e2;
3952 e3 = (flip ? mr : mi) ^ neg_imag;
3953
3954 if (likely((pg >> (i & 63)) & 1)) {
3955 d = *(float64 *)(va + H1_2(i));
3956 d = float64_muladd(e2, e1, d, 0, &env->vfp.fp_status);
3957 *(float64 *)(vd + H1_2(i)) = d;
3958 }
3959 if (likely((pg >> (j & 63)) & 1)) {
3960 d = *(float64 *)(va + H1_2(j));
3961 d = float64_muladd(e4, e3, d, 0, &env->vfp.fp_status);
3962 *(float64 *)(vd + H1_2(j)) = d;
3963 }
3964 } while (i & 63);
3965 } while (i != 0);
3966}
3967
c4e7c493
RH
3968/*
3969 * Load contiguous data, protected by a governing predicate.
3970 */
9123aeb6
RH
3971
3972/*
3973 * Load elements into @vd, controlled by @vg, from @host + @mem_ofs.
3974 * Memory is valid through @host + @mem_max. The register element
3975 * indicies are inferred from @mem_ofs, as modified by the types for
3976 * which the helper is built. Return the @mem_ofs of the first element
3977 * not loaded (which is @mem_max if they are all loaded).
3978 *
3979 * For softmmu, we have fully validated the guest page. For user-only,
3980 * we cannot fully validate without taking the mmap lock, but since we
3981 * know the access is within one host page, if any access is valid they
3982 * all must be valid. However, when @vg is all false, it may be that
3983 * no access is valid.
3984 */
3985typedef intptr_t sve_ld1_host_fn(void *vd, void *vg, void *host,
3986 intptr_t mem_ofs, intptr_t mem_max);
3987
3988/*
3989 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
3990 * The controlling predicate is known to be true.
3991 */
3992typedef void sve_ld1_tlb_fn(CPUARMState *env, void *vd, intptr_t reg_off,
3993 target_ulong vaddr, int mmu_idx, uintptr_t ra);
9fd46c83 3994typedef sve_ld1_tlb_fn sve_st1_tlb_fn;
9123aeb6
RH
3995
3996/*
3997 * Generate the above primitives.
3998 */
3999
4000#define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
4001static intptr_t sve_##NAME##_host(void *vd, void *vg, void *host, \
4002 intptr_t mem_off, const intptr_t mem_max) \
4003{ \
4004 intptr_t reg_off = mem_off * (sizeof(TYPEE) / sizeof(TYPEM)); \
4005 uint64_t *pg = vg; \
4006 while (mem_off + sizeof(TYPEM) <= mem_max) { \
4007 TYPEM val = 0; \
4008 if (likely((pg[reg_off >> 6] >> (reg_off & 63)) & 1)) { \
4009 val = HOST(host + mem_off); \
4010 } \
4011 *(TYPEE *)(vd + H(reg_off)) = val; \
4012 mem_off += sizeof(TYPEM), reg_off += sizeof(TYPEE); \
4013 } \
4014 return mem_off; \
4015}
4016
4017#ifdef CONFIG_SOFTMMU
4018#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, HOST, MOEND, TLB) \
4019static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4020 target_ulong addr, int mmu_idx, uintptr_t ra) \
4021{ \
4022 TCGMemOpIdx oi = make_memop_idx(ctz32(sizeof(TYPEM)) | MOEND, mmu_idx); \
4023 TYPEM val = TLB(env, addr, oi, ra); \
4024 *(TYPEE *)(vd + H(reg_off)) = val; \
4025}
4026#else
4027#define DO_LD_TLB(NAME, H, TYPEE, TYPEM, HOST, MOEND, TLB) \
4028static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4029 target_ulong addr, int mmu_idx, uintptr_t ra) \
4030{ \
4031 TYPEM val = HOST(g2h(addr)); \
4032 *(TYPEE *)(vd + H(reg_off)) = val; \
4033}
4034#endif
4035
4036#define DO_LD_PRIM_1(NAME, H, TE, TM) \
4037 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
4038 DO_LD_TLB(NAME, H, TE, TM, ldub_p, 0, helper_ret_ldub_mmu)
4039
4040DO_LD_PRIM_1(ld1bb, H1, uint8_t, uint8_t)
4041DO_LD_PRIM_1(ld1bhu, H1_2, uint16_t, uint8_t)
4042DO_LD_PRIM_1(ld1bhs, H1_2, uint16_t, int8_t)
4043DO_LD_PRIM_1(ld1bsu, H1_4, uint32_t, uint8_t)
4044DO_LD_PRIM_1(ld1bss, H1_4, uint32_t, int8_t)
4045DO_LD_PRIM_1(ld1bdu, , uint64_t, uint8_t)
4046DO_LD_PRIM_1(ld1bds, , uint64_t, int8_t)
4047
4048#define DO_LD_PRIM_2(NAME, end, MOEND, H, TE, TM, PH, PT) \
4049 DO_LD_HOST(NAME##_##end, H, TE, TM, PH##_##end##_p) \
4050 DO_LD_TLB(NAME##_##end, H, TE, TM, PH##_##end##_p, \
4051 MOEND, helper_##end##_##PT##_mmu)
4052
4053DO_LD_PRIM_2(ld1hh, le, MO_LE, H1_2, uint16_t, uint16_t, lduw, lduw)
4054DO_LD_PRIM_2(ld1hsu, le, MO_LE, H1_4, uint32_t, uint16_t, lduw, lduw)
4055DO_LD_PRIM_2(ld1hss, le, MO_LE, H1_4, uint32_t, int16_t, lduw, lduw)
4056DO_LD_PRIM_2(ld1hdu, le, MO_LE, , uint64_t, uint16_t, lduw, lduw)
4057DO_LD_PRIM_2(ld1hds, le, MO_LE, , uint64_t, int16_t, lduw, lduw)
4058
4059DO_LD_PRIM_2(ld1ss, le, MO_LE, H1_4, uint32_t, uint32_t, ldl, ldul)
4060DO_LD_PRIM_2(ld1sdu, le, MO_LE, , uint64_t, uint32_t, ldl, ldul)
4061DO_LD_PRIM_2(ld1sds, le, MO_LE, , uint64_t, int32_t, ldl, ldul)
4062
4063DO_LD_PRIM_2(ld1dd, le, MO_LE, , uint64_t, uint64_t, ldq, ldq)
4064
4065DO_LD_PRIM_2(ld1hh, be, MO_BE, H1_2, uint16_t, uint16_t, lduw, lduw)
4066DO_LD_PRIM_2(ld1hsu, be, MO_BE, H1_4, uint32_t, uint16_t, lduw, lduw)
4067DO_LD_PRIM_2(ld1hss, be, MO_BE, H1_4, uint32_t, int16_t, lduw, lduw)
4068DO_LD_PRIM_2(ld1hdu, be, MO_BE, , uint64_t, uint16_t, lduw, lduw)
4069DO_LD_PRIM_2(ld1hds, be, MO_BE, , uint64_t, int16_t, lduw, lduw)
4070
4071DO_LD_PRIM_2(ld1ss, be, MO_BE, H1_4, uint32_t, uint32_t, ldl, ldul)
4072DO_LD_PRIM_2(ld1sdu, be, MO_BE, , uint64_t, uint32_t, ldl, ldul)
4073DO_LD_PRIM_2(ld1sds, be, MO_BE, , uint64_t, int32_t, ldl, ldul)
4074
4075DO_LD_PRIM_2(ld1dd, be, MO_BE, , uint64_t, uint64_t, ldq, ldq)
4076
4077#undef DO_LD_TLB
4078#undef DO_LD_HOST
4079#undef DO_LD_PRIM_1
4080#undef DO_LD_PRIM_2
4081
4082/*
4083 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4084 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4085 * element >= @reg_off, or @reg_max if there were no active elements at all.
4086 */
4087static intptr_t find_next_active(uint64_t *vg, intptr_t reg_off,
4088 intptr_t reg_max, int esz)
4089{
4090 uint64_t pg_mask = pred_esz_masks[esz];
4091 uint64_t pg = (vg[reg_off >> 6] & pg_mask) >> (reg_off & 63);
4092
4093 /* In normal usage, the first element is active. */
4094 if (likely(pg & 1)) {
4095 return reg_off;
4096 }
4097
4098 if (pg == 0) {
4099 reg_off &= -64;
4100 do {
4101 reg_off += 64;
4102 if (unlikely(reg_off >= reg_max)) {
4103 /* The entire predicate was false. */
4104 return reg_max;
4105 }
4106 pg = vg[reg_off >> 6] & pg_mask;
4107 } while (pg == 0);
4108 }
4109 reg_off += ctz64(pg);
4110
4111 /* We should never see an out of range predicate bit set. */
4112 tcg_debug_assert(reg_off < reg_max);
4113 return reg_off;
4114}
4115
4116/*
4117 * Return the maximum offset <= @mem_max which is still within the page
4118 * referenced by @base + @mem_off.
4119 */
4120static intptr_t max_for_page(target_ulong base, intptr_t mem_off,
4121 intptr_t mem_max)
4122{
4123 target_ulong addr = base + mem_off;
4124 intptr_t split = -(intptr_t)(addr | TARGET_PAGE_MASK);
4125 return MIN(split, mem_max - mem_off) + mem_off;
4126}
4127
4128static inline void set_helper_retaddr(uintptr_t ra)
4129{
4130#ifdef CONFIG_USER_ONLY
4131 helper_retaddr = ra;
4132#endif
4133}
4134
4135/*
4136 * The result of tlb_vaddr_to_host for user-only is just g2h(x),
4137 * which is always non-null. Elide the useless test.
4138 */
4139static inline bool test_host_page(void *host)
4140{
4141#ifdef CONFIG_USER_ONLY
4142 return true;
4143#else
4144 return likely(host != NULL);
4145#endif
4146}
4147
4148/*
4149 * Common helper for all contiguous one-register predicated loads.
4150 */
4151static void sve_ld1_r(CPUARMState *env, void *vg, const target_ulong addr,
4152 uint32_t desc, const uintptr_t retaddr,
4153 const int esz, const int msz,
4154 sve_ld1_host_fn *host_fn,
4155 sve_ld1_tlb_fn *tlb_fn)
4156{
4157 void *vd = &env->vfp.zregs[simd_data(desc)];
4158 const int diffsz = esz - msz;
4159 const intptr_t reg_max = simd_oprsz(desc);
4160 const intptr_t mem_max = reg_max >> diffsz;
4161 const int mmu_idx = cpu_mmu_index(env, false);
4162 ARMVectorReg scratch;
4163 void *host;
4164 intptr_t split, reg_off, mem_off;
4165
4166 /* Find the first active element. */
4167 reg_off = find_next_active(vg, 0, reg_max, esz);
4168 if (unlikely(reg_off == reg_max)) {
4169 /* The entire predicate was false; no load occurs. */
4170 memset(vd, 0, reg_max);
4171 return;
4172 }
4173 mem_off = reg_off >> diffsz;
4174 set_helper_retaddr(retaddr);
4175
4176 /*
4177 * If the (remaining) load is entirely within a single page, then:
4178 * For softmmu, and the tlb hits, then no faults will occur;
4179 * For user-only, either the first load will fault or none will.
4180 * We can thus perform the load directly to the destination and
4181 * Vd will be unmodified on any exception path.
4182 */
4183 split = max_for_page(addr, mem_off, mem_max);
4184 if (likely(split == mem_max)) {
4185 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4186 if (test_host_page(host)) {
4187 mem_off = host_fn(vd, vg, host - mem_off, mem_off, mem_max);
4188 tcg_debug_assert(mem_off == mem_max);
4189 set_helper_retaddr(0);
4190 /* After having taken any fault, zero leading inactive elements. */
4191 swap_memzero(vd, reg_off);
4192 return;
4193 }
4194 }
4195
4196 /*
4197 * Perform the predicated read into a temporary, thus ensuring
4198 * if the load of the last element faults, Vd is not modified.
4199 */
4200#ifdef CONFIG_USER_ONLY
4201 swap_memzero(&scratch, reg_off);
4202 host_fn(&scratch, vg, g2h(addr), mem_off, mem_max);
4203#else
4204 memset(&scratch, 0, reg_max);
4205 goto start;
4206 while (1) {
4207 reg_off = find_next_active(vg, reg_off, reg_max, esz);
4208 if (reg_off >= reg_max) {
4209 break;
4210 }
4211 mem_off = reg_off >> diffsz;
4212 split = max_for_page(addr, mem_off, mem_max);
4213
4214 start:
4215 if (split - mem_off >= (1 << msz)) {
4216 /* At least one whole element on this page. */
4217 host = tlb_vaddr_to_host(env, addr + mem_off,
4218 MMU_DATA_LOAD, mmu_idx);
4219 if (host) {
4220 mem_off = host_fn(&scratch, vg, host - mem_off,
4221 mem_off, split);
4222 reg_off = mem_off << diffsz;
4223 continue;
4224 }
4225 }
4226
4227 /*
4228 * Perform one normal read. This may fault, longjmping out to the
4229 * main loop in order to raise an exception. It may succeed, and
4230 * as a side-effect load the TLB entry for the next round. Finally,
4231 * in the extremely unlikely case we're performing this operation
4232 * on I/O memory, it may succeed but not bring in the TLB entry.
4233 * But even then we have still made forward progress.
4234 */
4235 tlb_fn(env, &scratch, reg_off, addr + mem_off, mmu_idx, retaddr);
4236 reg_off += 1 << esz;
4237 }
4238#endif
4239
4240 set_helper_retaddr(0);
4241 memcpy(vd, &scratch, reg_max);
c4e7c493
RH
4242}
4243
9123aeb6
RH
4244#define DO_LD1_1(NAME, ESZ) \
4245void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
4246 target_ulong addr, uint32_t desc) \
4247{ \
4248 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, 0, \
4249 sve_##NAME##_host, sve_##NAME##_tlb); \
4250}
4251
9123aeb6 4252#define DO_LD1_2(NAME, ESZ, MSZ) \
7d0a57a2
RH
4253void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
4254 target_ulong addr, uint32_t desc) \
4255{ \
4256 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4257 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
4258} \
4259void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
4260 target_ulong addr, uint32_t desc) \
4261{ \
4262 sve_ld1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4263 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
9123aeb6
RH
4264}
4265
4266DO_LD1_1(ld1bb, 0)
4267DO_LD1_1(ld1bhu, 1)
4268DO_LD1_1(ld1bhs, 1)
4269DO_LD1_1(ld1bsu, 2)
4270DO_LD1_1(ld1bss, 2)
4271DO_LD1_1(ld1bdu, 3)
4272DO_LD1_1(ld1bds, 3)
4273
4274DO_LD1_2(ld1hh, 1, 1)
4275DO_LD1_2(ld1hsu, 2, 1)
4276DO_LD1_2(ld1hss, 2, 1)
4277DO_LD1_2(ld1hdu, 3, 1)
4278DO_LD1_2(ld1hds, 3, 1)
4279
4280DO_LD1_2(ld1ss, 2, 2)
4281DO_LD1_2(ld1sdu, 3, 2)
4282DO_LD1_2(ld1sds, 3, 2)
4283
4284DO_LD1_2(ld1dd, 3, 3)
4285
4286#undef DO_LD1_1
4287#undef DO_LD1_2
4288
f27d4dc2
RH
4289/*
4290 * Common helpers for all contiguous 2,3,4-register predicated loads.
4291 */
4292static void sve_ld2_r(CPUARMState *env, void *vg, target_ulong addr,
4293 uint32_t desc, int size, uintptr_t ra,
4294 sve_ld1_tlb_fn *tlb_fn)
4295{
4296 const int mmu_idx = cpu_mmu_index(env, false);
4297 intptr_t i, oprsz = simd_oprsz(desc);
4298 unsigned rd = simd_data(desc);
4299 ARMVectorReg scratch[2] = { };
4300
4301 set_helper_retaddr(ra);
4302 for (i = 0; i < oprsz; ) {
4303 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4304 do {
4305 if (pg & 1) {
4306 tlb_fn(env, &scratch[0], i, addr, mmu_idx, ra);
4307 tlb_fn(env, &scratch[1], i, addr + size, mmu_idx, ra);
4308 }
4309 i += size, pg >>= size;
4310 addr += 2 * size;
4311 } while (i & 15);
4312 }
4313 set_helper_retaddr(0);
4314
4315 /* Wait until all exceptions have been raised to write back. */
4316 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4317 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
c4e7c493
RH
4318}
4319
f27d4dc2
RH
4320static void sve_ld3_r(CPUARMState *env, void *vg, target_ulong addr,
4321 uint32_t desc, int size, uintptr_t ra,
4322 sve_ld1_tlb_fn *tlb_fn)
4323{
4324 const int mmu_idx = cpu_mmu_index(env, false);
4325 intptr_t i, oprsz = simd_oprsz(desc);
4326 unsigned rd = simd_data(desc);
4327 ARMVectorReg scratch[3] = { };
4328
4329 set_helper_retaddr(ra);
4330 for (i = 0; i < oprsz; ) {
4331 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4332 do {
4333 if (pg & 1) {
4334 tlb_fn(env, &scratch[0], i, addr, mmu_idx, ra);
4335 tlb_fn(env, &scratch[1], i, addr + size, mmu_idx, ra);
4336 tlb_fn(env, &scratch[2], i, addr + 2 * size, mmu_idx, ra);
4337 }
4338 i += size, pg >>= size;
4339 addr += 3 * size;
4340 } while (i & 15);
4341 }
4342 set_helper_retaddr(0);
4343
4344 /* Wait until all exceptions have been raised to write back. */
4345 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4346 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4347 memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
c4e7c493
RH
4348}
4349
f27d4dc2
RH
4350static void sve_ld4_r(CPUARMState *env, void *vg, target_ulong addr,
4351 uint32_t desc, int size, uintptr_t ra,
4352 sve_ld1_tlb_fn *tlb_fn)
4353{
4354 const int mmu_idx = cpu_mmu_index(env, false);
4355 intptr_t i, oprsz = simd_oprsz(desc);
4356 unsigned rd = simd_data(desc);
4357 ARMVectorReg scratch[4] = { };
4358
4359 set_helper_retaddr(ra);
4360 for (i = 0; i < oprsz; ) {
4361 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4362 do {
4363 if (pg & 1) {
4364 tlb_fn(env, &scratch[0], i, addr, mmu_idx, ra);
4365 tlb_fn(env, &scratch[1], i, addr + size, mmu_idx, ra);
4366 tlb_fn(env, &scratch[2], i, addr + 2 * size, mmu_idx, ra);
4367 tlb_fn(env, &scratch[3], i, addr + 3 * size, mmu_idx, ra);
4368 }
4369 i += size, pg >>= size;
4370 addr += 4 * size;
4371 } while (i & 15);
4372 }
4373 set_helper_retaddr(0);
4374
4375 /* Wait until all exceptions have been raised to write back. */
4376 memcpy(&env->vfp.zregs[rd], &scratch[0], oprsz);
4377 memcpy(&env->vfp.zregs[(rd + 1) & 31], &scratch[1], oprsz);
4378 memcpy(&env->vfp.zregs[(rd + 2) & 31], &scratch[2], oprsz);
4379 memcpy(&env->vfp.zregs[(rd + 3) & 31], &scratch[3], oprsz);
4380}
4381
4382#define DO_LDN_1(N) \
4383void __attribute__((flatten)) HELPER(sve_ld##N##bb_r) \
4384 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4385{ \
4386 sve_ld##N##_r(env, vg, addr, desc, 1, GETPC(), sve_ld1bb_tlb); \
4387}
4388
4389#define DO_LDN_2(N, SUFF, SIZE) \
7d0a57a2
RH
4390void __attribute__((flatten)) HELPER(sve_ld##N##SUFF##_le_r) \
4391 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4392{ \
4393 sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \
4394 sve_ld1##SUFF##_le_tlb); \
4395} \
4396void __attribute__((flatten)) HELPER(sve_ld##N##SUFF##_be_r) \
f27d4dc2
RH
4397 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4398{ \
4399 sve_ld##N##_r(env, vg, addr, desc, SIZE, GETPC(), \
7d0a57a2 4400 sve_ld1##SUFF##_be_tlb); \
c4e7c493
RH
4401}
4402
f27d4dc2
RH
4403DO_LDN_1(2)
4404DO_LDN_1(3)
4405DO_LDN_1(4)
c4e7c493 4406
f27d4dc2
RH
4407DO_LDN_2(2, hh, 2)
4408DO_LDN_2(3, hh, 2)
4409DO_LDN_2(4, hh, 2)
c4e7c493 4410
f27d4dc2
RH
4411DO_LDN_2(2, ss, 4)
4412DO_LDN_2(3, ss, 4)
4413DO_LDN_2(4, ss, 4)
c4e7c493 4414
f27d4dc2
RH
4415DO_LDN_2(2, dd, 8)
4416DO_LDN_2(3, dd, 8)
4417DO_LDN_2(4, dd, 8)
c4e7c493 4418
f27d4dc2
RH
4419#undef DO_LDN_1
4420#undef DO_LDN_2
e2654d75
RH
4421
4422/*
4423 * Load contiguous data, first-fault and no-fault.
9123aeb6
RH
4424 *
4425 * For user-only, one could argue that we should hold the mmap_lock during
4426 * the operation so that there is no race between page_check_range and the
4427 * load operation. However, unmapping pages out from under a running thread
4428 * is extraordinarily unlikely. This theoretical race condition also affects
4429 * linux-user/ in its get_user/put_user macros.
4430 *
4431 * TODO: Construct some helpers, written in assembly, that interact with
4432 * handle_cpu_signal to produce memory ops which can properly report errors
4433 * without racing.
e2654d75
RH
4434 */
4435
e2654d75
RH
4436/* Fault on byte I. All bits in FFR from I are cleared. The vector
4437 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
4438 * option, which leaves subsequent data unchanged.
4439 */
4440static void record_fault(CPUARMState *env, uintptr_t i, uintptr_t oprsz)
4441{
4442 uint64_t *ffr = env->vfp.pregs[FFR_PRED_NUM].p;
4443
4444 if (i & 63) {
4445 ffr[i / 64] &= MAKE_64BIT_MASK(0, i & 63);
4446 i = ROUND_UP(i, 64);
4447 }
4448 for (; i < oprsz; i += 64) {
4449 ffr[i / 64] = 0;
4450 }
4451}
4452
9123aeb6
RH
4453/*
4454 * Common helper for all contiguous first-fault loads.
4455 */
4456static void sve_ldff1_r(CPUARMState *env, void *vg, const target_ulong addr,
4457 uint32_t desc, const uintptr_t retaddr,
4458 const int esz, const int msz,
4459 sve_ld1_host_fn *host_fn,
4460 sve_ld1_tlb_fn *tlb_fn)
4461{
4462 void *vd = &env->vfp.zregs[simd_data(desc)];
4463 const int diffsz = esz - msz;
4464 const intptr_t reg_max = simd_oprsz(desc);
4465 const intptr_t mem_max = reg_max >> diffsz;
4466 const int mmu_idx = cpu_mmu_index(env, false);
4467 intptr_t split, reg_off, mem_off;
4468 void *host;
4469
4470 /* Skip to the first active element. */
4471 reg_off = find_next_active(vg, 0, reg_max, esz);
4472 if (unlikely(reg_off == reg_max)) {
4473 /* The entire predicate was false; no load occurs. */
4474 memset(vd, 0, reg_max);
4475 return;
4476 }
4477 mem_off = reg_off >> diffsz;
4478 set_helper_retaddr(retaddr);
4479
4480 /*
4481 * If the (remaining) load is entirely within a single page, then:
4482 * For softmmu, and the tlb hits, then no faults will occur;
4483 * For user-only, either the first load will fault or none will.
4484 * We can thus perform the load directly to the destination and
4485 * Vd will be unmodified on any exception path.
4486 */
4487 split = max_for_page(addr, mem_off, mem_max);
4488 if (likely(split == mem_max)) {
4489 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4490 if (test_host_page(host)) {
4491 mem_off = host_fn(vd, vg, host - mem_off, mem_off, mem_max);
4492 tcg_debug_assert(mem_off == mem_max);
4493 set_helper_retaddr(0);
4494 /* After any fault, zero any leading inactive elements. */
4495 swap_memzero(vd, reg_off);
4496 return;
4497 }
4498 }
4499
4500#ifdef CONFIG_USER_ONLY
4501 /*
4502 * The page(s) containing this first element at ADDR+MEM_OFF must
4503 * be valid. Considering that this first element may be misaligned
4504 * and cross a page boundary itself, take the rest of the page from
4505 * the last byte of the element.
4506 */
4507 split = max_for_page(addr, mem_off + (1 << msz) - 1, mem_max);
4508 mem_off = host_fn(vd, vg, g2h(addr), mem_off, split);
4509
4510 /* After any fault, zero any leading inactive elements. */
4511 swap_memzero(vd, reg_off);
4512 reg_off = mem_off << diffsz;
4513#else
4514 /*
4515 * Perform one normal read, which will fault or not.
4516 * But it is likely to bring the page into the tlb.
4517 */
4518 tlb_fn(env, vd, reg_off, addr + mem_off, mmu_idx, retaddr);
4519
4520 /* After any fault, zero any leading predicated false elts. */
4521 swap_memzero(vd, reg_off);
4522 mem_off += 1 << msz;
4523 reg_off += 1 << esz;
4524
4525 /* Try again to read the balance of the page. */
4526 split = max_for_page(addr, mem_off - 1, mem_max);
4527 if (split >= (1 << msz)) {
4528 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4529 if (host) {
4530 mem_off = host_fn(vd, vg, host - mem_off, mem_off, split);
4531 reg_off = mem_off << diffsz;
4532 }
4533 }
4534#endif
4535
4536 set_helper_retaddr(0);
4537 record_fault(env, reg_off, reg_max);
4538}
4539
4540/*
4541 * Common helper for all contiguous no-fault loads.
e2654d75 4542 */
9123aeb6
RH
4543static void sve_ldnf1_r(CPUARMState *env, void *vg, const target_ulong addr,
4544 uint32_t desc, const int esz, const int msz,
4545 sve_ld1_host_fn *host_fn)
4546{
4547 void *vd = &env->vfp.zregs[simd_data(desc)];
4548 const int diffsz = esz - msz;
4549 const intptr_t reg_max = simd_oprsz(desc);
4550 const intptr_t mem_max = reg_max >> diffsz;
4551 const int mmu_idx = cpu_mmu_index(env, false);
4552 intptr_t split, reg_off, mem_off;
4553 void *host;
4554
4555#ifdef CONFIG_USER_ONLY
4556 host = tlb_vaddr_to_host(env, addr, MMU_DATA_LOAD, mmu_idx);
4557 if (likely(page_check_range(addr, mem_max, PAGE_READ) == 0)) {
4558 /* The entire operation is valid and will not fault. */
4559 host_fn(vd, vg, host, 0, mem_max);
4560 return;
4561 }
4562#endif
4563
4564 /* There will be no fault, so we may modify in advance. */
4565 memset(vd, 0, reg_max);
4566
4567 /* Skip to the first active element. */
4568 reg_off = find_next_active(vg, 0, reg_max, esz);
4569 if (unlikely(reg_off == reg_max)) {
4570 /* The entire predicate was false; no load occurs. */
4571 return;
4572 }
4573 mem_off = reg_off >> diffsz;
4574
4575#ifdef CONFIG_USER_ONLY
4576 if (page_check_range(addr + mem_off, 1 << msz, PAGE_READ) == 0) {
4577 /* At least one load is valid; take the rest of the page. */
4578 split = max_for_page(addr, mem_off + (1 << msz) - 1, mem_max);
4579 mem_off = host_fn(vd, vg, host, mem_off, split);
4580 reg_off = mem_off << diffsz;
4581 }
4582#else
4583 /*
4584 * If the address is not in the TLB, we have no way to bring the
4585 * entry into the TLB without also risking a fault. Note that
4586 * the corollary is that we never load from an address not in RAM.
4587 *
4588 * This last is out of spec, in a weird corner case.
4589 * Per the MemNF/MemSingleNF pseudocode, a NF load from Device memory
4590 * must not actually hit the bus -- it returns UNKNOWN data instead.
4591 * But if you map non-RAM with Normal memory attributes and do a NF
4592 * load then it should access the bus. (Nobody ought actually do this
4593 * in the real world, obviously.)
4594 *
4595 * Then there are the annoying special cases with watchpoints...
4596 *
4597 * TODO: Add a form of tlb_fill that does not raise an exception,
4598 * with a form of tlb_vaddr_to_host and a set of loads to match.
4599 * The non_fault_vaddr_to_host would handle everything, usually,
4600 * and the loads would handle the iomem path for watchpoints.
4601 */
4602 host = tlb_vaddr_to_host(env, addr + mem_off, MMU_DATA_LOAD, mmu_idx);
4603 split = max_for_page(addr, mem_off, mem_max);
4604 if (host && split >= (1 << msz)) {
4605 mem_off = host_fn(vd, vg, host - mem_off, mem_off, split);
4606 reg_off = mem_off << diffsz;
4607 }
4608#endif
4609
4610 record_fault(env, reg_off, reg_max);
4611}
4612
4613#define DO_LDFF1_LDNF1_1(PART, ESZ) \
4614void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
4615 target_ulong addr, uint32_t desc) \
e2654d75 4616{ \
9123aeb6
RH
4617 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, 0, \
4618 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
e2654d75 4619} \
9123aeb6
RH
4620void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
4621 target_ulong addr, uint32_t desc) \
e2654d75 4622{ \
9123aeb6 4623 sve_ldnf1_r(env, vg, addr, desc, ESZ, 0, sve_ld1##PART##_host); \
e2654d75
RH
4624}
4625
9123aeb6 4626#define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
7d0a57a2
RH
4627void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
4628 target_ulong addr, uint32_t desc) \
e2654d75 4629{ \
7d0a57a2
RH
4630 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4631 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
9123aeb6 4632} \
7d0a57a2
RH
4633void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
4634 target_ulong addr, uint32_t desc) \
9123aeb6 4635{ \
7d0a57a2
RH
4636 sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_le_host); \
4637} \
4638void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
4639 target_ulong addr, uint32_t desc) \
4640{ \
4641 sve_ldff1_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, \
4642 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
4643} \
4644void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
4645 target_ulong addr, uint32_t desc) \
4646{ \
4647 sve_ldnf1_r(env, vg, addr, desc, ESZ, MSZ, sve_ld1##PART##_be_host); \
e2654d75
RH
4648}
4649
9123aeb6
RH
4650DO_LDFF1_LDNF1_1(bb, 0)
4651DO_LDFF1_LDNF1_1(bhu, 1)
4652DO_LDFF1_LDNF1_1(bhs, 1)
4653DO_LDFF1_LDNF1_1(bsu, 2)
4654DO_LDFF1_LDNF1_1(bss, 2)
4655DO_LDFF1_LDNF1_1(bdu, 3)
4656DO_LDFF1_LDNF1_1(bds, 3)
e2654d75 4657
9123aeb6
RH
4658DO_LDFF1_LDNF1_2(hh, 1, 1)
4659DO_LDFF1_LDNF1_2(hsu, 2, 1)
4660DO_LDFF1_LDNF1_2(hss, 2, 1)
4661DO_LDFF1_LDNF1_2(hdu, 3, 1)
4662DO_LDFF1_LDNF1_2(hds, 3, 1)
e2654d75 4663
9123aeb6
RH
4664DO_LDFF1_LDNF1_2(ss, 2, 2)
4665DO_LDFF1_LDNF1_2(sdu, 3, 2)
4666DO_LDFF1_LDNF1_2(sds, 3, 2)
e2654d75 4667
9123aeb6 4668DO_LDFF1_LDNF1_2(dd, 3, 3)
e2654d75 4669
9123aeb6
RH
4670#undef DO_LDFF1_LDNF1_1
4671#undef DO_LDFF1_LDNF1_2
1a039c7e
RH
4672
4673/*
4674 * Store contiguous data, protected by a governing predicate.
4675 */
1a039c7e 4676
9fd46c83
RH
4677#ifdef CONFIG_SOFTMMU
4678#define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
4679static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4680 target_ulong addr, int mmu_idx, uintptr_t ra) \
4681{ \
4682 TCGMemOpIdx oi = make_memop_idx(ctz32(sizeof(TYPEM)) | MOEND, mmu_idx); \
4683 TLB(env, addr, *(TYPEM *)(vd + H(reg_off)), oi, ra); \
1a039c7e 4684}
9fd46c83
RH
4685#else
4686#define DO_ST_TLB(NAME, H, TYPEM, HOST, MOEND, TLB) \
4687static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4688 target_ulong addr, int mmu_idx, uintptr_t ra) \
4689{ \
4690 HOST(g2h(addr), *(TYPEM *)(vd + H(reg_off))); \
1a039c7e 4691}
9fd46c83 4692#endif
1a039c7e 4693
9fd46c83
RH
4694DO_ST_TLB(st1bb, H1, uint8_t, stb_p, 0, helper_ret_stb_mmu)
4695DO_ST_TLB(st1bh, H1_2, uint16_t, stb_p, 0, helper_ret_stb_mmu)
4696DO_ST_TLB(st1bs, H1_4, uint32_t, stb_p, 0, helper_ret_stb_mmu)
4697DO_ST_TLB(st1bd, , uint64_t, stb_p, 0, helper_ret_stb_mmu)
1a039c7e 4698
9fd46c83
RH
4699DO_ST_TLB(st1hh_le, H1_2, uint16_t, stw_le_p, MO_LE, helper_le_stw_mmu)
4700DO_ST_TLB(st1hs_le, H1_4, uint32_t, stw_le_p, MO_LE, helper_le_stw_mmu)
4701DO_ST_TLB(st1hd_le, , uint64_t, stw_le_p, MO_LE, helper_le_stw_mmu)
4702
4703DO_ST_TLB(st1ss_le, H1_4, uint32_t, stl_le_p, MO_LE, helper_le_stl_mmu)
4704DO_ST_TLB(st1sd_le, , uint64_t, stl_le_p, MO_LE, helper_le_stl_mmu)
1a039c7e 4705
9fd46c83 4706DO_ST_TLB(st1dd_le, , uint64_t, stq_le_p, MO_LE, helper_le_stq_mmu)
1a039c7e 4707
9fd46c83
RH
4708DO_ST_TLB(st1hh_be, H1_2, uint16_t, stw_be_p, MO_BE, helper_be_stw_mmu)
4709DO_ST_TLB(st1hs_be, H1_4, uint32_t, stw_be_p, MO_BE, helper_be_stw_mmu)
4710DO_ST_TLB(st1hd_be, , uint64_t, stw_be_p, MO_BE, helper_be_stw_mmu)
1a039c7e 4711
9fd46c83
RH
4712DO_ST_TLB(st1ss_be, H1_4, uint32_t, stl_be_p, MO_BE, helper_be_stl_mmu)
4713DO_ST_TLB(st1sd_be, , uint64_t, stl_be_p, MO_BE, helper_be_stl_mmu)
1a039c7e 4714
9fd46c83 4715DO_ST_TLB(st1dd_be, , uint64_t, stq_be_p, MO_BE, helper_be_stq_mmu)
1a039c7e 4716
9fd46c83 4717#undef DO_ST_TLB
1a039c7e 4718
9fd46c83
RH
4719/*
4720 * Common helpers for all contiguous 1,2,3,4-register predicated stores.
4721 */
4722static void sve_st1_r(CPUARMState *env, void *vg, target_ulong addr,
4723 uint32_t desc, const uintptr_t ra,
4724 const int esize, const int msize,
4725 sve_st1_tlb_fn *tlb_fn)
4726{
4727 const int mmu_idx = cpu_mmu_index(env, false);
4728 intptr_t i, oprsz = simd_oprsz(desc);
4729 unsigned rd = simd_data(desc);
4730 void *vd = &env->vfp.zregs[rd];
1a039c7e 4731
9fd46c83
RH
4732 set_helper_retaddr(ra);
4733 for (i = 0; i < oprsz; ) {
4734 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4735 do {
4736 if (pg & 1) {
4737 tlb_fn(env, vd, i, addr, mmu_idx, ra);
4738 }
4739 i += esize, pg >>= esize;
4740 addr += msize;
4741 } while (i & 15);
4742 }
4743 set_helper_retaddr(0);
4744}
1a039c7e 4745
9fd46c83
RH
4746static void sve_st2_r(CPUARMState *env, void *vg, target_ulong addr,
4747 uint32_t desc, const uintptr_t ra,
4748 const int esize, const int msize,
4749 sve_st1_tlb_fn *tlb_fn)
1a039c7e 4750{
9fd46c83
RH
4751 const int mmu_idx = cpu_mmu_index(env, false);
4752 intptr_t i, oprsz = simd_oprsz(desc);
1a039c7e 4753 unsigned rd = simd_data(desc);
9fd46c83
RH
4754 void *d1 = &env->vfp.zregs[rd];
4755 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
1a039c7e 4756
9fd46c83
RH
4757 set_helper_retaddr(ra);
4758 for (i = 0; i < oprsz; ) {
4759 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4760 do {
4761 if (pg & 1) {
4762 tlb_fn(env, d1, i, addr, mmu_idx, ra);
4763 tlb_fn(env, d2, i, addr + msize, mmu_idx, ra);
4764 }
4765 i += esize, pg >>= esize;
4766 addr += 2 * msize;
4767 } while (i & 15);
1a039c7e 4768 }
9fd46c83 4769 set_helper_retaddr(0);
1a039c7e
RH
4770}
4771
9fd46c83
RH
4772static void sve_st3_r(CPUARMState *env, void *vg, target_ulong addr,
4773 uint32_t desc, const uintptr_t ra,
4774 const int esize, const int msize,
4775 sve_st1_tlb_fn *tlb_fn)
1a039c7e 4776{
9fd46c83
RH
4777 const int mmu_idx = cpu_mmu_index(env, false);
4778 intptr_t i, oprsz = simd_oprsz(desc);
1a039c7e 4779 unsigned rd = simd_data(desc);
9fd46c83
RH
4780 void *d1 = &env->vfp.zregs[rd];
4781 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
4782 void *d3 = &env->vfp.zregs[(rd + 2) & 31];
1a039c7e 4783
9fd46c83
RH
4784 set_helper_retaddr(ra);
4785 for (i = 0; i < oprsz; ) {
4786 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4787 do {
4788 if (pg & 1) {
4789 tlb_fn(env, d1, i, addr, mmu_idx, ra);
4790 tlb_fn(env, d2, i, addr + msize, mmu_idx, ra);
4791 tlb_fn(env, d3, i, addr + 2 * msize, mmu_idx, ra);
4792 }
4793 i += esize, pg >>= esize;
4794 addr += 3 * msize;
4795 } while (i & 15);
1a039c7e 4796 }
9fd46c83 4797 set_helper_retaddr(0);
1a039c7e
RH
4798}
4799
9fd46c83
RH
4800static void sve_st4_r(CPUARMState *env, void *vg, target_ulong addr,
4801 uint32_t desc, const uintptr_t ra,
4802 const int esize, const int msize,
4803 sve_st1_tlb_fn *tlb_fn)
1a039c7e 4804{
9fd46c83
RH
4805 const int mmu_idx = cpu_mmu_index(env, false);
4806 intptr_t i, oprsz = simd_oprsz(desc);
1a039c7e 4807 unsigned rd = simd_data(desc);
9fd46c83
RH
4808 void *d1 = &env->vfp.zregs[rd];
4809 void *d2 = &env->vfp.zregs[(rd + 1) & 31];
4810 void *d3 = &env->vfp.zregs[(rd + 2) & 31];
4811 void *d4 = &env->vfp.zregs[(rd + 3) & 31];
1a039c7e 4812
9fd46c83
RH
4813 set_helper_retaddr(ra);
4814 for (i = 0; i < oprsz; ) {
4815 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3));
4816 do {
4817 if (pg & 1) {
4818 tlb_fn(env, d1, i, addr, mmu_idx, ra);
4819 tlb_fn(env, d2, i, addr + msize, mmu_idx, ra);
4820 tlb_fn(env, d3, i, addr + 2 * msize, mmu_idx, ra);
4821 tlb_fn(env, d4, i, addr + 3 * msize, mmu_idx, ra);
4822 }
4823 i += esize, pg >>= esize;
4824 addr += 4 * msize;
4825 } while (i & 15);
1a039c7e 4826 }
9fd46c83
RH
4827 set_helper_retaddr(0);
4828}
4829
4830#define DO_STN_1(N, NAME, ESIZE) \
4831void __attribute__((flatten)) HELPER(sve_st##N##NAME##_r) \
4832 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4833{ \
4834 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, 1, \
4835 sve_st1##NAME##_tlb); \
1a039c7e 4836}
f6dbf62a 4837
9fd46c83 4838#define DO_STN_2(N, NAME, ESIZE, MSIZE) \
28d57f2d 4839void __attribute__((flatten)) HELPER(sve_st##N##NAME##_le_r) \
9fd46c83
RH
4840 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4841{ \
4842 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, MSIZE, \
28d57f2d
RH
4843 sve_st1##NAME##_le_tlb); \
4844} \
4845void __attribute__((flatten)) HELPER(sve_st##N##NAME##_be_r) \
4846 (CPUARMState *env, void *vg, target_ulong addr, uint32_t desc) \
4847{ \
4848 sve_st##N##_r(env, vg, addr, desc, GETPC(), ESIZE, MSIZE, \
4849 sve_st1##NAME##_be_tlb); \
9fd46c83
RH
4850}
4851
4852DO_STN_1(1, bb, 1)
4853DO_STN_1(1, bh, 2)
4854DO_STN_1(1, bs, 4)
4855DO_STN_1(1, bd, 8)
4856DO_STN_1(2, bb, 1)
4857DO_STN_1(3, bb, 1)
4858DO_STN_1(4, bb, 1)
4859
4860DO_STN_2(1, hh, 2, 2)
4861DO_STN_2(1, hs, 4, 2)
4862DO_STN_2(1, hd, 8, 2)
4863DO_STN_2(2, hh, 2, 2)
4864DO_STN_2(3, hh, 2, 2)
4865DO_STN_2(4, hh, 2, 2)
4866
4867DO_STN_2(1, ss, 4, 4)
4868DO_STN_2(1, sd, 8, 4)
4869DO_STN_2(2, ss, 4, 4)
4870DO_STN_2(3, ss, 4, 4)
4871DO_STN_2(4, ss, 4, 4)
4872
4873DO_STN_2(1, dd, 8, 8)
4874DO_STN_2(2, dd, 8, 8)
4875DO_STN_2(3, dd, 8, 8)
4876DO_STN_2(4, dd, 8, 8)
4877
4878#undef DO_STN_1
4879#undef DO_STN_2
4880
673e9fa6
RH
4881/* Loads with a vector index. */
4882
4883#define DO_LD1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
4884void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4885 target_ulong base, uint32_t desc) \
4886{ \
4887 intptr_t i, oprsz = simd_oprsz(desc); \
4888 unsigned scale = simd_data(desc); \
4889 uintptr_t ra = GETPC(); \
628fc75f 4890 for (i = 0; i < oprsz; ) { \
673e9fa6
RH
4891 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4892 do { \
4893 TYPEM m = 0; \
4894 if (pg & 1) { \
4895 target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
4896 m = FN(env, base + (off << scale), ra); \
4897 } \
4898 *(uint32_t *)(vd + H1_4(i)) = m; \
4899 i += 4, pg >>= 4; \
4900 } while (i & 15); \
4901 } \
4902}
4903
4904#define DO_LD1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
4905void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4906 target_ulong base, uint32_t desc) \
4907{ \
4908 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
4909 unsigned scale = simd_data(desc); \
4910 uintptr_t ra = GETPC(); \
4911 uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
4912 for (i = 0; i < oprsz; i++) { \
4913 TYPEM mm = 0; \
4914 if (pg[H1(i)] & 1) { \
4915 target_ulong off = (TYPEI)m[i]; \
4916 mm = FN(env, base + (off << scale), ra); \
4917 } \
4918 d[i] = mm; \
4919 } \
4920}
4921
4922DO_LD1_ZPZ_S(sve_ldbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4923DO_LD1_ZPZ_S(sve_ldhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4924DO_LD1_ZPZ_S(sve_ldssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4925DO_LD1_ZPZ_S(sve_ldbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4926DO_LD1_ZPZ_S(sve_ldhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4927
4928DO_LD1_ZPZ_S(sve_ldbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4929DO_LD1_ZPZ_S(sve_ldhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4930DO_LD1_ZPZ_S(sve_ldssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4931DO_LD1_ZPZ_S(sve_ldbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
4932DO_LD1_ZPZ_S(sve_ldhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
4933
4934DO_LD1_ZPZ_D(sve_ldbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
4935DO_LD1_ZPZ_D(sve_ldhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
4936DO_LD1_ZPZ_D(sve_ldsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
4937DO_LD1_ZPZ_D(sve_ldddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
4938DO_LD1_ZPZ_D(sve_ldbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
4939DO_LD1_ZPZ_D(sve_ldhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
4940DO_LD1_ZPZ_D(sve_ldsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
4941
4942DO_LD1_ZPZ_D(sve_ldbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
4943DO_LD1_ZPZ_D(sve_ldhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
4944DO_LD1_ZPZ_D(sve_ldsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
4945DO_LD1_ZPZ_D(sve_ldddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
4946DO_LD1_ZPZ_D(sve_ldbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
4947DO_LD1_ZPZ_D(sve_ldhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
4948DO_LD1_ZPZ_D(sve_ldsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
4949
4950DO_LD1_ZPZ_D(sve_ldbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
4951DO_LD1_ZPZ_D(sve_ldhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
4952DO_LD1_ZPZ_D(sve_ldsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
4953DO_LD1_ZPZ_D(sve_ldddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
4954DO_LD1_ZPZ_D(sve_ldbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
4955DO_LD1_ZPZ_D(sve_ldhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
4956DO_LD1_ZPZ_D(sve_ldsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
4957
ed67eb7f
RH
4958/* First fault loads with a vector index. */
4959
4960#ifdef CONFIG_USER_ONLY
4961
4962#define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
4963void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4964 target_ulong base, uint32_t desc) \
4965{ \
4966 intptr_t i, oprsz = simd_oprsz(desc); \
4967 unsigned scale = simd_data(desc); \
4968 uintptr_t ra = GETPC(); \
4969 bool first = true; \
4970 mmap_lock(); \
628fc75f 4971 for (i = 0; i < oprsz; ) { \
ed67eb7f
RH
4972 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4973 do { \
4974 TYPEM m = 0; \
4975 if (pg & 1) { \
4976 target_ulong off = *(TYPEI *)(vm + H(i)); \
4977 target_ulong addr = base + (off << scale); \
4978 if (!first && \
4979 page_check_range(addr, sizeof(TYPEM), PAGE_READ)) { \
4980 record_fault(env, i, oprsz); \
4981 goto exit; \
4982 } \
4983 m = FN(env, addr, ra); \
4984 first = false; \
4985 } \
4986 *(TYPEE *)(vd + H(i)) = m; \
4987 i += sizeof(TYPEE), pg >>= sizeof(TYPEE); \
4988 } while (i & 15); \
4989 } \
4990 exit: \
4991 mmap_unlock(); \
4992}
4993
4994#else
4995
4996#define DO_LDFF1_ZPZ(NAME, TYPEE, TYPEI, TYPEM, FN, H) \
4997void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
4998 target_ulong base, uint32_t desc) \
4999{ \
5000 g_assert_not_reached(); \
5001}
5002
5003#endif
5004
5005#define DO_LDFF1_ZPZ_S(NAME, TYPEI, TYPEM, FN) \
5006 DO_LDFF1_ZPZ(NAME, uint32_t, TYPEI, TYPEM, FN, H1_4)
5007#define DO_LDFF1_ZPZ_D(NAME, TYPEI, TYPEM, FN) \
5008 DO_LDFF1_ZPZ(NAME, uint64_t, TYPEI, TYPEM, FN, )
5009
5010DO_LDFF1_ZPZ_S(sve_ldffbsu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
5011DO_LDFF1_ZPZ_S(sve_ldffhsu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
5012DO_LDFF1_ZPZ_S(sve_ldffssu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
5013DO_LDFF1_ZPZ_S(sve_ldffbss_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
5014DO_LDFF1_ZPZ_S(sve_ldffhss_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
5015
5016DO_LDFF1_ZPZ_S(sve_ldffbsu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
5017DO_LDFF1_ZPZ_S(sve_ldffhsu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
5018DO_LDFF1_ZPZ_S(sve_ldffssu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
5019DO_LDFF1_ZPZ_S(sve_ldffbss_zss, int32_t, int8_t, cpu_ldub_data_ra)
5020DO_LDFF1_ZPZ_S(sve_ldffhss_zss, int32_t, int16_t, cpu_lduw_data_ra)
5021
5022DO_LDFF1_ZPZ_D(sve_ldffbdu_zsu, uint32_t, uint8_t, cpu_ldub_data_ra)
5023DO_LDFF1_ZPZ_D(sve_ldffhdu_zsu, uint32_t, uint16_t, cpu_lduw_data_ra)
5024DO_LDFF1_ZPZ_D(sve_ldffsdu_zsu, uint32_t, uint32_t, cpu_ldl_data_ra)
5025DO_LDFF1_ZPZ_D(sve_ldffddu_zsu, uint32_t, uint64_t, cpu_ldq_data_ra)
5026DO_LDFF1_ZPZ_D(sve_ldffbds_zsu, uint32_t, int8_t, cpu_ldub_data_ra)
5027DO_LDFF1_ZPZ_D(sve_ldffhds_zsu, uint32_t, int16_t, cpu_lduw_data_ra)
5028DO_LDFF1_ZPZ_D(sve_ldffsds_zsu, uint32_t, int32_t, cpu_ldl_data_ra)
5029
5030DO_LDFF1_ZPZ_D(sve_ldffbdu_zss, int32_t, uint8_t, cpu_ldub_data_ra)
5031DO_LDFF1_ZPZ_D(sve_ldffhdu_zss, int32_t, uint16_t, cpu_lduw_data_ra)
5032DO_LDFF1_ZPZ_D(sve_ldffsdu_zss, int32_t, uint32_t, cpu_ldl_data_ra)
5033DO_LDFF1_ZPZ_D(sve_ldffddu_zss, int32_t, uint64_t, cpu_ldq_data_ra)
5034DO_LDFF1_ZPZ_D(sve_ldffbds_zss, int32_t, int8_t, cpu_ldub_data_ra)
5035DO_LDFF1_ZPZ_D(sve_ldffhds_zss, int32_t, int16_t, cpu_lduw_data_ra)
5036DO_LDFF1_ZPZ_D(sve_ldffsds_zss, int32_t, int32_t, cpu_ldl_data_ra)
5037
5038DO_LDFF1_ZPZ_D(sve_ldffbdu_zd, uint64_t, uint8_t, cpu_ldub_data_ra)
5039DO_LDFF1_ZPZ_D(sve_ldffhdu_zd, uint64_t, uint16_t, cpu_lduw_data_ra)
5040DO_LDFF1_ZPZ_D(sve_ldffsdu_zd, uint64_t, uint32_t, cpu_ldl_data_ra)
5041DO_LDFF1_ZPZ_D(sve_ldffddu_zd, uint64_t, uint64_t, cpu_ldq_data_ra)
5042DO_LDFF1_ZPZ_D(sve_ldffbds_zd, uint64_t, int8_t, cpu_ldub_data_ra)
5043DO_LDFF1_ZPZ_D(sve_ldffhds_zd, uint64_t, int16_t, cpu_lduw_data_ra)
5044DO_LDFF1_ZPZ_D(sve_ldffsds_zd, uint64_t, int32_t, cpu_ldl_data_ra)
5045
f6dbf62a
RH
5046/* Stores with a vector index. */
5047
5048#define DO_ST1_ZPZ_S(NAME, TYPEI, FN) \
5049void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
5050 target_ulong base, uint32_t desc) \
5051{ \
5052 intptr_t i, oprsz = simd_oprsz(desc); \
5053 unsigned scale = simd_data(desc); \
5054 uintptr_t ra = GETPC(); \
5055 for (i = 0; i < oprsz; ) { \
5056 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
5057 do { \
5058 if (likely(pg & 1)) { \
5059 target_ulong off = *(TYPEI *)(vm + H1_4(i)); \
5060 uint32_t d = *(uint32_t *)(vd + H1_4(i)); \
5061 FN(env, base + (off << scale), d, ra); \
5062 } \
5063 i += sizeof(uint32_t), pg >>= sizeof(uint32_t); \
5064 } while (i & 15); \
5065 } \
5066}
5067
5068#define DO_ST1_ZPZ_D(NAME, TYPEI, FN) \
5069void HELPER(NAME)(CPUARMState *env, void *vd, void *vg, void *vm, \
5070 target_ulong base, uint32_t desc) \
5071{ \
5072 intptr_t i, oprsz = simd_oprsz(desc) / 8; \
5073 unsigned scale = simd_data(desc); \
5074 uintptr_t ra = GETPC(); \
5075 uint64_t *d = vd, *m = vm; uint8_t *pg = vg; \
5076 for (i = 0; i < oprsz; i++) { \
5077 if (likely(pg[H1(i)] & 1)) { \
5078 target_ulong off = (target_ulong)(TYPEI)m[i] << scale; \
5079 FN(env, base + off, d[i], ra); \
5080 } \
5081 } \
5082}
5083
5084DO_ST1_ZPZ_S(sve_stbs_zsu, uint32_t, cpu_stb_data_ra)
5085DO_ST1_ZPZ_S(sve_sths_zsu, uint32_t, cpu_stw_data_ra)
5086DO_ST1_ZPZ_S(sve_stss_zsu, uint32_t, cpu_stl_data_ra)
5087
5088DO_ST1_ZPZ_S(sve_stbs_zss, int32_t, cpu_stb_data_ra)
5089DO_ST1_ZPZ_S(sve_sths_zss, int32_t, cpu_stw_data_ra)
5090DO_ST1_ZPZ_S(sve_stss_zss, int32_t, cpu_stl_data_ra)
5091
5092DO_ST1_ZPZ_D(sve_stbd_zsu, uint32_t, cpu_stb_data_ra)
5093DO_ST1_ZPZ_D(sve_sthd_zsu, uint32_t, cpu_stw_data_ra)
5094DO_ST1_ZPZ_D(sve_stsd_zsu, uint32_t, cpu_stl_data_ra)
5095DO_ST1_ZPZ_D(sve_stdd_zsu, uint32_t, cpu_stq_data_ra)
5096
5097DO_ST1_ZPZ_D(sve_stbd_zss, int32_t, cpu_stb_data_ra)
5098DO_ST1_ZPZ_D(sve_sthd_zss, int32_t, cpu_stw_data_ra)
5099DO_ST1_ZPZ_D(sve_stsd_zss, int32_t, cpu_stl_data_ra)
5100DO_ST1_ZPZ_D(sve_stdd_zss, int32_t, cpu_stq_data_ra)
5101
5102DO_ST1_ZPZ_D(sve_stbd_zd, uint64_t, cpu_stb_data_ra)
5103DO_ST1_ZPZ_D(sve_sthd_zd, uint64_t, cpu_stw_data_ra)
5104DO_ST1_ZPZ_D(sve_stsd_zd, uint64_t, cpu_stl_data_ra)
5105DO_ST1_ZPZ_D(sve_stdd_zd, uint64_t, cpu_stq_data_ra)