4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
29 #include "vec_internal.h"
32 /* Note that vector data is stored in host-endian 64-bit chunks,
33 so addressing units smaller than that needs a host-endian fixup. */
34 #ifdef HOST_WORDS_BIGENDIAN
35 #define H1(x) ((x) ^ 7)
36 #define H1_2(x) ((x) ^ 6)
37 #define H1_4(x) ((x) ^ 4)
38 #define H2(x) ((x) ^ 3)
39 #define H4(x) ((x) ^ 1)
48 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
50 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
51 * and bit 0 set if C is set. Compare the definitions of these variables
55 /* For no G bits set, NZCV = C. */
56 #define PREDTEST_INIT 1
58 /* This is an iterative function, called for each Pd and Pg word
61 static uint32_t iter_predtest_fwd(uint64_t d
, uint64_t g
, uint32_t flags
)
64 /* Compute N from first D & G.
65 Use bit 2 to signal first G bit seen. */
67 flags
|= ((d
& (g
& -g
)) != 0) << 31;
71 /* Accumulate Z from each D & G. */
72 flags
|= ((d
& g
) != 0) << 1;
74 /* Compute C from last !(D & G). Replace previous. */
75 flags
= deposit32(flags
, 0, 1, (d
& pow2floor(g
)) == 0);
80 /* This is an iterative function, called for each Pd and Pg word
83 static uint32_t iter_predtest_bwd(uint64_t d
, uint64_t g
, uint32_t flags
)
86 /* Compute C from first (i.e last) !(D & G).
87 Use bit 2 to signal first G bit seen. */
89 flags
+= 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
90 flags
|= (d
& pow2floor(g
)) == 0;
93 /* Accumulate Z from each D & G. */
94 flags
|= ((d
& g
) != 0) << 1;
96 /* Compute N from last (i.e first) D & G. Replace previous. */
97 flags
= deposit32(flags
, 31, 1, (d
& (g
& -g
)) != 0);
102 /* The same for a single word predicate. */
103 uint32_t HELPER(sve_predtest1
)(uint64_t d
, uint64_t g
)
105 return iter_predtest_fwd(d
, g
, PREDTEST_INIT
);
108 /* The same for a multi-word predicate. */
109 uint32_t HELPER(sve_predtest
)(void *vd
, void *vg
, uint32_t words
)
111 uint32_t flags
= PREDTEST_INIT
;
112 uint64_t *d
= vd
, *g
= vg
;
116 flags
= iter_predtest_fwd(d
[i
], g
[i
], flags
);
117 } while (++i
< words
);
122 /* Expand active predicate bits to bytes, for byte elements.
123 * for (i = 0; i < 256; ++i) {
124 * unsigned long m = 0;
125 * for (j = 0; j < 8; j++) {
126 * if ((i >> j) & 1) {
127 * m |= 0xfful << (j << 3);
130 * printf("0x%016lx,\n", m);
133 static inline uint64_t expand_pred_b(uint8_t byte
)
135 static const uint64_t word
[256] = {
136 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
137 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
138 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
139 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
140 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
141 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
142 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
143 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
144 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
145 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
146 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
147 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
148 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
149 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
150 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
151 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
152 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
153 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
154 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
155 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
156 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
157 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
158 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
159 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
160 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
161 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
162 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
163 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
164 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
165 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
166 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
167 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
168 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
169 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
170 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
171 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
172 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
173 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
174 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
175 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
176 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
177 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
178 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
179 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
180 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
181 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
182 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
183 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
184 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
185 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
186 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
187 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
188 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
189 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
190 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
191 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
192 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
193 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
194 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
195 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
196 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
197 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
198 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
199 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
200 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
201 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
202 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
203 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
204 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
205 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
206 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
207 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
208 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
209 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
210 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
211 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
212 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
213 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
214 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
215 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
216 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
217 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
218 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
219 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
220 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
226 /* Similarly for half-word elements.
227 * for (i = 0; i < 256; ++i) {
228 * unsigned long m = 0;
232 * for (j = 0; j < 8; j += 2) {
233 * if ((i >> j) & 1) {
234 * m |= 0xfffful << (j << 3);
237 * printf("[0x%x] = 0x%016lx,\n", i, m);
240 static inline uint64_t expand_pred_h(uint8_t byte
)
242 static const uint64_t word
[] = {
243 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
244 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
245 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
246 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
247 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
248 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
249 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
250 [0x55] = 0xffffffffffffffff,
252 return word
[byte
& 0x55];
255 /* Similarly for single word elements. */
256 static inline uint64_t expand_pred_s(uint8_t byte
)
258 static const uint64_t word
[] = {
259 [0x01] = 0x00000000ffffffffull
,
260 [0x10] = 0xffffffff00000000ull
,
261 [0x11] = 0xffffffffffffffffull
,
263 return word
[byte
& 0x11];
266 /* Swap 16-bit words within a 32-bit word. */
267 static inline uint32_t hswap32(uint32_t h
)
272 /* Swap 16-bit words within a 64-bit word. */
273 static inline uint64_t hswap64(uint64_t h
)
275 uint64_t m
= 0x0000ffff0000ffffull
;
277 return ((h
& m
) << 16) | ((h
>> 16) & m
);
280 /* Swap 32-bit words within a 64-bit word. */
281 static inline uint64_t wswap64(uint64_t h
)
286 #define LOGICAL_PPPP(NAME, FUNC) \
287 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
289 uintptr_t opr_sz = simd_oprsz(desc); \
290 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
292 for (i = 0; i < opr_sz / 8; ++i) { \
293 d[i] = FUNC(n[i], m[i], g[i]); \
297 #define DO_AND(N, M, G) (((N) & (M)) & (G))
298 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
299 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
300 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
301 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
302 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
303 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
304 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
306 LOGICAL_PPPP(sve_and_pppp
, DO_AND
)
307 LOGICAL_PPPP(sve_bic_pppp
, DO_BIC
)
308 LOGICAL_PPPP(sve_eor_pppp
, DO_EOR
)
309 LOGICAL_PPPP(sve_sel_pppp
, DO_SEL
)
310 LOGICAL_PPPP(sve_orr_pppp
, DO_ORR
)
311 LOGICAL_PPPP(sve_orn_pppp
, DO_ORN
)
312 LOGICAL_PPPP(sve_nor_pppp
, DO_NOR
)
313 LOGICAL_PPPP(sve_nand_pppp
, DO_NAND
)
325 /* Fully general three-operand expander, controlled by a predicate.
326 * This is complicated by the host-endian storage of the register file.
328 /* ??? I don't expect the compiler could ever vectorize this itself.
329 * With some tables we can convert bit masks to byte masks, and with
330 * extra care wrt byte/word ordering we could use gcc generic vectors
331 * and do 16 bytes at a time.
333 #define DO_ZPZZ(NAME, TYPE, H, OP) \
334 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
336 intptr_t i, opr_sz = simd_oprsz(desc); \
337 for (i = 0; i < opr_sz; ) { \
338 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
341 TYPE nn = *(TYPE *)(vn + H(i)); \
342 TYPE mm = *(TYPE *)(vm + H(i)); \
343 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
345 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
350 /* Similarly, specialized for 64-bit operands. */
351 #define DO_ZPZZ_D(NAME, TYPE, OP) \
352 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
354 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
355 TYPE *d = vd, *n = vn, *m = vm; \
357 for (i = 0; i < opr_sz; i += 1) { \
358 if (pg[H1(i)] & 1) { \
359 TYPE nn = n[i], mm = m[i]; \
365 #define DO_AND(N, M) (N & M)
366 #define DO_EOR(N, M) (N ^ M)
367 #define DO_ORR(N, M) (N | M)
368 #define DO_BIC(N, M) (N & ~M)
369 #define DO_ADD(N, M) (N + M)
370 #define DO_SUB(N, M) (N - M)
371 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
372 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
373 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
374 #define DO_MUL(N, M) (N * M)
378 * We must avoid the C undefined behaviour cases: division by
379 * zero and signed division of INT_MIN by -1. Both of these
380 * have architecturally defined required results for Arm.
381 * We special case all signed divisions by -1 to avoid having
382 * to deduce the minimum integer for the type involved.
384 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
385 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
387 DO_ZPZZ(sve_and_zpzz_b
, uint8_t, H1
, DO_AND
)
388 DO_ZPZZ(sve_and_zpzz_h
, uint16_t, H1_2
, DO_AND
)
389 DO_ZPZZ(sve_and_zpzz_s
, uint32_t, H1_4
, DO_AND
)
390 DO_ZPZZ_D(sve_and_zpzz_d
, uint64_t, DO_AND
)
392 DO_ZPZZ(sve_orr_zpzz_b
, uint8_t, H1
, DO_ORR
)
393 DO_ZPZZ(sve_orr_zpzz_h
, uint16_t, H1_2
, DO_ORR
)
394 DO_ZPZZ(sve_orr_zpzz_s
, uint32_t, H1_4
, DO_ORR
)
395 DO_ZPZZ_D(sve_orr_zpzz_d
, uint64_t, DO_ORR
)
397 DO_ZPZZ(sve_eor_zpzz_b
, uint8_t, H1
, DO_EOR
)
398 DO_ZPZZ(sve_eor_zpzz_h
, uint16_t, H1_2
, DO_EOR
)
399 DO_ZPZZ(sve_eor_zpzz_s
, uint32_t, H1_4
, DO_EOR
)
400 DO_ZPZZ_D(sve_eor_zpzz_d
, uint64_t, DO_EOR
)
402 DO_ZPZZ(sve_bic_zpzz_b
, uint8_t, H1
, DO_BIC
)
403 DO_ZPZZ(sve_bic_zpzz_h
, uint16_t, H1_2
, DO_BIC
)
404 DO_ZPZZ(sve_bic_zpzz_s
, uint32_t, H1_4
, DO_BIC
)
405 DO_ZPZZ_D(sve_bic_zpzz_d
, uint64_t, DO_BIC
)
407 DO_ZPZZ(sve_add_zpzz_b
, uint8_t, H1
, DO_ADD
)
408 DO_ZPZZ(sve_add_zpzz_h
, uint16_t, H1_2
, DO_ADD
)
409 DO_ZPZZ(sve_add_zpzz_s
, uint32_t, H1_4
, DO_ADD
)
410 DO_ZPZZ_D(sve_add_zpzz_d
, uint64_t, DO_ADD
)
412 DO_ZPZZ(sve_sub_zpzz_b
, uint8_t, H1
, DO_SUB
)
413 DO_ZPZZ(sve_sub_zpzz_h
, uint16_t, H1_2
, DO_SUB
)
414 DO_ZPZZ(sve_sub_zpzz_s
, uint32_t, H1_4
, DO_SUB
)
415 DO_ZPZZ_D(sve_sub_zpzz_d
, uint64_t, DO_SUB
)
417 DO_ZPZZ(sve_smax_zpzz_b
, int8_t, H1
, DO_MAX
)
418 DO_ZPZZ(sve_smax_zpzz_h
, int16_t, H1_2
, DO_MAX
)
419 DO_ZPZZ(sve_smax_zpzz_s
, int32_t, H1_4
, DO_MAX
)
420 DO_ZPZZ_D(sve_smax_zpzz_d
, int64_t, DO_MAX
)
422 DO_ZPZZ(sve_umax_zpzz_b
, uint8_t, H1
, DO_MAX
)
423 DO_ZPZZ(sve_umax_zpzz_h
, uint16_t, H1_2
, DO_MAX
)
424 DO_ZPZZ(sve_umax_zpzz_s
, uint32_t, H1_4
, DO_MAX
)
425 DO_ZPZZ_D(sve_umax_zpzz_d
, uint64_t, DO_MAX
)
427 DO_ZPZZ(sve_smin_zpzz_b
, int8_t, H1
, DO_MIN
)
428 DO_ZPZZ(sve_smin_zpzz_h
, int16_t, H1_2
, DO_MIN
)
429 DO_ZPZZ(sve_smin_zpzz_s
, int32_t, H1_4
, DO_MIN
)
430 DO_ZPZZ_D(sve_smin_zpzz_d
, int64_t, DO_MIN
)
432 DO_ZPZZ(sve_umin_zpzz_b
, uint8_t, H1
, DO_MIN
)
433 DO_ZPZZ(sve_umin_zpzz_h
, uint16_t, H1_2
, DO_MIN
)
434 DO_ZPZZ(sve_umin_zpzz_s
, uint32_t, H1_4
, DO_MIN
)
435 DO_ZPZZ_D(sve_umin_zpzz_d
, uint64_t, DO_MIN
)
437 DO_ZPZZ(sve_sabd_zpzz_b
, int8_t, H1
, DO_ABD
)
438 DO_ZPZZ(sve_sabd_zpzz_h
, int16_t, H1_2
, DO_ABD
)
439 DO_ZPZZ(sve_sabd_zpzz_s
, int32_t, H1_4
, DO_ABD
)
440 DO_ZPZZ_D(sve_sabd_zpzz_d
, int64_t, DO_ABD
)
442 DO_ZPZZ(sve_uabd_zpzz_b
, uint8_t, H1
, DO_ABD
)
443 DO_ZPZZ(sve_uabd_zpzz_h
, uint16_t, H1_2
, DO_ABD
)
444 DO_ZPZZ(sve_uabd_zpzz_s
, uint32_t, H1_4
, DO_ABD
)
445 DO_ZPZZ_D(sve_uabd_zpzz_d
, uint64_t, DO_ABD
)
447 /* Because the computation type is at least twice as large as required,
448 these work for both signed and unsigned source types. */
449 static inline uint8_t do_mulh_b(int32_t n
, int32_t m
)
454 static inline uint16_t do_mulh_h(int32_t n
, int32_t m
)
456 return (n
* m
) >> 16;
459 static inline uint32_t do_mulh_s(int64_t n
, int64_t m
)
461 return (n
* m
) >> 32;
464 static inline uint64_t do_smulh_d(uint64_t n
, uint64_t m
)
467 muls64(&lo
, &hi
, n
, m
);
471 static inline uint64_t do_umulh_d(uint64_t n
, uint64_t m
)
474 mulu64(&lo
, &hi
, n
, m
);
478 DO_ZPZZ(sve_mul_zpzz_b
, uint8_t, H1
, DO_MUL
)
479 DO_ZPZZ(sve_mul_zpzz_h
, uint16_t, H1_2
, DO_MUL
)
480 DO_ZPZZ(sve_mul_zpzz_s
, uint32_t, H1_4
, DO_MUL
)
481 DO_ZPZZ_D(sve_mul_zpzz_d
, uint64_t, DO_MUL
)
483 DO_ZPZZ(sve_smulh_zpzz_b
, int8_t, H1
, do_mulh_b
)
484 DO_ZPZZ(sve_smulh_zpzz_h
, int16_t, H1_2
, do_mulh_h
)
485 DO_ZPZZ(sve_smulh_zpzz_s
, int32_t, H1_4
, do_mulh_s
)
486 DO_ZPZZ_D(sve_smulh_zpzz_d
, uint64_t, do_smulh_d
)
488 DO_ZPZZ(sve_umulh_zpzz_b
, uint8_t, H1
, do_mulh_b
)
489 DO_ZPZZ(sve_umulh_zpzz_h
, uint16_t, H1_2
, do_mulh_h
)
490 DO_ZPZZ(sve_umulh_zpzz_s
, uint32_t, H1_4
, do_mulh_s
)
491 DO_ZPZZ_D(sve_umulh_zpzz_d
, uint64_t, do_umulh_d
)
493 DO_ZPZZ(sve_sdiv_zpzz_s
, int32_t, H1_4
, DO_SDIV
)
494 DO_ZPZZ_D(sve_sdiv_zpzz_d
, int64_t, DO_SDIV
)
496 DO_ZPZZ(sve_udiv_zpzz_s
, uint32_t, H1_4
, DO_UDIV
)
497 DO_ZPZZ_D(sve_udiv_zpzz_d
, uint64_t, DO_UDIV
)
499 /* Note that all bits of the shift are significant
500 and not modulo the element size. */
501 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
502 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
503 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
505 DO_ZPZZ(sve_asr_zpzz_b
, int8_t, H1
, DO_ASR
)
506 DO_ZPZZ(sve_lsr_zpzz_b
, uint8_t, H1_2
, DO_LSR
)
507 DO_ZPZZ(sve_lsl_zpzz_b
, uint8_t, H1_4
, DO_LSL
)
509 DO_ZPZZ(sve_asr_zpzz_h
, int16_t, H1
, DO_ASR
)
510 DO_ZPZZ(sve_lsr_zpzz_h
, uint16_t, H1_2
, DO_LSR
)
511 DO_ZPZZ(sve_lsl_zpzz_h
, uint16_t, H1_4
, DO_LSL
)
513 DO_ZPZZ(sve_asr_zpzz_s
, int32_t, H1
, DO_ASR
)
514 DO_ZPZZ(sve_lsr_zpzz_s
, uint32_t, H1_2
, DO_LSR
)
515 DO_ZPZZ(sve_lsl_zpzz_s
, uint32_t, H1_4
, DO_LSL
)
517 DO_ZPZZ_D(sve_asr_zpzz_d
, int64_t, DO_ASR
)
518 DO_ZPZZ_D(sve_lsr_zpzz_d
, uint64_t, DO_LSR
)
519 DO_ZPZZ_D(sve_lsl_zpzz_d
, uint64_t, DO_LSL
)
521 static inline uint16_t do_sadalp_h(int16_t n
, int16_t m
)
523 int8_t n1
= n
, n2
= n
>> 8;
527 static inline uint32_t do_sadalp_s(int32_t n
, int32_t m
)
529 int16_t n1
= n
, n2
= n
>> 16;
533 static inline uint64_t do_sadalp_d(int64_t n
, int64_t m
)
535 int32_t n1
= n
, n2
= n
>> 32;
539 DO_ZPZZ(sve2_sadalp_zpzz_h
, int16_t, H1_2
, do_sadalp_h
)
540 DO_ZPZZ(sve2_sadalp_zpzz_s
, int32_t, H1_4
, do_sadalp_s
)
541 DO_ZPZZ_D(sve2_sadalp_zpzz_d
, int64_t, do_sadalp_d
)
543 static inline uint16_t do_uadalp_h(uint16_t n
, uint16_t m
)
545 uint8_t n1
= n
, n2
= n
>> 8;
549 static inline uint32_t do_uadalp_s(uint32_t n
, uint32_t m
)
551 uint16_t n1
= n
, n2
= n
>> 16;
555 static inline uint64_t do_uadalp_d(uint64_t n
, uint64_t m
)
557 uint32_t n1
= n
, n2
= n
>> 32;
561 DO_ZPZZ(sve2_uadalp_zpzz_h
, uint16_t, H1_2
, do_uadalp_h
)
562 DO_ZPZZ(sve2_uadalp_zpzz_s
, uint32_t, H1_4
, do_uadalp_s
)
563 DO_ZPZZ_D(sve2_uadalp_zpzz_d
, uint64_t, do_uadalp_d
)
565 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
566 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
567 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
568 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
570 DO_ZPZZ(sve2_srshl_zpzz_b
, int8_t, H1
, do_srshl_b
)
571 DO_ZPZZ(sve2_srshl_zpzz_h
, int16_t, H1_2
, do_srshl_h
)
572 DO_ZPZZ(sve2_srshl_zpzz_s
, int32_t, H1_4
, do_srshl_s
)
573 DO_ZPZZ_D(sve2_srshl_zpzz_d
, int64_t, do_srshl_d
)
575 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
576 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
577 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
578 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
580 DO_ZPZZ(sve2_urshl_zpzz_b
, uint8_t, H1
, do_urshl_b
)
581 DO_ZPZZ(sve2_urshl_zpzz_h
, uint16_t, H1_2
, do_urshl_h
)
582 DO_ZPZZ(sve2_urshl_zpzz_s
, uint32_t, H1_4
, do_urshl_s
)
583 DO_ZPZZ_D(sve2_urshl_zpzz_d
, uint64_t, do_urshl_d
)
586 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
587 * We pass in a pointer to a dummy saturation field to trigger
588 * the saturating arithmetic but discard the information about
589 * whether it has occurred.
591 #define do_sqshl_b(n, m) \
592 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
593 #define do_sqshl_h(n, m) \
594 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
595 #define do_sqshl_s(n, m) \
596 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
597 #define do_sqshl_d(n, m) \
598 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
600 DO_ZPZZ(sve2_sqshl_zpzz_b
, int8_t, H1_2
, do_sqshl_b
)
601 DO_ZPZZ(sve2_sqshl_zpzz_h
, int16_t, H1_2
, do_sqshl_h
)
602 DO_ZPZZ(sve2_sqshl_zpzz_s
, int32_t, H1_4
, do_sqshl_s
)
603 DO_ZPZZ_D(sve2_sqshl_zpzz_d
, int64_t, do_sqshl_d
)
605 #define do_uqshl_b(n, m) \
606 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
607 #define do_uqshl_h(n, m) \
608 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
609 #define do_uqshl_s(n, m) \
610 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
611 #define do_uqshl_d(n, m) \
612 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
614 DO_ZPZZ(sve2_uqshl_zpzz_b
, uint8_t, H1_2
, do_uqshl_b
)
615 DO_ZPZZ(sve2_uqshl_zpzz_h
, uint16_t, H1_2
, do_uqshl_h
)
616 DO_ZPZZ(sve2_uqshl_zpzz_s
, uint32_t, H1_4
, do_uqshl_s
)
617 DO_ZPZZ_D(sve2_uqshl_zpzz_d
, uint64_t, do_uqshl_d
)
619 #define do_sqrshl_b(n, m) \
620 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
621 #define do_sqrshl_h(n, m) \
622 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
623 #define do_sqrshl_s(n, m) \
624 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
625 #define do_sqrshl_d(n, m) \
626 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
628 DO_ZPZZ(sve2_sqrshl_zpzz_b
, int8_t, H1_2
, do_sqrshl_b
)
629 DO_ZPZZ(sve2_sqrshl_zpzz_h
, int16_t, H1_2
, do_sqrshl_h
)
630 DO_ZPZZ(sve2_sqrshl_zpzz_s
, int32_t, H1_4
, do_sqrshl_s
)
631 DO_ZPZZ_D(sve2_sqrshl_zpzz_d
, int64_t, do_sqrshl_d
)
635 #define do_uqrshl_b(n, m) \
636 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
637 #define do_uqrshl_h(n, m) \
638 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
639 #define do_uqrshl_s(n, m) \
640 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
641 #define do_uqrshl_d(n, m) \
642 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
644 DO_ZPZZ(sve2_uqrshl_zpzz_b
, uint8_t, H1_2
, do_uqrshl_b
)
645 DO_ZPZZ(sve2_uqrshl_zpzz_h
, uint16_t, H1_2
, do_uqrshl_h
)
646 DO_ZPZZ(sve2_uqrshl_zpzz_s
, uint32_t, H1_4
, do_uqrshl_s
)
647 DO_ZPZZ_D(sve2_uqrshl_zpzz_d
, uint64_t, do_uqrshl_d
)
651 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
652 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
654 DO_ZPZZ(sve2_shadd_zpzz_b
, int8_t, H1
, DO_HADD_BHS
)
655 DO_ZPZZ(sve2_shadd_zpzz_h
, int16_t, H1_2
, DO_HADD_BHS
)
656 DO_ZPZZ(sve2_shadd_zpzz_s
, int32_t, H1_4
, DO_HADD_BHS
)
657 DO_ZPZZ_D(sve2_shadd_zpzz_d
, int64_t, DO_HADD_D
)
659 DO_ZPZZ(sve2_uhadd_zpzz_b
, uint8_t, H1
, DO_HADD_BHS
)
660 DO_ZPZZ(sve2_uhadd_zpzz_h
, uint16_t, H1_2
, DO_HADD_BHS
)
661 DO_ZPZZ(sve2_uhadd_zpzz_s
, uint32_t, H1_4
, DO_HADD_BHS
)
662 DO_ZPZZ_D(sve2_uhadd_zpzz_d
, uint64_t, DO_HADD_D
)
664 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
665 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
667 DO_ZPZZ(sve2_srhadd_zpzz_b
, int8_t, H1
, DO_RHADD_BHS
)
668 DO_ZPZZ(sve2_srhadd_zpzz_h
, int16_t, H1_2
, DO_RHADD_BHS
)
669 DO_ZPZZ(sve2_srhadd_zpzz_s
, int32_t, H1_4
, DO_RHADD_BHS
)
670 DO_ZPZZ_D(sve2_srhadd_zpzz_d
, int64_t, DO_RHADD_D
)
672 DO_ZPZZ(sve2_urhadd_zpzz_b
, uint8_t, H1
, DO_RHADD_BHS
)
673 DO_ZPZZ(sve2_urhadd_zpzz_h
, uint16_t, H1_2
, DO_RHADD_BHS
)
674 DO_ZPZZ(sve2_urhadd_zpzz_s
, uint32_t, H1_4
, DO_RHADD_BHS
)
675 DO_ZPZZ_D(sve2_urhadd_zpzz_d
, uint64_t, DO_RHADD_D
)
677 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
678 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
680 DO_ZPZZ(sve2_shsub_zpzz_b
, int8_t, H1
, DO_HSUB_BHS
)
681 DO_ZPZZ(sve2_shsub_zpzz_h
, int16_t, H1_2
, DO_HSUB_BHS
)
682 DO_ZPZZ(sve2_shsub_zpzz_s
, int32_t, H1_4
, DO_HSUB_BHS
)
683 DO_ZPZZ_D(sve2_shsub_zpzz_d
, int64_t, DO_HSUB_D
)
685 DO_ZPZZ(sve2_uhsub_zpzz_b
, uint8_t, H1
, DO_HSUB_BHS
)
686 DO_ZPZZ(sve2_uhsub_zpzz_h
, uint16_t, H1_2
, DO_HSUB_BHS
)
687 DO_ZPZZ(sve2_uhsub_zpzz_s
, uint32_t, H1_4
, DO_HSUB_BHS
)
688 DO_ZPZZ_D(sve2_uhsub_zpzz_d
, uint64_t, DO_HSUB_D
)
690 static inline int32_t do_sat_bhs(int64_t val
, int64_t min
, int64_t max
)
692 return val
>= max
? max
: val
<= min
? min
: val
;
695 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
696 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
697 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
699 static inline int64_t do_sqadd_d(int64_t n
, int64_t m
)
702 if (((r
^ n
) & ~(n
^ m
)) < 0) {
703 /* Signed overflow. */
704 return r
< 0 ? INT64_MAX
: INT64_MIN
;
709 DO_ZPZZ(sve2_sqadd_zpzz_b
, int8_t, H1
, DO_SQADD_B
)
710 DO_ZPZZ(sve2_sqadd_zpzz_h
, int16_t, H1_2
, DO_SQADD_H
)
711 DO_ZPZZ(sve2_sqadd_zpzz_s
, int32_t, H1_4
, DO_SQADD_S
)
712 DO_ZPZZ_D(sve2_sqadd_zpzz_d
, int64_t, do_sqadd_d
)
714 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
715 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
716 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
718 static inline uint64_t do_uqadd_d(uint64_t n
, uint64_t m
)
721 return r
< n
? UINT64_MAX
: r
;
724 DO_ZPZZ(sve2_uqadd_zpzz_b
, uint8_t, H1
, DO_UQADD_B
)
725 DO_ZPZZ(sve2_uqadd_zpzz_h
, uint16_t, H1_2
, DO_UQADD_H
)
726 DO_ZPZZ(sve2_uqadd_zpzz_s
, uint32_t, H1_4
, DO_UQADD_S
)
727 DO_ZPZZ_D(sve2_uqadd_zpzz_d
, uint64_t, do_uqadd_d
)
729 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
730 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
731 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
733 static inline int64_t do_sqsub_d(int64_t n
, int64_t m
)
736 if (((r
^ n
) & (n
^ m
)) < 0) {
737 /* Signed overflow. */
738 return r
< 0 ? INT64_MAX
: INT64_MIN
;
743 DO_ZPZZ(sve2_sqsub_zpzz_b
, int8_t, H1
, DO_SQSUB_B
)
744 DO_ZPZZ(sve2_sqsub_zpzz_h
, int16_t, H1_2
, DO_SQSUB_H
)
745 DO_ZPZZ(sve2_sqsub_zpzz_s
, int32_t, H1_4
, DO_SQSUB_S
)
746 DO_ZPZZ_D(sve2_sqsub_zpzz_d
, int64_t, do_sqsub_d
)
748 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
749 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
750 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
752 static inline uint64_t do_uqsub_d(uint64_t n
, uint64_t m
)
754 return n
> m
? n
- m
: 0;
757 DO_ZPZZ(sve2_uqsub_zpzz_b
, uint8_t, H1
, DO_UQSUB_B
)
758 DO_ZPZZ(sve2_uqsub_zpzz_h
, uint16_t, H1_2
, DO_UQSUB_H
)
759 DO_ZPZZ(sve2_uqsub_zpzz_s
, uint32_t, H1_4
, DO_UQSUB_S
)
760 DO_ZPZZ_D(sve2_uqsub_zpzz_d
, uint64_t, do_uqsub_d
)
762 #define DO_SUQADD_B(n, m) \
763 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
764 #define DO_SUQADD_H(n, m) \
765 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
766 #define DO_SUQADD_S(n, m) \
767 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
769 static inline int64_t do_suqadd_d(int64_t n
, uint64_t m
)
774 /* Note that m - abs(n) cannot underflow. */
776 /* Result is either very large positive or negative. */
778 /* m > abs(n), so r is a very large positive. */
781 /* Result is negative. */
784 /* Both inputs are positive: check for overflow. */
785 if (r
< m
|| r
> INT64_MAX
) {
792 DO_ZPZZ(sve2_suqadd_zpzz_b
, uint8_t, H1
, DO_SUQADD_B
)
793 DO_ZPZZ(sve2_suqadd_zpzz_h
, uint16_t, H1_2
, DO_SUQADD_H
)
794 DO_ZPZZ(sve2_suqadd_zpzz_s
, uint32_t, H1_4
, DO_SUQADD_S
)
795 DO_ZPZZ_D(sve2_suqadd_zpzz_d
, uint64_t, do_suqadd_d
)
797 #define DO_USQADD_B(n, m) \
798 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
799 #define DO_USQADD_H(n, m) \
800 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
801 #define DO_USQADD_S(n, m) \
802 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
804 static inline uint64_t do_usqadd_d(uint64_t n
, int64_t m
)
809 return n
< -m
? 0 : r
;
811 return r
< n
? UINT64_MAX
: r
;
814 DO_ZPZZ(sve2_usqadd_zpzz_b
, uint8_t, H1
, DO_USQADD_B
)
815 DO_ZPZZ(sve2_usqadd_zpzz_h
, uint16_t, H1_2
, DO_USQADD_H
)
816 DO_ZPZZ(sve2_usqadd_zpzz_s
, uint32_t, H1_4
, DO_USQADD_S
)
817 DO_ZPZZ_D(sve2_usqadd_zpzz_d
, uint64_t, do_usqadd_d
)
823 * Three operand expander, operating on element pairs.
824 * If the slot I is even, the elements from from VN {I, I+1}.
825 * If the slot I is odd, the elements from from VM {I-1, I}.
826 * Load all of the input elements in each pair before overwriting output.
828 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
829 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
831 intptr_t i, opr_sz = simd_oprsz(desc); \
832 for (i = 0; i < opr_sz; ) { \
833 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
835 TYPE n0 = *(TYPE *)(vn + H(i)); \
836 TYPE m0 = *(TYPE *)(vm + H(i)); \
837 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
838 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
840 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
842 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
844 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
846 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
851 /* Similarly, specialized for 64-bit operands. */
852 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
853 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
855 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
856 TYPE *d = vd, *n = vn, *m = vm; \
858 for (i = 0; i < opr_sz; i += 2) { \
859 TYPE n0 = n[i], n1 = n[i + 1]; \
860 TYPE m0 = m[i], m1 = m[i + 1]; \
861 if (pg[H1(i)] & 1) { \
864 if (pg[H1(i + 1)] & 1) { \
865 d[i + 1] = OP(m0, m1); \
870 DO_ZPZZ_PAIR(sve2_addp_zpzz_b
, uint8_t, H1
, DO_ADD
)
871 DO_ZPZZ_PAIR(sve2_addp_zpzz_h
, uint16_t, H1_2
, DO_ADD
)
872 DO_ZPZZ_PAIR(sve2_addp_zpzz_s
, uint32_t, H1_4
, DO_ADD
)
873 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d
, uint64_t, DO_ADD
)
875 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b
, uint8_t, H1
, DO_MAX
)
876 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h
, uint16_t, H1_2
, DO_MAX
)
877 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s
, uint32_t, H1_4
, DO_MAX
)
878 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d
, uint64_t, DO_MAX
)
880 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b
, uint8_t, H1
, DO_MIN
)
881 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h
, uint16_t, H1_2
, DO_MIN
)
882 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s
, uint32_t, H1_4
, DO_MIN
)
883 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d
, uint64_t, DO_MIN
)
885 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b
, int8_t, H1
, DO_MAX
)
886 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h
, int16_t, H1_2
, DO_MAX
)
887 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s
, int32_t, H1_4
, DO_MAX
)
888 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d
, int64_t, DO_MAX
)
890 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b
, int8_t, H1
, DO_MIN
)
891 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h
, int16_t, H1_2
, DO_MIN
)
892 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s
, int32_t, H1_4
, DO_MIN
)
893 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d
, int64_t, DO_MIN
)
896 #undef DO_ZPZZ_PAIR_D
898 /* Three-operand expander, controlled by a predicate, in which the
899 * third operand is "wide". That is, for D = N op M, the same 64-bit
900 * value of M is used with all of the narrower values of N.
902 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
903 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
905 intptr_t i, opr_sz = simd_oprsz(desc); \
906 for (i = 0; i < opr_sz; ) { \
907 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
908 TYPEW mm = *(TYPEW *)(vm + i); \
911 TYPE nn = *(TYPE *)(vn + H(i)); \
912 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
914 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
919 DO_ZPZW(sve_asr_zpzw_b
, int8_t, uint64_t, H1
, DO_ASR
)
920 DO_ZPZW(sve_lsr_zpzw_b
, uint8_t, uint64_t, H1
, DO_LSR
)
921 DO_ZPZW(sve_lsl_zpzw_b
, uint8_t, uint64_t, H1
, DO_LSL
)
923 DO_ZPZW(sve_asr_zpzw_h
, int16_t, uint64_t, H1_2
, DO_ASR
)
924 DO_ZPZW(sve_lsr_zpzw_h
, uint16_t, uint64_t, H1_2
, DO_LSR
)
925 DO_ZPZW(sve_lsl_zpzw_h
, uint16_t, uint64_t, H1_2
, DO_LSL
)
927 DO_ZPZW(sve_asr_zpzw_s
, int32_t, uint64_t, H1_4
, DO_ASR
)
928 DO_ZPZW(sve_lsr_zpzw_s
, uint32_t, uint64_t, H1_4
, DO_LSR
)
929 DO_ZPZW(sve_lsl_zpzw_s
, uint32_t, uint64_t, H1_4
, DO_LSL
)
933 /* Fully general two-operand expander, controlled by a predicate.
935 #define DO_ZPZ(NAME, TYPE, H, OP) \
936 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
938 intptr_t i, opr_sz = simd_oprsz(desc); \
939 for (i = 0; i < opr_sz; ) { \
940 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
943 TYPE nn = *(TYPE *)(vn + H(i)); \
944 *(TYPE *)(vd + H(i)) = OP(nn); \
946 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
951 /* Similarly, specialized for 64-bit operands. */
952 #define DO_ZPZ_D(NAME, TYPE, OP) \
953 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
955 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
956 TYPE *d = vd, *n = vn; \
958 for (i = 0; i < opr_sz; i += 1) { \
959 if (pg[H1(i)] & 1) { \
966 #define DO_CLS_B(N) (clrsb32(N) - 24)
967 #define DO_CLS_H(N) (clrsb32(N) - 16)
969 DO_ZPZ(sve_cls_b
, int8_t, H1
, DO_CLS_B
)
970 DO_ZPZ(sve_cls_h
, int16_t, H1_2
, DO_CLS_H
)
971 DO_ZPZ(sve_cls_s
, int32_t, H1_4
, clrsb32
)
972 DO_ZPZ_D(sve_cls_d
, int64_t, clrsb64
)
974 #define DO_CLZ_B(N) (clz32(N) - 24)
975 #define DO_CLZ_H(N) (clz32(N) - 16)
977 DO_ZPZ(sve_clz_b
, uint8_t, H1
, DO_CLZ_B
)
978 DO_ZPZ(sve_clz_h
, uint16_t, H1_2
, DO_CLZ_H
)
979 DO_ZPZ(sve_clz_s
, uint32_t, H1_4
, clz32
)
980 DO_ZPZ_D(sve_clz_d
, uint64_t, clz64
)
982 DO_ZPZ(sve_cnt_zpz_b
, uint8_t, H1
, ctpop8
)
983 DO_ZPZ(sve_cnt_zpz_h
, uint16_t, H1_2
, ctpop16
)
984 DO_ZPZ(sve_cnt_zpz_s
, uint32_t, H1_4
, ctpop32
)
985 DO_ZPZ_D(sve_cnt_zpz_d
, uint64_t, ctpop64
)
987 #define DO_CNOT(N) (N == 0)
989 DO_ZPZ(sve_cnot_b
, uint8_t, H1
, DO_CNOT
)
990 DO_ZPZ(sve_cnot_h
, uint16_t, H1_2
, DO_CNOT
)
991 DO_ZPZ(sve_cnot_s
, uint32_t, H1_4
, DO_CNOT
)
992 DO_ZPZ_D(sve_cnot_d
, uint64_t, DO_CNOT
)
994 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
996 DO_ZPZ(sve_fabs_h
, uint16_t, H1_2
, DO_FABS
)
997 DO_ZPZ(sve_fabs_s
, uint32_t, H1_4
, DO_FABS
)
998 DO_ZPZ_D(sve_fabs_d
, uint64_t, DO_FABS
)
1000 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
1002 DO_ZPZ(sve_fneg_h
, uint16_t, H1_2
, DO_FNEG
)
1003 DO_ZPZ(sve_fneg_s
, uint32_t, H1_4
, DO_FNEG
)
1004 DO_ZPZ_D(sve_fneg_d
, uint64_t, DO_FNEG
)
1006 #define DO_NOT(N) (~N)
1008 DO_ZPZ(sve_not_zpz_b
, uint8_t, H1
, DO_NOT
)
1009 DO_ZPZ(sve_not_zpz_h
, uint16_t, H1_2
, DO_NOT
)
1010 DO_ZPZ(sve_not_zpz_s
, uint32_t, H1_4
, DO_NOT
)
1011 DO_ZPZ_D(sve_not_zpz_d
, uint64_t, DO_NOT
)
1013 #define DO_SXTB(N) ((int8_t)N)
1014 #define DO_SXTH(N) ((int16_t)N)
1015 #define DO_SXTS(N) ((int32_t)N)
1016 #define DO_UXTB(N) ((uint8_t)N)
1017 #define DO_UXTH(N) ((uint16_t)N)
1018 #define DO_UXTS(N) ((uint32_t)N)
1020 DO_ZPZ(sve_sxtb_h
, uint16_t, H1_2
, DO_SXTB
)
1021 DO_ZPZ(sve_sxtb_s
, uint32_t, H1_4
, DO_SXTB
)
1022 DO_ZPZ(sve_sxth_s
, uint32_t, H1_4
, DO_SXTH
)
1023 DO_ZPZ_D(sve_sxtb_d
, uint64_t, DO_SXTB
)
1024 DO_ZPZ_D(sve_sxth_d
, uint64_t, DO_SXTH
)
1025 DO_ZPZ_D(sve_sxtw_d
, uint64_t, DO_SXTS
)
1027 DO_ZPZ(sve_uxtb_h
, uint16_t, H1_2
, DO_UXTB
)
1028 DO_ZPZ(sve_uxtb_s
, uint32_t, H1_4
, DO_UXTB
)
1029 DO_ZPZ(sve_uxth_s
, uint32_t, H1_4
, DO_UXTH
)
1030 DO_ZPZ_D(sve_uxtb_d
, uint64_t, DO_UXTB
)
1031 DO_ZPZ_D(sve_uxth_d
, uint64_t, DO_UXTH
)
1032 DO_ZPZ_D(sve_uxtw_d
, uint64_t, DO_UXTS
)
1034 #define DO_ABS(N) (N < 0 ? -N : N)
1036 DO_ZPZ(sve_abs_b
, int8_t, H1
, DO_ABS
)
1037 DO_ZPZ(sve_abs_h
, int16_t, H1_2
, DO_ABS
)
1038 DO_ZPZ(sve_abs_s
, int32_t, H1_4
, DO_ABS
)
1039 DO_ZPZ_D(sve_abs_d
, int64_t, DO_ABS
)
1041 #define DO_NEG(N) (-N)
1043 DO_ZPZ(sve_neg_b
, uint8_t, H1
, DO_NEG
)
1044 DO_ZPZ(sve_neg_h
, uint16_t, H1_2
, DO_NEG
)
1045 DO_ZPZ(sve_neg_s
, uint32_t, H1_4
, DO_NEG
)
1046 DO_ZPZ_D(sve_neg_d
, uint64_t, DO_NEG
)
1048 DO_ZPZ(sve_revb_h
, uint16_t, H1_2
, bswap16
)
1049 DO_ZPZ(sve_revb_s
, uint32_t, H1_4
, bswap32
)
1050 DO_ZPZ_D(sve_revb_d
, uint64_t, bswap64
)
1052 DO_ZPZ(sve_revh_s
, uint32_t, H1_4
, hswap32
)
1053 DO_ZPZ_D(sve_revh_d
, uint64_t, hswap64
)
1055 DO_ZPZ_D(sve_revw_d
, uint64_t, wswap64
)
1057 DO_ZPZ(sve_rbit_b
, uint8_t, H1
, revbit8
)
1058 DO_ZPZ(sve_rbit_h
, uint16_t, H1_2
, revbit16
)
1059 DO_ZPZ(sve_rbit_s
, uint32_t, H1_4
, revbit32
)
1060 DO_ZPZ_D(sve_rbit_d
, uint64_t, revbit64
)
1062 #define DO_SQABS(X) \
1063 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1064 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
1066 DO_ZPZ(sve2_sqabs_b
, int8_t, H1
, DO_SQABS
)
1067 DO_ZPZ(sve2_sqabs_h
, int16_t, H1_2
, DO_SQABS
)
1068 DO_ZPZ(sve2_sqabs_s
, int32_t, H1_4
, DO_SQABS
)
1069 DO_ZPZ_D(sve2_sqabs_d
, int64_t, DO_SQABS
)
1071 #define DO_SQNEG(X) \
1072 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1073 x_ == min_ ? -min_ - 1 : -x_; })
1075 DO_ZPZ(sve2_sqneg_b
, uint8_t, H1
, DO_SQNEG
)
1076 DO_ZPZ(sve2_sqneg_h
, uint16_t, H1_2
, DO_SQNEG
)
1077 DO_ZPZ(sve2_sqneg_s
, uint32_t, H1_4
, DO_SQNEG
)
1078 DO_ZPZ_D(sve2_sqneg_d
, uint64_t, DO_SQNEG
)
1080 DO_ZPZ(sve2_urecpe_s
, uint32_t, H1_4
, helper_recpe_u32
)
1081 DO_ZPZ(sve2_ursqrte_s
, uint32_t, H1_4
, helper_rsqrte_u32
)
1083 /* Three-operand expander, unpredicated, in which the third operand is "wide".
1085 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1086 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1088 intptr_t i, opr_sz = simd_oprsz(desc); \
1089 for (i = 0; i < opr_sz; ) { \
1090 TYPEW mm = *(TYPEW *)(vm + i); \
1092 TYPE nn = *(TYPE *)(vn + H(i)); \
1093 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1094 i += sizeof(TYPE); \
1099 DO_ZZW(sve_asr_zzw_b
, int8_t, uint64_t, H1
, DO_ASR
)
1100 DO_ZZW(sve_lsr_zzw_b
, uint8_t, uint64_t, H1
, DO_LSR
)
1101 DO_ZZW(sve_lsl_zzw_b
, uint8_t, uint64_t, H1
, DO_LSL
)
1103 DO_ZZW(sve_asr_zzw_h
, int16_t, uint64_t, H1_2
, DO_ASR
)
1104 DO_ZZW(sve_lsr_zzw_h
, uint16_t, uint64_t, H1_2
, DO_LSR
)
1105 DO_ZZW(sve_lsl_zzw_h
, uint16_t, uint64_t, H1_2
, DO_LSL
)
1107 DO_ZZW(sve_asr_zzw_s
, int32_t, uint64_t, H1_4
, DO_ASR
)
1108 DO_ZZW(sve_lsr_zzw_s
, uint32_t, uint64_t, H1_4
, DO_LSR
)
1109 DO_ZZW(sve_lsl_zzw_s
, uint32_t, uint64_t, H1_4
, DO_LSL
)
1126 * Three-operand expander, unpredicated, in which the two inputs are
1127 * selected from the top or bottom half of the wide column.
1129 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1130 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1132 intptr_t i, opr_sz = simd_oprsz(desc); \
1133 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1134 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1135 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1136 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1137 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1138 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1142 DO_ZZZ_TB(sve2_saddl_h
, int16_t, int8_t, H1_2
, H1
, DO_ADD
)
1143 DO_ZZZ_TB(sve2_saddl_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ADD
)
1144 DO_ZZZ_TB(sve2_saddl_d
, int64_t, int32_t, , H1_4
, DO_ADD
)
1146 DO_ZZZ_TB(sve2_ssubl_h
, int16_t, int8_t, H1_2
, H1
, DO_SUB
)
1147 DO_ZZZ_TB(sve2_ssubl_s
, int32_t, int16_t, H1_4
, H1_2
, DO_SUB
)
1148 DO_ZZZ_TB(sve2_ssubl_d
, int64_t, int32_t, , H1_4
, DO_SUB
)
1150 DO_ZZZ_TB(sve2_sabdl_h
, int16_t, int8_t, H1_2
, H1
, DO_ABD
)
1151 DO_ZZZ_TB(sve2_sabdl_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ABD
)
1152 DO_ZZZ_TB(sve2_sabdl_d
, int64_t, int32_t, , H1_4
, DO_ABD
)
1154 DO_ZZZ_TB(sve2_uaddl_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ADD
)
1155 DO_ZZZ_TB(sve2_uaddl_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ADD
)
1156 DO_ZZZ_TB(sve2_uaddl_d
, uint64_t, uint32_t, , H1_4
, DO_ADD
)
1158 DO_ZZZ_TB(sve2_usubl_h
, uint16_t, uint8_t, H1_2
, H1
, DO_SUB
)
1159 DO_ZZZ_TB(sve2_usubl_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_SUB
)
1160 DO_ZZZ_TB(sve2_usubl_d
, uint64_t, uint32_t, , H1_4
, DO_SUB
)
1162 DO_ZZZ_TB(sve2_uabdl_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ABD
)
1163 DO_ZZZ_TB(sve2_uabdl_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ABD
)
1164 DO_ZZZ_TB(sve2_uabdl_d
, uint64_t, uint32_t, , H1_4
, DO_ABD
)
1166 DO_ZZZ_TB(sve2_smull_zzz_h
, int16_t, int8_t, H1_2
, H1
, DO_MUL
)
1167 DO_ZZZ_TB(sve2_smull_zzz_s
, int32_t, int16_t, H1_4
, H1_2
, DO_MUL
)
1168 DO_ZZZ_TB(sve2_smull_zzz_d
, int64_t, int32_t, , H1_4
, DO_MUL
)
1170 DO_ZZZ_TB(sve2_umull_zzz_h
, uint16_t, uint8_t, H1_2
, H1
, DO_MUL
)
1171 DO_ZZZ_TB(sve2_umull_zzz_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_MUL
)
1172 DO_ZZZ_TB(sve2_umull_zzz_d
, uint64_t, uint32_t, , H1_4
, DO_MUL
)
1174 /* Note that the multiply cannot overflow, but the doubling can. */
1175 static inline int16_t do_sqdmull_h(int16_t n
, int16_t m
)
1177 int16_t val
= n
* m
;
1178 return DO_SQADD_H(val
, val
);
1181 static inline int32_t do_sqdmull_s(int32_t n
, int32_t m
)
1183 int32_t val
= n
* m
;
1184 return DO_SQADD_S(val
, val
);
1187 static inline int64_t do_sqdmull_d(int64_t n
, int64_t m
)
1189 int64_t val
= n
* m
;
1190 return do_sqadd_d(val
, val
);
1193 DO_ZZZ_TB(sve2_sqdmull_zzz_h
, int16_t, int8_t, H1_2
, H1
, do_sqdmull_h
)
1194 DO_ZZZ_TB(sve2_sqdmull_zzz_s
, int32_t, int16_t, H1_4
, H1_2
, do_sqdmull_s
)
1195 DO_ZZZ_TB(sve2_sqdmull_zzz_d
, int64_t, int32_t, , H1_4
, do_sqdmull_d
)
1199 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1200 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1202 intptr_t i, opr_sz = simd_oprsz(desc); \
1203 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1204 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1205 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1206 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1207 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1211 DO_ZZZ_WTB(sve2_saddw_h
, int16_t, int8_t, H1_2
, H1
, DO_ADD
)
1212 DO_ZZZ_WTB(sve2_saddw_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ADD
)
1213 DO_ZZZ_WTB(sve2_saddw_d
, int64_t, int32_t, , H1_4
, DO_ADD
)
1215 DO_ZZZ_WTB(sve2_ssubw_h
, int16_t, int8_t, H1_2
, H1
, DO_SUB
)
1216 DO_ZZZ_WTB(sve2_ssubw_s
, int32_t, int16_t, H1_4
, H1_2
, DO_SUB
)
1217 DO_ZZZ_WTB(sve2_ssubw_d
, int64_t, int32_t, , H1_4
, DO_SUB
)
1219 DO_ZZZ_WTB(sve2_uaddw_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ADD
)
1220 DO_ZZZ_WTB(sve2_uaddw_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ADD
)
1221 DO_ZZZ_WTB(sve2_uaddw_d
, uint64_t, uint32_t, , H1_4
, DO_ADD
)
1223 DO_ZZZ_WTB(sve2_usubw_h
, uint16_t, uint8_t, H1_2
, H1
, DO_SUB
)
1224 DO_ZZZ_WTB(sve2_usubw_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_SUB
)
1225 DO_ZZZ_WTB(sve2_usubw_d
, uint64_t, uint32_t, , H1_4
, DO_SUB
)
1229 /* Two-operand reduction expander, controlled by a predicate.
1230 * The difference between TYPERED and TYPERET has to do with
1231 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1232 * but TYPERET must be unsigned so that e.g. a 32-bit value
1233 * is not sign-extended to the ABI uint64_t return type.
1235 /* ??? If we were to vectorize this by hand the reduction ordering
1236 * would change. For integer operands, this is perfectly fine.
1238 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1239 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1241 intptr_t i, opr_sz = simd_oprsz(desc); \
1242 TYPERED ret = INIT; \
1243 for (i = 0; i < opr_sz; ) { \
1244 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1247 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1248 ret = OP(ret, nn); \
1250 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1253 return (TYPERET)ret; \
1256 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1257 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1259 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1263 for (i = 0; i < opr_sz; i += 1) { \
1264 if (pg[H1(i)] & 1) { \
1266 ret = OP(ret, nn); \
1272 DO_VPZ(sve_orv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_ORR
)
1273 DO_VPZ(sve_orv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_ORR
)
1274 DO_VPZ(sve_orv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_ORR
)
1275 DO_VPZ_D(sve_orv_d
, uint64_t, uint64_t, 0, DO_ORR
)
1277 DO_VPZ(sve_eorv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_EOR
)
1278 DO_VPZ(sve_eorv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_EOR
)
1279 DO_VPZ(sve_eorv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_EOR
)
1280 DO_VPZ_D(sve_eorv_d
, uint64_t, uint64_t, 0, DO_EOR
)
1282 DO_VPZ(sve_andv_b
, uint8_t, uint8_t, uint8_t, H1
, -1, DO_AND
)
1283 DO_VPZ(sve_andv_h
, uint16_t, uint16_t, uint16_t, H1_2
, -1, DO_AND
)
1284 DO_VPZ(sve_andv_s
, uint32_t, uint32_t, uint32_t, H1_4
, -1, DO_AND
)
1285 DO_VPZ_D(sve_andv_d
, uint64_t, uint64_t, -1, DO_AND
)
1287 DO_VPZ(sve_saddv_b
, int8_t, uint64_t, uint64_t, H1
, 0, DO_ADD
)
1288 DO_VPZ(sve_saddv_h
, int16_t, uint64_t, uint64_t, H1_2
, 0, DO_ADD
)
1289 DO_VPZ(sve_saddv_s
, int32_t, uint64_t, uint64_t, H1_4
, 0, DO_ADD
)
1291 DO_VPZ(sve_uaddv_b
, uint8_t, uint64_t, uint64_t, H1
, 0, DO_ADD
)
1292 DO_VPZ(sve_uaddv_h
, uint16_t, uint64_t, uint64_t, H1_2
, 0, DO_ADD
)
1293 DO_VPZ(sve_uaddv_s
, uint32_t, uint64_t, uint64_t, H1_4
, 0, DO_ADD
)
1294 DO_VPZ_D(sve_uaddv_d
, uint64_t, uint64_t, 0, DO_ADD
)
1296 DO_VPZ(sve_smaxv_b
, int8_t, int8_t, uint8_t, H1
, INT8_MIN
, DO_MAX
)
1297 DO_VPZ(sve_smaxv_h
, int16_t, int16_t, uint16_t, H1_2
, INT16_MIN
, DO_MAX
)
1298 DO_VPZ(sve_smaxv_s
, int32_t, int32_t, uint32_t, H1_4
, INT32_MIN
, DO_MAX
)
1299 DO_VPZ_D(sve_smaxv_d
, int64_t, int64_t, INT64_MIN
, DO_MAX
)
1301 DO_VPZ(sve_umaxv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_MAX
)
1302 DO_VPZ(sve_umaxv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_MAX
)
1303 DO_VPZ(sve_umaxv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_MAX
)
1304 DO_VPZ_D(sve_umaxv_d
, uint64_t, uint64_t, 0, DO_MAX
)
1306 DO_VPZ(sve_sminv_b
, int8_t, int8_t, uint8_t, H1
, INT8_MAX
, DO_MIN
)
1307 DO_VPZ(sve_sminv_h
, int16_t, int16_t, uint16_t, H1_2
, INT16_MAX
, DO_MIN
)
1308 DO_VPZ(sve_sminv_s
, int32_t, int32_t, uint32_t, H1_4
, INT32_MAX
, DO_MIN
)
1309 DO_VPZ_D(sve_sminv_d
, int64_t, int64_t, INT64_MAX
, DO_MIN
)
1311 DO_VPZ(sve_uminv_b
, uint8_t, uint8_t, uint8_t, H1
, -1, DO_MIN
)
1312 DO_VPZ(sve_uminv_h
, uint16_t, uint16_t, uint16_t, H1_2
, -1, DO_MIN
)
1313 DO_VPZ(sve_uminv_s
, uint32_t, uint32_t, uint32_t, H1_4
, -1, DO_MIN
)
1314 DO_VPZ_D(sve_uminv_d
, uint64_t, uint64_t, -1, DO_MIN
)
1319 /* Two vector operand, one scalar operand, unpredicated. */
1320 #define DO_ZZI(NAME, TYPE, OP) \
1321 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1323 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1324 TYPE s = s64, *d = vd, *n = vn; \
1325 for (i = 0; i < opr_sz; ++i) { \
1326 d[i] = OP(n[i], s); \
1330 #define DO_SUBR(X, Y) (Y - X)
1332 DO_ZZI(sve_subri_b
, uint8_t, DO_SUBR
)
1333 DO_ZZI(sve_subri_h
, uint16_t, DO_SUBR
)
1334 DO_ZZI(sve_subri_s
, uint32_t, DO_SUBR
)
1335 DO_ZZI(sve_subri_d
, uint64_t, DO_SUBR
)
1337 DO_ZZI(sve_smaxi_b
, int8_t, DO_MAX
)
1338 DO_ZZI(sve_smaxi_h
, int16_t, DO_MAX
)
1339 DO_ZZI(sve_smaxi_s
, int32_t, DO_MAX
)
1340 DO_ZZI(sve_smaxi_d
, int64_t, DO_MAX
)
1342 DO_ZZI(sve_smini_b
, int8_t, DO_MIN
)
1343 DO_ZZI(sve_smini_h
, int16_t, DO_MIN
)
1344 DO_ZZI(sve_smini_s
, int32_t, DO_MIN
)
1345 DO_ZZI(sve_smini_d
, int64_t, DO_MIN
)
1347 DO_ZZI(sve_umaxi_b
, uint8_t, DO_MAX
)
1348 DO_ZZI(sve_umaxi_h
, uint16_t, DO_MAX
)
1349 DO_ZZI(sve_umaxi_s
, uint32_t, DO_MAX
)
1350 DO_ZZI(sve_umaxi_d
, uint64_t, DO_MAX
)
1352 DO_ZZI(sve_umini_b
, uint8_t, DO_MIN
)
1353 DO_ZZI(sve_umini_h
, uint16_t, DO_MIN
)
1354 DO_ZZI(sve_umini_s
, uint32_t, DO_MIN
)
1355 DO_ZZI(sve_umini_d
, uint64_t, DO_MIN
)
1375 /* Similar to the ARM LastActiveElement pseudocode function, except the
1376 result is multiplied by the element size. This includes the not found
1377 indication; e.g. not found for esz=3 is -8. */
1378 static intptr_t last_active_element(uint64_t *g
, intptr_t words
, intptr_t esz
)
1380 uint64_t mask
= pred_esz_masks
[esz
];
1384 uint64_t this_g
= g
[--i
] & mask
;
1386 return i
* 64 + (63 - clz64(this_g
));
1389 return (intptr_t)-1 << esz
;
1392 uint32_t HELPER(sve_pfirst
)(void *vd
, void *vg
, uint32_t pred_desc
)
1394 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
1395 uint32_t flags
= PREDTEST_INIT
;
1396 uint64_t *d
= vd
, *g
= vg
;
1400 uint64_t this_d
= d
[i
];
1401 uint64_t this_g
= g
[i
];
1405 /* Set in D the first bit of G. */
1406 this_d
|= this_g
& -this_g
;
1409 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
1411 } while (++i
< words
);
1416 uint32_t HELPER(sve_pnext
)(void *vd
, void *vg
, uint32_t pred_desc
)
1418 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
1419 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
1420 uint32_t flags
= PREDTEST_INIT
;
1421 uint64_t *d
= vd
, *g
= vg
, esz_mask
;
1424 next
= last_active_element(vd
, words
, esz
) + (1 << esz
);
1425 esz_mask
= pred_esz_masks
[esz
];
1427 /* Similar to the pseudocode for pnext, but scaled by ESZ
1428 so that we find the correct bit. */
1429 if (next
< words
* 64) {
1433 mask
= ~((1ull << (next
& 63)) - 1);
1437 uint64_t this_g
= g
[next
/ 64] & esz_mask
& mask
;
1439 next
= (next
& -64) + ctz64(this_g
);
1444 } while (next
< words
* 64);
1449 uint64_t this_d
= 0;
1450 if (i
== next
/ 64) {
1451 this_d
= 1ull << (next
& 63);
1454 flags
= iter_predtest_fwd(this_d
, g
[i
] & esz_mask
, flags
);
1455 } while (++i
< words
);
1461 * Copy Zn into Zd, and store zero into inactive elements.
1462 * If inv, store zeros into the active elements.
1464 void HELPER(sve_movz_b
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
1466 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1467 uint64_t inv
= -(uint64_t)(simd_data(desc
) & 1);
1468 uint64_t *d
= vd
, *n
= vn
;
1471 for (i
= 0; i
< opr_sz
; i
+= 1) {
1472 d
[i
] = n
[i
] & (expand_pred_b(pg
[H1(i
)]) ^ inv
);
1476 void HELPER(sve_movz_h
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
1478 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1479 uint64_t inv
= -(uint64_t)(simd_data(desc
) & 1);
1480 uint64_t *d
= vd
, *n
= vn
;
1483 for (i
= 0; i
< opr_sz
; i
+= 1) {
1484 d
[i
] = n
[i
] & (expand_pred_h(pg
[H1(i
)]) ^ inv
);
1488 void HELPER(sve_movz_s
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
1490 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1491 uint64_t inv
= -(uint64_t)(simd_data(desc
) & 1);
1492 uint64_t *d
= vd
, *n
= vn
;
1495 for (i
= 0; i
< opr_sz
; i
+= 1) {
1496 d
[i
] = n
[i
] & (expand_pred_s(pg
[H1(i
)]) ^ inv
);
1500 void HELPER(sve_movz_d
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
1502 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1503 uint64_t *d
= vd
, *n
= vn
;
1505 uint8_t inv
= simd_data(desc
);
1507 for (i
= 0; i
< opr_sz
; i
+= 1) {
1508 d
[i
] = n
[i
] & -(uint64_t)((pg
[H1(i
)] ^ inv
) & 1);
1512 /* Three-operand expander, immediate operand, controlled by a predicate.
1514 #define DO_ZPZI(NAME, TYPE, H, OP) \
1515 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1517 intptr_t i, opr_sz = simd_oprsz(desc); \
1518 TYPE imm = simd_data(desc); \
1519 for (i = 0; i < opr_sz; ) { \
1520 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1523 TYPE nn = *(TYPE *)(vn + H(i)); \
1524 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1526 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1531 /* Similarly, specialized for 64-bit operands. */
1532 #define DO_ZPZI_D(NAME, TYPE, OP) \
1533 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1535 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1536 TYPE *d = vd, *n = vn; \
1537 TYPE imm = simd_data(desc); \
1539 for (i = 0; i < opr_sz; i += 1) { \
1540 if (pg[H1(i)] & 1) { \
1542 d[i] = OP(nn, imm); \
1547 #define DO_SHR(N, M) (N >> M)
1548 #define DO_SHL(N, M) (N << M)
1550 /* Arithmetic shift right for division. This rounds negative numbers
1551 toward zero as per signed division. Therefore before shifting,
1552 when N is negative, add 2**M-1. */
1553 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1555 DO_ZPZI(sve_asr_zpzi_b
, int8_t, H1
, DO_SHR
)
1556 DO_ZPZI(sve_asr_zpzi_h
, int16_t, H1_2
, DO_SHR
)
1557 DO_ZPZI(sve_asr_zpzi_s
, int32_t, H1_4
, DO_SHR
)
1558 DO_ZPZI_D(sve_asr_zpzi_d
, int64_t, DO_SHR
)
1560 DO_ZPZI(sve_lsr_zpzi_b
, uint8_t, H1
, DO_SHR
)
1561 DO_ZPZI(sve_lsr_zpzi_h
, uint16_t, H1_2
, DO_SHR
)
1562 DO_ZPZI(sve_lsr_zpzi_s
, uint32_t, H1_4
, DO_SHR
)
1563 DO_ZPZI_D(sve_lsr_zpzi_d
, uint64_t, DO_SHR
)
1565 DO_ZPZI(sve_lsl_zpzi_b
, uint8_t, H1
, DO_SHL
)
1566 DO_ZPZI(sve_lsl_zpzi_h
, uint16_t, H1_2
, DO_SHL
)
1567 DO_ZPZI(sve_lsl_zpzi_s
, uint32_t, H1_4
, DO_SHL
)
1568 DO_ZPZI_D(sve_lsl_zpzi_d
, uint64_t, DO_SHL
)
1570 DO_ZPZI(sve_asrd_b
, int8_t, H1
, DO_ASRD
)
1571 DO_ZPZI(sve_asrd_h
, int16_t, H1_2
, DO_ASRD
)
1572 DO_ZPZI(sve_asrd_s
, int32_t, H1_4
, DO_ASRD
)
1573 DO_ZPZI_D(sve_asrd_d
, int64_t, DO_ASRD
)
1581 /* Fully general four-operand expander, controlled by a predicate.
1583 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
1584 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1585 void *vg, uint32_t desc) \
1587 intptr_t i, opr_sz = simd_oprsz(desc); \
1588 for (i = 0; i < opr_sz; ) { \
1589 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1592 TYPE nn = *(TYPE *)(vn + H(i)); \
1593 TYPE mm = *(TYPE *)(vm + H(i)); \
1594 TYPE aa = *(TYPE *)(va + H(i)); \
1595 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1597 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1602 /* Similarly, specialized for 64-bit operands. */
1603 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
1604 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1605 void *vg, uint32_t desc) \
1607 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1608 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1610 for (i = 0; i < opr_sz; i += 1) { \
1611 if (pg[H1(i)] & 1) { \
1612 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1613 d[i] = OP(aa, nn, mm); \
1618 #define DO_MLA(A, N, M) (A + N * M)
1619 #define DO_MLS(A, N, M) (A - N * M)
1621 DO_ZPZZZ(sve_mla_b
, uint8_t, H1
, DO_MLA
)
1622 DO_ZPZZZ(sve_mls_b
, uint8_t, H1
, DO_MLS
)
1624 DO_ZPZZZ(sve_mla_h
, uint16_t, H1_2
, DO_MLA
)
1625 DO_ZPZZZ(sve_mls_h
, uint16_t, H1_2
, DO_MLS
)
1627 DO_ZPZZZ(sve_mla_s
, uint32_t, H1_4
, DO_MLA
)
1628 DO_ZPZZZ(sve_mls_s
, uint32_t, H1_4
, DO_MLS
)
1630 DO_ZPZZZ_D(sve_mla_d
, uint64_t, DO_MLA
)
1631 DO_ZPZZZ_D(sve_mls_d
, uint64_t, DO_MLS
)
1638 void HELPER(sve_index_b
)(void *vd
, uint32_t start
,
1639 uint32_t incr
, uint32_t desc
)
1641 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1643 for (i
= 0; i
< opr_sz
; i
+= 1) {
1644 d
[H1(i
)] = start
+ i
* incr
;
1648 void HELPER(sve_index_h
)(void *vd
, uint32_t start
,
1649 uint32_t incr
, uint32_t desc
)
1651 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
1653 for (i
= 0; i
< opr_sz
; i
+= 1) {
1654 d
[H2(i
)] = start
+ i
* incr
;
1658 void HELPER(sve_index_s
)(void *vd
, uint32_t start
,
1659 uint32_t incr
, uint32_t desc
)
1661 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
1663 for (i
= 0; i
< opr_sz
; i
+= 1) {
1664 d
[H4(i
)] = start
+ i
* incr
;
1668 void HELPER(sve_index_d
)(void *vd
, uint64_t start
,
1669 uint64_t incr
, uint32_t desc
)
1671 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1673 for (i
= 0; i
< opr_sz
; i
+= 1) {
1674 d
[i
] = start
+ i
* incr
;
1678 void HELPER(sve_adr_p32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1680 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
1681 uint32_t sh
= simd_data(desc
);
1682 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
1683 for (i
= 0; i
< opr_sz
; i
+= 1) {
1684 d
[i
] = n
[i
] + (m
[i
] << sh
);
1688 void HELPER(sve_adr_p64
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1690 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1691 uint64_t sh
= simd_data(desc
);
1692 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1693 for (i
= 0; i
< opr_sz
; i
+= 1) {
1694 d
[i
] = n
[i
] + (m
[i
] << sh
);
1698 void HELPER(sve_adr_s32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1700 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1701 uint64_t sh
= simd_data(desc
);
1702 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1703 for (i
= 0; i
< opr_sz
; i
+= 1) {
1704 d
[i
] = n
[i
] + ((uint64_t)(int32_t)m
[i
] << sh
);
1708 void HELPER(sve_adr_u32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1710 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1711 uint64_t sh
= simd_data(desc
);
1712 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1713 for (i
= 0; i
< opr_sz
; i
+= 1) {
1714 d
[i
] = n
[i
] + ((uint64_t)(uint32_t)m
[i
] << sh
);
1718 void HELPER(sve_fexpa_h
)(void *vd
, void *vn
, uint32_t desc
)
1720 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1721 static const uint16_t coeff
[] = {
1722 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1723 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1724 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1725 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1727 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
1728 uint16_t *d
= vd
, *n
= vn
;
1730 for (i
= 0; i
< opr_sz
; i
++) {
1732 intptr_t idx
= extract32(nn
, 0, 5);
1733 uint16_t exp
= extract32(nn
, 5, 5);
1734 d
[i
] = coeff
[idx
] | (exp
<< 10);
1738 void HELPER(sve_fexpa_s
)(void *vd
, void *vn
, uint32_t desc
)
1740 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1741 static const uint32_t coeff
[] = {
1742 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1743 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1744 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1745 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1746 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1747 0x1ef532, 0x20b051, 0x227043, 0x243516,
1748 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1749 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1750 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1751 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1752 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1753 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1754 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1755 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1756 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1757 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1759 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
1760 uint32_t *d
= vd
, *n
= vn
;
1762 for (i
= 0; i
< opr_sz
; i
++) {
1764 intptr_t idx
= extract32(nn
, 0, 6);
1765 uint32_t exp
= extract32(nn
, 6, 8);
1766 d
[i
] = coeff
[idx
] | (exp
<< 23);
1770 void HELPER(sve_fexpa_d
)(void *vd
, void *vn
, uint32_t desc
)
1772 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1773 static const uint64_t coeff
[] = {
1774 0x0000000000000ull
, 0x02C9A3E778061ull
, 0x059B0D3158574ull
,
1775 0x0874518759BC8ull
, 0x0B5586CF9890Full
, 0x0E3EC32D3D1A2ull
,
1776 0x11301D0125B51ull
, 0x1429AAEA92DE0ull
, 0x172B83C7D517Bull
,
1777 0x1A35BEB6FCB75ull
, 0x1D4873168B9AAull
, 0x2063B88628CD6ull
,
1778 0x2387A6E756238ull
, 0x26B4565E27CDDull
, 0x29E9DF51FDEE1ull
,
1779 0x2D285A6E4030Bull
, 0x306FE0A31B715ull
, 0x33C08B26416FFull
,
1780 0x371A7373AA9CBull
, 0x3A7DB34E59FF7ull
, 0x3DEA64C123422ull
,
1781 0x4160A21F72E2Aull
, 0x44E086061892Dull
, 0x486A2B5C13CD0ull
,
1782 0x4BFDAD5362A27ull
, 0x4F9B2769D2CA7ull
, 0x5342B569D4F82ull
,
1783 0x56F4736B527DAull
, 0x5AB07DD485429ull
, 0x5E76F15AD2148ull
,
1784 0x6247EB03A5585ull
, 0x6623882552225ull
, 0x6A09E667F3BCDull
,
1785 0x6DFB23C651A2Full
, 0x71F75E8EC5F74ull
, 0x75FEB564267C9ull
,
1786 0x7A11473EB0187ull
, 0x7E2F336CF4E62ull
, 0x82589994CCE13ull
,
1787 0x868D99B4492EDull
, 0x8ACE5422AA0DBull
, 0x8F1AE99157736ull
,
1788 0x93737B0CDC5E5ull
, 0x97D829FDE4E50ull
, 0x9C49182A3F090ull
,
1789 0xA0C667B5DE565ull
, 0xA5503B23E255Dull
, 0xA9E6B5579FDBFull
,
1790 0xAE89F995AD3ADull
, 0xB33A2B84F15FBull
, 0xB7F76F2FB5E47ull
,
1791 0xBCC1E904BC1D2ull
, 0xC199BDD85529Cull
, 0xC67F12E57D14Bull
,
1792 0xCB720DCEF9069ull
, 0xD072D4A07897Cull
, 0xD5818DCFBA487ull
,
1793 0xDA9E603DB3285ull
, 0xDFC97337B9B5Full
, 0xE502EE78B3FF6ull
,
1794 0xEA4AFA2A490DAull
, 0xEFA1BEE615A27ull
, 0xF50765B6E4540ull
,
1797 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1798 uint64_t *d
= vd
, *n
= vn
;
1800 for (i
= 0; i
< opr_sz
; i
++) {
1802 intptr_t idx
= extract32(nn
, 0, 6);
1803 uint64_t exp
= extract32(nn
, 6, 11);
1804 d
[i
] = coeff
[idx
] | (exp
<< 52);
1808 void HELPER(sve_ftssel_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1810 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
1811 uint16_t *d
= vd
, *n
= vn
, *m
= vm
;
1812 for (i
= 0; i
< opr_sz
; i
+= 1) {
1818 d
[i
] = nn
^ (mm
& 2) << 14;
1822 void HELPER(sve_ftssel_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1824 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
1825 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
1826 for (i
= 0; i
< opr_sz
; i
+= 1) {
1832 d
[i
] = nn
^ (mm
& 2) << 30;
1836 void HELPER(sve_ftssel_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
1838 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1839 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
1840 for (i
= 0; i
< opr_sz
; i
+= 1) {
1846 d
[i
] = nn
^ (mm
& 2) << 62;
1851 * Signed saturating addition with scalar operand.
1854 void HELPER(sve_sqaddi_b
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
1856 intptr_t i
, oprsz
= simd_oprsz(desc
);
1858 for (i
= 0; i
< oprsz
; i
+= sizeof(int8_t)) {
1859 *(int8_t *)(d
+ i
) = DO_SQADD_B(b
, *(int8_t *)(a
+ i
));
1863 void HELPER(sve_sqaddi_h
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
1865 intptr_t i
, oprsz
= simd_oprsz(desc
);
1867 for (i
= 0; i
< oprsz
; i
+= sizeof(int16_t)) {
1868 *(int16_t *)(d
+ i
) = DO_SQADD_H(b
, *(int16_t *)(a
+ i
));
1872 void HELPER(sve_sqaddi_s
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
1874 intptr_t i
, oprsz
= simd_oprsz(desc
);
1876 for (i
= 0; i
< oprsz
; i
+= sizeof(int32_t)) {
1877 *(int32_t *)(d
+ i
) = DO_SQADD_S(b
, *(int32_t *)(a
+ i
));
1881 void HELPER(sve_sqaddi_d
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
1883 intptr_t i
, oprsz
= simd_oprsz(desc
);
1885 for (i
= 0; i
< oprsz
; i
+= sizeof(int64_t)) {
1886 *(int64_t *)(d
+ i
) = do_sqadd_d(b
, *(int64_t *)(a
+ i
));
1891 * Unsigned saturating addition with scalar operand.
1894 void HELPER(sve_uqaddi_b
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
1896 intptr_t i
, oprsz
= simd_oprsz(desc
);
1898 for (i
= 0; i
< oprsz
; i
+= sizeof(uint8_t)) {
1899 *(uint8_t *)(d
+ i
) = DO_UQADD_B(b
, *(uint8_t *)(a
+ i
));
1903 void HELPER(sve_uqaddi_h
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
1905 intptr_t i
, oprsz
= simd_oprsz(desc
);
1907 for (i
= 0; i
< oprsz
; i
+= sizeof(uint16_t)) {
1908 *(uint16_t *)(d
+ i
) = DO_UQADD_H(b
, *(uint16_t *)(a
+ i
));
1912 void HELPER(sve_uqaddi_s
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
1914 intptr_t i
, oprsz
= simd_oprsz(desc
);
1916 for (i
= 0; i
< oprsz
; i
+= sizeof(uint32_t)) {
1917 *(uint32_t *)(d
+ i
) = DO_UQADD_S(b
, *(uint32_t *)(a
+ i
));
1921 void HELPER(sve_uqaddi_d
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
1923 intptr_t i
, oprsz
= simd_oprsz(desc
);
1925 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
1926 *(uint64_t *)(d
+ i
) = do_uqadd_d(b
, *(uint64_t *)(a
+ i
));
1930 void HELPER(sve_uqsubi_d
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
1932 intptr_t i
, oprsz
= simd_oprsz(desc
);
1934 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
1935 *(uint64_t *)(d
+ i
) = do_uqsub_d(*(uint64_t *)(a
+ i
), b
);
1939 /* Two operand predicated copy immediate with merge. All valid immediates
1940 * can fit within 17 signed bits in the simd_data field.
1942 void HELPER(sve_cpy_m_b
)(void *vd
, void *vn
, void *vg
,
1943 uint64_t mm
, uint32_t desc
)
1945 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1946 uint64_t *d
= vd
, *n
= vn
;
1949 mm
= dup_const(MO_8
, mm
);
1950 for (i
= 0; i
< opr_sz
; i
+= 1) {
1952 uint64_t pp
= expand_pred_b(pg
[H1(i
)]);
1953 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
1957 void HELPER(sve_cpy_m_h
)(void *vd
, void *vn
, void *vg
,
1958 uint64_t mm
, uint32_t desc
)
1960 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1961 uint64_t *d
= vd
, *n
= vn
;
1964 mm
= dup_const(MO_16
, mm
);
1965 for (i
= 0; i
< opr_sz
; i
+= 1) {
1967 uint64_t pp
= expand_pred_h(pg
[H1(i
)]);
1968 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
1972 void HELPER(sve_cpy_m_s
)(void *vd
, void *vn
, void *vg
,
1973 uint64_t mm
, uint32_t desc
)
1975 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1976 uint64_t *d
= vd
, *n
= vn
;
1979 mm
= dup_const(MO_32
, mm
);
1980 for (i
= 0; i
< opr_sz
; i
+= 1) {
1982 uint64_t pp
= expand_pred_s(pg
[H1(i
)]);
1983 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
1987 void HELPER(sve_cpy_m_d
)(void *vd
, void *vn
, void *vg
,
1988 uint64_t mm
, uint32_t desc
)
1990 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1991 uint64_t *d
= vd
, *n
= vn
;
1994 for (i
= 0; i
< opr_sz
; i
+= 1) {
1996 d
[i
] = (pg
[H1(i
)] & 1 ? mm
: nn
);
2000 void HELPER(sve_cpy_z_b
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2002 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2006 val
= dup_const(MO_8
, val
);
2007 for (i
= 0; i
< opr_sz
; i
+= 1) {
2008 d
[i
] = val
& expand_pred_b(pg
[H1(i
)]);
2012 void HELPER(sve_cpy_z_h
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2014 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2018 val
= dup_const(MO_16
, val
);
2019 for (i
= 0; i
< opr_sz
; i
+= 1) {
2020 d
[i
] = val
& expand_pred_h(pg
[H1(i
)]);
2024 void HELPER(sve_cpy_z_s
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2026 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2030 val
= dup_const(MO_32
, val
);
2031 for (i
= 0; i
< opr_sz
; i
+= 1) {
2032 d
[i
] = val
& expand_pred_s(pg
[H1(i
)]);
2036 void HELPER(sve_cpy_z_d
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2038 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2042 for (i
= 0; i
< opr_sz
; i
+= 1) {
2043 d
[i
] = (pg
[H1(i
)] & 1 ? val
: 0);
2047 /* Big-endian hosts need to frob the byte indices. If the copy
2048 * happens to be 8-byte aligned, then no frobbing necessary.
2050 static void swap_memmove(void *vd
, void *vs
, size_t n
)
2052 uintptr_t d
= (uintptr_t)vd
;
2053 uintptr_t s
= (uintptr_t)vs
;
2054 uintptr_t o
= (d
| s
| n
) & 7;
2057 #ifndef HOST_WORDS_BIGENDIAN
2066 if (d
< s
|| d
>= s
+ n
) {
2067 for (i
= 0; i
< n
; i
+= 4) {
2068 *(uint32_t *)H1_4(d
+ i
) = *(uint32_t *)H1_4(s
+ i
);
2071 for (i
= n
; i
> 0; ) {
2073 *(uint32_t *)H1_4(d
+ i
) = *(uint32_t *)H1_4(s
+ i
);
2080 if (d
< s
|| d
>= s
+ n
) {
2081 for (i
= 0; i
< n
; i
+= 2) {
2082 *(uint16_t *)H1_2(d
+ i
) = *(uint16_t *)H1_2(s
+ i
);
2085 for (i
= n
; i
> 0; ) {
2087 *(uint16_t *)H1_2(d
+ i
) = *(uint16_t *)H1_2(s
+ i
);
2093 if (d
< s
|| d
>= s
+ n
) {
2094 for (i
= 0; i
< n
; i
++) {
2095 *(uint8_t *)H1(d
+ i
) = *(uint8_t *)H1(s
+ i
);
2098 for (i
= n
; i
> 0; ) {
2100 *(uint8_t *)H1(d
+ i
) = *(uint8_t *)H1(s
+ i
);
2107 /* Similarly for memset of 0. */
2108 static void swap_memzero(void *vd
, size_t n
)
2110 uintptr_t d
= (uintptr_t)vd
;
2111 uintptr_t o
= (d
| n
) & 7;
2114 /* Usually, the first bit of a predicate is set, so N is 0. */
2115 if (likely(n
== 0)) {
2119 #ifndef HOST_WORDS_BIGENDIAN
2128 for (i
= 0; i
< n
; i
+= 4) {
2129 *(uint32_t *)H1_4(d
+ i
) = 0;
2135 for (i
= 0; i
< n
; i
+= 2) {
2136 *(uint16_t *)H1_2(d
+ i
) = 0;
2141 for (i
= 0; i
< n
; i
++) {
2142 *(uint8_t *)H1(d
+ i
) = 0;
2148 void HELPER(sve_ext
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2150 intptr_t opr_sz
= simd_oprsz(desc
);
2151 size_t n_ofs
= simd_data(desc
);
2152 size_t n_siz
= opr_sz
- n_ofs
;
2155 swap_memmove(vd
, vn
+ n_ofs
, n_siz
);
2156 swap_memmove(vd
+ n_siz
, vm
, n_ofs
);
2157 } else if (vd
!= vn
) {
2158 swap_memmove(vd
+ n_siz
, vd
, n_ofs
);
2159 swap_memmove(vd
, vn
+ n_ofs
, n_siz
);
2161 /* vd == vn == vm. Need temp space. */
2163 swap_memmove(&tmp
, vm
, n_ofs
);
2164 swap_memmove(vd
, vd
+ n_ofs
, n_siz
);
2165 memcpy(vd
+ n_siz
, &tmp
, n_ofs
);
2169 #define DO_INSR(NAME, TYPE, H) \
2170 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2172 intptr_t opr_sz = simd_oprsz(desc); \
2173 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2174 *(TYPE *)(vd + H(0)) = val; \
2177 DO_INSR(sve_insr_b
, uint8_t, H1
)
2178 DO_INSR(sve_insr_h
, uint16_t, H1_2
)
2179 DO_INSR(sve_insr_s
, uint32_t, H1_4
)
2180 DO_INSR(sve_insr_d
, uint64_t, )
2184 void HELPER(sve_rev_b
)(void *vd
, void *vn
, uint32_t desc
)
2186 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2187 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
2188 uint64_t f
= *(uint64_t *)(vn
+ i
);
2189 uint64_t b
= *(uint64_t *)(vn
+ j
);
2190 *(uint64_t *)(vd
+ i
) = bswap64(b
);
2191 *(uint64_t *)(vd
+ j
) = bswap64(f
);
2195 void HELPER(sve_rev_h
)(void *vd
, void *vn
, uint32_t desc
)
2197 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2198 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
2199 uint64_t f
= *(uint64_t *)(vn
+ i
);
2200 uint64_t b
= *(uint64_t *)(vn
+ j
);
2201 *(uint64_t *)(vd
+ i
) = hswap64(b
);
2202 *(uint64_t *)(vd
+ j
) = hswap64(f
);
2206 void HELPER(sve_rev_s
)(void *vd
, void *vn
, uint32_t desc
)
2208 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2209 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
2210 uint64_t f
= *(uint64_t *)(vn
+ i
);
2211 uint64_t b
= *(uint64_t *)(vn
+ j
);
2212 *(uint64_t *)(vd
+ i
) = rol64(b
, 32);
2213 *(uint64_t *)(vd
+ j
) = rol64(f
, 32);
2217 void HELPER(sve_rev_d
)(void *vd
, void *vn
, uint32_t desc
)
2219 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2220 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
2221 uint64_t f
= *(uint64_t *)(vn
+ i
);
2222 uint64_t b
= *(uint64_t *)(vn
+ j
);
2223 *(uint64_t *)(vd
+ i
) = b
;
2224 *(uint64_t *)(vd
+ j
) = f
;
2228 #define DO_TBL(NAME, TYPE, H) \
2229 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2231 intptr_t i, opr_sz = simd_oprsz(desc); \
2232 uintptr_t elem = opr_sz / sizeof(TYPE); \
2233 TYPE *d = vd, *n = vn, *m = vm; \
2235 if (unlikely(vd == vn)) { \
2236 n = memcpy(&tmp, vn, opr_sz); \
2238 for (i = 0; i < elem; i++) { \
2240 d[H(i)] = j < elem ? n[H(j)] : 0; \
2244 DO_TBL(sve_tbl_b
, uint8_t, H1
)
2245 DO_TBL(sve_tbl_h
, uint16_t, H2
)
2246 DO_TBL(sve_tbl_s
, uint32_t, H4
)
2247 DO_TBL(sve_tbl_d
, uint64_t, )
2251 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
2252 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2254 intptr_t i, opr_sz = simd_oprsz(desc); \
2258 if (unlikely(vn - vd < opr_sz)) { \
2259 n = memcpy(&tmp, n, opr_sz / 2); \
2261 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
2262 d[HD(i)] = n[HS(i)]; \
2266 DO_UNPK(sve_sunpk_h
, int16_t, int8_t, H2
, H1
)
2267 DO_UNPK(sve_sunpk_s
, int32_t, int16_t, H4
, H2
)
2268 DO_UNPK(sve_sunpk_d
, int64_t, int32_t, , H4
)
2270 DO_UNPK(sve_uunpk_h
, uint16_t, uint8_t, H2
, H1
)
2271 DO_UNPK(sve_uunpk_s
, uint32_t, uint16_t, H4
, H2
)
2272 DO_UNPK(sve_uunpk_d
, uint64_t, uint32_t, , H4
)
2276 /* Mask of bits included in the even numbered predicates of width esz.
2277 * We also use this for expand_bits/compress_bits, and so extend the
2278 * same pattern out to 16-bit units.
2280 static const uint64_t even_bit_esz_masks
[5] = {
2281 0x5555555555555555ull
,
2282 0x3333333333333333ull
,
2283 0x0f0f0f0f0f0f0f0full
,
2284 0x00ff00ff00ff00ffull
,
2285 0x0000ffff0000ffffull
,
2288 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
2289 * For N==0, this corresponds to the operation that in qemu/bitops.h
2290 * we call half_shuffle64; this algorithm is from Hacker's Delight,
2291 * section 7-2 Shuffling Bits.
2293 static uint64_t expand_bits(uint64_t x
, int n
)
2298 for (i
= 4; i
>= n
; i
--) {
2300 x
= ((x
<< sh
) | x
) & even_bit_esz_masks
[i
];
2305 /* Compress units of 2**(N+1) bits to units of 2**N bits.
2306 * For N==0, this corresponds to the operation that in qemu/bitops.h
2307 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
2308 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
2310 static uint64_t compress_bits(uint64_t x
, int n
)
2314 for (i
= n
; i
<= 4; i
++) {
2316 x
&= even_bit_esz_masks
[i
];
2319 return x
& 0xffffffffu
;
2322 void HELPER(sve_zip_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
2324 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
2325 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
2326 intptr_t high
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
);
2327 int esize
= 1 << esz
;
2332 uint64_t nn
= *(uint64_t *)vn
;
2333 uint64_t mm
= *(uint64_t *)vm
;
2334 int half
= 4 * oprsz
;
2336 nn
= extract64(nn
, high
* half
, half
);
2337 mm
= extract64(mm
, high
* half
, half
);
2338 nn
= expand_bits(nn
, esz
);
2339 mm
= expand_bits(mm
, esz
);
2340 d
[0] = nn
| (mm
<< esize
);
2342 ARMPredicateReg tmp
;
2344 /* We produce output faster than we consume input.
2345 Therefore we must be mindful of possible overlap. */
2347 vn
= memcpy(&tmp
, vn
, oprsz
);
2351 } else if (vd
== vm
) {
2352 vm
= memcpy(&tmp
, vm
, oprsz
);
2358 if ((oprsz
& 7) == 0) {
2359 uint32_t *n
= vn
, *m
= vm
;
2362 for (i
= 0; i
< oprsz
/ 8; i
++) {
2363 uint64_t nn
= n
[H4(high
+ i
)];
2364 uint64_t mm
= m
[H4(high
+ i
)];
2366 nn
= expand_bits(nn
, esz
);
2367 mm
= expand_bits(mm
, esz
);
2368 d
[i
] = nn
| (mm
<< esize
);
2371 uint8_t *n
= vn
, *m
= vm
;
2374 for (i
= 0; i
< oprsz
/ 2; i
++) {
2375 uint16_t nn
= n
[H1(high
+ i
)];
2376 uint16_t mm
= m
[H1(high
+ i
)];
2378 nn
= expand_bits(nn
, esz
);
2379 mm
= expand_bits(mm
, esz
);
2380 d16
[H2(i
)] = nn
| (mm
<< esize
);
2386 void HELPER(sve_uzp_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
2388 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
2389 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
2390 int odd
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
) << esz
;
2391 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2396 l
= compress_bits(n
[0] >> odd
, esz
);
2397 h
= compress_bits(m
[0] >> odd
, esz
);
2398 d
[0] = l
| (h
<< (4 * oprsz
));
2400 ARMPredicateReg tmp_m
;
2401 intptr_t oprsz_16
= oprsz
/ 16;
2403 if ((vm
- vd
) < (uintptr_t)oprsz
) {
2404 m
= memcpy(&tmp_m
, vm
, oprsz
);
2407 for (i
= 0; i
< oprsz_16
; i
++) {
2410 l
= compress_bits(l
>> odd
, esz
);
2411 h
= compress_bits(h
>> odd
, esz
);
2412 d
[i
] = l
| (h
<< 32);
2416 * For VL which is not a multiple of 512, the results from M do not
2417 * align nicely with the uint64_t for D. Put the aligned results
2418 * from M into TMP_M and then copy it into place afterward.
2421 int final_shift
= (oprsz
& 15) * 2;
2425 l
= compress_bits(l
>> odd
, esz
);
2426 h
= compress_bits(h
>> odd
, esz
);
2427 d
[i
] = l
| (h
<< final_shift
);
2429 for (i
= 0; i
< oprsz_16
; i
++) {
2432 l
= compress_bits(l
>> odd
, esz
);
2433 h
= compress_bits(h
>> odd
, esz
);
2434 tmp_m
.p
[i
] = l
| (h
<< 32);
2438 l
= compress_bits(l
>> odd
, esz
);
2439 h
= compress_bits(h
>> odd
, esz
);
2440 tmp_m
.p
[i
] = l
| (h
<< final_shift
);
2442 swap_memmove(vd
+ oprsz
/ 2, &tmp_m
, oprsz
/ 2);
2444 for (i
= 0; i
< oprsz_16
; i
++) {
2447 l
= compress_bits(l
>> odd
, esz
);
2448 h
= compress_bits(h
>> odd
, esz
);
2449 d
[oprsz_16
+ i
] = l
| (h
<< 32);
2455 void HELPER(sve_trn_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
2457 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
2458 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
2459 int odd
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
);
2460 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2467 mask
= even_bit_esz_masks
[esz
];
2474 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); i
++) {
2475 uint64_t nn
= (n
[i
] & mask
) >> shr
;
2476 uint64_t mm
= (m
[i
] & mask
) << shl
;
2481 /* Reverse units of 2**N bits. */
2482 static uint64_t reverse_bits_64(uint64_t x
, int n
)
2487 for (i
= 2, sh
= 4; i
>= n
; i
--, sh
>>= 1) {
2488 uint64_t mask
= even_bit_esz_masks
[i
];
2489 x
= ((x
& mask
) << sh
) | ((x
>> sh
) & mask
);
2494 static uint8_t reverse_bits_8(uint8_t x
, int n
)
2496 static const uint8_t mask
[3] = { 0x55, 0x33, 0x0f };
2499 for (i
= 2, sh
= 4; i
>= n
; i
--, sh
>>= 1) {
2500 x
= ((x
& mask
[i
]) << sh
) | ((x
>> sh
) & mask
[i
]);
2505 void HELPER(sve_rev_p
)(void *vd
, void *vn
, uint32_t pred_desc
)
2507 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
2508 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
2509 intptr_t i
, oprsz_2
= oprsz
/ 2;
2512 uint64_t l
= *(uint64_t *)vn
;
2513 l
= reverse_bits_64(l
<< (64 - 8 * oprsz
), esz
);
2514 *(uint64_t *)vd
= l
;
2515 } else if ((oprsz
& 15) == 0) {
2516 for (i
= 0; i
< oprsz_2
; i
+= 8) {
2517 intptr_t ih
= oprsz
- 8 - i
;
2518 uint64_t l
= reverse_bits_64(*(uint64_t *)(vn
+ i
), esz
);
2519 uint64_t h
= reverse_bits_64(*(uint64_t *)(vn
+ ih
), esz
);
2520 *(uint64_t *)(vd
+ i
) = h
;
2521 *(uint64_t *)(vd
+ ih
) = l
;
2524 for (i
= 0; i
< oprsz_2
; i
+= 1) {
2525 intptr_t il
= H1(i
);
2526 intptr_t ih
= H1(oprsz
- 1 - i
);
2527 uint8_t l
= reverse_bits_8(*(uint8_t *)(vn
+ il
), esz
);
2528 uint8_t h
= reverse_bits_8(*(uint8_t *)(vn
+ ih
), esz
);
2529 *(uint8_t *)(vd
+ il
) = h
;
2530 *(uint8_t *)(vd
+ ih
) = l
;
2535 void HELPER(sve_punpk_p
)(void *vd
, void *vn
, uint32_t pred_desc
)
2537 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
2538 intptr_t high
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
);
2543 uint64_t nn
= *(uint64_t *)vn
;
2544 int half
= 4 * oprsz
;
2546 nn
= extract64(nn
, high
* half
, half
);
2547 nn
= expand_bits(nn
, 0);
2550 ARMPredicateReg tmp_n
;
2552 /* We produce output faster than we consume input.
2553 Therefore we must be mindful of possible overlap. */
2554 if ((vn
- vd
) < (uintptr_t)oprsz
) {
2555 vn
= memcpy(&tmp_n
, vn
, oprsz
);
2561 if ((oprsz
& 7) == 0) {
2565 for (i
= 0; i
< oprsz
/ 8; i
++) {
2566 uint64_t nn
= n
[H4(high
+ i
)];
2567 d
[i
] = expand_bits(nn
, 0);
2573 for (i
= 0; i
< oprsz
/ 2; i
++) {
2574 uint16_t nn
= n
[H1(high
+ i
)];
2575 d16
[H2(i
)] = expand_bits(nn
, 0);
2581 #define DO_ZIP(NAME, TYPE, H) \
2582 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2584 intptr_t oprsz = simd_oprsz(desc); \
2585 intptr_t i, oprsz_2 = oprsz / 2; \
2586 ARMVectorReg tmp_n, tmp_m; \
2587 /* We produce output faster than we consume input. \
2588 Therefore we must be mindful of possible overlap. */ \
2589 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2590 vn = memcpy(&tmp_n, vn, oprsz_2); \
2592 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2593 vm = memcpy(&tmp_m, vm, oprsz_2); \
2595 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2596 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2597 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2601 DO_ZIP(sve_zip_b
, uint8_t, H1
)
2602 DO_ZIP(sve_zip_h
, uint16_t, H1_2
)
2603 DO_ZIP(sve_zip_s
, uint32_t, H1_4
)
2604 DO_ZIP(sve_zip_d
, uint64_t, )
2606 #define DO_UZP(NAME, TYPE, H) \
2607 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2609 intptr_t oprsz = simd_oprsz(desc); \
2610 intptr_t oprsz_2 = oprsz / 2; \
2611 intptr_t odd_ofs = simd_data(desc); \
2613 ARMVectorReg tmp_m; \
2614 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2615 vm = memcpy(&tmp_m, vm, oprsz); \
2617 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2618 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2620 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2621 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2625 DO_UZP(sve_uzp_b
, uint8_t, H1
)
2626 DO_UZP(sve_uzp_h
, uint16_t, H1_2
)
2627 DO_UZP(sve_uzp_s
, uint32_t, H1_4
)
2628 DO_UZP(sve_uzp_d
, uint64_t, )
2630 #define DO_TRN(NAME, TYPE, H) \
2631 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2633 intptr_t oprsz = simd_oprsz(desc); \
2634 intptr_t odd_ofs = simd_data(desc); \
2636 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2637 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2638 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2639 *(TYPE *)(vd + H(i + 0)) = ae; \
2640 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2644 DO_TRN(sve_trn_b
, uint8_t, H1
)
2645 DO_TRN(sve_trn_h
, uint16_t, H1_2
)
2646 DO_TRN(sve_trn_s
, uint32_t, H1_4
)
2647 DO_TRN(sve_trn_d
, uint64_t, )
2653 void HELPER(sve_compact_s
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
2655 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
) / 4;
2656 uint32_t *d
= vd
, *n
= vn
;
2659 for (i
= j
= 0; i
< opr_sz
; i
++) {
2660 if (pg
[H1(i
/ 2)] & (i
& 1 ? 0x10 : 0x01)) {
2661 d
[H4(j
)] = n
[H4(i
)];
2665 for (; j
< opr_sz
; j
++) {
2670 void HELPER(sve_compact_d
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
2672 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
) / 8;
2673 uint64_t *d
= vd
, *n
= vn
;
2676 for (i
= j
= 0; i
< opr_sz
; i
++) {
2677 if (pg
[H1(i
)] & 1) {
2682 for (; j
< opr_sz
; j
++) {
2687 /* Similar to the ARM LastActiveElement pseudocode function, except the
2688 * result is multiplied by the element size. This includes the not found
2689 * indication; e.g. not found for esz=3 is -8.
2691 int32_t HELPER(sve_last_active_element
)(void *vg
, uint32_t pred_desc
)
2693 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
2694 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
2696 return last_active_element(vg
, words
, esz
);
2699 void HELPER(sve_splice
)(void *vd
, void *vn
, void *vm
, void *vg
, uint32_t desc
)
2701 intptr_t opr_sz
= simd_oprsz(desc
) / 8;
2702 int esz
= simd_data(desc
);
2703 uint64_t pg
, first_g
, last_g
, len
, mask
= pred_esz_masks
[esz
];
2704 intptr_t i
, first_i
, last_i
;
2707 first_i
= last_i
= 0;
2708 first_g
= last_g
= 0;
2710 /* Find the extent of the active elements within VG. */
2711 for (i
= QEMU_ALIGN_UP(opr_sz
, 8) - 8; i
>= 0; i
-= 8) {
2712 pg
= *(uint64_t *)(vg
+ i
) & mask
;
2725 first_i
= first_i
* 8 + ctz64(first_g
);
2726 last_i
= last_i
* 8 + 63 - clz64(last_g
);
2727 len
= last_i
- first_i
+ (1 << esz
);
2729 vm
= memcpy(&tmp
, vm
, opr_sz
* 8);
2731 swap_memmove(vd
, vn
+ first_i
, len
);
2733 swap_memmove(vd
+ len
, vm
, opr_sz
* 8 - len
);
2736 void HELPER(sve_sel_zpzz_b
)(void *vd
, void *vn
, void *vm
,
2737 void *vg
, uint32_t desc
)
2739 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2740 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2743 for (i
= 0; i
< opr_sz
; i
+= 1) {
2744 uint64_t nn
= n
[i
], mm
= m
[i
];
2745 uint64_t pp
= expand_pred_b(pg
[H1(i
)]);
2746 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
2750 void HELPER(sve_sel_zpzz_h
)(void *vd
, void *vn
, void *vm
,
2751 void *vg
, uint32_t desc
)
2753 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2754 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2757 for (i
= 0; i
< opr_sz
; i
+= 1) {
2758 uint64_t nn
= n
[i
], mm
= m
[i
];
2759 uint64_t pp
= expand_pred_h(pg
[H1(i
)]);
2760 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
2764 void HELPER(sve_sel_zpzz_s
)(void *vd
, void *vn
, void *vm
,
2765 void *vg
, uint32_t desc
)
2767 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2768 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2771 for (i
= 0; i
< opr_sz
; i
+= 1) {
2772 uint64_t nn
= n
[i
], mm
= m
[i
];
2773 uint64_t pp
= expand_pred_s(pg
[H1(i
)]);
2774 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
2778 void HELPER(sve_sel_zpzz_d
)(void *vd
, void *vn
, void *vm
,
2779 void *vg
, uint32_t desc
)
2781 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2782 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2785 for (i
= 0; i
< opr_sz
; i
+= 1) {
2786 uint64_t nn
= n
[i
], mm
= m
[i
];
2787 d
[i
] = (pg
[H1(i
)] & 1 ? nn
: mm
);
2791 /* Two operand comparison controlled by a predicate.
2792 * ??? It is very tempting to want to be able to expand this inline
2793 * with x86 instructions, e.g.
2795 * vcmpeqw zm, zn, %ymm0
2796 * vpmovmskb %ymm0, %eax
2800 * or even aarch64, e.g.
2802 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
2803 * cmeq v0.8h, zn, zm
2804 * and v0.8h, v0.8h, mask
2808 * However, coming up with an abstraction that allows vector inputs and
2809 * a scalar output, and also handles the byte-ordering of sub-uint64_t
2810 * scalar outputs, is tricky.
2812 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
2813 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2815 intptr_t opr_sz = simd_oprsz(desc); \
2816 uint32_t flags = PREDTEST_INIT; \
2817 intptr_t i = opr_sz; \
2819 uint64_t out = 0, pg; \
2821 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2822 TYPE nn = *(TYPE *)(vn + H(i)); \
2823 TYPE mm = *(TYPE *)(vm + H(i)); \
2826 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2828 *(uint64_t *)(vd + (i >> 3)) = out; \
2829 flags = iter_predtest_bwd(out, pg, flags); \
2834 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
2835 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2836 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
2837 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2838 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
2839 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2840 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
2841 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
2843 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b
, uint8_t, ==)
2844 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h
, uint16_t, ==)
2845 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s
, uint32_t, ==)
2846 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d
, uint64_t, ==)
2848 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b
, uint8_t, !=)
2849 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h
, uint16_t, !=)
2850 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s
, uint32_t, !=)
2851 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d
, uint64_t, !=)
2853 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b
, int8_t, >)
2854 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h
, int16_t, >)
2855 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s
, int32_t, >)
2856 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d
, int64_t, >)
2858 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b
, int8_t, >=)
2859 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h
, int16_t, >=)
2860 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s
, int32_t, >=)
2861 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d
, int64_t, >=)
2863 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b
, uint8_t, >)
2864 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h
, uint16_t, >)
2865 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s
, uint32_t, >)
2866 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d
, uint64_t, >)
2868 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b
, uint8_t, >=)
2869 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h
, uint16_t, >=)
2870 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s
, uint32_t, >=)
2871 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d
, uint64_t, >=)
2873 #undef DO_CMP_PPZZ_B
2874 #undef DO_CMP_PPZZ_H
2875 #undef DO_CMP_PPZZ_S
2876 #undef DO_CMP_PPZZ_D
2879 /* Similar, but the second source is "wide". */
2880 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
2881 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
2883 intptr_t opr_sz = simd_oprsz(desc); \
2884 uint32_t flags = PREDTEST_INIT; \
2885 intptr_t i = opr_sz; \
2887 uint64_t out = 0, pg; \
2889 TYPEW mm = *(TYPEW *)(vm + i - 8); \
2891 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2892 TYPE nn = *(TYPE *)(vn + H(i)); \
2896 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2898 *(uint64_t *)(vd + (i >> 3)) = out; \
2899 flags = iter_predtest_bwd(out, pg, flags); \
2904 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
2905 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
2906 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
2907 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
2908 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
2909 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
2911 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b
, int8_t, uint64_t, ==)
2912 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h
, int16_t, uint64_t, ==)
2913 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s
, int32_t, uint64_t, ==)
2915 DO_CMP_PPZW_B(sve_cmpne_ppzw_b
, int8_t, uint64_t, !=)
2916 DO_CMP_PPZW_H(sve_cmpne_ppzw_h
, int16_t, uint64_t, !=)
2917 DO_CMP_PPZW_S(sve_cmpne_ppzw_s
, int32_t, uint64_t, !=)
2919 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b
, int8_t, int64_t, >)
2920 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h
, int16_t, int64_t, >)
2921 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s
, int32_t, int64_t, >)
2923 DO_CMP_PPZW_B(sve_cmpge_ppzw_b
, int8_t, int64_t, >=)
2924 DO_CMP_PPZW_H(sve_cmpge_ppzw_h
, int16_t, int64_t, >=)
2925 DO_CMP_PPZW_S(sve_cmpge_ppzw_s
, int32_t, int64_t, >=)
2927 DO_CMP_PPZW_B(sve_cmphi_ppzw_b
, uint8_t, uint64_t, >)
2928 DO_CMP_PPZW_H(sve_cmphi_ppzw_h
, uint16_t, uint64_t, >)
2929 DO_CMP_PPZW_S(sve_cmphi_ppzw_s
, uint32_t, uint64_t, >)
2931 DO_CMP_PPZW_B(sve_cmphs_ppzw_b
, uint8_t, uint64_t, >=)
2932 DO_CMP_PPZW_H(sve_cmphs_ppzw_h
, uint16_t, uint64_t, >=)
2933 DO_CMP_PPZW_S(sve_cmphs_ppzw_s
, uint32_t, uint64_t, >=)
2935 DO_CMP_PPZW_B(sve_cmplt_ppzw_b
, int8_t, int64_t, <)
2936 DO_CMP_PPZW_H(sve_cmplt_ppzw_h
, int16_t, int64_t, <)
2937 DO_CMP_PPZW_S(sve_cmplt_ppzw_s
, int32_t, int64_t, <)
2939 DO_CMP_PPZW_B(sve_cmple_ppzw_b
, int8_t, int64_t, <=)
2940 DO_CMP_PPZW_H(sve_cmple_ppzw_h
, int16_t, int64_t, <=)
2941 DO_CMP_PPZW_S(sve_cmple_ppzw_s
, int32_t, int64_t, <=)
2943 DO_CMP_PPZW_B(sve_cmplo_ppzw_b
, uint8_t, uint64_t, <)
2944 DO_CMP_PPZW_H(sve_cmplo_ppzw_h
, uint16_t, uint64_t, <)
2945 DO_CMP_PPZW_S(sve_cmplo_ppzw_s
, uint32_t, uint64_t, <)
2947 DO_CMP_PPZW_B(sve_cmpls_ppzw_b
, uint8_t, uint64_t, <=)
2948 DO_CMP_PPZW_H(sve_cmpls_ppzw_h
, uint16_t, uint64_t, <=)
2949 DO_CMP_PPZW_S(sve_cmpls_ppzw_s
, uint32_t, uint64_t, <=)
2951 #undef DO_CMP_PPZW_B
2952 #undef DO_CMP_PPZW_H
2953 #undef DO_CMP_PPZW_S
2956 /* Similar, but the second source is immediate. */
2957 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
2958 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2960 intptr_t opr_sz = simd_oprsz(desc); \
2961 uint32_t flags = PREDTEST_INIT; \
2962 TYPE mm = simd_data(desc); \
2963 intptr_t i = opr_sz; \
2965 uint64_t out = 0, pg; \
2967 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
2968 TYPE nn = *(TYPE *)(vn + H(i)); \
2971 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
2973 *(uint64_t *)(vd + (i >> 3)) = out; \
2974 flags = iter_predtest_bwd(out, pg, flags); \
2979 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
2980 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
2981 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
2982 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
2983 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
2984 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
2985 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
2986 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
2988 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b
, uint8_t, ==)
2989 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h
, uint16_t, ==)
2990 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s
, uint32_t, ==)
2991 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d
, uint64_t, ==)
2993 DO_CMP_PPZI_B(sve_cmpne_ppzi_b
, uint8_t, !=)
2994 DO_CMP_PPZI_H(sve_cmpne_ppzi_h
, uint16_t, !=)
2995 DO_CMP_PPZI_S(sve_cmpne_ppzi_s
, uint32_t, !=)
2996 DO_CMP_PPZI_D(sve_cmpne_ppzi_d
, uint64_t, !=)
2998 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b
, int8_t, >)
2999 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h
, int16_t, >)
3000 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s
, int32_t, >)
3001 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d
, int64_t, >)
3003 DO_CMP_PPZI_B(sve_cmpge_ppzi_b
, int8_t, >=)
3004 DO_CMP_PPZI_H(sve_cmpge_ppzi_h
, int16_t, >=)
3005 DO_CMP_PPZI_S(sve_cmpge_ppzi_s
, int32_t, >=)
3006 DO_CMP_PPZI_D(sve_cmpge_ppzi_d
, int64_t, >=)
3008 DO_CMP_PPZI_B(sve_cmphi_ppzi_b
, uint8_t, >)
3009 DO_CMP_PPZI_H(sve_cmphi_ppzi_h
, uint16_t, >)
3010 DO_CMP_PPZI_S(sve_cmphi_ppzi_s
, uint32_t, >)
3011 DO_CMP_PPZI_D(sve_cmphi_ppzi_d
, uint64_t, >)
3013 DO_CMP_PPZI_B(sve_cmphs_ppzi_b
, uint8_t, >=)
3014 DO_CMP_PPZI_H(sve_cmphs_ppzi_h
, uint16_t, >=)
3015 DO_CMP_PPZI_S(sve_cmphs_ppzi_s
, uint32_t, >=)
3016 DO_CMP_PPZI_D(sve_cmphs_ppzi_d
, uint64_t, >=)
3018 DO_CMP_PPZI_B(sve_cmplt_ppzi_b
, int8_t, <)
3019 DO_CMP_PPZI_H(sve_cmplt_ppzi_h
, int16_t, <)
3020 DO_CMP_PPZI_S(sve_cmplt_ppzi_s
, int32_t, <)
3021 DO_CMP_PPZI_D(sve_cmplt_ppzi_d
, int64_t, <)
3023 DO_CMP_PPZI_B(sve_cmple_ppzi_b
, int8_t, <=)
3024 DO_CMP_PPZI_H(sve_cmple_ppzi_h
, int16_t, <=)
3025 DO_CMP_PPZI_S(sve_cmple_ppzi_s
, int32_t, <=)
3026 DO_CMP_PPZI_D(sve_cmple_ppzi_d
, int64_t, <=)
3028 DO_CMP_PPZI_B(sve_cmplo_ppzi_b
, uint8_t, <)
3029 DO_CMP_PPZI_H(sve_cmplo_ppzi_h
, uint16_t, <)
3030 DO_CMP_PPZI_S(sve_cmplo_ppzi_s
, uint32_t, <)
3031 DO_CMP_PPZI_D(sve_cmplo_ppzi_d
, uint64_t, <)
3033 DO_CMP_PPZI_B(sve_cmpls_ppzi_b
, uint8_t, <=)
3034 DO_CMP_PPZI_H(sve_cmpls_ppzi_h
, uint16_t, <=)
3035 DO_CMP_PPZI_S(sve_cmpls_ppzi_s
, uint32_t, <=)
3036 DO_CMP_PPZI_D(sve_cmpls_ppzi_d
, uint64_t, <=)
3038 #undef DO_CMP_PPZI_B
3039 #undef DO_CMP_PPZI_H
3040 #undef DO_CMP_PPZI_S
3041 #undef DO_CMP_PPZI_D
3044 /* Similar to the ARM LastActive pseudocode function. */
3045 static bool last_active_pred(void *vd
, void *vg
, intptr_t oprsz
)
3049 for (i
= QEMU_ALIGN_UP(oprsz
, 8) - 8; i
>= 0; i
-= 8) {
3050 uint64_t pg
= *(uint64_t *)(vg
+ i
);
3052 return (pow2floor(pg
) & *(uint64_t *)(vd
+ i
)) != 0;
3058 /* Compute a mask into RETB that is true for all G, up to and including
3059 * (if after) or excluding (if !after) the first G & N.
3060 * Return true if BRK found.
3062 static bool compute_brk(uint64_t *retb
, uint64_t n
, uint64_t g
,
3063 bool brk
, bool after
)
3069 } else if ((g
& n
) == 0) {
3070 /* For all G, no N are set; break not found. */
3073 /* Break somewhere in N. Locate it. */
3074 b
= g
& n
; /* guard true, pred true */
3075 b
= b
& -b
; /* first such */
3077 b
= b
| (b
- 1); /* break after same */
3079 b
= b
- 1; /* break before same */
3088 /* Compute a zeroing BRK. */
3089 static void compute_brk_z(uint64_t *d
, uint64_t *n
, uint64_t *g
,
3090 intptr_t oprsz
, bool after
)
3095 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
3096 uint64_t this_b
, this_g
= g
[i
];
3098 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
3099 d
[i
] = this_b
& this_g
;
3103 /* Likewise, but also compute flags. */
3104 static uint32_t compute_brks_z(uint64_t *d
, uint64_t *n
, uint64_t *g
,
3105 intptr_t oprsz
, bool after
)
3107 uint32_t flags
= PREDTEST_INIT
;
3111 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
3112 uint64_t this_b
, this_d
, this_g
= g
[i
];
3114 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
3115 d
[i
] = this_d
= this_b
& this_g
;
3116 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
3121 /* Compute a merging BRK. */
3122 static void compute_brk_m(uint64_t *d
, uint64_t *n
, uint64_t *g
,
3123 intptr_t oprsz
, bool after
)
3128 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
3129 uint64_t this_b
, this_g
= g
[i
];
3131 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
3132 d
[i
] = (this_b
& this_g
) | (d
[i
] & ~this_g
);
3136 /* Likewise, but also compute flags. */
3137 static uint32_t compute_brks_m(uint64_t *d
, uint64_t *n
, uint64_t *g
,
3138 intptr_t oprsz
, bool after
)
3140 uint32_t flags
= PREDTEST_INIT
;
3144 for (i
= 0; i
< oprsz
/ 8; ++i
) {
3145 uint64_t this_b
, this_d
= d
[i
], this_g
= g
[i
];
3147 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
3148 d
[i
] = this_d
= (this_b
& this_g
) | (this_d
& ~this_g
);
3149 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
3154 static uint32_t do_zero(ARMPredicateReg
*d
, intptr_t oprsz
)
3156 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3157 * The compiler should turn this into 4 64-bit integer stores.
3159 memset(d
, 0, sizeof(ARMPredicateReg
));
3160 return PREDTEST_INIT
;
3163 void HELPER(sve_brkpa
)(void *vd
, void *vn
, void *vm
, void *vg
,
3166 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3167 if (last_active_pred(vn
, vg
, oprsz
)) {
3168 compute_brk_z(vd
, vm
, vg
, oprsz
, true);
3174 uint32_t HELPER(sve_brkpas
)(void *vd
, void *vn
, void *vm
, void *vg
,
3177 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3178 if (last_active_pred(vn
, vg
, oprsz
)) {
3179 return compute_brks_z(vd
, vm
, vg
, oprsz
, true);
3181 return do_zero(vd
, oprsz
);
3185 void HELPER(sve_brkpb
)(void *vd
, void *vn
, void *vm
, void *vg
,
3188 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3189 if (last_active_pred(vn
, vg
, oprsz
)) {
3190 compute_brk_z(vd
, vm
, vg
, oprsz
, false);
3196 uint32_t HELPER(sve_brkpbs
)(void *vd
, void *vn
, void *vm
, void *vg
,
3199 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3200 if (last_active_pred(vn
, vg
, oprsz
)) {
3201 return compute_brks_z(vd
, vm
, vg
, oprsz
, false);
3203 return do_zero(vd
, oprsz
);
3207 void HELPER(sve_brka_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3209 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3210 compute_brk_z(vd
, vn
, vg
, oprsz
, true);
3213 uint32_t HELPER(sve_brkas_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3215 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3216 return compute_brks_z(vd
, vn
, vg
, oprsz
, true);
3219 void HELPER(sve_brkb_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3221 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3222 compute_brk_z(vd
, vn
, vg
, oprsz
, false);
3225 uint32_t HELPER(sve_brkbs_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3227 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3228 return compute_brks_z(vd
, vn
, vg
, oprsz
, false);
3231 void HELPER(sve_brka_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3233 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3234 compute_brk_m(vd
, vn
, vg
, oprsz
, true);
3237 uint32_t HELPER(sve_brkas_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3239 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3240 return compute_brks_m(vd
, vn
, vg
, oprsz
, true);
3243 void HELPER(sve_brkb_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3245 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3246 compute_brk_m(vd
, vn
, vg
, oprsz
, false);
3249 uint32_t HELPER(sve_brkbs_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3251 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3252 return compute_brks_m(vd
, vn
, vg
, oprsz
, false);
3255 void HELPER(sve_brkn
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3257 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3258 if (!last_active_pred(vn
, vg
, oprsz
)) {
3263 /* As if PredTest(Ones(PL), D, esz). */
3264 static uint32_t predtest_ones(ARMPredicateReg
*d
, intptr_t oprsz
,
3267 uint32_t flags
= PREDTEST_INIT
;
3270 for (i
= 0; i
< oprsz
/ 8; i
++) {
3271 flags
= iter_predtest_fwd(d
->p
[i
], esz_mask
, flags
);
3274 uint64_t mask
= ~(-1ULL << (8 * (oprsz
& 7)));
3275 flags
= iter_predtest_fwd(d
->p
[i
], esz_mask
& mask
, flags
);
3280 uint32_t HELPER(sve_brkns
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3282 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3283 if (last_active_pred(vn
, vg
, oprsz
)) {
3284 return predtest_ones(vd
, oprsz
, -1);
3286 return do_zero(vd
, oprsz
);
3290 uint64_t HELPER(sve_cntp
)(void *vn
, void *vg
, uint32_t pred_desc
)
3292 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
3293 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3294 uint64_t *n
= vn
, *g
= vg
, sum
= 0, mask
= pred_esz_masks
[esz
];
3297 for (i
= 0; i
< words
; ++i
) {
3298 uint64_t t
= n
[i
] & g
[i
] & mask
;
3304 uint32_t HELPER(sve_while
)(void *vd
, uint32_t count
, uint32_t pred_desc
)
3306 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3307 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3308 uint64_t esz_mask
= pred_esz_masks
[esz
];
3309 ARMPredicateReg
*d
= vd
;
3313 /* Begin with a zero predicate register. */
3314 flags
= do_zero(d
, oprsz
);
3319 /* Set all of the requested bits. */
3320 for (i
= 0; i
< count
/ 64; ++i
) {
3324 d
->p
[i
] = MAKE_64BIT_MASK(0, count
& 63) & esz_mask
;
3327 return predtest_ones(d
, oprsz
, esz_mask
);
3330 /* Recursive reduction on a function;
3331 * C.f. the ARM ARM function ReducePredicated.
3333 * While it would be possible to write this without the DATA temporary,
3334 * it is much simpler to process the predicate register this way.
3335 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
3336 * little to gain with a more complex non-recursive form.
3338 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
3339 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
3344 uintptr_t half = n / 2; \
3345 TYPE lo = NAME##_reduce(data, status, half); \
3346 TYPE hi = NAME##_reduce(data + half, status, half); \
3347 return TYPE##_##FUNC(lo, hi, status); \
3350 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
3352 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
3353 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
3354 for (i = 0; i < oprsz; ) { \
3355 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3357 TYPE nn = *(TYPE *)(vn + H(i)); \
3358 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
3359 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
3362 for (; i < maxsz; i += sizeof(TYPE)) { \
3363 *(TYPE *)((void *)data + i) = IDENT; \
3365 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
3368 DO_REDUCE(sve_faddv_h
, float16
, H1_2
, add
, float16_zero
)
3369 DO_REDUCE(sve_faddv_s
, float32
, H1_4
, add
, float32_zero
)
3370 DO_REDUCE(sve_faddv_d
, float64
, , add
, float64_zero
)
3372 /* Identity is floatN_default_nan, without the function call. */
3373 DO_REDUCE(sve_fminnmv_h
, float16
, H1_2
, minnum
, 0x7E00)
3374 DO_REDUCE(sve_fminnmv_s
, float32
, H1_4
, minnum
, 0x7FC00000)
3375 DO_REDUCE(sve_fminnmv_d
, float64
, , minnum
, 0x7FF8000000000000ULL
)
3377 DO_REDUCE(sve_fmaxnmv_h
, float16
, H1_2
, maxnum
, 0x7E00)
3378 DO_REDUCE(sve_fmaxnmv_s
, float32
, H1_4
, maxnum
, 0x7FC00000)
3379 DO_REDUCE(sve_fmaxnmv_d
, float64
, , maxnum
, 0x7FF8000000000000ULL
)
3381 DO_REDUCE(sve_fminv_h
, float16
, H1_2
, min
, float16_infinity
)
3382 DO_REDUCE(sve_fminv_s
, float32
, H1_4
, min
, float32_infinity
)
3383 DO_REDUCE(sve_fminv_d
, float64
, , min
, float64_infinity
)
3385 DO_REDUCE(sve_fmaxv_h
, float16
, H1_2
, max
, float16_chs(float16_infinity
))
3386 DO_REDUCE(sve_fmaxv_s
, float32
, H1_4
, max
, float32_chs(float32_infinity
))
3387 DO_REDUCE(sve_fmaxv_d
, float64
, , max
, float64_chs(float64_infinity
))
3391 uint64_t HELPER(sve_fadda_h
)(uint64_t nn
, void *vm
, void *vg
,
3392 void *status
, uint32_t desc
)
3394 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
);
3395 float16 result
= nn
;
3398 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
3401 float16 mm
= *(float16
*)(vm
+ H1_2(i
));
3402 result
= float16_add(result
, mm
, status
);
3404 i
+= sizeof(float16
), pg
>>= sizeof(float16
);
3406 } while (i
< opr_sz
);
3411 uint64_t HELPER(sve_fadda_s
)(uint64_t nn
, void *vm
, void *vg
,
3412 void *status
, uint32_t desc
)
3414 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
);
3415 float32 result
= nn
;
3418 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
3421 float32 mm
= *(float32
*)(vm
+ H1_2(i
));
3422 result
= float32_add(result
, mm
, status
);
3424 i
+= sizeof(float32
), pg
>>= sizeof(float32
);
3426 } while (i
< opr_sz
);
3431 uint64_t HELPER(sve_fadda_d
)(uint64_t nn
, void *vm
, void *vg
,
3432 void *status
, uint32_t desc
)
3434 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
) / 8;
3438 for (i
= 0; i
< opr_sz
; i
++) {
3439 if (pg
[H1(i
)] & 1) {
3440 nn
= float64_add(nn
, m
[i
], status
);
3447 /* Fully general three-operand expander, controlled by a predicate,
3448 * With the extra float_status parameter.
3450 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
3451 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3452 void *status, uint32_t desc) \
3454 intptr_t i = simd_oprsz(desc); \
3457 uint64_t pg = g[(i - 1) >> 6]; \
3459 i -= sizeof(TYPE); \
3460 if (likely((pg >> (i & 63)) & 1)) { \
3461 TYPE nn = *(TYPE *)(vn + H(i)); \
3462 TYPE mm = *(TYPE *)(vm + H(i)); \
3463 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3469 DO_ZPZZ_FP(sve_fadd_h
, uint16_t, H1_2
, float16_add
)
3470 DO_ZPZZ_FP(sve_fadd_s
, uint32_t, H1_4
, float32_add
)
3471 DO_ZPZZ_FP(sve_fadd_d
, uint64_t, , float64_add
)
3473 DO_ZPZZ_FP(sve_fsub_h
, uint16_t, H1_2
, float16_sub
)
3474 DO_ZPZZ_FP(sve_fsub_s
, uint32_t, H1_4
, float32_sub
)
3475 DO_ZPZZ_FP(sve_fsub_d
, uint64_t, , float64_sub
)
3477 DO_ZPZZ_FP(sve_fmul_h
, uint16_t, H1_2
, float16_mul
)
3478 DO_ZPZZ_FP(sve_fmul_s
, uint32_t, H1_4
, float32_mul
)
3479 DO_ZPZZ_FP(sve_fmul_d
, uint64_t, , float64_mul
)
3481 DO_ZPZZ_FP(sve_fdiv_h
, uint16_t, H1_2
, float16_div
)
3482 DO_ZPZZ_FP(sve_fdiv_s
, uint32_t, H1_4
, float32_div
)
3483 DO_ZPZZ_FP(sve_fdiv_d
, uint64_t, , float64_div
)
3485 DO_ZPZZ_FP(sve_fmin_h
, uint16_t, H1_2
, float16_min
)
3486 DO_ZPZZ_FP(sve_fmin_s
, uint32_t, H1_4
, float32_min
)
3487 DO_ZPZZ_FP(sve_fmin_d
, uint64_t, , float64_min
)
3489 DO_ZPZZ_FP(sve_fmax_h
, uint16_t, H1_2
, float16_max
)
3490 DO_ZPZZ_FP(sve_fmax_s
, uint32_t, H1_4
, float32_max
)
3491 DO_ZPZZ_FP(sve_fmax_d
, uint64_t, , float64_max
)
3493 DO_ZPZZ_FP(sve_fminnum_h
, uint16_t, H1_2
, float16_minnum
)
3494 DO_ZPZZ_FP(sve_fminnum_s
, uint32_t, H1_4
, float32_minnum
)
3495 DO_ZPZZ_FP(sve_fminnum_d
, uint64_t, , float64_minnum
)
3497 DO_ZPZZ_FP(sve_fmaxnum_h
, uint16_t, H1_2
, float16_maxnum
)
3498 DO_ZPZZ_FP(sve_fmaxnum_s
, uint32_t, H1_4
, float32_maxnum
)
3499 DO_ZPZZ_FP(sve_fmaxnum_d
, uint64_t, , float64_maxnum
)
3501 static inline float16
abd_h(float16 a
, float16 b
, float_status
*s
)
3503 return float16_abs(float16_sub(a
, b
, s
));
3506 static inline float32
abd_s(float32 a
, float32 b
, float_status
*s
)
3508 return float32_abs(float32_sub(a
, b
, s
));
3511 static inline float64
abd_d(float64 a
, float64 b
, float_status
*s
)
3513 return float64_abs(float64_sub(a
, b
, s
));
3516 DO_ZPZZ_FP(sve_fabd_h
, uint16_t, H1_2
, abd_h
)
3517 DO_ZPZZ_FP(sve_fabd_s
, uint32_t, H1_4
, abd_s
)
3518 DO_ZPZZ_FP(sve_fabd_d
, uint64_t, , abd_d
)
3520 static inline float64
scalbn_d(float64 a
, int64_t b
, float_status
*s
)
3522 int b_int
= MIN(MAX(b
, INT_MIN
), INT_MAX
);
3523 return float64_scalbn(a
, b_int
, s
);
3526 DO_ZPZZ_FP(sve_fscalbn_h
, int16_t, H1_2
, float16_scalbn
)
3527 DO_ZPZZ_FP(sve_fscalbn_s
, int32_t, H1_4
, float32_scalbn
)
3528 DO_ZPZZ_FP(sve_fscalbn_d
, int64_t, , scalbn_d
)
3530 DO_ZPZZ_FP(sve_fmulx_h
, uint16_t, H1_2
, helper_advsimd_mulxh
)
3531 DO_ZPZZ_FP(sve_fmulx_s
, uint32_t, H1_4
, helper_vfp_mulxs
)
3532 DO_ZPZZ_FP(sve_fmulx_d
, uint64_t, , helper_vfp_mulxd
)
3536 /* Three-operand expander, with one scalar operand, controlled by
3537 * a predicate, with the extra float_status parameter.
3539 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
3540 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
3541 void *status, uint32_t desc) \
3543 intptr_t i = simd_oprsz(desc); \
3547 uint64_t pg = g[(i - 1) >> 6]; \
3549 i -= sizeof(TYPE); \
3550 if (likely((pg >> (i & 63)) & 1)) { \
3551 TYPE nn = *(TYPE *)(vn + H(i)); \
3552 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3558 DO_ZPZS_FP(sve_fadds_h
, float16
, H1_2
, float16_add
)
3559 DO_ZPZS_FP(sve_fadds_s
, float32
, H1_4
, float32_add
)
3560 DO_ZPZS_FP(sve_fadds_d
, float64
, , float64_add
)
3562 DO_ZPZS_FP(sve_fsubs_h
, float16
, H1_2
, float16_sub
)
3563 DO_ZPZS_FP(sve_fsubs_s
, float32
, H1_4
, float32_sub
)
3564 DO_ZPZS_FP(sve_fsubs_d
, float64
, , float64_sub
)
3566 DO_ZPZS_FP(sve_fmuls_h
, float16
, H1_2
, float16_mul
)
3567 DO_ZPZS_FP(sve_fmuls_s
, float32
, H1_4
, float32_mul
)
3568 DO_ZPZS_FP(sve_fmuls_d
, float64
, , float64_mul
)
3570 static inline float16
subr_h(float16 a
, float16 b
, float_status
*s
)
3572 return float16_sub(b
, a
, s
);
3575 static inline float32
subr_s(float32 a
, float32 b
, float_status
*s
)
3577 return float32_sub(b
, a
, s
);
3580 static inline float64
subr_d(float64 a
, float64 b
, float_status
*s
)
3582 return float64_sub(b
, a
, s
);
3585 DO_ZPZS_FP(sve_fsubrs_h
, float16
, H1_2
, subr_h
)
3586 DO_ZPZS_FP(sve_fsubrs_s
, float32
, H1_4
, subr_s
)
3587 DO_ZPZS_FP(sve_fsubrs_d
, float64
, , subr_d
)
3589 DO_ZPZS_FP(sve_fmaxnms_h
, float16
, H1_2
, float16_maxnum
)
3590 DO_ZPZS_FP(sve_fmaxnms_s
, float32
, H1_4
, float32_maxnum
)
3591 DO_ZPZS_FP(sve_fmaxnms_d
, float64
, , float64_maxnum
)
3593 DO_ZPZS_FP(sve_fminnms_h
, float16
, H1_2
, float16_minnum
)
3594 DO_ZPZS_FP(sve_fminnms_s
, float32
, H1_4
, float32_minnum
)
3595 DO_ZPZS_FP(sve_fminnms_d
, float64
, , float64_minnum
)
3597 DO_ZPZS_FP(sve_fmaxs_h
, float16
, H1_2
, float16_max
)
3598 DO_ZPZS_FP(sve_fmaxs_s
, float32
, H1_4
, float32_max
)
3599 DO_ZPZS_FP(sve_fmaxs_d
, float64
, , float64_max
)
3601 DO_ZPZS_FP(sve_fmins_h
, float16
, H1_2
, float16_min
)
3602 DO_ZPZS_FP(sve_fmins_s
, float32
, H1_4
, float32_min
)
3603 DO_ZPZS_FP(sve_fmins_d
, float64
, , float64_min
)
3605 /* Fully general two-operand expander, controlled by a predicate,
3606 * With the extra float_status parameter.
3608 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
3609 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
3611 intptr_t i = simd_oprsz(desc); \
3614 uint64_t pg = g[(i - 1) >> 6]; \
3616 i -= sizeof(TYPE); \
3617 if (likely((pg >> (i & 63)) & 1)) { \
3618 TYPE nn = *(TYPE *)(vn + H(i)); \
3619 *(TYPE *)(vd + H(i)) = OP(nn, status); \
3625 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
3626 * FZ16. When converting from fp16, this affects flushing input denormals;
3627 * when converting to fp16, this affects flushing output denormals.
3629 static inline float32
sve_f16_to_f32(float16 f
, float_status
*fpst
)
3631 bool save
= get_flush_inputs_to_zero(fpst
);
3634 set_flush_inputs_to_zero(false, fpst
);
3635 ret
= float16_to_float32(f
, true, fpst
);
3636 set_flush_inputs_to_zero(save
, fpst
);
3640 static inline float64
sve_f16_to_f64(float16 f
, float_status
*fpst
)
3642 bool save
= get_flush_inputs_to_zero(fpst
);
3645 set_flush_inputs_to_zero(false, fpst
);
3646 ret
= float16_to_float64(f
, true, fpst
);
3647 set_flush_inputs_to_zero(save
, fpst
);
3651 static inline float16
sve_f32_to_f16(float32 f
, float_status
*fpst
)
3653 bool save
= get_flush_to_zero(fpst
);
3656 set_flush_to_zero(false, fpst
);
3657 ret
= float32_to_float16(f
, true, fpst
);
3658 set_flush_to_zero(save
, fpst
);
3662 static inline float16
sve_f64_to_f16(float64 f
, float_status
*fpst
)
3664 bool save
= get_flush_to_zero(fpst
);
3667 set_flush_to_zero(false, fpst
);
3668 ret
= float64_to_float16(f
, true, fpst
);
3669 set_flush_to_zero(save
, fpst
);
3673 static inline int16_t vfp_float16_to_int16_rtz(float16 f
, float_status
*s
)
3675 if (float16_is_any_nan(f
)) {
3676 float_raise(float_flag_invalid
, s
);
3679 return float16_to_int16_round_to_zero(f
, s
);
3682 static inline int64_t vfp_float16_to_int64_rtz(float16 f
, float_status
*s
)
3684 if (float16_is_any_nan(f
)) {
3685 float_raise(float_flag_invalid
, s
);
3688 return float16_to_int64_round_to_zero(f
, s
);
3691 static inline int64_t vfp_float32_to_int64_rtz(float32 f
, float_status
*s
)
3693 if (float32_is_any_nan(f
)) {
3694 float_raise(float_flag_invalid
, s
);
3697 return float32_to_int64_round_to_zero(f
, s
);
3700 static inline int64_t vfp_float64_to_int64_rtz(float64 f
, float_status
*s
)
3702 if (float64_is_any_nan(f
)) {
3703 float_raise(float_flag_invalid
, s
);
3706 return float64_to_int64_round_to_zero(f
, s
);
3709 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f
, float_status
*s
)
3711 if (float16_is_any_nan(f
)) {
3712 float_raise(float_flag_invalid
, s
);
3715 return float16_to_uint16_round_to_zero(f
, s
);
3718 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f
, float_status
*s
)
3720 if (float16_is_any_nan(f
)) {
3721 float_raise(float_flag_invalid
, s
);
3724 return float16_to_uint64_round_to_zero(f
, s
);
3727 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f
, float_status
*s
)
3729 if (float32_is_any_nan(f
)) {
3730 float_raise(float_flag_invalid
, s
);
3733 return float32_to_uint64_round_to_zero(f
, s
);
3736 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f
, float_status
*s
)
3738 if (float64_is_any_nan(f
)) {
3739 float_raise(float_flag_invalid
, s
);
3742 return float64_to_uint64_round_to_zero(f
, s
);
3745 DO_ZPZ_FP(sve_fcvt_sh
, uint32_t, H1_4
, sve_f32_to_f16
)
3746 DO_ZPZ_FP(sve_fcvt_hs
, uint32_t, H1_4
, sve_f16_to_f32
)
3747 DO_ZPZ_FP(sve_fcvt_dh
, uint64_t, , sve_f64_to_f16
)
3748 DO_ZPZ_FP(sve_fcvt_hd
, uint64_t, , sve_f16_to_f64
)
3749 DO_ZPZ_FP(sve_fcvt_ds
, uint64_t, , float64_to_float32
)
3750 DO_ZPZ_FP(sve_fcvt_sd
, uint64_t, , float32_to_float64
)
3752 DO_ZPZ_FP(sve_fcvtzs_hh
, uint16_t, H1_2
, vfp_float16_to_int16_rtz
)
3753 DO_ZPZ_FP(sve_fcvtzs_hs
, uint32_t, H1_4
, helper_vfp_tosizh
)
3754 DO_ZPZ_FP(sve_fcvtzs_ss
, uint32_t, H1_4
, helper_vfp_tosizs
)
3755 DO_ZPZ_FP(sve_fcvtzs_hd
, uint64_t, , vfp_float16_to_int64_rtz
)
3756 DO_ZPZ_FP(sve_fcvtzs_sd
, uint64_t, , vfp_float32_to_int64_rtz
)
3757 DO_ZPZ_FP(sve_fcvtzs_ds
, uint64_t, , helper_vfp_tosizd
)
3758 DO_ZPZ_FP(sve_fcvtzs_dd
, uint64_t, , vfp_float64_to_int64_rtz
)
3760 DO_ZPZ_FP(sve_fcvtzu_hh
, uint16_t, H1_2
, vfp_float16_to_uint16_rtz
)
3761 DO_ZPZ_FP(sve_fcvtzu_hs
, uint32_t, H1_4
, helper_vfp_touizh
)
3762 DO_ZPZ_FP(sve_fcvtzu_ss
, uint32_t, H1_4
, helper_vfp_touizs
)
3763 DO_ZPZ_FP(sve_fcvtzu_hd
, uint64_t, , vfp_float16_to_uint64_rtz
)
3764 DO_ZPZ_FP(sve_fcvtzu_sd
, uint64_t, , vfp_float32_to_uint64_rtz
)
3765 DO_ZPZ_FP(sve_fcvtzu_ds
, uint64_t, , helper_vfp_touizd
)
3766 DO_ZPZ_FP(sve_fcvtzu_dd
, uint64_t, , vfp_float64_to_uint64_rtz
)
3768 DO_ZPZ_FP(sve_frint_h
, uint16_t, H1_2
, helper_advsimd_rinth
)
3769 DO_ZPZ_FP(sve_frint_s
, uint32_t, H1_4
, helper_rints
)
3770 DO_ZPZ_FP(sve_frint_d
, uint64_t, , helper_rintd
)
3772 DO_ZPZ_FP(sve_frintx_h
, uint16_t, H1_2
, float16_round_to_int
)
3773 DO_ZPZ_FP(sve_frintx_s
, uint32_t, H1_4
, float32_round_to_int
)
3774 DO_ZPZ_FP(sve_frintx_d
, uint64_t, , float64_round_to_int
)
3776 DO_ZPZ_FP(sve_frecpx_h
, uint16_t, H1_2
, helper_frecpx_f16
)
3777 DO_ZPZ_FP(sve_frecpx_s
, uint32_t, H1_4
, helper_frecpx_f32
)
3778 DO_ZPZ_FP(sve_frecpx_d
, uint64_t, , helper_frecpx_f64
)
3780 DO_ZPZ_FP(sve_fsqrt_h
, uint16_t, H1_2
, float16_sqrt
)
3781 DO_ZPZ_FP(sve_fsqrt_s
, uint32_t, H1_4
, float32_sqrt
)
3782 DO_ZPZ_FP(sve_fsqrt_d
, uint64_t, , float64_sqrt
)
3784 DO_ZPZ_FP(sve_scvt_hh
, uint16_t, H1_2
, int16_to_float16
)
3785 DO_ZPZ_FP(sve_scvt_sh
, uint32_t, H1_4
, int32_to_float16
)
3786 DO_ZPZ_FP(sve_scvt_ss
, uint32_t, H1_4
, int32_to_float32
)
3787 DO_ZPZ_FP(sve_scvt_sd
, uint64_t, , int32_to_float64
)
3788 DO_ZPZ_FP(sve_scvt_dh
, uint64_t, , int64_to_float16
)
3789 DO_ZPZ_FP(sve_scvt_ds
, uint64_t, , int64_to_float32
)
3790 DO_ZPZ_FP(sve_scvt_dd
, uint64_t, , int64_to_float64
)
3792 DO_ZPZ_FP(sve_ucvt_hh
, uint16_t, H1_2
, uint16_to_float16
)
3793 DO_ZPZ_FP(sve_ucvt_sh
, uint32_t, H1_4
, uint32_to_float16
)
3794 DO_ZPZ_FP(sve_ucvt_ss
, uint32_t, H1_4
, uint32_to_float32
)
3795 DO_ZPZ_FP(sve_ucvt_sd
, uint64_t, , uint32_to_float64
)
3796 DO_ZPZ_FP(sve_ucvt_dh
, uint64_t, , uint64_to_float16
)
3797 DO_ZPZ_FP(sve_ucvt_ds
, uint64_t, , uint64_to_float32
)
3798 DO_ZPZ_FP(sve_ucvt_dd
, uint64_t, , uint64_to_float64
)
3802 static void do_fmla_zpzzz_h(void *vd
, void *vn
, void *vm
, void *va
, void *vg
,
3803 float_status
*status
, uint32_t desc
,
3804 uint16_t neg1
, uint16_t neg3
)
3806 intptr_t i
= simd_oprsz(desc
);
3810 uint64_t pg
= g
[(i
- 1) >> 6];
3813 if (likely((pg
>> (i
& 63)) & 1)) {
3814 float16 e1
, e2
, e3
, r
;
3816 e1
= *(uint16_t *)(vn
+ H1_2(i
)) ^ neg1
;
3817 e2
= *(uint16_t *)(vm
+ H1_2(i
));
3818 e3
= *(uint16_t *)(va
+ H1_2(i
)) ^ neg3
;
3819 r
= float16_muladd(e1
, e2
, e3
, 0, status
);
3820 *(uint16_t *)(vd
+ H1_2(i
)) = r
;
3826 void HELPER(sve_fmla_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
3827 void *vg
, void *status
, uint32_t desc
)
3829 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0);
3832 void HELPER(sve_fmls_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
3833 void *vg
, void *status
, uint32_t desc
)
3835 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x8000, 0);
3838 void HELPER(sve_fnmla_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
3839 void *vg
, void *status
, uint32_t desc
)
3841 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x8000, 0x8000);
3844 void HELPER(sve_fnmls_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
3845 void *vg
, void *status
, uint32_t desc
)
3847 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0x8000);
3850 static void do_fmla_zpzzz_s(void *vd
, void *vn
, void *vm
, void *va
, void *vg
,
3851 float_status
*status
, uint32_t desc
,
3852 uint32_t neg1
, uint32_t neg3
)
3854 intptr_t i
= simd_oprsz(desc
);
3858 uint64_t pg
= g
[(i
- 1) >> 6];
3861 if (likely((pg
>> (i
& 63)) & 1)) {
3862 float32 e1
, e2
, e3
, r
;
3864 e1
= *(uint32_t *)(vn
+ H1_4(i
)) ^ neg1
;
3865 e2
= *(uint32_t *)(vm
+ H1_4(i
));
3866 e3
= *(uint32_t *)(va
+ H1_4(i
)) ^ neg3
;
3867 r
= float32_muladd(e1
, e2
, e3
, 0, status
);
3868 *(uint32_t *)(vd
+ H1_4(i
)) = r
;
3874 void HELPER(sve_fmla_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
3875 void *vg
, void *status
, uint32_t desc
)
3877 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0);
3880 void HELPER(sve_fmls_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
3881 void *vg
, void *status
, uint32_t desc
)
3883 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x80000000, 0);
3886 void HELPER(sve_fnmla_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
3887 void *vg
, void *status
, uint32_t desc
)
3889 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x80000000, 0x80000000);
3892 void HELPER(sve_fnmls_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
3893 void *vg
, void *status
, uint32_t desc
)
3895 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0x80000000);
3898 static void do_fmla_zpzzz_d(void *vd
, void *vn
, void *vm
, void *va
, void *vg
,
3899 float_status
*status
, uint32_t desc
,
3900 uint64_t neg1
, uint64_t neg3
)
3902 intptr_t i
= simd_oprsz(desc
);
3906 uint64_t pg
= g
[(i
- 1) >> 6];
3909 if (likely((pg
>> (i
& 63)) & 1)) {
3910 float64 e1
, e2
, e3
, r
;
3912 e1
= *(uint64_t *)(vn
+ i
) ^ neg1
;
3913 e2
= *(uint64_t *)(vm
+ i
);
3914 e3
= *(uint64_t *)(va
+ i
) ^ neg3
;
3915 r
= float64_muladd(e1
, e2
, e3
, 0, status
);
3916 *(uint64_t *)(vd
+ i
) = r
;
3922 void HELPER(sve_fmla_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
3923 void *vg
, void *status
, uint32_t desc
)
3925 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0);
3928 void HELPER(sve_fmls_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
3929 void *vg
, void *status
, uint32_t desc
)
3931 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, INT64_MIN
, 0);
3934 void HELPER(sve_fnmla_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
3935 void *vg
, void *status
, uint32_t desc
)
3937 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, INT64_MIN
, INT64_MIN
);
3940 void HELPER(sve_fnmls_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
3941 void *vg
, void *status
, uint32_t desc
)
3943 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, INT64_MIN
);
3946 /* Two operand floating-point comparison controlled by a predicate.
3947 * Unlike the integer version, we are not allowed to optimistically
3948 * compare operands, since the comparison may have side effects wrt
3951 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
3952 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3953 void *status, uint32_t desc) \
3955 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
3956 uint64_t *d = vd, *g = vg; \
3958 uint64_t out = 0, pg = g[j]; \
3960 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3961 if (likely((pg >> (i & 63)) & 1)) { \
3962 TYPE nn = *(TYPE *)(vn + H(i)); \
3963 TYPE mm = *(TYPE *)(vm + H(i)); \
3964 out |= OP(TYPE, nn, mm, status); \
3971 #define DO_FPCMP_PPZZ_H(NAME, OP) \
3972 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
3973 #define DO_FPCMP_PPZZ_S(NAME, OP) \
3974 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
3975 #define DO_FPCMP_PPZZ_D(NAME, OP) \
3976 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
3978 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
3979 DO_FPCMP_PPZZ_H(NAME, OP) \
3980 DO_FPCMP_PPZZ_S(NAME, OP) \
3981 DO_FPCMP_PPZZ_D(NAME, OP)
3983 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
3984 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
3985 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
3986 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
3987 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
3988 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
3989 #define DO_FCMUO(TYPE, X, Y, ST) \
3990 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
3991 #define DO_FACGE(TYPE, X, Y, ST) \
3992 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
3993 #define DO_FACGT(TYPE, X, Y, ST) \
3994 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
3996 DO_FPCMP_PPZZ_ALL(sve_fcmge
, DO_FCMGE
)
3997 DO_FPCMP_PPZZ_ALL(sve_fcmgt
, DO_FCMGT
)
3998 DO_FPCMP_PPZZ_ALL(sve_fcmeq
, DO_FCMEQ
)
3999 DO_FPCMP_PPZZ_ALL(sve_fcmne
, DO_FCMNE
)
4000 DO_FPCMP_PPZZ_ALL(sve_fcmuo
, DO_FCMUO
)
4001 DO_FPCMP_PPZZ_ALL(sve_facge
, DO_FACGE
)
4002 DO_FPCMP_PPZZ_ALL(sve_facgt
, DO_FACGT
)
4004 #undef DO_FPCMP_PPZZ_ALL
4005 #undef DO_FPCMP_PPZZ_D
4006 #undef DO_FPCMP_PPZZ_S
4007 #undef DO_FPCMP_PPZZ_H
4008 #undef DO_FPCMP_PPZZ
4010 /* One operand floating-point comparison against zero, controlled
4013 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4014 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4015 void *status, uint32_t desc) \
4017 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4018 uint64_t *d = vd, *g = vg; \
4020 uint64_t out = 0, pg = g[j]; \
4022 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4023 if ((pg >> (i & 63)) & 1) { \
4024 TYPE nn = *(TYPE *)(vn + H(i)); \
4025 out |= OP(TYPE, nn, 0, status); \
4032 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4033 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4034 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4035 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4036 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4037 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
4039 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4040 DO_FPCMP_PPZ0_H(NAME, OP) \
4041 DO_FPCMP_PPZ0_S(NAME, OP) \
4042 DO_FPCMP_PPZ0_D(NAME, OP)
4044 DO_FPCMP_PPZ0_ALL(sve_fcmge0
, DO_FCMGE
)
4045 DO_FPCMP_PPZ0_ALL(sve_fcmgt0
, DO_FCMGT
)
4046 DO_FPCMP_PPZ0_ALL(sve_fcmle0
, DO_FCMLE
)
4047 DO_FPCMP_PPZ0_ALL(sve_fcmlt0
, DO_FCMLT
)
4048 DO_FPCMP_PPZ0_ALL(sve_fcmeq0
, DO_FCMEQ
)
4049 DO_FPCMP_PPZ0_ALL(sve_fcmne0
, DO_FCMNE
)
4051 /* FP Trig Multiply-Add. */
4053 void HELPER(sve_ftmad_h
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
4055 static const float16 coeff
[16] = {
4056 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4057 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4059 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float16
);
4060 intptr_t x
= simd_data(desc
);
4061 float16
*d
= vd
, *n
= vn
, *m
= vm
;
4062 for (i
= 0; i
< opr_sz
; i
++) {
4065 if (float16_is_neg(mm
)) {
4066 mm
= float16_abs(mm
);
4069 d
[i
] = float16_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
4073 void HELPER(sve_ftmad_s
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
4075 static const float32 coeff
[16] = {
4076 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
4077 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
4078 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
4079 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
4081 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float32
);
4082 intptr_t x
= simd_data(desc
);
4083 float32
*d
= vd
, *n
= vn
, *m
= vm
;
4084 for (i
= 0; i
< opr_sz
; i
++) {
4087 if (float32_is_neg(mm
)) {
4088 mm
= float32_abs(mm
);
4091 d
[i
] = float32_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
4095 void HELPER(sve_ftmad_d
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
4097 static const float64 coeff
[16] = {
4098 0x3ff0000000000000ull
, 0xbfc5555555555543ull
,
4099 0x3f8111111110f30cull
, 0xbf2a01a019b92fc6ull
,
4100 0x3ec71de351f3d22bull
, 0xbe5ae5e2b60f7b91ull
,
4101 0x3de5d8408868552full
, 0x0000000000000000ull
,
4102 0x3ff0000000000000ull
, 0xbfe0000000000000ull
,
4103 0x3fa5555555555536ull
, 0xbf56c16c16c13a0bull
,
4104 0x3efa01a019b1e8d8ull
, 0xbe927e4f7282f468ull
,
4105 0x3e21ee96d2641b13ull
, 0xbda8f76380fbb401ull
,
4107 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float64
);
4108 intptr_t x
= simd_data(desc
);
4109 float64
*d
= vd
, *n
= vn
, *m
= vm
;
4110 for (i
= 0; i
< opr_sz
; i
++) {
4113 if (float64_is_neg(mm
)) {
4114 mm
= float64_abs(mm
);
4117 d
[i
] = float64_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
4125 void HELPER(sve_fcadd_h
)(void *vd
, void *vn
, void *vm
, void *vg
,
4126 void *vs
, uint32_t desc
)
4128 intptr_t j
, i
= simd_oprsz(desc
);
4130 float16 neg_imag
= float16_set_sign(0, simd_data(desc
));
4131 float16 neg_real
= float16_chs(neg_imag
);
4134 uint64_t pg
= g
[(i
- 1) >> 6];
4136 float16 e0
, e1
, e2
, e3
;
4138 /* I holds the real index; J holds the imag index. */
4139 j
= i
- sizeof(float16
);
4140 i
-= 2 * sizeof(float16
);
4142 e0
= *(float16
*)(vn
+ H1_2(i
));
4143 e1
= *(float16
*)(vm
+ H1_2(j
)) ^ neg_real
;
4144 e2
= *(float16
*)(vn
+ H1_2(j
));
4145 e3
= *(float16
*)(vm
+ H1_2(i
)) ^ neg_imag
;
4147 if (likely((pg
>> (i
& 63)) & 1)) {
4148 *(float16
*)(vd
+ H1_2(i
)) = float16_add(e0
, e1
, vs
);
4150 if (likely((pg
>> (j
& 63)) & 1)) {
4151 *(float16
*)(vd
+ H1_2(j
)) = float16_add(e2
, e3
, vs
);
4157 void HELPER(sve_fcadd_s
)(void *vd
, void *vn
, void *vm
, void *vg
,
4158 void *vs
, uint32_t desc
)
4160 intptr_t j
, i
= simd_oprsz(desc
);
4162 float32 neg_imag
= float32_set_sign(0, simd_data(desc
));
4163 float32 neg_real
= float32_chs(neg_imag
);
4166 uint64_t pg
= g
[(i
- 1) >> 6];
4168 float32 e0
, e1
, e2
, e3
;
4170 /* I holds the real index; J holds the imag index. */
4171 j
= i
- sizeof(float32
);
4172 i
-= 2 * sizeof(float32
);
4174 e0
= *(float32
*)(vn
+ H1_2(i
));
4175 e1
= *(float32
*)(vm
+ H1_2(j
)) ^ neg_real
;
4176 e2
= *(float32
*)(vn
+ H1_2(j
));
4177 e3
= *(float32
*)(vm
+ H1_2(i
)) ^ neg_imag
;
4179 if (likely((pg
>> (i
& 63)) & 1)) {
4180 *(float32
*)(vd
+ H1_2(i
)) = float32_add(e0
, e1
, vs
);
4182 if (likely((pg
>> (j
& 63)) & 1)) {
4183 *(float32
*)(vd
+ H1_2(j
)) = float32_add(e2
, e3
, vs
);
4189 void HELPER(sve_fcadd_d
)(void *vd
, void *vn
, void *vm
, void *vg
,
4190 void *vs
, uint32_t desc
)
4192 intptr_t j
, i
= simd_oprsz(desc
);
4194 float64 neg_imag
= float64_set_sign(0, simd_data(desc
));
4195 float64 neg_real
= float64_chs(neg_imag
);
4198 uint64_t pg
= g
[(i
- 1) >> 6];
4200 float64 e0
, e1
, e2
, e3
;
4202 /* I holds the real index; J holds the imag index. */
4203 j
= i
- sizeof(float64
);
4204 i
-= 2 * sizeof(float64
);
4206 e0
= *(float64
*)(vn
+ H1_2(i
));
4207 e1
= *(float64
*)(vm
+ H1_2(j
)) ^ neg_real
;
4208 e2
= *(float64
*)(vn
+ H1_2(j
));
4209 e3
= *(float64
*)(vm
+ H1_2(i
)) ^ neg_imag
;
4211 if (likely((pg
>> (i
& 63)) & 1)) {
4212 *(float64
*)(vd
+ H1_2(i
)) = float64_add(e0
, e1
, vs
);
4214 if (likely((pg
>> (j
& 63)) & 1)) {
4215 *(float64
*)(vd
+ H1_2(j
)) = float64_add(e2
, e3
, vs
);
4222 * FP Complex Multiply
4225 void HELPER(sve_fcmla_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4226 void *vg
, void *status
, uint32_t desc
)
4228 intptr_t j
, i
= simd_oprsz(desc
);
4229 unsigned rot
= simd_data(desc
);
4230 bool flip
= rot
& 1;
4231 float16 neg_imag
, neg_real
;
4234 neg_imag
= float16_set_sign(0, (rot
& 2) != 0);
4235 neg_real
= float16_set_sign(0, rot
== 1 || rot
== 2);
4238 uint64_t pg
= g
[(i
- 1) >> 6];
4240 float16 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
4242 /* I holds the real index; J holds the imag index. */
4243 j
= i
- sizeof(float16
);
4244 i
-= 2 * sizeof(float16
);
4246 nr
= *(float16
*)(vn
+ H1_2(i
));
4247 ni
= *(float16
*)(vn
+ H1_2(j
));
4248 mr
= *(float16
*)(vm
+ H1_2(i
));
4249 mi
= *(float16
*)(vm
+ H1_2(j
));
4251 e2
= (flip
? ni
: nr
);
4252 e1
= (flip
? mi
: mr
) ^ neg_real
;
4254 e3
= (flip
? mr
: mi
) ^ neg_imag
;
4256 if (likely((pg
>> (i
& 63)) & 1)) {
4257 d
= *(float16
*)(va
+ H1_2(i
));
4258 d
= float16_muladd(e2
, e1
, d
, 0, status
);
4259 *(float16
*)(vd
+ H1_2(i
)) = d
;
4261 if (likely((pg
>> (j
& 63)) & 1)) {
4262 d
= *(float16
*)(va
+ H1_2(j
));
4263 d
= float16_muladd(e4
, e3
, d
, 0, status
);
4264 *(float16
*)(vd
+ H1_2(j
)) = d
;
4270 void HELPER(sve_fcmla_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4271 void *vg
, void *status
, uint32_t desc
)
4273 intptr_t j
, i
= simd_oprsz(desc
);
4274 unsigned rot
= simd_data(desc
);
4275 bool flip
= rot
& 1;
4276 float32 neg_imag
, neg_real
;
4279 neg_imag
= float32_set_sign(0, (rot
& 2) != 0);
4280 neg_real
= float32_set_sign(0, rot
== 1 || rot
== 2);
4283 uint64_t pg
= g
[(i
- 1) >> 6];
4285 float32 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
4287 /* I holds the real index; J holds the imag index. */
4288 j
= i
- sizeof(float32
);
4289 i
-= 2 * sizeof(float32
);
4291 nr
= *(float32
*)(vn
+ H1_2(i
));
4292 ni
= *(float32
*)(vn
+ H1_2(j
));
4293 mr
= *(float32
*)(vm
+ H1_2(i
));
4294 mi
= *(float32
*)(vm
+ H1_2(j
));
4296 e2
= (flip
? ni
: nr
);
4297 e1
= (flip
? mi
: mr
) ^ neg_real
;
4299 e3
= (flip
? mr
: mi
) ^ neg_imag
;
4301 if (likely((pg
>> (i
& 63)) & 1)) {
4302 d
= *(float32
*)(va
+ H1_2(i
));
4303 d
= float32_muladd(e2
, e1
, d
, 0, status
);
4304 *(float32
*)(vd
+ H1_2(i
)) = d
;
4306 if (likely((pg
>> (j
& 63)) & 1)) {
4307 d
= *(float32
*)(va
+ H1_2(j
));
4308 d
= float32_muladd(e4
, e3
, d
, 0, status
);
4309 *(float32
*)(vd
+ H1_2(j
)) = d
;
4315 void HELPER(sve_fcmla_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4316 void *vg
, void *status
, uint32_t desc
)
4318 intptr_t j
, i
= simd_oprsz(desc
);
4319 unsigned rot
= simd_data(desc
);
4320 bool flip
= rot
& 1;
4321 float64 neg_imag
, neg_real
;
4324 neg_imag
= float64_set_sign(0, (rot
& 2) != 0);
4325 neg_real
= float64_set_sign(0, rot
== 1 || rot
== 2);
4328 uint64_t pg
= g
[(i
- 1) >> 6];
4330 float64 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
4332 /* I holds the real index; J holds the imag index. */
4333 j
= i
- sizeof(float64
);
4334 i
-= 2 * sizeof(float64
);
4336 nr
= *(float64
*)(vn
+ H1_2(i
));
4337 ni
= *(float64
*)(vn
+ H1_2(j
));
4338 mr
= *(float64
*)(vm
+ H1_2(i
));
4339 mi
= *(float64
*)(vm
+ H1_2(j
));
4341 e2
= (flip
? ni
: nr
);
4342 e1
= (flip
? mi
: mr
) ^ neg_real
;
4344 e3
= (flip
? mr
: mi
) ^ neg_imag
;
4346 if (likely((pg
>> (i
& 63)) & 1)) {
4347 d
= *(float64
*)(va
+ H1_2(i
));
4348 d
= float64_muladd(e2
, e1
, d
, 0, status
);
4349 *(float64
*)(vd
+ H1_2(i
)) = d
;
4351 if (likely((pg
>> (j
& 63)) & 1)) {
4352 d
= *(float64
*)(va
+ H1_2(j
));
4353 d
= float64_muladd(e4
, e3
, d
, 0, status
);
4354 *(float64
*)(vd
+ H1_2(j
)) = d
;
4361 * Load contiguous data, protected by a governing predicate.
4365 * Load one element into @vd + @reg_off from @host.
4366 * The controlling predicate is known to be true.
4368 typedef void sve_ldst1_host_fn(void *vd
, intptr_t reg_off
, void *host
);
4371 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
4372 * The controlling predicate is known to be true.
4374 typedef void sve_ldst1_tlb_fn(CPUARMState
*env
, void *vd
, intptr_t reg_off
,
4375 target_ulong vaddr
, uintptr_t retaddr
);
4378 * Generate the above primitives.
4381 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
4382 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
4384 TYPEM val = HOST(host); \
4385 *(TYPEE *)(vd + H(reg_off)) = val; \
4388 #define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
4389 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
4390 { HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
4392 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
4393 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4394 target_ulong addr, uintptr_t ra) \
4396 *(TYPEE *)(vd + H(reg_off)) = \
4397 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
4400 #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
4401 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4402 target_ulong addr, uintptr_t ra) \
4404 TLB(env, useronly_clean_ptr(addr), \
4405 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
4408 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
4409 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
4410 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
4412 DO_LD_PRIM_1(ld1bb
, H1
, uint8_t, uint8_t)
4413 DO_LD_PRIM_1(ld1bhu
, H1_2
, uint16_t, uint8_t)
4414 DO_LD_PRIM_1(ld1bhs
, H1_2
, uint16_t, int8_t)
4415 DO_LD_PRIM_1(ld1bsu
, H1_4
, uint32_t, uint8_t)
4416 DO_LD_PRIM_1(ld1bss
, H1_4
, uint32_t, int8_t)
4417 DO_LD_PRIM_1(ld1bdu
, , uint64_t, uint8_t)
4418 DO_LD_PRIM_1(ld1bds
, , uint64_t, int8_t)
4420 #define DO_ST_PRIM_1(NAME, H, TE, TM) \
4421 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
4422 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
4424 DO_ST_PRIM_1(bb
, H1
, uint8_t, uint8_t)
4425 DO_ST_PRIM_1(bh
, H1_2
, uint16_t, uint8_t)
4426 DO_ST_PRIM_1(bs
, H1_4
, uint32_t, uint8_t)
4427 DO_ST_PRIM_1(bd
, , uint64_t, uint8_t)
4429 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
4430 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
4431 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
4432 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
4433 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
4435 #define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
4436 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
4437 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
4438 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
4439 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
4441 DO_LD_PRIM_2(hh
, H1_2
, uint16_t, uint16_t, lduw
)
4442 DO_LD_PRIM_2(hsu
, H1_4
, uint32_t, uint16_t, lduw
)
4443 DO_LD_PRIM_2(hss
, H1_4
, uint32_t, int16_t, lduw
)
4444 DO_LD_PRIM_2(hdu
, , uint64_t, uint16_t, lduw
)
4445 DO_LD_PRIM_2(hds
, , uint64_t, int16_t, lduw
)
4447 DO_ST_PRIM_2(hh
, H1_2
, uint16_t, uint16_t, stw
)
4448 DO_ST_PRIM_2(hs
, H1_4
, uint32_t, uint16_t, stw
)
4449 DO_ST_PRIM_2(hd
, , uint64_t, uint16_t, stw
)
4451 DO_LD_PRIM_2(ss
, H1_4
, uint32_t, uint32_t, ldl
)
4452 DO_LD_PRIM_2(sdu
, , uint64_t, uint32_t, ldl
)
4453 DO_LD_PRIM_2(sds
, , uint64_t, int32_t, ldl
)
4455 DO_ST_PRIM_2(ss
, H1_4
, uint32_t, uint32_t, stl
)
4456 DO_ST_PRIM_2(sd
, , uint64_t, uint32_t, stl
)
4458 DO_LD_PRIM_2(dd
, , uint64_t, uint64_t, ldq
)
4459 DO_ST_PRIM_2(dd
, , uint64_t, uint64_t, stq
)
4470 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4471 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4472 * element >= @reg_off, or @reg_max if there were no active elements at all.
4474 static intptr_t find_next_active(uint64_t *vg
, intptr_t reg_off
,
4475 intptr_t reg_max
, int esz
)
4477 uint64_t pg_mask
= pred_esz_masks
[esz
];
4478 uint64_t pg
= (vg
[reg_off
>> 6] & pg_mask
) >> (reg_off
& 63);
4480 /* In normal usage, the first element is active. */
4481 if (likely(pg
& 1)) {
4489 if (unlikely(reg_off
>= reg_max
)) {
4490 /* The entire predicate was false. */
4493 pg
= vg
[reg_off
>> 6] & pg_mask
;
4496 reg_off
+= ctz64(pg
);
4498 /* We should never see an out of range predicate bit set. */
4499 tcg_debug_assert(reg_off
< reg_max
);
4504 * Resolve the guest virtual address to info->host and info->flags.
4505 * If @nofault, return false if the page is invalid, otherwise
4506 * exit via page fault exception.
4515 static bool sve_probe_page(SVEHostPage
*info
, bool nofault
,
4516 CPUARMState
*env
, target_ulong addr
,
4517 int mem_off
, MMUAccessType access_type
,
4518 int mmu_idx
, uintptr_t retaddr
)
4525 * User-only currently always issues with TBI. See the comment
4526 * above useronly_clean_ptr. Usually we clean this top byte away
4527 * during translation, but we can't do that for e.g. vector + imm
4530 * We currently always enable TBI for user-only, and do not provide
4531 * a way to turn it off. So clean the pointer unconditionally here,
4532 * rather than look it up here, or pass it down from above.
4534 addr
= useronly_clean_ptr(addr
);
4536 flags
= probe_access_flags(env
, addr
, access_type
, mmu_idx
, nofault
,
4537 &info
->host
, retaddr
);
4538 info
->flags
= flags
;
4540 if (flags
& TLB_INVALID_MASK
) {
4545 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
4546 info
->host
-= mem_off
;
4548 #ifdef CONFIG_USER_ONLY
4549 memset(&info
->attrs
, 0, sizeof(info
->attrs
));
4552 * Find the iotlbentry for addr and return the transaction attributes.
4553 * This *must* be present in the TLB because we just found the mapping.
4556 uintptr_t index
= tlb_index(env
, mmu_idx
, addr
);
4558 # ifdef CONFIG_DEBUG_TCG
4559 CPUTLBEntry
*entry
= tlb_entry(env
, mmu_idx
, addr
);
4560 target_ulong comparator
= (access_type
== MMU_DATA_LOAD
4562 : tlb_addr_write(entry
));
4563 g_assert(tlb_hit(comparator
, addr
));
4566 CPUIOTLBEntry
*iotlbentry
= &env_tlb(env
)->d
[mmu_idx
].iotlb
[index
];
4567 info
->attrs
= iotlbentry
->attrs
;
4576 * Analyse contiguous data, protected by a governing predicate.
4587 * First and last element wholly contained within the two pages.
4588 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
4589 * reg_off_last[0] may be < 0 if the first element crosses pages.
4590 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
4591 * are set >= 0 only if there are complete elements on a second page.
4593 * The reg_off_* offsets are relative to the internal vector register.
4594 * The mem_off_first offset is relative to the memory address; the
4595 * two offsets are different when a load operation extends, a store
4596 * operation truncates, or for multi-register operations.
4598 int16_t mem_off_first
[2];
4599 int16_t reg_off_first
[2];
4600 int16_t reg_off_last
[2];
4603 * One element that is misaligned and spans both pages,
4604 * or -1 if there is no such active element.
4606 int16_t mem_off_split
;
4607 int16_t reg_off_split
;
4610 * The byte offset at which the entire operation crosses a page boundary.
4611 * Set >= 0 if and only if the entire operation spans two pages.
4615 /* TLB data for the two pages. */
4616 SVEHostPage page
[2];
4620 * Find first active element on each page, and a loose bound for the
4621 * final element on each page. Identify any single element that spans
4622 * the page boundary. Return true if there are any active elements.
4624 static bool sve_cont_ldst_elements(SVEContLdSt
*info
, target_ulong addr
,
4625 uint64_t *vg
, intptr_t reg_max
,
4628 const int esize
= 1 << esz
;
4629 const uint64_t pg_mask
= pred_esz_masks
[esz
];
4630 intptr_t reg_off_first
= -1, reg_off_last
= -1, reg_off_split
;
4631 intptr_t mem_off_last
, mem_off_split
;
4632 intptr_t page_split
, elt_split
;
4635 /* Set all of the element indices to -1, and the TLB data to 0. */
4636 memset(info
, -1, offsetof(SVEContLdSt
, page
));
4637 memset(info
->page
, 0, sizeof(info
->page
));
4639 /* Gross scan over the entire predicate to find bounds. */
4642 uint64_t pg
= vg
[i
] & pg_mask
;
4644 reg_off_last
= i
* 64 + 63 - clz64(pg
);
4645 if (reg_off_first
< 0) {
4646 reg_off_first
= i
* 64 + ctz64(pg
);
4649 } while (++i
* 64 < reg_max
);
4651 if (unlikely(reg_off_first
< 0)) {
4652 /* No active elements, no pages touched. */
4655 tcg_debug_assert(reg_off_last
>= 0 && reg_off_last
< reg_max
);
4657 info
->reg_off_first
[0] = reg_off_first
;
4658 info
->mem_off_first
[0] = (reg_off_first
>> esz
) * msize
;
4659 mem_off_last
= (reg_off_last
>> esz
) * msize
;
4661 page_split
= -(addr
| TARGET_PAGE_MASK
);
4662 if (likely(mem_off_last
+ msize
<= page_split
)) {
4663 /* The entire operation fits within a single page. */
4664 info
->reg_off_last
[0] = reg_off_last
;
4668 info
->page_split
= page_split
;
4669 elt_split
= page_split
/ msize
;
4670 reg_off_split
= elt_split
<< esz
;
4671 mem_off_split
= elt_split
* msize
;
4674 * This is the last full element on the first page, but it is not
4675 * necessarily active. If there is no full element, i.e. the first
4676 * active element is the one that's split, this value remains -1.
4677 * It is useful as iteration bounds.
4679 if (elt_split
!= 0) {
4680 info
->reg_off_last
[0] = reg_off_split
- esize
;
4683 /* Determine if an unaligned element spans the pages. */
4684 if (page_split
% msize
!= 0) {
4685 /* It is helpful to know if the split element is active. */
4686 if ((vg
[reg_off_split
>> 6] >> (reg_off_split
& 63)) & 1) {
4687 info
->reg_off_split
= reg_off_split
;
4688 info
->mem_off_split
= mem_off_split
;
4690 if (reg_off_split
== reg_off_last
) {
4691 /* The page crossing element is last. */
4695 reg_off_split
+= esize
;
4696 mem_off_split
+= msize
;
4700 * We do want the first active element on the second page, because
4701 * this may affect the address reported in an exception.
4703 reg_off_split
= find_next_active(vg
, reg_off_split
, reg_max
, esz
);
4704 tcg_debug_assert(reg_off_split
<= reg_off_last
);
4705 info
->reg_off_first
[1] = reg_off_split
;
4706 info
->mem_off_first
[1] = (reg_off_split
>> esz
) * msize
;
4707 info
->reg_off_last
[1] = reg_off_last
;
4712 * Resolve the guest virtual addresses to info->page[].
4713 * Control the generation of page faults with @fault. Return false if
4714 * there is no work to do, which can only happen with @fault == FAULT_NO.
4716 static bool sve_cont_ldst_pages(SVEContLdSt
*info
, SVEContFault fault
,
4717 CPUARMState
*env
, target_ulong addr
,
4718 MMUAccessType access_type
, uintptr_t retaddr
)
4720 int mmu_idx
= cpu_mmu_index(env
, false);
4721 int mem_off
= info
->mem_off_first
[0];
4722 bool nofault
= fault
== FAULT_NO
;
4723 bool have_work
= true;
4725 if (!sve_probe_page(&info
->page
[0], nofault
, env
, addr
, mem_off
,
4726 access_type
, mmu_idx
, retaddr
)) {
4727 /* No work to be done. */
4731 if (likely(info
->page_split
< 0)) {
4732 /* The entire operation was on the one page. */
4737 * If the second page is invalid, then we want the fault address to be
4738 * the first byte on that page which is accessed.
4740 if (info
->mem_off_split
>= 0) {
4742 * There is an element split across the pages. The fault address
4743 * should be the first byte of the second page.
4745 mem_off
= info
->page_split
;
4747 * If the split element is also the first active element
4748 * of the vector, then: For first-fault we should continue
4749 * to generate faults for the second page. For no-fault,
4750 * we have work only if the second page is valid.
4752 if (info
->mem_off_first
[0] < info
->mem_off_split
) {
4753 nofault
= FAULT_FIRST
;
4758 * There is no element split across the pages. The fault address
4759 * should be the first active element on the second page.
4761 mem_off
= info
->mem_off_first
[1];
4763 * There must have been one active element on the first page,
4764 * so we're out of first-fault territory.
4766 nofault
= fault
!= FAULT_ALL
;
4769 have_work
|= sve_probe_page(&info
->page
[1], nofault
, env
, addr
, mem_off
,
4770 access_type
, mmu_idx
, retaddr
);
4774 static void sve_cont_ldst_watchpoints(SVEContLdSt
*info
, CPUARMState
*env
,
4775 uint64_t *vg
, target_ulong addr
,
4776 int esize
, int msize
, int wp_access
,
4779 #ifndef CONFIG_USER_ONLY
4780 intptr_t mem_off
, reg_off
, reg_last
;
4781 int flags0
= info
->page
[0].flags
;
4782 int flags1
= info
->page
[1].flags
;
4784 if (likely(!((flags0
| flags1
) & TLB_WATCHPOINT
))) {
4788 /* Indicate that watchpoints are handled. */
4789 info
->page
[0].flags
= flags0
& ~TLB_WATCHPOINT
;
4790 info
->page
[1].flags
= flags1
& ~TLB_WATCHPOINT
;
4792 if (flags0
& TLB_WATCHPOINT
) {
4793 mem_off
= info
->mem_off_first
[0];
4794 reg_off
= info
->reg_off_first
[0];
4795 reg_last
= info
->reg_off_last
[0];
4797 while (reg_off
<= reg_last
) {
4798 uint64_t pg
= vg
[reg_off
>> 6];
4800 if ((pg
>> (reg_off
& 63)) & 1) {
4801 cpu_check_watchpoint(env_cpu(env
), addr
+ mem_off
,
4802 msize
, info
->page
[0].attrs
,
4803 wp_access
, retaddr
);
4807 } while (reg_off
<= reg_last
&& (reg_off
& 63));
4811 mem_off
= info
->mem_off_split
;
4813 cpu_check_watchpoint(env_cpu(env
), addr
+ mem_off
, msize
,
4814 info
->page
[0].attrs
, wp_access
, retaddr
);
4817 mem_off
= info
->mem_off_first
[1];
4818 if ((flags1
& TLB_WATCHPOINT
) && mem_off
>= 0) {
4819 reg_off
= info
->reg_off_first
[1];
4820 reg_last
= info
->reg_off_last
[1];
4823 uint64_t pg
= vg
[reg_off
>> 6];
4825 if ((pg
>> (reg_off
& 63)) & 1) {
4826 cpu_check_watchpoint(env_cpu(env
), addr
+ mem_off
,
4827 msize
, info
->page
[1].attrs
,
4828 wp_access
, retaddr
);
4832 } while (reg_off
& 63);
4833 } while (reg_off
<= reg_last
);
4838 static void sve_cont_ldst_mte_check(SVEContLdSt
*info
, CPUARMState
*env
,
4839 uint64_t *vg
, target_ulong addr
, int esize
,
4840 int msize
, uint32_t mtedesc
, uintptr_t ra
)
4842 intptr_t mem_off
, reg_off
, reg_last
;
4844 /* Process the page only if MemAttr == Tagged. */
4845 if (arm_tlb_mte_tagged(&info
->page
[0].attrs
)) {
4846 mem_off
= info
->mem_off_first
[0];
4847 reg_off
= info
->reg_off_first
[0];
4848 reg_last
= info
->reg_off_split
;
4850 reg_last
= info
->reg_off_last
[0];
4854 uint64_t pg
= vg
[reg_off
>> 6];
4856 if ((pg
>> (reg_off
& 63)) & 1) {
4857 mte_check(env
, mtedesc
, addr
, ra
);
4861 } while (reg_off
<= reg_last
&& (reg_off
& 63));
4862 } while (reg_off
<= reg_last
);
4865 mem_off
= info
->mem_off_first
[1];
4866 if (mem_off
>= 0 && arm_tlb_mte_tagged(&info
->page
[1].attrs
)) {
4867 reg_off
= info
->reg_off_first
[1];
4868 reg_last
= info
->reg_off_last
[1];
4871 uint64_t pg
= vg
[reg_off
>> 6];
4873 if ((pg
>> (reg_off
& 63)) & 1) {
4874 mte_check(env
, mtedesc
, addr
, ra
);
4878 } while (reg_off
& 63);
4879 } while (reg_off
<= reg_last
);
4884 * Common helper for all contiguous 1,2,3,4-register predicated stores.
4886 static inline QEMU_ALWAYS_INLINE
4887 void sve_ldN_r(CPUARMState
*env
, uint64_t *vg
, const target_ulong addr
,
4888 uint32_t desc
, const uintptr_t retaddr
,
4889 const int esz
, const int msz
, const int N
, uint32_t mtedesc
,
4890 sve_ldst1_host_fn
*host_fn
,
4891 sve_ldst1_tlb_fn
*tlb_fn
)
4893 const unsigned rd
= simd_data(desc
);
4894 const intptr_t reg_max
= simd_oprsz(desc
);
4895 intptr_t reg_off
, reg_last
, mem_off
;
4900 /* Find the active elements. */
4901 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, N
<< msz
)) {
4902 /* The entire predicate was false; no load occurs. */
4903 for (i
= 0; i
< N
; ++i
) {
4904 memset(&env
->vfp
.zregs
[(rd
+ i
) & 31], 0, reg_max
);
4909 /* Probe the page(s). Exit with exception for any invalid page. */
4910 sve_cont_ldst_pages(&info
, FAULT_ALL
, env
, addr
, MMU_DATA_LOAD
, retaddr
);
4912 /* Handle watchpoints for all active elements. */
4913 sve_cont_ldst_watchpoints(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
4914 BP_MEM_READ
, retaddr
);
4917 * Handle mte checks for all active elements.
4918 * Since TBI must be set for MTE, !mtedesc => !mte_active.
4921 sve_cont_ldst_mte_check(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
4925 flags
= info
.page
[0].flags
| info
.page
[1].flags
;
4926 if (unlikely(flags
!= 0)) {
4927 #ifdef CONFIG_USER_ONLY
4928 g_assert_not_reached();
4931 * At least one page includes MMIO.
4932 * Any bus operation can fail with cpu_transaction_failed,
4933 * which for ARM will raise SyncExternal. Perform the load
4934 * into scratch memory to preserve register state until the end.
4936 ARMVectorReg scratch
[4] = { };
4938 mem_off
= info
.mem_off_first
[0];
4939 reg_off
= info
.reg_off_first
[0];
4940 reg_last
= info
.reg_off_last
[1];
4942 reg_last
= info
.reg_off_split
;
4944 reg_last
= info
.reg_off_last
[0];
4949 uint64_t pg
= vg
[reg_off
>> 6];
4951 if ((pg
>> (reg_off
& 63)) & 1) {
4952 for (i
= 0; i
< N
; ++i
) {
4953 tlb_fn(env
, &scratch
[i
], reg_off
,
4954 addr
+ mem_off
+ (i
<< msz
), retaddr
);
4957 reg_off
+= 1 << esz
;
4958 mem_off
+= N
<< msz
;
4959 } while (reg_off
& 63);
4960 } while (reg_off
<= reg_last
);
4962 for (i
= 0; i
< N
; ++i
) {
4963 memcpy(&env
->vfp
.zregs
[(rd
+ i
) & 31], &scratch
[i
], reg_max
);
4969 /* The entire operation is in RAM, on valid pages. */
4971 for (i
= 0; i
< N
; ++i
) {
4972 memset(&env
->vfp
.zregs
[(rd
+ i
) & 31], 0, reg_max
);
4975 mem_off
= info
.mem_off_first
[0];
4976 reg_off
= info
.reg_off_first
[0];
4977 reg_last
= info
.reg_off_last
[0];
4978 host
= info
.page
[0].host
;
4980 while (reg_off
<= reg_last
) {
4981 uint64_t pg
= vg
[reg_off
>> 6];
4983 if ((pg
>> (reg_off
& 63)) & 1) {
4984 for (i
= 0; i
< N
; ++i
) {
4985 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
4986 host
+ mem_off
+ (i
<< msz
));
4989 reg_off
+= 1 << esz
;
4990 mem_off
+= N
<< msz
;
4991 } while (reg_off
<= reg_last
&& (reg_off
& 63));
4995 * Use the slow path to manage the cross-page misalignment.
4996 * But we know this is RAM and cannot trap.
4998 mem_off
= info
.mem_off_split
;
4999 if (unlikely(mem_off
>= 0)) {
5000 reg_off
= info
.reg_off_split
;
5001 for (i
= 0; i
< N
; ++i
) {
5002 tlb_fn(env
, &env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5003 addr
+ mem_off
+ (i
<< msz
), retaddr
);
5007 mem_off
= info
.mem_off_first
[1];
5008 if (unlikely(mem_off
>= 0)) {
5009 reg_off
= info
.reg_off_first
[1];
5010 reg_last
= info
.reg_off_last
[1];
5011 host
= info
.page
[1].host
;
5014 uint64_t pg
= vg
[reg_off
>> 6];
5016 if ((pg
>> (reg_off
& 63)) & 1) {
5017 for (i
= 0; i
< N
; ++i
) {
5018 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5019 host
+ mem_off
+ (i
<< msz
));
5022 reg_off
+= 1 << esz
;
5023 mem_off
+= N
<< msz
;
5024 } while (reg_off
& 63);
5025 } while (reg_off
<= reg_last
);
5029 static inline QEMU_ALWAYS_INLINE
5030 void sve_ldN_r_mte(CPUARMState
*env
, uint64_t *vg
, target_ulong addr
,
5031 uint32_t desc
, const uintptr_t ra
,
5032 const int esz
, const int msz
, const int N
,
5033 sve_ldst1_host_fn
*host_fn
,
5034 sve_ldst1_tlb_fn
*tlb_fn
)
5036 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
5037 int bit55
= extract64(addr
, 55, 1);
5039 /* Remove mtedesc from the normal sve descriptor. */
5040 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
5042 /* Perform gross MTE suppression early. */
5043 if (!tbi_check(desc
, bit55
) ||
5044 tcma_check(desc
, bit55
, allocation_tag_from_addr(addr
))) {
5048 sve_ldN_r(env
, vg
, addr
, desc
, ra
, esz
, msz
, N
, mtedesc
, host_fn
, tlb_fn
);
5051 #define DO_LD1_1(NAME, ESZ) \
5052 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5053 target_ulong addr, uint32_t desc) \
5055 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
5056 sve_##NAME##_host, sve_##NAME##_tlb); \
5058 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5059 target_ulong addr, uint32_t desc) \
5061 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5062 sve_##NAME##_host, sve_##NAME##_tlb); \
5065 #define DO_LD1_2(NAME, ESZ, MSZ) \
5066 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5067 target_ulong addr, uint32_t desc) \
5069 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5070 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5072 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5073 target_ulong addr, uint32_t desc) \
5075 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5076 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5078 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5079 target_ulong addr, uint32_t desc) \
5081 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5082 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5084 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5085 target_ulong addr, uint32_t desc) \
5087 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5088 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5091 DO_LD1_1(ld1bb
, MO_8
)
5092 DO_LD1_1(ld1bhu
, MO_16
)
5093 DO_LD1_1(ld1bhs
, MO_16
)
5094 DO_LD1_1(ld1bsu
, MO_32
)
5095 DO_LD1_1(ld1bss
, MO_32
)
5096 DO_LD1_1(ld1bdu
, MO_64
)
5097 DO_LD1_1(ld1bds
, MO_64
)
5099 DO_LD1_2(ld1hh
, MO_16
, MO_16
)
5100 DO_LD1_2(ld1hsu
, MO_32
, MO_16
)
5101 DO_LD1_2(ld1hss
, MO_32
, MO_16
)
5102 DO_LD1_2(ld1hdu
, MO_64
, MO_16
)
5103 DO_LD1_2(ld1hds
, MO_64
, MO_16
)
5105 DO_LD1_2(ld1ss
, MO_32
, MO_32
)
5106 DO_LD1_2(ld1sdu
, MO_64
, MO_32
)
5107 DO_LD1_2(ld1sds
, MO_64
, MO_32
)
5109 DO_LD1_2(ld1dd
, MO_64
, MO_64
)
5114 #define DO_LDN_1(N) \
5115 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5116 target_ulong addr, uint32_t desc) \
5118 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
5119 sve_ld1bb_host, sve_ld1bb_tlb); \
5121 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
5122 target_ulong addr, uint32_t desc) \
5124 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
5125 sve_ld1bb_host, sve_ld1bb_tlb); \
5128 #define DO_LDN_2(N, SUFF, ESZ) \
5129 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
5130 target_ulong addr, uint32_t desc) \
5132 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5133 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5135 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
5136 target_ulong addr, uint32_t desc) \
5138 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5139 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5141 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
5142 target_ulong addr, uint32_t desc) \
5144 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5145 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5147 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
5148 target_ulong addr, uint32_t desc) \
5150 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5151 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5158 DO_LDN_2(2, hh
, MO_16
)
5159 DO_LDN_2(3, hh
, MO_16
)
5160 DO_LDN_2(4, hh
, MO_16
)
5162 DO_LDN_2(2, ss
, MO_32
)
5163 DO_LDN_2(3, ss
, MO_32
)
5164 DO_LDN_2(4, ss
, MO_32
)
5166 DO_LDN_2(2, dd
, MO_64
)
5167 DO_LDN_2(3, dd
, MO_64
)
5168 DO_LDN_2(4, dd
, MO_64
)
5174 * Load contiguous data, first-fault and no-fault.
5176 * For user-only, one could argue that we should hold the mmap_lock during
5177 * the operation so that there is no race between page_check_range and the
5178 * load operation. However, unmapping pages out from under a running thread
5179 * is extraordinarily unlikely. This theoretical race condition also affects
5180 * linux-user/ in its get_user/put_user macros.
5182 * TODO: Construct some helpers, written in assembly, that interact with
5183 * handle_cpu_signal to produce memory ops which can properly report errors
5187 /* Fault on byte I. All bits in FFR from I are cleared. The vector
5188 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5189 * option, which leaves subsequent data unchanged.
5191 static void record_fault(CPUARMState
*env
, uintptr_t i
, uintptr_t oprsz
)
5193 uint64_t *ffr
= env
->vfp
.pregs
[FFR_PRED_NUM
].p
;
5196 ffr
[i
/ 64] &= MAKE_64BIT_MASK(0, i
& 63);
5197 i
= ROUND_UP(i
, 64);
5199 for (; i
< oprsz
; i
+= 64) {
5205 * Common helper for all contiguous no-fault and first-fault loads.
5207 static inline QEMU_ALWAYS_INLINE
5208 void sve_ldnfff1_r(CPUARMState
*env
, void *vg
, const target_ulong addr
,
5209 uint32_t desc
, const uintptr_t retaddr
, uint32_t mtedesc
,
5210 const int esz
, const int msz
, const SVEContFault fault
,
5211 sve_ldst1_host_fn
*host_fn
,
5212 sve_ldst1_tlb_fn
*tlb_fn
)
5214 const unsigned rd
= simd_data(desc
);
5215 void *vd
= &env
->vfp
.zregs
[rd
];
5216 const intptr_t reg_max
= simd_oprsz(desc
);
5217 intptr_t reg_off
, mem_off
, reg_last
;
5222 /* Find the active elements. */
5223 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, 1 << msz
)) {
5224 /* The entire predicate was false; no load occurs. */
5225 memset(vd
, 0, reg_max
);
5228 reg_off
= info
.reg_off_first
[0];
5230 /* Probe the page(s). */
5231 if (!sve_cont_ldst_pages(&info
, fault
, env
, addr
, MMU_DATA_LOAD
, retaddr
)) {
5232 /* Fault on first element. */
5233 tcg_debug_assert(fault
== FAULT_NO
);
5234 memset(vd
, 0, reg_max
);
5238 mem_off
= info
.mem_off_first
[0];
5239 flags
= info
.page
[0].flags
;
5242 * Disable MTE checking if the Tagged bit is not set. Since TBI must
5243 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
5245 if (arm_tlb_mte_tagged(&info
.page
[0].attrs
)) {
5249 if (fault
== FAULT_FIRST
) {
5250 /* Trapping mte check for the first-fault element. */
5252 mte_check(env
, mtedesc
, addr
+ mem_off
, retaddr
);
5256 * Special handling of the first active element,
5257 * if it crosses a page boundary or is MMIO.
5259 bool is_split
= mem_off
== info
.mem_off_split
;
5260 if (unlikely(flags
!= 0) || unlikely(is_split
)) {
5262 * Use the slow path for cross-page handling.
5263 * Might trap for MMIO or watchpoints.
5265 tlb_fn(env
, vd
, reg_off
, addr
+ mem_off
, retaddr
);
5267 /* After any fault, zero the other elements. */
5268 swap_memzero(vd
, reg_off
);
5269 reg_off
+= 1 << esz
;
5270 mem_off
+= 1 << msz
;
5271 swap_memzero(vd
+ reg_off
, reg_max
- reg_off
);
5277 memset(vd
, 0, reg_max
);
5280 memset(vd
, 0, reg_max
);
5281 if (unlikely(mem_off
== info
.mem_off_split
)) {
5282 /* The first active element crosses a page boundary. */
5283 flags
|= info
.page
[1].flags
;
5284 if (unlikely(flags
& TLB_MMIO
)) {
5285 /* Some page is MMIO, see below. */
5288 if (unlikely(flags
& TLB_WATCHPOINT
) &&
5289 (cpu_watchpoint_address_matches
5290 (env_cpu(env
), addr
+ mem_off
, 1 << msz
)
5292 /* Watchpoint hit, see below. */
5295 if (mtedesc
&& !mte_probe(env
, mtedesc
, addr
+ mem_off
)) {
5299 * Use the slow path for cross-page handling.
5300 * This is RAM, without a watchpoint, and will not trap.
5302 tlb_fn(env
, vd
, reg_off
, addr
+ mem_off
, retaddr
);
5308 * From this point on, all memory operations are MemSingleNF.
5310 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
5311 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
5313 * Unfortuately we do not have access to the memory attributes from the
5314 * PTE to tell Device memory from Normal memory. So we make a mostly
5315 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
5316 * This gives the right answer for the common cases of "Normal memory,
5317 * backed by host RAM" and "Device memory, backed by MMIO".
5318 * The architecture allows us to suppress an NF load and return
5319 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
5320 * case of "Normal memory, backed by MMIO" is permitted. The case we
5321 * get wrong is "Device memory, backed by host RAM", for which we
5322 * should return (UNKNOWN, FAULT) for but do not.
5324 * Similarly, CPU_BP breakpoints would raise exceptions, and so
5325 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
5326 * architectural breakpoints the same.
5328 if (unlikely(flags
& TLB_MMIO
)) {
5332 reg_last
= info
.reg_off_last
[0];
5333 host
= info
.page
[0].host
;
5336 uint64_t pg
= *(uint64_t *)(vg
+ (reg_off
>> 3));
5338 if ((pg
>> (reg_off
& 63)) & 1) {
5339 if (unlikely(flags
& TLB_WATCHPOINT
) &&
5340 (cpu_watchpoint_address_matches
5341 (env_cpu(env
), addr
+ mem_off
, 1 << msz
)
5345 if (mtedesc
&& !mte_probe(env
, mtedesc
, addr
+ mem_off
)) {
5348 host_fn(vd
, reg_off
, host
+ mem_off
);
5350 reg_off
+= 1 << esz
;
5351 mem_off
+= 1 << msz
;
5352 } while (reg_off
<= reg_last
&& (reg_off
& 63));
5353 } while (reg_off
<= reg_last
);
5356 * MemSingleNF is allowed to fail for any reason. We have special
5357 * code above to handle the first element crossing a page boundary.
5358 * As an implementation choice, decline to handle a cross-page element
5359 * in any other position.
5361 reg_off
= info
.reg_off_split
;
5367 reg_off
= info
.reg_off_first
[1];
5368 if (likely(reg_off
< 0)) {
5369 /* No active elements on the second page. All done. */
5374 * MemSingleNF is allowed to fail for any reason. As an implementation
5375 * choice, decline to handle elements on the second page. This should
5376 * be low frequency as the guest walks through memory -- the next
5377 * iteration of the guest's loop should be aligned on the page boundary,
5378 * and then all following iterations will stay aligned.
5382 record_fault(env
, reg_off
, reg_max
);
5385 static inline QEMU_ALWAYS_INLINE
5386 void sve_ldnfff1_r_mte(CPUARMState
*env
, void *vg
, target_ulong addr
,
5387 uint32_t desc
, const uintptr_t retaddr
,
5388 const int esz
, const int msz
, const SVEContFault fault
,
5389 sve_ldst1_host_fn
*host_fn
,
5390 sve_ldst1_tlb_fn
*tlb_fn
)
5392 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
5393 int bit55
= extract64(addr
, 55, 1);
5395 /* Remove mtedesc from the normal sve descriptor. */
5396 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
5398 /* Perform gross MTE suppression early. */
5399 if (!tbi_check(desc
, bit55
) ||
5400 tcma_check(desc
, bit55
, allocation_tag_from_addr(addr
))) {
5404 sve_ldnfff1_r(env
, vg
, addr
, desc
, retaddr
, mtedesc
,
5405 esz
, msz
, fault
, host_fn
, tlb_fn
);
5408 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
5409 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
5410 target_ulong addr, uint32_t desc) \
5412 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
5413 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5415 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
5416 target_ulong addr, uint32_t desc) \
5418 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
5419 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5421 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
5422 target_ulong addr, uint32_t desc) \
5424 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
5425 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5427 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
5428 target_ulong addr, uint32_t desc) \
5430 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
5431 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5434 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
5435 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
5436 target_ulong addr, uint32_t desc) \
5438 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
5439 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5441 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
5442 target_ulong addr, uint32_t desc) \
5444 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
5445 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5447 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
5448 target_ulong addr, uint32_t desc) \
5450 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
5451 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5453 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
5454 target_ulong addr, uint32_t desc) \
5456 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
5457 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5459 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5460 target_ulong addr, uint32_t desc) \
5462 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5463 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5465 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5466 target_ulong addr, uint32_t desc) \
5468 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5469 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5471 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5472 target_ulong addr, uint32_t desc) \
5474 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5475 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5477 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5478 target_ulong addr, uint32_t desc) \
5480 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5481 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5484 DO_LDFF1_LDNF1_1(bb
, MO_8
)
5485 DO_LDFF1_LDNF1_1(bhu
, MO_16
)
5486 DO_LDFF1_LDNF1_1(bhs
, MO_16
)
5487 DO_LDFF1_LDNF1_1(bsu
, MO_32
)
5488 DO_LDFF1_LDNF1_1(bss
, MO_32
)
5489 DO_LDFF1_LDNF1_1(bdu
, MO_64
)
5490 DO_LDFF1_LDNF1_1(bds
, MO_64
)
5492 DO_LDFF1_LDNF1_2(hh
, MO_16
, MO_16
)
5493 DO_LDFF1_LDNF1_2(hsu
, MO_32
, MO_16
)
5494 DO_LDFF1_LDNF1_2(hss
, MO_32
, MO_16
)
5495 DO_LDFF1_LDNF1_2(hdu
, MO_64
, MO_16
)
5496 DO_LDFF1_LDNF1_2(hds
, MO_64
, MO_16
)
5498 DO_LDFF1_LDNF1_2(ss
, MO_32
, MO_32
)
5499 DO_LDFF1_LDNF1_2(sdu
, MO_64
, MO_32
)
5500 DO_LDFF1_LDNF1_2(sds
, MO_64
, MO_32
)
5502 DO_LDFF1_LDNF1_2(dd
, MO_64
, MO_64
)
5504 #undef DO_LDFF1_LDNF1_1
5505 #undef DO_LDFF1_LDNF1_2
5508 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5511 static inline QEMU_ALWAYS_INLINE
5512 void sve_stN_r(CPUARMState
*env
, uint64_t *vg
, target_ulong addr
,
5513 uint32_t desc
, const uintptr_t retaddr
,
5514 const int esz
, const int msz
, const int N
, uint32_t mtedesc
,
5515 sve_ldst1_host_fn
*host_fn
,
5516 sve_ldst1_tlb_fn
*tlb_fn
)
5518 const unsigned rd
= simd_data(desc
);
5519 const intptr_t reg_max
= simd_oprsz(desc
);
5520 intptr_t reg_off
, reg_last
, mem_off
;
5525 /* Find the active elements. */
5526 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, N
<< msz
)) {
5527 /* The entire predicate was false; no store occurs. */
5531 /* Probe the page(s). Exit with exception for any invalid page. */
5532 sve_cont_ldst_pages(&info
, FAULT_ALL
, env
, addr
, MMU_DATA_STORE
, retaddr
);
5534 /* Handle watchpoints for all active elements. */
5535 sve_cont_ldst_watchpoints(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
5536 BP_MEM_WRITE
, retaddr
);
5539 * Handle mte checks for all active elements.
5540 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5543 sve_cont_ldst_mte_check(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
5547 flags
= info
.page
[0].flags
| info
.page
[1].flags
;
5548 if (unlikely(flags
!= 0)) {
5549 #ifdef CONFIG_USER_ONLY
5550 g_assert_not_reached();
5553 * At least one page includes MMIO.
5554 * Any bus operation can fail with cpu_transaction_failed,
5555 * which for ARM will raise SyncExternal. We cannot avoid
5556 * this fault and will leave with the store incomplete.
5558 mem_off
= info
.mem_off_first
[0];
5559 reg_off
= info
.reg_off_first
[0];
5560 reg_last
= info
.reg_off_last
[1];
5562 reg_last
= info
.reg_off_split
;
5564 reg_last
= info
.reg_off_last
[0];
5569 uint64_t pg
= vg
[reg_off
>> 6];
5571 if ((pg
>> (reg_off
& 63)) & 1) {
5572 for (i
= 0; i
< N
; ++i
) {
5573 tlb_fn(env
, &env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5574 addr
+ mem_off
+ (i
<< msz
), retaddr
);
5577 reg_off
+= 1 << esz
;
5578 mem_off
+= N
<< msz
;
5579 } while (reg_off
& 63);
5580 } while (reg_off
<= reg_last
);
5585 mem_off
= info
.mem_off_first
[0];
5586 reg_off
= info
.reg_off_first
[0];
5587 reg_last
= info
.reg_off_last
[0];
5588 host
= info
.page
[0].host
;
5590 while (reg_off
<= reg_last
) {
5591 uint64_t pg
= vg
[reg_off
>> 6];
5593 if ((pg
>> (reg_off
& 63)) & 1) {
5594 for (i
= 0; i
< N
; ++i
) {
5595 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5596 host
+ mem_off
+ (i
<< msz
));
5599 reg_off
+= 1 << esz
;
5600 mem_off
+= N
<< msz
;
5601 } while (reg_off
<= reg_last
&& (reg_off
& 63));
5605 * Use the slow path to manage the cross-page misalignment.
5606 * But we know this is RAM and cannot trap.
5608 mem_off
= info
.mem_off_split
;
5609 if (unlikely(mem_off
>= 0)) {
5610 reg_off
= info
.reg_off_split
;
5611 for (i
= 0; i
< N
; ++i
) {
5612 tlb_fn(env
, &env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5613 addr
+ mem_off
+ (i
<< msz
), retaddr
);
5617 mem_off
= info
.mem_off_first
[1];
5618 if (unlikely(mem_off
>= 0)) {
5619 reg_off
= info
.reg_off_first
[1];
5620 reg_last
= info
.reg_off_last
[1];
5621 host
= info
.page
[1].host
;
5624 uint64_t pg
= vg
[reg_off
>> 6];
5626 if ((pg
>> (reg_off
& 63)) & 1) {
5627 for (i
= 0; i
< N
; ++i
) {
5628 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5629 host
+ mem_off
+ (i
<< msz
));
5632 reg_off
+= 1 << esz
;
5633 mem_off
+= N
<< msz
;
5634 } while (reg_off
& 63);
5635 } while (reg_off
<= reg_last
);
5639 static inline QEMU_ALWAYS_INLINE
5640 void sve_stN_r_mte(CPUARMState
*env
, uint64_t *vg
, target_ulong addr
,
5641 uint32_t desc
, const uintptr_t ra
,
5642 const int esz
, const int msz
, const int N
,
5643 sve_ldst1_host_fn
*host_fn
,
5644 sve_ldst1_tlb_fn
*tlb_fn
)
5646 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
5647 int bit55
= extract64(addr
, 55, 1);
5649 /* Remove mtedesc from the normal sve descriptor. */
5650 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
5652 /* Perform gross MTE suppression early. */
5653 if (!tbi_check(desc
, bit55
) ||
5654 tcma_check(desc
, bit55
, allocation_tag_from_addr(addr
))) {
5658 sve_stN_r(env
, vg
, addr
, desc
, ra
, esz
, msz
, N
, mtedesc
, host_fn
, tlb_fn
);
5661 #define DO_STN_1(N, NAME, ESZ) \
5662 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
5663 target_ulong addr, uint32_t desc) \
5665 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
5666 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
5668 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
5669 target_ulong addr, uint32_t desc) \
5671 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
5672 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
5675 #define DO_STN_2(N, NAME, ESZ, MSZ) \
5676 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
5677 target_ulong addr, uint32_t desc) \
5679 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
5680 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
5682 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
5683 target_ulong addr, uint32_t desc) \
5685 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
5686 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
5688 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5689 target_ulong addr, uint32_t desc) \
5691 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5692 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
5694 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5695 target_ulong addr, uint32_t desc) \
5697 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
5698 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
5701 DO_STN_1(1, bb
, MO_8
)
5702 DO_STN_1(1, bh
, MO_16
)
5703 DO_STN_1(1, bs
, MO_32
)
5704 DO_STN_1(1, bd
, MO_64
)
5705 DO_STN_1(2, bb
, MO_8
)
5706 DO_STN_1(3, bb
, MO_8
)
5707 DO_STN_1(4, bb
, MO_8
)
5709 DO_STN_2(1, hh
, MO_16
, MO_16
)
5710 DO_STN_2(1, hs
, MO_32
, MO_16
)
5711 DO_STN_2(1, hd
, MO_64
, MO_16
)
5712 DO_STN_2(2, hh
, MO_16
, MO_16
)
5713 DO_STN_2(3, hh
, MO_16
, MO_16
)
5714 DO_STN_2(4, hh
, MO_16
, MO_16
)
5716 DO_STN_2(1, ss
, MO_32
, MO_32
)
5717 DO_STN_2(1, sd
, MO_64
, MO_32
)
5718 DO_STN_2(2, ss
, MO_32
, MO_32
)
5719 DO_STN_2(3, ss
, MO_32
, MO_32
)
5720 DO_STN_2(4, ss
, MO_32
, MO_32
)
5722 DO_STN_2(1, dd
, MO_64
, MO_64
)
5723 DO_STN_2(2, dd
, MO_64
, MO_64
)
5724 DO_STN_2(3, dd
, MO_64
, MO_64
)
5725 DO_STN_2(4, dd
, MO_64
, MO_64
)
5731 * Loads with a vector index.
5735 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
5737 typedef target_ulong
zreg_off_fn(void *reg
, intptr_t reg_ofs
);
5739 static target_ulong
off_zsu_s(void *reg
, intptr_t reg_ofs
)
5741 return *(uint32_t *)(reg
+ H1_4(reg_ofs
));
5744 static target_ulong
off_zss_s(void *reg
, intptr_t reg_ofs
)
5746 return *(int32_t *)(reg
+ H1_4(reg_ofs
));
5749 static target_ulong
off_zsu_d(void *reg
, intptr_t reg_ofs
)
5751 return (uint32_t)*(uint64_t *)(reg
+ reg_ofs
);
5754 static target_ulong
off_zss_d(void *reg
, intptr_t reg_ofs
)
5756 return (int32_t)*(uint64_t *)(reg
+ reg_ofs
);
5759 static target_ulong
off_zd_d(void *reg
, intptr_t reg_ofs
)
5761 return *(uint64_t *)(reg
+ reg_ofs
);
5764 static inline QEMU_ALWAYS_INLINE
5765 void sve_ld1_z(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
5766 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
5767 uint32_t mtedesc
, int esize
, int msize
,
5768 zreg_off_fn
*off_fn
,
5769 sve_ldst1_host_fn
*host_fn
,
5770 sve_ldst1_tlb_fn
*tlb_fn
)
5772 const int mmu_idx
= cpu_mmu_index(env
, false);
5773 const intptr_t reg_max
= simd_oprsz(desc
);
5774 const int scale
= simd_data(desc
);
5775 ARMVectorReg scratch
;
5777 SVEHostPage info
, info2
;
5779 memset(&scratch
, 0, reg_max
);
5782 uint64_t pg
= vg
[reg_off
>> 6];
5784 if (likely(pg
& 1)) {
5785 target_ulong addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
5786 target_ulong in_page
= -(addr
| TARGET_PAGE_MASK
);
5788 sve_probe_page(&info
, false, env
, addr
, 0, MMU_DATA_LOAD
,
5791 if (likely(in_page
>= msize
)) {
5792 if (unlikely(info
.flags
& TLB_WATCHPOINT
)) {
5793 cpu_check_watchpoint(env_cpu(env
), addr
, msize
,
5794 info
.attrs
, BP_MEM_READ
, retaddr
);
5796 if (mtedesc
&& arm_tlb_mte_tagged(&info
.attrs
)) {
5797 mte_check(env
, mtedesc
, addr
, retaddr
);
5799 host_fn(&scratch
, reg_off
, info
.host
);
5801 /* Element crosses the page boundary. */
5802 sve_probe_page(&info2
, false, env
, addr
+ in_page
, 0,
5803 MMU_DATA_LOAD
, mmu_idx
, retaddr
);
5804 if (unlikely((info
.flags
| info2
.flags
) & TLB_WATCHPOINT
)) {
5805 cpu_check_watchpoint(env_cpu(env
), addr
,
5807 BP_MEM_READ
, retaddr
);
5809 if (mtedesc
&& arm_tlb_mte_tagged(&info
.attrs
)) {
5810 mte_check(env
, mtedesc
, addr
, retaddr
);
5812 tlb_fn(env
, &scratch
, reg_off
, addr
, retaddr
);
5817 } while (reg_off
& 63);
5818 } while (reg_off
< reg_max
);
5820 /* Wait until all exceptions have been raised to write back. */
5821 memcpy(vd
, &scratch
, reg_max
);
5824 static inline QEMU_ALWAYS_INLINE
5825 void sve_ld1_z_mte(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
5826 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
5827 int esize
, int msize
, zreg_off_fn
*off_fn
,
5828 sve_ldst1_host_fn
*host_fn
,
5829 sve_ldst1_tlb_fn
*tlb_fn
)
5831 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
5832 /* Remove mtedesc from the normal sve descriptor. */
5833 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
5836 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
5837 * offset base entirely over the address space hole to change the
5838 * pointer tag, or change the bit55 selector. So we could here
5839 * examine TBI + TCMA like we do for sve_ldN_r_mte().
5841 sve_ld1_z(env
, vd
, vg
, vm
, base
, desc
, retaddr
, mtedesc
,
5842 esize
, msize
, off_fn
, host_fn
, tlb_fn
);
5845 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
5846 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5847 void *vm, target_ulong base, uint32_t desc) \
5849 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
5850 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5852 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5853 void *vm, target_ulong base, uint32_t desc) \
5855 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
5856 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5859 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
5860 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
5861 void *vm, target_ulong base, uint32_t desc) \
5863 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
5864 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5866 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
5867 void *vm, target_ulong base, uint32_t desc) \
5869 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
5870 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
5873 DO_LD1_ZPZ_S(bsu
, zsu
, MO_8
)
5874 DO_LD1_ZPZ_S(bsu
, zss
, MO_8
)
5875 DO_LD1_ZPZ_D(bdu
, zsu
, MO_8
)
5876 DO_LD1_ZPZ_D(bdu
, zss
, MO_8
)
5877 DO_LD1_ZPZ_D(bdu
, zd
, MO_8
)
5879 DO_LD1_ZPZ_S(bss
, zsu
, MO_8
)
5880 DO_LD1_ZPZ_S(bss
, zss
, MO_8
)
5881 DO_LD1_ZPZ_D(bds
, zsu
, MO_8
)
5882 DO_LD1_ZPZ_D(bds
, zss
, MO_8
)
5883 DO_LD1_ZPZ_D(bds
, zd
, MO_8
)
5885 DO_LD1_ZPZ_S(hsu_le
, zsu
, MO_16
)
5886 DO_LD1_ZPZ_S(hsu_le
, zss
, MO_16
)
5887 DO_LD1_ZPZ_D(hdu_le
, zsu
, MO_16
)
5888 DO_LD1_ZPZ_D(hdu_le
, zss
, MO_16
)
5889 DO_LD1_ZPZ_D(hdu_le
, zd
, MO_16
)
5891 DO_LD1_ZPZ_S(hsu_be
, zsu
, MO_16
)
5892 DO_LD1_ZPZ_S(hsu_be
, zss
, MO_16
)
5893 DO_LD1_ZPZ_D(hdu_be
, zsu
, MO_16
)
5894 DO_LD1_ZPZ_D(hdu_be
, zss
, MO_16
)
5895 DO_LD1_ZPZ_D(hdu_be
, zd
, MO_16
)
5897 DO_LD1_ZPZ_S(hss_le
, zsu
, MO_16
)
5898 DO_LD1_ZPZ_S(hss_le
, zss
, MO_16
)
5899 DO_LD1_ZPZ_D(hds_le
, zsu
, MO_16
)
5900 DO_LD1_ZPZ_D(hds_le
, zss
, MO_16
)
5901 DO_LD1_ZPZ_D(hds_le
, zd
, MO_16
)
5903 DO_LD1_ZPZ_S(hss_be
, zsu
, MO_16
)
5904 DO_LD1_ZPZ_S(hss_be
, zss
, MO_16
)
5905 DO_LD1_ZPZ_D(hds_be
, zsu
, MO_16
)
5906 DO_LD1_ZPZ_D(hds_be
, zss
, MO_16
)
5907 DO_LD1_ZPZ_D(hds_be
, zd
, MO_16
)
5909 DO_LD1_ZPZ_S(ss_le
, zsu
, MO_32
)
5910 DO_LD1_ZPZ_S(ss_le
, zss
, MO_32
)
5911 DO_LD1_ZPZ_D(sdu_le
, zsu
, MO_32
)
5912 DO_LD1_ZPZ_D(sdu_le
, zss
, MO_32
)
5913 DO_LD1_ZPZ_D(sdu_le
, zd
, MO_32
)
5915 DO_LD1_ZPZ_S(ss_be
, zsu
, MO_32
)
5916 DO_LD1_ZPZ_S(ss_be
, zss
, MO_32
)
5917 DO_LD1_ZPZ_D(sdu_be
, zsu
, MO_32
)
5918 DO_LD1_ZPZ_D(sdu_be
, zss
, MO_32
)
5919 DO_LD1_ZPZ_D(sdu_be
, zd
, MO_32
)
5921 DO_LD1_ZPZ_D(sds_le
, zsu
, MO_32
)
5922 DO_LD1_ZPZ_D(sds_le
, zss
, MO_32
)
5923 DO_LD1_ZPZ_D(sds_le
, zd
, MO_32
)
5925 DO_LD1_ZPZ_D(sds_be
, zsu
, MO_32
)
5926 DO_LD1_ZPZ_D(sds_be
, zss
, MO_32
)
5927 DO_LD1_ZPZ_D(sds_be
, zd
, MO_32
)
5929 DO_LD1_ZPZ_D(dd_le
, zsu
, MO_64
)
5930 DO_LD1_ZPZ_D(dd_le
, zss
, MO_64
)
5931 DO_LD1_ZPZ_D(dd_le
, zd
, MO_64
)
5933 DO_LD1_ZPZ_D(dd_be
, zsu
, MO_64
)
5934 DO_LD1_ZPZ_D(dd_be
, zss
, MO_64
)
5935 DO_LD1_ZPZ_D(dd_be
, zd
, MO_64
)
5940 /* First fault loads with a vector index. */
5943 * Common helpers for all gather first-faulting loads.
5946 static inline QEMU_ALWAYS_INLINE
5947 void sve_ldff1_z(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
5948 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
5949 uint32_t mtedesc
, const int esz
, const int msz
,
5950 zreg_off_fn
*off_fn
,
5951 sve_ldst1_host_fn
*host_fn
,
5952 sve_ldst1_tlb_fn
*tlb_fn
)
5954 const int mmu_idx
= cpu_mmu_index(env
, false);
5955 const intptr_t reg_max
= simd_oprsz(desc
);
5956 const int scale
= simd_data(desc
);
5957 const int esize
= 1 << esz
;
5958 const int msize
= 1 << msz
;
5961 target_ulong addr
, in_page
;
5963 /* Skip to the first true predicate. */
5964 reg_off
= find_next_active(vg
, 0, reg_max
, esz
);
5965 if (unlikely(reg_off
>= reg_max
)) {
5966 /* The entire predicate was false; no load occurs. */
5967 memset(vd
, 0, reg_max
);
5972 * Probe the first element, allowing faults.
5974 addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
5976 mte_check(env
, mtedesc
, addr
, retaddr
);
5978 tlb_fn(env
, vd
, reg_off
, addr
, retaddr
);
5980 /* After any fault, zero the other elements. */
5981 swap_memzero(vd
, reg_off
);
5983 swap_memzero(vd
+ reg_off
, reg_max
- reg_off
);
5986 * Probe the remaining elements, not allowing faults.
5988 while (reg_off
< reg_max
) {
5989 uint64_t pg
= vg
[reg_off
>> 6];
5991 if (likely((pg
>> (reg_off
& 63)) & 1)) {
5992 addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
5993 in_page
= -(addr
| TARGET_PAGE_MASK
);
5995 if (unlikely(in_page
< msize
)) {
5996 /* Stop if the element crosses a page boundary. */
6000 sve_probe_page(&info
, true, env
, addr
, 0, MMU_DATA_LOAD
,
6002 if (unlikely(info
.flags
& (TLB_INVALID_MASK
| TLB_MMIO
))) {
6005 if (unlikely(info
.flags
& TLB_WATCHPOINT
) &&
6006 (cpu_watchpoint_address_matches
6007 (env_cpu(env
), addr
, msize
) & BP_MEM_READ
)) {
6011 arm_tlb_mte_tagged(&info
.attrs
) &&
6012 !mte_probe(env
, mtedesc
, addr
)) {
6016 host_fn(vd
, reg_off
, info
.host
);
6019 } while (reg_off
& 63);
6024 record_fault(env
, reg_off
, reg_max
);
6027 static inline QEMU_ALWAYS_INLINE
6028 void sve_ldff1_z_mte(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6029 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6030 const int esz
, const int msz
,
6031 zreg_off_fn
*off_fn
,
6032 sve_ldst1_host_fn
*host_fn
,
6033 sve_ldst1_tlb_fn
*tlb_fn
)
6035 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6036 /* Remove mtedesc from the normal sve descriptor. */
6037 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6040 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6041 * offset base entirely over the address space hole to change the
6042 * pointer tag, or change the bit55 selector. So we could here
6043 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6045 sve_ldff1_z(env
, vd
, vg
, vm
, base
, desc
, retaddr
, mtedesc
,
6046 esz
, msz
, off_fn
, host_fn
, tlb_fn
);
6049 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6050 void HELPER(sve_ldff##MEM##_##OFS) \
6051 (CPUARMState *env, void *vd, void *vg, \
6052 void *vm, target_ulong base, uint32_t desc) \
6054 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6055 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6057 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6058 (CPUARMState *env, void *vd, void *vg, \
6059 void *vm, target_ulong base, uint32_t desc) \
6061 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6062 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6065 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6066 void HELPER(sve_ldff##MEM##_##OFS) \
6067 (CPUARMState *env, void *vd, void *vg, \
6068 void *vm, target_ulong base, uint32_t desc) \
6070 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6071 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6073 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6074 (CPUARMState *env, void *vd, void *vg, \
6075 void *vm, target_ulong base, uint32_t desc) \
6077 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6078 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6081 DO_LDFF1_ZPZ_S(bsu
, zsu
, MO_8
)
6082 DO_LDFF1_ZPZ_S(bsu
, zss
, MO_8
)
6083 DO_LDFF1_ZPZ_D(bdu
, zsu
, MO_8
)
6084 DO_LDFF1_ZPZ_D(bdu
, zss
, MO_8
)
6085 DO_LDFF1_ZPZ_D(bdu
, zd
, MO_8
)
6087 DO_LDFF1_ZPZ_S(bss
, zsu
, MO_8
)
6088 DO_LDFF1_ZPZ_S(bss
, zss
, MO_8
)
6089 DO_LDFF1_ZPZ_D(bds
, zsu
, MO_8
)
6090 DO_LDFF1_ZPZ_D(bds
, zss
, MO_8
)
6091 DO_LDFF1_ZPZ_D(bds
, zd
, MO_8
)
6093 DO_LDFF1_ZPZ_S(hsu_le
, zsu
, MO_16
)
6094 DO_LDFF1_ZPZ_S(hsu_le
, zss
, MO_16
)
6095 DO_LDFF1_ZPZ_D(hdu_le
, zsu
, MO_16
)
6096 DO_LDFF1_ZPZ_D(hdu_le
, zss
, MO_16
)
6097 DO_LDFF1_ZPZ_D(hdu_le
, zd
, MO_16
)
6099 DO_LDFF1_ZPZ_S(hsu_be
, zsu
, MO_16
)
6100 DO_LDFF1_ZPZ_S(hsu_be
, zss
, MO_16
)
6101 DO_LDFF1_ZPZ_D(hdu_be
, zsu
, MO_16
)
6102 DO_LDFF1_ZPZ_D(hdu_be
, zss
, MO_16
)
6103 DO_LDFF1_ZPZ_D(hdu_be
, zd
, MO_16
)
6105 DO_LDFF1_ZPZ_S(hss_le
, zsu
, MO_16
)
6106 DO_LDFF1_ZPZ_S(hss_le
, zss
, MO_16
)
6107 DO_LDFF1_ZPZ_D(hds_le
, zsu
, MO_16
)
6108 DO_LDFF1_ZPZ_D(hds_le
, zss
, MO_16
)
6109 DO_LDFF1_ZPZ_D(hds_le
, zd
, MO_16
)
6111 DO_LDFF1_ZPZ_S(hss_be
, zsu
, MO_16
)
6112 DO_LDFF1_ZPZ_S(hss_be
, zss
, MO_16
)
6113 DO_LDFF1_ZPZ_D(hds_be
, zsu
, MO_16
)
6114 DO_LDFF1_ZPZ_D(hds_be
, zss
, MO_16
)
6115 DO_LDFF1_ZPZ_D(hds_be
, zd
, MO_16
)
6117 DO_LDFF1_ZPZ_S(ss_le
, zsu
, MO_32
)
6118 DO_LDFF1_ZPZ_S(ss_le
, zss
, MO_32
)
6119 DO_LDFF1_ZPZ_D(sdu_le
, zsu
, MO_32
)
6120 DO_LDFF1_ZPZ_D(sdu_le
, zss
, MO_32
)
6121 DO_LDFF1_ZPZ_D(sdu_le
, zd
, MO_32
)
6123 DO_LDFF1_ZPZ_S(ss_be
, zsu
, MO_32
)
6124 DO_LDFF1_ZPZ_S(ss_be
, zss
, MO_32
)
6125 DO_LDFF1_ZPZ_D(sdu_be
, zsu
, MO_32
)
6126 DO_LDFF1_ZPZ_D(sdu_be
, zss
, MO_32
)
6127 DO_LDFF1_ZPZ_D(sdu_be
, zd
, MO_32
)
6129 DO_LDFF1_ZPZ_D(sds_le
, zsu
, MO_32
)
6130 DO_LDFF1_ZPZ_D(sds_le
, zss
, MO_32
)
6131 DO_LDFF1_ZPZ_D(sds_le
, zd
, MO_32
)
6133 DO_LDFF1_ZPZ_D(sds_be
, zsu
, MO_32
)
6134 DO_LDFF1_ZPZ_D(sds_be
, zss
, MO_32
)
6135 DO_LDFF1_ZPZ_D(sds_be
, zd
, MO_32
)
6137 DO_LDFF1_ZPZ_D(dd_le
, zsu
, MO_64
)
6138 DO_LDFF1_ZPZ_D(dd_le
, zss
, MO_64
)
6139 DO_LDFF1_ZPZ_D(dd_le
, zd
, MO_64
)
6141 DO_LDFF1_ZPZ_D(dd_be
, zsu
, MO_64
)
6142 DO_LDFF1_ZPZ_D(dd_be
, zss
, MO_64
)
6143 DO_LDFF1_ZPZ_D(dd_be
, zd
, MO_64
)
6145 /* Stores with a vector index. */
6147 static inline QEMU_ALWAYS_INLINE
6148 void sve_st1_z(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6149 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6150 uint32_t mtedesc
, int esize
, int msize
,
6151 zreg_off_fn
*off_fn
,
6152 sve_ldst1_host_fn
*host_fn
,
6153 sve_ldst1_tlb_fn
*tlb_fn
)
6155 const int mmu_idx
= cpu_mmu_index(env
, false);
6156 const intptr_t reg_max
= simd_oprsz(desc
);
6157 const int scale
= simd_data(desc
);
6158 void *host
[ARM_MAX_VQ
* 4];
6159 intptr_t reg_off
, i
;
6160 SVEHostPage info
, info2
;
6163 * Probe all of the elements for host addresses and flags.
6167 uint64_t pg
= vg
[reg_off
>> 6];
6169 target_ulong addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6170 target_ulong in_page
= -(addr
| TARGET_PAGE_MASK
);
6173 if (likely((pg
>> (reg_off
& 63)) & 1)) {
6174 if (likely(in_page
>= msize
)) {
6175 sve_probe_page(&info
, false, env
, addr
, 0, MMU_DATA_STORE
,
6177 host
[i
] = info
.host
;
6180 * Element crosses the page boundary.
6181 * Probe both pages, but do not record the host address,
6182 * so that we use the slow path.
6184 sve_probe_page(&info
, false, env
, addr
, 0,
6185 MMU_DATA_STORE
, mmu_idx
, retaddr
);
6186 sve_probe_page(&info2
, false, env
, addr
+ in_page
, 0,
6187 MMU_DATA_STORE
, mmu_idx
, retaddr
);
6188 info
.flags
|= info2
.flags
;
6191 if (unlikely(info
.flags
& TLB_WATCHPOINT
)) {
6192 cpu_check_watchpoint(env_cpu(env
), addr
, msize
,
6193 info
.attrs
, BP_MEM_WRITE
, retaddr
);
6196 if (mtedesc
&& arm_tlb_mte_tagged(&info
.attrs
)) {
6197 mte_check(env
, mtedesc
, addr
, retaddr
);
6202 } while (reg_off
& 63);
6203 } while (reg_off
< reg_max
);
6206 * Now that we have recognized all exceptions except SyncExternal
6207 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6209 * Note for the common case of an element in RAM, not crossing a page
6210 * boundary, we have stored the host address in host[]. This doubles
6211 * as a first-level check against the predicate, since only enabled
6212 * elements have non-null host addresses.
6217 if (likely(h
!= NULL
)) {
6218 host_fn(vd
, reg_off
, h
);
6219 } else if ((vg
[reg_off
>> 6] >> (reg_off
& 63)) & 1) {
6220 target_ulong addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6221 tlb_fn(env
, vd
, reg_off
, addr
, retaddr
);
6225 } while (reg_off
< reg_max
);
6228 static inline QEMU_ALWAYS_INLINE
6229 void sve_st1_z_mte(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6230 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6231 int esize
, int msize
, zreg_off_fn
*off_fn
,
6232 sve_ldst1_host_fn
*host_fn
,
6233 sve_ldst1_tlb_fn
*tlb_fn
)
6235 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6236 /* Remove mtedesc from the normal sve descriptor. */
6237 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6240 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6241 * offset base entirely over the address space hole to change the
6242 * pointer tag, or change the bit55 selector. So we could here
6243 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6245 sve_st1_z(env
, vd
, vg
, vm
, base
, desc
, retaddr
, mtedesc
,
6246 esize
, msize
, off_fn
, host_fn
, tlb_fn
);
6249 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
6250 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6251 void *vm, target_ulong base, uint32_t desc) \
6253 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6254 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6256 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6257 void *vm, target_ulong base, uint32_t desc) \
6259 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6260 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6263 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
6264 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6265 void *vm, target_ulong base, uint32_t desc) \
6267 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6268 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6270 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6271 void *vm, target_ulong base, uint32_t desc) \
6273 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6274 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6277 DO_ST1_ZPZ_S(bs
, zsu
, MO_8
)
6278 DO_ST1_ZPZ_S(hs_le
, zsu
, MO_16
)
6279 DO_ST1_ZPZ_S(hs_be
, zsu
, MO_16
)
6280 DO_ST1_ZPZ_S(ss_le
, zsu
, MO_32
)
6281 DO_ST1_ZPZ_S(ss_be
, zsu
, MO_32
)
6283 DO_ST1_ZPZ_S(bs
, zss
, MO_8
)
6284 DO_ST1_ZPZ_S(hs_le
, zss
, MO_16
)
6285 DO_ST1_ZPZ_S(hs_be
, zss
, MO_16
)
6286 DO_ST1_ZPZ_S(ss_le
, zss
, MO_32
)
6287 DO_ST1_ZPZ_S(ss_be
, zss
, MO_32
)
6289 DO_ST1_ZPZ_D(bd
, zsu
, MO_8
)
6290 DO_ST1_ZPZ_D(hd_le
, zsu
, MO_16
)
6291 DO_ST1_ZPZ_D(hd_be
, zsu
, MO_16
)
6292 DO_ST1_ZPZ_D(sd_le
, zsu
, MO_32
)
6293 DO_ST1_ZPZ_D(sd_be
, zsu
, MO_32
)
6294 DO_ST1_ZPZ_D(dd_le
, zsu
, MO_64
)
6295 DO_ST1_ZPZ_D(dd_be
, zsu
, MO_64
)
6297 DO_ST1_ZPZ_D(bd
, zss
, MO_8
)
6298 DO_ST1_ZPZ_D(hd_le
, zss
, MO_16
)
6299 DO_ST1_ZPZ_D(hd_be
, zss
, MO_16
)
6300 DO_ST1_ZPZ_D(sd_le
, zss
, MO_32
)
6301 DO_ST1_ZPZ_D(sd_be
, zss
, MO_32
)
6302 DO_ST1_ZPZ_D(dd_le
, zss
, MO_64
)
6303 DO_ST1_ZPZ_D(dd_be
, zss
, MO_64
)
6305 DO_ST1_ZPZ_D(bd
, zd
, MO_8
)
6306 DO_ST1_ZPZ_D(hd_le
, zd
, MO_16
)
6307 DO_ST1_ZPZ_D(hd_be
, zd
, MO_16
)
6308 DO_ST1_ZPZ_D(sd_le
, zd
, MO_32
)
6309 DO_ST1_ZPZ_D(sd_be
, zd
, MO_32
)
6310 DO_ST1_ZPZ_D(dd_le
, zd
, MO_64
)
6311 DO_ST1_ZPZ_D(dd_be
, zd
, MO_64
)