4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
29 #include "vec_internal.h"
32 /* Note that vector data is stored in host-endian 64-bit chunks,
33 so addressing units smaller than that needs a host-endian fixup. */
34 #ifdef HOST_WORDS_BIGENDIAN
35 #define H1(x) ((x) ^ 7)
36 #define H1_2(x) ((x) ^ 6)
37 #define H1_4(x) ((x) ^ 4)
38 #define H2(x) ((x) ^ 3)
39 #define H4(x) ((x) ^ 1)
48 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
50 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
51 * and bit 0 set if C is set. Compare the definitions of these variables
55 /* For no G bits set, NZCV = C. */
56 #define PREDTEST_INIT 1
58 /* This is an iterative function, called for each Pd and Pg word
61 static uint32_t iter_predtest_fwd(uint64_t d
, uint64_t g
, uint32_t flags
)
64 /* Compute N from first D & G.
65 Use bit 2 to signal first G bit seen. */
67 flags
|= ((d
& (g
& -g
)) != 0) << 31;
71 /* Accumulate Z from each D & G. */
72 flags
|= ((d
& g
) != 0) << 1;
74 /* Compute C from last !(D & G). Replace previous. */
75 flags
= deposit32(flags
, 0, 1, (d
& pow2floor(g
)) == 0);
80 /* This is an iterative function, called for each Pd and Pg word
83 static uint32_t iter_predtest_bwd(uint64_t d
, uint64_t g
, uint32_t flags
)
86 /* Compute C from first (i.e last) !(D & G).
87 Use bit 2 to signal first G bit seen. */
89 flags
+= 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
90 flags
|= (d
& pow2floor(g
)) == 0;
93 /* Accumulate Z from each D & G. */
94 flags
|= ((d
& g
) != 0) << 1;
96 /* Compute N from last (i.e first) D & G. Replace previous. */
97 flags
= deposit32(flags
, 31, 1, (d
& (g
& -g
)) != 0);
102 /* The same for a single word predicate. */
103 uint32_t HELPER(sve_predtest1
)(uint64_t d
, uint64_t g
)
105 return iter_predtest_fwd(d
, g
, PREDTEST_INIT
);
108 /* The same for a multi-word predicate. */
109 uint32_t HELPER(sve_predtest
)(void *vd
, void *vg
, uint32_t words
)
111 uint32_t flags
= PREDTEST_INIT
;
112 uint64_t *d
= vd
, *g
= vg
;
116 flags
= iter_predtest_fwd(d
[i
], g
[i
], flags
);
117 } while (++i
< words
);
122 /* Expand active predicate bits to bytes, for byte elements.
123 * for (i = 0; i < 256; ++i) {
124 * unsigned long m = 0;
125 * for (j = 0; j < 8; j++) {
126 * if ((i >> j) & 1) {
127 * m |= 0xfful << (j << 3);
130 * printf("0x%016lx,\n", m);
133 static inline uint64_t expand_pred_b(uint8_t byte
)
135 static const uint64_t word
[256] = {
136 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
137 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
138 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
139 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
140 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
141 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
142 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
143 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
144 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
145 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
146 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
147 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
148 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
149 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
150 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
151 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
152 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
153 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
154 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
155 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
156 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
157 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
158 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
159 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
160 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
161 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
162 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
163 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
164 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
165 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
166 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
167 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
168 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
169 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
170 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
171 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
172 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
173 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
174 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
175 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
176 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
177 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
178 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
179 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
180 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
181 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
182 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
183 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
184 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
185 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
186 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
187 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
188 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
189 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
190 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
191 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
192 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
193 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
194 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
195 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
196 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
197 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
198 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
199 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
200 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
201 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
202 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
203 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
204 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
205 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
206 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
207 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
208 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
209 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
210 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
211 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
212 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
213 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
214 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
215 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
216 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
217 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
218 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
219 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
220 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
226 /* Similarly for half-word elements.
227 * for (i = 0; i < 256; ++i) {
228 * unsigned long m = 0;
232 * for (j = 0; j < 8; j += 2) {
233 * if ((i >> j) & 1) {
234 * m |= 0xfffful << (j << 3);
237 * printf("[0x%x] = 0x%016lx,\n", i, m);
240 static inline uint64_t expand_pred_h(uint8_t byte
)
242 static const uint64_t word
[] = {
243 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
244 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
245 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
246 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
247 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
248 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
249 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
250 [0x55] = 0xffffffffffffffff,
252 return word
[byte
& 0x55];
255 /* Similarly for single word elements. */
256 static inline uint64_t expand_pred_s(uint8_t byte
)
258 static const uint64_t word
[] = {
259 [0x01] = 0x00000000ffffffffull
,
260 [0x10] = 0xffffffff00000000ull
,
261 [0x11] = 0xffffffffffffffffull
,
263 return word
[byte
& 0x11];
266 /* Swap 16-bit words within a 32-bit word. */
267 static inline uint32_t hswap32(uint32_t h
)
272 /* Swap 16-bit words within a 64-bit word. */
273 static inline uint64_t hswap64(uint64_t h
)
275 uint64_t m
= 0x0000ffff0000ffffull
;
277 return ((h
& m
) << 16) | ((h
>> 16) & m
);
280 /* Swap 32-bit words within a 64-bit word. */
281 static inline uint64_t wswap64(uint64_t h
)
286 #define LOGICAL_PPPP(NAME, FUNC) \
287 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
289 uintptr_t opr_sz = simd_oprsz(desc); \
290 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
292 for (i = 0; i < opr_sz / 8; ++i) { \
293 d[i] = FUNC(n[i], m[i], g[i]); \
297 #define DO_AND(N, M, G) (((N) & (M)) & (G))
298 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
299 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
300 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
301 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
302 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
303 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
304 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
306 LOGICAL_PPPP(sve_and_pppp
, DO_AND
)
307 LOGICAL_PPPP(sve_bic_pppp
, DO_BIC
)
308 LOGICAL_PPPP(sve_eor_pppp
, DO_EOR
)
309 LOGICAL_PPPP(sve_sel_pppp
, DO_SEL
)
310 LOGICAL_PPPP(sve_orr_pppp
, DO_ORR
)
311 LOGICAL_PPPP(sve_orn_pppp
, DO_ORN
)
312 LOGICAL_PPPP(sve_nor_pppp
, DO_NOR
)
313 LOGICAL_PPPP(sve_nand_pppp
, DO_NAND
)
325 /* Fully general three-operand expander, controlled by a predicate.
326 * This is complicated by the host-endian storage of the register file.
328 /* ??? I don't expect the compiler could ever vectorize this itself.
329 * With some tables we can convert bit masks to byte masks, and with
330 * extra care wrt byte/word ordering we could use gcc generic vectors
331 * and do 16 bytes at a time.
333 #define DO_ZPZZ(NAME, TYPE, H, OP) \
334 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
336 intptr_t i, opr_sz = simd_oprsz(desc); \
337 for (i = 0; i < opr_sz; ) { \
338 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
341 TYPE nn = *(TYPE *)(vn + H(i)); \
342 TYPE mm = *(TYPE *)(vm + H(i)); \
343 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
345 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
350 /* Similarly, specialized for 64-bit operands. */
351 #define DO_ZPZZ_D(NAME, TYPE, OP) \
352 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
354 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
355 TYPE *d = vd, *n = vn, *m = vm; \
357 for (i = 0; i < opr_sz; i += 1) { \
358 if (pg[H1(i)] & 1) { \
359 TYPE nn = n[i], mm = m[i]; \
365 #define DO_AND(N, M) (N & M)
366 #define DO_EOR(N, M) (N ^ M)
367 #define DO_ORR(N, M) (N | M)
368 #define DO_BIC(N, M) (N & ~M)
369 #define DO_ADD(N, M) (N + M)
370 #define DO_SUB(N, M) (N - M)
371 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
372 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
373 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
374 #define DO_MUL(N, M) (N * M)
378 * We must avoid the C undefined behaviour cases: division by
379 * zero and signed division of INT_MIN by -1. Both of these
380 * have architecturally defined required results for Arm.
381 * We special case all signed divisions by -1 to avoid having
382 * to deduce the minimum integer for the type involved.
384 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
385 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
387 DO_ZPZZ(sve_and_zpzz_b
, uint8_t, H1
, DO_AND
)
388 DO_ZPZZ(sve_and_zpzz_h
, uint16_t, H1_2
, DO_AND
)
389 DO_ZPZZ(sve_and_zpzz_s
, uint32_t, H1_4
, DO_AND
)
390 DO_ZPZZ_D(sve_and_zpzz_d
, uint64_t, DO_AND
)
392 DO_ZPZZ(sve_orr_zpzz_b
, uint8_t, H1
, DO_ORR
)
393 DO_ZPZZ(sve_orr_zpzz_h
, uint16_t, H1_2
, DO_ORR
)
394 DO_ZPZZ(sve_orr_zpzz_s
, uint32_t, H1_4
, DO_ORR
)
395 DO_ZPZZ_D(sve_orr_zpzz_d
, uint64_t, DO_ORR
)
397 DO_ZPZZ(sve_eor_zpzz_b
, uint8_t, H1
, DO_EOR
)
398 DO_ZPZZ(sve_eor_zpzz_h
, uint16_t, H1_2
, DO_EOR
)
399 DO_ZPZZ(sve_eor_zpzz_s
, uint32_t, H1_4
, DO_EOR
)
400 DO_ZPZZ_D(sve_eor_zpzz_d
, uint64_t, DO_EOR
)
402 DO_ZPZZ(sve_bic_zpzz_b
, uint8_t, H1
, DO_BIC
)
403 DO_ZPZZ(sve_bic_zpzz_h
, uint16_t, H1_2
, DO_BIC
)
404 DO_ZPZZ(sve_bic_zpzz_s
, uint32_t, H1_4
, DO_BIC
)
405 DO_ZPZZ_D(sve_bic_zpzz_d
, uint64_t, DO_BIC
)
407 DO_ZPZZ(sve_add_zpzz_b
, uint8_t, H1
, DO_ADD
)
408 DO_ZPZZ(sve_add_zpzz_h
, uint16_t, H1_2
, DO_ADD
)
409 DO_ZPZZ(sve_add_zpzz_s
, uint32_t, H1_4
, DO_ADD
)
410 DO_ZPZZ_D(sve_add_zpzz_d
, uint64_t, DO_ADD
)
412 DO_ZPZZ(sve_sub_zpzz_b
, uint8_t, H1
, DO_SUB
)
413 DO_ZPZZ(sve_sub_zpzz_h
, uint16_t, H1_2
, DO_SUB
)
414 DO_ZPZZ(sve_sub_zpzz_s
, uint32_t, H1_4
, DO_SUB
)
415 DO_ZPZZ_D(sve_sub_zpzz_d
, uint64_t, DO_SUB
)
417 DO_ZPZZ(sve_smax_zpzz_b
, int8_t, H1
, DO_MAX
)
418 DO_ZPZZ(sve_smax_zpzz_h
, int16_t, H1_2
, DO_MAX
)
419 DO_ZPZZ(sve_smax_zpzz_s
, int32_t, H1_4
, DO_MAX
)
420 DO_ZPZZ_D(sve_smax_zpzz_d
, int64_t, DO_MAX
)
422 DO_ZPZZ(sve_umax_zpzz_b
, uint8_t, H1
, DO_MAX
)
423 DO_ZPZZ(sve_umax_zpzz_h
, uint16_t, H1_2
, DO_MAX
)
424 DO_ZPZZ(sve_umax_zpzz_s
, uint32_t, H1_4
, DO_MAX
)
425 DO_ZPZZ_D(sve_umax_zpzz_d
, uint64_t, DO_MAX
)
427 DO_ZPZZ(sve_smin_zpzz_b
, int8_t, H1
, DO_MIN
)
428 DO_ZPZZ(sve_smin_zpzz_h
, int16_t, H1_2
, DO_MIN
)
429 DO_ZPZZ(sve_smin_zpzz_s
, int32_t, H1_4
, DO_MIN
)
430 DO_ZPZZ_D(sve_smin_zpzz_d
, int64_t, DO_MIN
)
432 DO_ZPZZ(sve_umin_zpzz_b
, uint8_t, H1
, DO_MIN
)
433 DO_ZPZZ(sve_umin_zpzz_h
, uint16_t, H1_2
, DO_MIN
)
434 DO_ZPZZ(sve_umin_zpzz_s
, uint32_t, H1_4
, DO_MIN
)
435 DO_ZPZZ_D(sve_umin_zpzz_d
, uint64_t, DO_MIN
)
437 DO_ZPZZ(sve_sabd_zpzz_b
, int8_t, H1
, DO_ABD
)
438 DO_ZPZZ(sve_sabd_zpzz_h
, int16_t, H1_2
, DO_ABD
)
439 DO_ZPZZ(sve_sabd_zpzz_s
, int32_t, H1_4
, DO_ABD
)
440 DO_ZPZZ_D(sve_sabd_zpzz_d
, int64_t, DO_ABD
)
442 DO_ZPZZ(sve_uabd_zpzz_b
, uint8_t, H1
, DO_ABD
)
443 DO_ZPZZ(sve_uabd_zpzz_h
, uint16_t, H1_2
, DO_ABD
)
444 DO_ZPZZ(sve_uabd_zpzz_s
, uint32_t, H1_4
, DO_ABD
)
445 DO_ZPZZ_D(sve_uabd_zpzz_d
, uint64_t, DO_ABD
)
447 /* Because the computation type is at least twice as large as required,
448 these work for both signed and unsigned source types. */
449 static inline uint8_t do_mulh_b(int32_t n
, int32_t m
)
454 static inline uint16_t do_mulh_h(int32_t n
, int32_t m
)
456 return (n
* m
) >> 16;
459 static inline uint32_t do_mulh_s(int64_t n
, int64_t m
)
461 return (n
* m
) >> 32;
464 static inline uint64_t do_smulh_d(uint64_t n
, uint64_t m
)
467 muls64(&lo
, &hi
, n
, m
);
471 static inline uint64_t do_umulh_d(uint64_t n
, uint64_t m
)
474 mulu64(&lo
, &hi
, n
, m
);
478 DO_ZPZZ(sve_mul_zpzz_b
, uint8_t, H1
, DO_MUL
)
479 DO_ZPZZ(sve_mul_zpzz_h
, uint16_t, H1_2
, DO_MUL
)
480 DO_ZPZZ(sve_mul_zpzz_s
, uint32_t, H1_4
, DO_MUL
)
481 DO_ZPZZ_D(sve_mul_zpzz_d
, uint64_t, DO_MUL
)
483 DO_ZPZZ(sve_smulh_zpzz_b
, int8_t, H1
, do_mulh_b
)
484 DO_ZPZZ(sve_smulh_zpzz_h
, int16_t, H1_2
, do_mulh_h
)
485 DO_ZPZZ(sve_smulh_zpzz_s
, int32_t, H1_4
, do_mulh_s
)
486 DO_ZPZZ_D(sve_smulh_zpzz_d
, uint64_t, do_smulh_d
)
488 DO_ZPZZ(sve_umulh_zpzz_b
, uint8_t, H1
, do_mulh_b
)
489 DO_ZPZZ(sve_umulh_zpzz_h
, uint16_t, H1_2
, do_mulh_h
)
490 DO_ZPZZ(sve_umulh_zpzz_s
, uint32_t, H1_4
, do_mulh_s
)
491 DO_ZPZZ_D(sve_umulh_zpzz_d
, uint64_t, do_umulh_d
)
493 DO_ZPZZ(sve_sdiv_zpzz_s
, int32_t, H1_4
, DO_SDIV
)
494 DO_ZPZZ_D(sve_sdiv_zpzz_d
, int64_t, DO_SDIV
)
496 DO_ZPZZ(sve_udiv_zpzz_s
, uint32_t, H1_4
, DO_UDIV
)
497 DO_ZPZZ_D(sve_udiv_zpzz_d
, uint64_t, DO_UDIV
)
499 /* Note that all bits of the shift are significant
500 and not modulo the element size. */
501 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
502 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
503 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
505 DO_ZPZZ(sve_asr_zpzz_b
, int8_t, H1
, DO_ASR
)
506 DO_ZPZZ(sve_lsr_zpzz_b
, uint8_t, H1_2
, DO_LSR
)
507 DO_ZPZZ(sve_lsl_zpzz_b
, uint8_t, H1_4
, DO_LSL
)
509 DO_ZPZZ(sve_asr_zpzz_h
, int16_t, H1
, DO_ASR
)
510 DO_ZPZZ(sve_lsr_zpzz_h
, uint16_t, H1_2
, DO_LSR
)
511 DO_ZPZZ(sve_lsl_zpzz_h
, uint16_t, H1_4
, DO_LSL
)
513 DO_ZPZZ(sve_asr_zpzz_s
, int32_t, H1
, DO_ASR
)
514 DO_ZPZZ(sve_lsr_zpzz_s
, uint32_t, H1_2
, DO_LSR
)
515 DO_ZPZZ(sve_lsl_zpzz_s
, uint32_t, H1_4
, DO_LSL
)
517 DO_ZPZZ_D(sve_asr_zpzz_d
, int64_t, DO_ASR
)
518 DO_ZPZZ_D(sve_lsr_zpzz_d
, uint64_t, DO_LSR
)
519 DO_ZPZZ_D(sve_lsl_zpzz_d
, uint64_t, DO_LSL
)
521 static inline uint16_t do_sadalp_h(int16_t n
, int16_t m
)
523 int8_t n1
= n
, n2
= n
>> 8;
527 static inline uint32_t do_sadalp_s(int32_t n
, int32_t m
)
529 int16_t n1
= n
, n2
= n
>> 16;
533 static inline uint64_t do_sadalp_d(int64_t n
, int64_t m
)
535 int32_t n1
= n
, n2
= n
>> 32;
539 DO_ZPZZ(sve2_sadalp_zpzz_h
, int16_t, H1_2
, do_sadalp_h
)
540 DO_ZPZZ(sve2_sadalp_zpzz_s
, int32_t, H1_4
, do_sadalp_s
)
541 DO_ZPZZ_D(sve2_sadalp_zpzz_d
, int64_t, do_sadalp_d
)
543 static inline uint16_t do_uadalp_h(uint16_t n
, uint16_t m
)
545 uint8_t n1
= n
, n2
= n
>> 8;
549 static inline uint32_t do_uadalp_s(uint32_t n
, uint32_t m
)
551 uint16_t n1
= n
, n2
= n
>> 16;
555 static inline uint64_t do_uadalp_d(uint64_t n
, uint64_t m
)
557 uint32_t n1
= n
, n2
= n
>> 32;
561 DO_ZPZZ(sve2_uadalp_zpzz_h
, uint16_t, H1_2
, do_uadalp_h
)
562 DO_ZPZZ(sve2_uadalp_zpzz_s
, uint32_t, H1_4
, do_uadalp_s
)
563 DO_ZPZZ_D(sve2_uadalp_zpzz_d
, uint64_t, do_uadalp_d
)
565 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
566 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
567 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
568 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
570 DO_ZPZZ(sve2_srshl_zpzz_b
, int8_t, H1
, do_srshl_b
)
571 DO_ZPZZ(sve2_srshl_zpzz_h
, int16_t, H1_2
, do_srshl_h
)
572 DO_ZPZZ(sve2_srshl_zpzz_s
, int32_t, H1_4
, do_srshl_s
)
573 DO_ZPZZ_D(sve2_srshl_zpzz_d
, int64_t, do_srshl_d
)
575 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
576 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
577 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
578 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
580 DO_ZPZZ(sve2_urshl_zpzz_b
, uint8_t, H1
, do_urshl_b
)
581 DO_ZPZZ(sve2_urshl_zpzz_h
, uint16_t, H1_2
, do_urshl_h
)
582 DO_ZPZZ(sve2_urshl_zpzz_s
, uint32_t, H1_4
, do_urshl_s
)
583 DO_ZPZZ_D(sve2_urshl_zpzz_d
, uint64_t, do_urshl_d
)
586 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
587 * We pass in a pointer to a dummy saturation field to trigger
588 * the saturating arithmetic but discard the information about
589 * whether it has occurred.
591 #define do_sqshl_b(n, m) \
592 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
593 #define do_sqshl_h(n, m) \
594 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
595 #define do_sqshl_s(n, m) \
596 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
597 #define do_sqshl_d(n, m) \
598 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
600 DO_ZPZZ(sve2_sqshl_zpzz_b
, int8_t, H1_2
, do_sqshl_b
)
601 DO_ZPZZ(sve2_sqshl_zpzz_h
, int16_t, H1_2
, do_sqshl_h
)
602 DO_ZPZZ(sve2_sqshl_zpzz_s
, int32_t, H1_4
, do_sqshl_s
)
603 DO_ZPZZ_D(sve2_sqshl_zpzz_d
, int64_t, do_sqshl_d
)
605 #define do_uqshl_b(n, m) \
606 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
607 #define do_uqshl_h(n, m) \
608 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
609 #define do_uqshl_s(n, m) \
610 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
611 #define do_uqshl_d(n, m) \
612 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
614 DO_ZPZZ(sve2_uqshl_zpzz_b
, uint8_t, H1_2
, do_uqshl_b
)
615 DO_ZPZZ(sve2_uqshl_zpzz_h
, uint16_t, H1_2
, do_uqshl_h
)
616 DO_ZPZZ(sve2_uqshl_zpzz_s
, uint32_t, H1_4
, do_uqshl_s
)
617 DO_ZPZZ_D(sve2_uqshl_zpzz_d
, uint64_t, do_uqshl_d
)
619 #define do_sqrshl_b(n, m) \
620 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
621 #define do_sqrshl_h(n, m) \
622 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
623 #define do_sqrshl_s(n, m) \
624 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
625 #define do_sqrshl_d(n, m) \
626 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
628 DO_ZPZZ(sve2_sqrshl_zpzz_b
, int8_t, H1_2
, do_sqrshl_b
)
629 DO_ZPZZ(sve2_sqrshl_zpzz_h
, int16_t, H1_2
, do_sqrshl_h
)
630 DO_ZPZZ(sve2_sqrshl_zpzz_s
, int32_t, H1_4
, do_sqrshl_s
)
631 DO_ZPZZ_D(sve2_sqrshl_zpzz_d
, int64_t, do_sqrshl_d
)
635 #define do_uqrshl_b(n, m) \
636 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
637 #define do_uqrshl_h(n, m) \
638 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
639 #define do_uqrshl_s(n, m) \
640 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
641 #define do_uqrshl_d(n, m) \
642 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
644 DO_ZPZZ(sve2_uqrshl_zpzz_b
, uint8_t, H1_2
, do_uqrshl_b
)
645 DO_ZPZZ(sve2_uqrshl_zpzz_h
, uint16_t, H1_2
, do_uqrshl_h
)
646 DO_ZPZZ(sve2_uqrshl_zpzz_s
, uint32_t, H1_4
, do_uqrshl_s
)
647 DO_ZPZZ_D(sve2_uqrshl_zpzz_d
, uint64_t, do_uqrshl_d
)
651 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
652 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
654 DO_ZPZZ(sve2_shadd_zpzz_b
, int8_t, H1
, DO_HADD_BHS
)
655 DO_ZPZZ(sve2_shadd_zpzz_h
, int16_t, H1_2
, DO_HADD_BHS
)
656 DO_ZPZZ(sve2_shadd_zpzz_s
, int32_t, H1_4
, DO_HADD_BHS
)
657 DO_ZPZZ_D(sve2_shadd_zpzz_d
, int64_t, DO_HADD_D
)
659 DO_ZPZZ(sve2_uhadd_zpzz_b
, uint8_t, H1
, DO_HADD_BHS
)
660 DO_ZPZZ(sve2_uhadd_zpzz_h
, uint16_t, H1_2
, DO_HADD_BHS
)
661 DO_ZPZZ(sve2_uhadd_zpzz_s
, uint32_t, H1_4
, DO_HADD_BHS
)
662 DO_ZPZZ_D(sve2_uhadd_zpzz_d
, uint64_t, DO_HADD_D
)
664 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
665 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
667 DO_ZPZZ(sve2_srhadd_zpzz_b
, int8_t, H1
, DO_RHADD_BHS
)
668 DO_ZPZZ(sve2_srhadd_zpzz_h
, int16_t, H1_2
, DO_RHADD_BHS
)
669 DO_ZPZZ(sve2_srhadd_zpzz_s
, int32_t, H1_4
, DO_RHADD_BHS
)
670 DO_ZPZZ_D(sve2_srhadd_zpzz_d
, int64_t, DO_RHADD_D
)
672 DO_ZPZZ(sve2_urhadd_zpzz_b
, uint8_t, H1
, DO_RHADD_BHS
)
673 DO_ZPZZ(sve2_urhadd_zpzz_h
, uint16_t, H1_2
, DO_RHADD_BHS
)
674 DO_ZPZZ(sve2_urhadd_zpzz_s
, uint32_t, H1_4
, DO_RHADD_BHS
)
675 DO_ZPZZ_D(sve2_urhadd_zpzz_d
, uint64_t, DO_RHADD_D
)
677 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
678 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
680 DO_ZPZZ(sve2_shsub_zpzz_b
, int8_t, H1
, DO_HSUB_BHS
)
681 DO_ZPZZ(sve2_shsub_zpzz_h
, int16_t, H1_2
, DO_HSUB_BHS
)
682 DO_ZPZZ(sve2_shsub_zpzz_s
, int32_t, H1_4
, DO_HSUB_BHS
)
683 DO_ZPZZ_D(sve2_shsub_zpzz_d
, int64_t, DO_HSUB_D
)
685 DO_ZPZZ(sve2_uhsub_zpzz_b
, uint8_t, H1
, DO_HSUB_BHS
)
686 DO_ZPZZ(sve2_uhsub_zpzz_h
, uint16_t, H1_2
, DO_HSUB_BHS
)
687 DO_ZPZZ(sve2_uhsub_zpzz_s
, uint32_t, H1_4
, DO_HSUB_BHS
)
688 DO_ZPZZ_D(sve2_uhsub_zpzz_d
, uint64_t, DO_HSUB_D
)
690 static inline int32_t do_sat_bhs(int64_t val
, int64_t min
, int64_t max
)
692 return val
>= max
? max
: val
<= min
? min
: val
;
695 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
696 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
697 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
699 static inline int64_t do_sqadd_d(int64_t n
, int64_t m
)
702 if (((r
^ n
) & ~(n
^ m
)) < 0) {
703 /* Signed overflow. */
704 return r
< 0 ? INT64_MAX
: INT64_MIN
;
709 DO_ZPZZ(sve2_sqadd_zpzz_b
, int8_t, H1
, DO_SQADD_B
)
710 DO_ZPZZ(sve2_sqadd_zpzz_h
, int16_t, H1_2
, DO_SQADD_H
)
711 DO_ZPZZ(sve2_sqadd_zpzz_s
, int32_t, H1_4
, DO_SQADD_S
)
712 DO_ZPZZ_D(sve2_sqadd_zpzz_d
, int64_t, do_sqadd_d
)
714 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
715 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
716 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
718 static inline uint64_t do_uqadd_d(uint64_t n
, uint64_t m
)
721 return r
< n
? UINT64_MAX
: r
;
724 DO_ZPZZ(sve2_uqadd_zpzz_b
, uint8_t, H1
, DO_UQADD_B
)
725 DO_ZPZZ(sve2_uqadd_zpzz_h
, uint16_t, H1_2
, DO_UQADD_H
)
726 DO_ZPZZ(sve2_uqadd_zpzz_s
, uint32_t, H1_4
, DO_UQADD_S
)
727 DO_ZPZZ_D(sve2_uqadd_zpzz_d
, uint64_t, do_uqadd_d
)
729 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
730 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
731 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
733 static inline int64_t do_sqsub_d(int64_t n
, int64_t m
)
736 if (((r
^ n
) & (n
^ m
)) < 0) {
737 /* Signed overflow. */
738 return r
< 0 ? INT64_MAX
: INT64_MIN
;
743 DO_ZPZZ(sve2_sqsub_zpzz_b
, int8_t, H1
, DO_SQSUB_B
)
744 DO_ZPZZ(sve2_sqsub_zpzz_h
, int16_t, H1_2
, DO_SQSUB_H
)
745 DO_ZPZZ(sve2_sqsub_zpzz_s
, int32_t, H1_4
, DO_SQSUB_S
)
746 DO_ZPZZ_D(sve2_sqsub_zpzz_d
, int64_t, do_sqsub_d
)
748 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
749 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
750 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
752 static inline uint64_t do_uqsub_d(uint64_t n
, uint64_t m
)
754 return n
> m
? n
- m
: 0;
757 DO_ZPZZ(sve2_uqsub_zpzz_b
, uint8_t, H1
, DO_UQSUB_B
)
758 DO_ZPZZ(sve2_uqsub_zpzz_h
, uint16_t, H1_2
, DO_UQSUB_H
)
759 DO_ZPZZ(sve2_uqsub_zpzz_s
, uint32_t, H1_4
, DO_UQSUB_S
)
760 DO_ZPZZ_D(sve2_uqsub_zpzz_d
, uint64_t, do_uqsub_d
)
762 #define DO_SUQADD_B(n, m) \
763 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
764 #define DO_SUQADD_H(n, m) \
765 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
766 #define DO_SUQADD_S(n, m) \
767 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
769 static inline int64_t do_suqadd_d(int64_t n
, uint64_t m
)
774 /* Note that m - abs(n) cannot underflow. */
776 /* Result is either very large positive or negative. */
778 /* m > abs(n), so r is a very large positive. */
781 /* Result is negative. */
784 /* Both inputs are positive: check for overflow. */
785 if (r
< m
|| r
> INT64_MAX
) {
792 DO_ZPZZ(sve2_suqadd_zpzz_b
, uint8_t, H1
, DO_SUQADD_B
)
793 DO_ZPZZ(sve2_suqadd_zpzz_h
, uint16_t, H1_2
, DO_SUQADD_H
)
794 DO_ZPZZ(sve2_suqadd_zpzz_s
, uint32_t, H1_4
, DO_SUQADD_S
)
795 DO_ZPZZ_D(sve2_suqadd_zpzz_d
, uint64_t, do_suqadd_d
)
797 #define DO_USQADD_B(n, m) \
798 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
799 #define DO_USQADD_H(n, m) \
800 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
801 #define DO_USQADD_S(n, m) \
802 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
804 static inline uint64_t do_usqadd_d(uint64_t n
, int64_t m
)
809 return n
< -m
? 0 : r
;
811 return r
< n
? UINT64_MAX
: r
;
814 DO_ZPZZ(sve2_usqadd_zpzz_b
, uint8_t, H1
, DO_USQADD_B
)
815 DO_ZPZZ(sve2_usqadd_zpzz_h
, uint16_t, H1_2
, DO_USQADD_H
)
816 DO_ZPZZ(sve2_usqadd_zpzz_s
, uint32_t, H1_4
, DO_USQADD_S
)
817 DO_ZPZZ_D(sve2_usqadd_zpzz_d
, uint64_t, do_usqadd_d
)
823 * Three operand expander, operating on element pairs.
824 * If the slot I is even, the elements from from VN {I, I+1}.
825 * If the slot I is odd, the elements from from VM {I-1, I}.
826 * Load all of the input elements in each pair before overwriting output.
828 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
829 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
831 intptr_t i, opr_sz = simd_oprsz(desc); \
832 for (i = 0; i < opr_sz; ) { \
833 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
835 TYPE n0 = *(TYPE *)(vn + H(i)); \
836 TYPE m0 = *(TYPE *)(vm + H(i)); \
837 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
838 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
840 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
842 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
844 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
846 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
851 /* Similarly, specialized for 64-bit operands. */
852 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
853 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
855 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
856 TYPE *d = vd, *n = vn, *m = vm; \
858 for (i = 0; i < opr_sz; i += 2) { \
859 TYPE n0 = n[i], n1 = n[i + 1]; \
860 TYPE m0 = m[i], m1 = m[i + 1]; \
861 if (pg[H1(i)] & 1) { \
864 if (pg[H1(i + 1)] & 1) { \
865 d[i + 1] = OP(m0, m1); \
870 DO_ZPZZ_PAIR(sve2_addp_zpzz_b
, uint8_t, H1
, DO_ADD
)
871 DO_ZPZZ_PAIR(sve2_addp_zpzz_h
, uint16_t, H1_2
, DO_ADD
)
872 DO_ZPZZ_PAIR(sve2_addp_zpzz_s
, uint32_t, H1_4
, DO_ADD
)
873 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d
, uint64_t, DO_ADD
)
875 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b
, uint8_t, H1
, DO_MAX
)
876 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h
, uint16_t, H1_2
, DO_MAX
)
877 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s
, uint32_t, H1_4
, DO_MAX
)
878 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d
, uint64_t, DO_MAX
)
880 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b
, uint8_t, H1
, DO_MIN
)
881 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h
, uint16_t, H1_2
, DO_MIN
)
882 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s
, uint32_t, H1_4
, DO_MIN
)
883 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d
, uint64_t, DO_MIN
)
885 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b
, int8_t, H1
, DO_MAX
)
886 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h
, int16_t, H1_2
, DO_MAX
)
887 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s
, int32_t, H1_4
, DO_MAX
)
888 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d
, int64_t, DO_MAX
)
890 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b
, int8_t, H1
, DO_MIN
)
891 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h
, int16_t, H1_2
, DO_MIN
)
892 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s
, int32_t, H1_4
, DO_MIN
)
893 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d
, int64_t, DO_MIN
)
896 #undef DO_ZPZZ_PAIR_D
898 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
899 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
900 void *status, uint32_t desc) \
902 intptr_t i, opr_sz = simd_oprsz(desc); \
903 for (i = 0; i < opr_sz; ) { \
904 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
906 TYPE n0 = *(TYPE *)(vn + H(i)); \
907 TYPE m0 = *(TYPE *)(vm + H(i)); \
908 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
909 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
911 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
913 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
915 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
917 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
922 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h
, float16
, H1_2
, float16_add
)
923 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s
, float32
, H1_4
, float32_add
)
924 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d
, float64
, , float64_add
)
926 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h
, float16
, H1_2
, float16_maxnum
)
927 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s
, float32
, H1_4
, float32_maxnum
)
928 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d
, float64
, , float64_maxnum
)
930 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h
, float16
, H1_2
, float16_minnum
)
931 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s
, float32
, H1_4
, float32_minnum
)
932 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d
, float64
, , float64_minnum
)
934 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h
, float16
, H1_2
, float16_max
)
935 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s
, float32
, H1_4
, float32_max
)
936 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d
, float64
, , float64_max
)
938 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h
, float16
, H1_2
, float16_min
)
939 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s
, float32
, H1_4
, float32_min
)
940 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d
, float64
, , float64_min
)
942 #undef DO_ZPZZ_PAIR_FP
944 /* Three-operand expander, controlled by a predicate, in which the
945 * third operand is "wide". That is, for D = N op M, the same 64-bit
946 * value of M is used with all of the narrower values of N.
948 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
949 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
951 intptr_t i, opr_sz = simd_oprsz(desc); \
952 for (i = 0; i < opr_sz; ) { \
953 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
954 TYPEW mm = *(TYPEW *)(vm + i); \
957 TYPE nn = *(TYPE *)(vn + H(i)); \
958 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
960 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
965 DO_ZPZW(sve_asr_zpzw_b
, int8_t, uint64_t, H1
, DO_ASR
)
966 DO_ZPZW(sve_lsr_zpzw_b
, uint8_t, uint64_t, H1
, DO_LSR
)
967 DO_ZPZW(sve_lsl_zpzw_b
, uint8_t, uint64_t, H1
, DO_LSL
)
969 DO_ZPZW(sve_asr_zpzw_h
, int16_t, uint64_t, H1_2
, DO_ASR
)
970 DO_ZPZW(sve_lsr_zpzw_h
, uint16_t, uint64_t, H1_2
, DO_LSR
)
971 DO_ZPZW(sve_lsl_zpzw_h
, uint16_t, uint64_t, H1_2
, DO_LSL
)
973 DO_ZPZW(sve_asr_zpzw_s
, int32_t, uint64_t, H1_4
, DO_ASR
)
974 DO_ZPZW(sve_lsr_zpzw_s
, uint32_t, uint64_t, H1_4
, DO_LSR
)
975 DO_ZPZW(sve_lsl_zpzw_s
, uint32_t, uint64_t, H1_4
, DO_LSL
)
979 /* Fully general two-operand expander, controlled by a predicate.
981 #define DO_ZPZ(NAME, TYPE, H, OP) \
982 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
984 intptr_t i, opr_sz = simd_oprsz(desc); \
985 for (i = 0; i < opr_sz; ) { \
986 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
989 TYPE nn = *(TYPE *)(vn + H(i)); \
990 *(TYPE *)(vd + H(i)) = OP(nn); \
992 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
997 /* Similarly, specialized for 64-bit operands. */
998 #define DO_ZPZ_D(NAME, TYPE, OP) \
999 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1001 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1002 TYPE *d = vd, *n = vn; \
1004 for (i = 0; i < opr_sz; i += 1) { \
1005 if (pg[H1(i)] & 1) { \
1012 #define DO_CLS_B(N) (clrsb32(N) - 24)
1013 #define DO_CLS_H(N) (clrsb32(N) - 16)
1015 DO_ZPZ(sve_cls_b
, int8_t, H1
, DO_CLS_B
)
1016 DO_ZPZ(sve_cls_h
, int16_t, H1_2
, DO_CLS_H
)
1017 DO_ZPZ(sve_cls_s
, int32_t, H1_4
, clrsb32
)
1018 DO_ZPZ_D(sve_cls_d
, int64_t, clrsb64
)
1020 #define DO_CLZ_B(N) (clz32(N) - 24)
1021 #define DO_CLZ_H(N) (clz32(N) - 16)
1023 DO_ZPZ(sve_clz_b
, uint8_t, H1
, DO_CLZ_B
)
1024 DO_ZPZ(sve_clz_h
, uint16_t, H1_2
, DO_CLZ_H
)
1025 DO_ZPZ(sve_clz_s
, uint32_t, H1_4
, clz32
)
1026 DO_ZPZ_D(sve_clz_d
, uint64_t, clz64
)
1028 DO_ZPZ(sve_cnt_zpz_b
, uint8_t, H1
, ctpop8
)
1029 DO_ZPZ(sve_cnt_zpz_h
, uint16_t, H1_2
, ctpop16
)
1030 DO_ZPZ(sve_cnt_zpz_s
, uint32_t, H1_4
, ctpop32
)
1031 DO_ZPZ_D(sve_cnt_zpz_d
, uint64_t, ctpop64
)
1033 #define DO_CNOT(N) (N == 0)
1035 DO_ZPZ(sve_cnot_b
, uint8_t, H1
, DO_CNOT
)
1036 DO_ZPZ(sve_cnot_h
, uint16_t, H1_2
, DO_CNOT
)
1037 DO_ZPZ(sve_cnot_s
, uint32_t, H1_4
, DO_CNOT
)
1038 DO_ZPZ_D(sve_cnot_d
, uint64_t, DO_CNOT
)
1040 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
1042 DO_ZPZ(sve_fabs_h
, uint16_t, H1_2
, DO_FABS
)
1043 DO_ZPZ(sve_fabs_s
, uint32_t, H1_4
, DO_FABS
)
1044 DO_ZPZ_D(sve_fabs_d
, uint64_t, DO_FABS
)
1046 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
1048 DO_ZPZ(sve_fneg_h
, uint16_t, H1_2
, DO_FNEG
)
1049 DO_ZPZ(sve_fneg_s
, uint32_t, H1_4
, DO_FNEG
)
1050 DO_ZPZ_D(sve_fneg_d
, uint64_t, DO_FNEG
)
1052 #define DO_NOT(N) (~N)
1054 DO_ZPZ(sve_not_zpz_b
, uint8_t, H1
, DO_NOT
)
1055 DO_ZPZ(sve_not_zpz_h
, uint16_t, H1_2
, DO_NOT
)
1056 DO_ZPZ(sve_not_zpz_s
, uint32_t, H1_4
, DO_NOT
)
1057 DO_ZPZ_D(sve_not_zpz_d
, uint64_t, DO_NOT
)
1059 #define DO_SXTB(N) ((int8_t)N)
1060 #define DO_SXTH(N) ((int16_t)N)
1061 #define DO_SXTS(N) ((int32_t)N)
1062 #define DO_UXTB(N) ((uint8_t)N)
1063 #define DO_UXTH(N) ((uint16_t)N)
1064 #define DO_UXTS(N) ((uint32_t)N)
1066 DO_ZPZ(sve_sxtb_h
, uint16_t, H1_2
, DO_SXTB
)
1067 DO_ZPZ(sve_sxtb_s
, uint32_t, H1_4
, DO_SXTB
)
1068 DO_ZPZ(sve_sxth_s
, uint32_t, H1_4
, DO_SXTH
)
1069 DO_ZPZ_D(sve_sxtb_d
, uint64_t, DO_SXTB
)
1070 DO_ZPZ_D(sve_sxth_d
, uint64_t, DO_SXTH
)
1071 DO_ZPZ_D(sve_sxtw_d
, uint64_t, DO_SXTS
)
1073 DO_ZPZ(sve_uxtb_h
, uint16_t, H1_2
, DO_UXTB
)
1074 DO_ZPZ(sve_uxtb_s
, uint32_t, H1_4
, DO_UXTB
)
1075 DO_ZPZ(sve_uxth_s
, uint32_t, H1_4
, DO_UXTH
)
1076 DO_ZPZ_D(sve_uxtb_d
, uint64_t, DO_UXTB
)
1077 DO_ZPZ_D(sve_uxth_d
, uint64_t, DO_UXTH
)
1078 DO_ZPZ_D(sve_uxtw_d
, uint64_t, DO_UXTS
)
1080 #define DO_ABS(N) (N < 0 ? -N : N)
1082 DO_ZPZ(sve_abs_b
, int8_t, H1
, DO_ABS
)
1083 DO_ZPZ(sve_abs_h
, int16_t, H1_2
, DO_ABS
)
1084 DO_ZPZ(sve_abs_s
, int32_t, H1_4
, DO_ABS
)
1085 DO_ZPZ_D(sve_abs_d
, int64_t, DO_ABS
)
1087 #define DO_NEG(N) (-N)
1089 DO_ZPZ(sve_neg_b
, uint8_t, H1
, DO_NEG
)
1090 DO_ZPZ(sve_neg_h
, uint16_t, H1_2
, DO_NEG
)
1091 DO_ZPZ(sve_neg_s
, uint32_t, H1_4
, DO_NEG
)
1092 DO_ZPZ_D(sve_neg_d
, uint64_t, DO_NEG
)
1094 DO_ZPZ(sve_revb_h
, uint16_t, H1_2
, bswap16
)
1095 DO_ZPZ(sve_revb_s
, uint32_t, H1_4
, bswap32
)
1096 DO_ZPZ_D(sve_revb_d
, uint64_t, bswap64
)
1098 DO_ZPZ(sve_revh_s
, uint32_t, H1_4
, hswap32
)
1099 DO_ZPZ_D(sve_revh_d
, uint64_t, hswap64
)
1101 DO_ZPZ_D(sve_revw_d
, uint64_t, wswap64
)
1103 DO_ZPZ(sve_rbit_b
, uint8_t, H1
, revbit8
)
1104 DO_ZPZ(sve_rbit_h
, uint16_t, H1_2
, revbit16
)
1105 DO_ZPZ(sve_rbit_s
, uint32_t, H1_4
, revbit32
)
1106 DO_ZPZ_D(sve_rbit_d
, uint64_t, revbit64
)
1108 #define DO_SQABS(X) \
1109 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1110 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
1112 DO_ZPZ(sve2_sqabs_b
, int8_t, H1
, DO_SQABS
)
1113 DO_ZPZ(sve2_sqabs_h
, int16_t, H1_2
, DO_SQABS
)
1114 DO_ZPZ(sve2_sqabs_s
, int32_t, H1_4
, DO_SQABS
)
1115 DO_ZPZ_D(sve2_sqabs_d
, int64_t, DO_SQABS
)
1117 #define DO_SQNEG(X) \
1118 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1119 x_ == min_ ? -min_ - 1 : -x_; })
1121 DO_ZPZ(sve2_sqneg_b
, uint8_t, H1
, DO_SQNEG
)
1122 DO_ZPZ(sve2_sqneg_h
, uint16_t, H1_2
, DO_SQNEG
)
1123 DO_ZPZ(sve2_sqneg_s
, uint32_t, H1_4
, DO_SQNEG
)
1124 DO_ZPZ_D(sve2_sqneg_d
, uint64_t, DO_SQNEG
)
1126 DO_ZPZ(sve2_urecpe_s
, uint32_t, H1_4
, helper_recpe_u32
)
1127 DO_ZPZ(sve2_ursqrte_s
, uint32_t, H1_4
, helper_rsqrte_u32
)
1129 /* Three-operand expander, unpredicated, in which the third operand is "wide".
1131 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1132 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1134 intptr_t i, opr_sz = simd_oprsz(desc); \
1135 for (i = 0; i < opr_sz; ) { \
1136 TYPEW mm = *(TYPEW *)(vm + i); \
1138 TYPE nn = *(TYPE *)(vn + H(i)); \
1139 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1140 i += sizeof(TYPE); \
1145 DO_ZZW(sve_asr_zzw_b
, int8_t, uint64_t, H1
, DO_ASR
)
1146 DO_ZZW(sve_lsr_zzw_b
, uint8_t, uint64_t, H1
, DO_LSR
)
1147 DO_ZZW(sve_lsl_zzw_b
, uint8_t, uint64_t, H1
, DO_LSL
)
1149 DO_ZZW(sve_asr_zzw_h
, int16_t, uint64_t, H1_2
, DO_ASR
)
1150 DO_ZZW(sve_lsr_zzw_h
, uint16_t, uint64_t, H1_2
, DO_LSR
)
1151 DO_ZZW(sve_lsl_zzw_h
, uint16_t, uint64_t, H1_2
, DO_LSL
)
1153 DO_ZZW(sve_asr_zzw_s
, int32_t, uint64_t, H1_4
, DO_ASR
)
1154 DO_ZZW(sve_lsr_zzw_s
, uint32_t, uint64_t, H1_4
, DO_LSR
)
1155 DO_ZZW(sve_lsl_zzw_s
, uint32_t, uint64_t, H1_4
, DO_LSL
)
1172 * Three-operand expander, unpredicated, in which the two inputs are
1173 * selected from the top or bottom half of the wide column.
1175 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1176 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1178 intptr_t i, opr_sz = simd_oprsz(desc); \
1179 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1180 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1181 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1182 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1183 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1184 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1188 DO_ZZZ_TB(sve2_saddl_h
, int16_t, int8_t, H1_2
, H1
, DO_ADD
)
1189 DO_ZZZ_TB(sve2_saddl_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ADD
)
1190 DO_ZZZ_TB(sve2_saddl_d
, int64_t, int32_t, , H1_4
, DO_ADD
)
1192 DO_ZZZ_TB(sve2_ssubl_h
, int16_t, int8_t, H1_2
, H1
, DO_SUB
)
1193 DO_ZZZ_TB(sve2_ssubl_s
, int32_t, int16_t, H1_4
, H1_2
, DO_SUB
)
1194 DO_ZZZ_TB(sve2_ssubl_d
, int64_t, int32_t, , H1_4
, DO_SUB
)
1196 DO_ZZZ_TB(sve2_sabdl_h
, int16_t, int8_t, H1_2
, H1
, DO_ABD
)
1197 DO_ZZZ_TB(sve2_sabdl_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ABD
)
1198 DO_ZZZ_TB(sve2_sabdl_d
, int64_t, int32_t, , H1_4
, DO_ABD
)
1200 DO_ZZZ_TB(sve2_uaddl_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ADD
)
1201 DO_ZZZ_TB(sve2_uaddl_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ADD
)
1202 DO_ZZZ_TB(sve2_uaddl_d
, uint64_t, uint32_t, , H1_4
, DO_ADD
)
1204 DO_ZZZ_TB(sve2_usubl_h
, uint16_t, uint8_t, H1_2
, H1
, DO_SUB
)
1205 DO_ZZZ_TB(sve2_usubl_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_SUB
)
1206 DO_ZZZ_TB(sve2_usubl_d
, uint64_t, uint32_t, , H1_4
, DO_SUB
)
1208 DO_ZZZ_TB(sve2_uabdl_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ABD
)
1209 DO_ZZZ_TB(sve2_uabdl_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ABD
)
1210 DO_ZZZ_TB(sve2_uabdl_d
, uint64_t, uint32_t, , H1_4
, DO_ABD
)
1212 DO_ZZZ_TB(sve2_smull_zzz_h
, int16_t, int8_t, H1_2
, H1
, DO_MUL
)
1213 DO_ZZZ_TB(sve2_smull_zzz_s
, int32_t, int16_t, H1_4
, H1_2
, DO_MUL
)
1214 DO_ZZZ_TB(sve2_smull_zzz_d
, int64_t, int32_t, , H1_4
, DO_MUL
)
1216 DO_ZZZ_TB(sve2_umull_zzz_h
, uint16_t, uint8_t, H1_2
, H1
, DO_MUL
)
1217 DO_ZZZ_TB(sve2_umull_zzz_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_MUL
)
1218 DO_ZZZ_TB(sve2_umull_zzz_d
, uint64_t, uint32_t, , H1_4
, DO_MUL
)
1220 /* Note that the multiply cannot overflow, but the doubling can. */
1221 static inline int16_t do_sqdmull_h(int16_t n
, int16_t m
)
1223 int16_t val
= n
* m
;
1224 return DO_SQADD_H(val
, val
);
1227 static inline int32_t do_sqdmull_s(int32_t n
, int32_t m
)
1229 int32_t val
= n
* m
;
1230 return DO_SQADD_S(val
, val
);
1233 static inline int64_t do_sqdmull_d(int64_t n
, int64_t m
)
1235 int64_t val
= n
* m
;
1236 return do_sqadd_d(val
, val
);
1239 DO_ZZZ_TB(sve2_sqdmull_zzz_h
, int16_t, int8_t, H1_2
, H1
, do_sqdmull_h
)
1240 DO_ZZZ_TB(sve2_sqdmull_zzz_s
, int32_t, int16_t, H1_4
, H1_2
, do_sqdmull_s
)
1241 DO_ZZZ_TB(sve2_sqdmull_zzz_d
, int64_t, int32_t, , H1_4
, do_sqdmull_d
)
1245 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1246 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1248 intptr_t i, opr_sz = simd_oprsz(desc); \
1249 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1250 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1251 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1252 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1253 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1257 DO_ZZZ_WTB(sve2_saddw_h
, int16_t, int8_t, H1_2
, H1
, DO_ADD
)
1258 DO_ZZZ_WTB(sve2_saddw_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ADD
)
1259 DO_ZZZ_WTB(sve2_saddw_d
, int64_t, int32_t, , H1_4
, DO_ADD
)
1261 DO_ZZZ_WTB(sve2_ssubw_h
, int16_t, int8_t, H1_2
, H1
, DO_SUB
)
1262 DO_ZZZ_WTB(sve2_ssubw_s
, int32_t, int16_t, H1_4
, H1_2
, DO_SUB
)
1263 DO_ZZZ_WTB(sve2_ssubw_d
, int64_t, int32_t, , H1_4
, DO_SUB
)
1265 DO_ZZZ_WTB(sve2_uaddw_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ADD
)
1266 DO_ZZZ_WTB(sve2_uaddw_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ADD
)
1267 DO_ZZZ_WTB(sve2_uaddw_d
, uint64_t, uint32_t, , H1_4
, DO_ADD
)
1269 DO_ZZZ_WTB(sve2_usubw_h
, uint16_t, uint8_t, H1_2
, H1
, DO_SUB
)
1270 DO_ZZZ_WTB(sve2_usubw_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_SUB
)
1271 DO_ZZZ_WTB(sve2_usubw_d
, uint64_t, uint32_t, , H1_4
, DO_SUB
)
1275 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1276 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1278 intptr_t i, opr_sz = simd_oprsz(desc); \
1279 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1280 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1281 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1282 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1283 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1284 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1288 DO_ZZZ_NTB(sve2_eoril_b
, uint8_t, H1
, DO_EOR
)
1289 DO_ZZZ_NTB(sve2_eoril_h
, uint16_t, H1_2
, DO_EOR
)
1290 DO_ZZZ_NTB(sve2_eoril_s
, uint32_t, H1_4
, DO_EOR
)
1291 DO_ZZZ_NTB(sve2_eoril_d
, uint64_t, , DO_EOR
)
1295 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1296 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1298 intptr_t i, opr_sz = simd_oprsz(desc); \
1299 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1300 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1301 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1302 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1303 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1304 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1308 DO_ZZZW_ACC(sve2_sabal_h
, int16_t, int8_t, H1_2
, H1
, DO_ABD
)
1309 DO_ZZZW_ACC(sve2_sabal_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ABD
)
1310 DO_ZZZW_ACC(sve2_sabal_d
, int64_t, int32_t, , H1_4
, DO_ABD
)
1312 DO_ZZZW_ACC(sve2_uabal_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ABD
)
1313 DO_ZZZW_ACC(sve2_uabal_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ABD
)
1314 DO_ZZZW_ACC(sve2_uabal_d
, uint64_t, uint32_t, , H1_4
, DO_ABD
)
1318 #define DO_XTNB(NAME, TYPE, OP) \
1319 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1321 intptr_t i, opr_sz = simd_oprsz(desc); \
1322 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1323 TYPE nn = *(TYPE *)(vn + i); \
1324 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1325 *(TYPE *)(vd + i) = nn; \
1329 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1330 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1332 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1333 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1334 TYPE nn = *(TYPE *)(vn + i); \
1335 *(TYPEN *)(vd + i + odd) = OP(nn); \
1339 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1340 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1341 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1343 DO_XTNB(sve2_sqxtnb_h
, int16_t, DO_SQXTN_H
)
1344 DO_XTNB(sve2_sqxtnb_s
, int32_t, DO_SQXTN_S
)
1345 DO_XTNB(sve2_sqxtnb_d
, int64_t, DO_SQXTN_D
)
1347 DO_XTNT(sve2_sqxtnt_h
, int16_t, int8_t, H1
, DO_SQXTN_H
)
1348 DO_XTNT(sve2_sqxtnt_s
, int32_t, int16_t, H1_2
, DO_SQXTN_S
)
1349 DO_XTNT(sve2_sqxtnt_d
, int64_t, int32_t, H1_4
, DO_SQXTN_D
)
1351 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1352 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1353 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1355 DO_XTNB(sve2_uqxtnb_h
, uint16_t, DO_UQXTN_H
)
1356 DO_XTNB(sve2_uqxtnb_s
, uint32_t, DO_UQXTN_S
)
1357 DO_XTNB(sve2_uqxtnb_d
, uint64_t, DO_UQXTN_D
)
1359 DO_XTNT(sve2_uqxtnt_h
, uint16_t, uint8_t, H1
, DO_UQXTN_H
)
1360 DO_XTNT(sve2_uqxtnt_s
, uint32_t, uint16_t, H1_2
, DO_UQXTN_S
)
1361 DO_XTNT(sve2_uqxtnt_d
, uint64_t, uint32_t, H1_4
, DO_UQXTN_D
)
1363 DO_XTNB(sve2_sqxtunb_h
, int16_t, DO_UQXTN_H
)
1364 DO_XTNB(sve2_sqxtunb_s
, int32_t, DO_UQXTN_S
)
1365 DO_XTNB(sve2_sqxtunb_d
, int64_t, DO_UQXTN_D
)
1367 DO_XTNT(sve2_sqxtunt_h
, int16_t, int8_t, H1
, DO_UQXTN_H
)
1368 DO_XTNT(sve2_sqxtunt_s
, int32_t, int16_t, H1_2
, DO_UQXTN_S
)
1369 DO_XTNT(sve2_sqxtunt_d
, int64_t, int32_t, H1_4
, DO_UQXTN_D
)
1374 void HELPER(sve2_adcl_s
)(void *vd
, void *vn
, void *vm
, void *va
, uint32_t desc
)
1376 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1377 int sel
= H4(extract32(desc
, SIMD_DATA_SHIFT
, 1));
1378 uint32_t inv
= -extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
1379 uint32_t *a
= va
, *n
= vn
;
1380 uint64_t *d
= vd
, *m
= vm
;
1382 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
1383 uint32_t e1
= a
[2 * i
+ H4(0)];
1384 uint32_t e2
= n
[2 * i
+ sel
] ^ inv
;
1385 uint64_t c
= extract64(m
[i
], 32, 1);
1386 /* Compute and store the entire 33-bit result at once. */
1391 void HELPER(sve2_adcl_d
)(void *vd
, void *vn
, void *vm
, void *va
, uint32_t desc
)
1393 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1394 int sel
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
1395 uint64_t inv
= -(uint64_t)extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
1396 uint64_t *d
= vd
, *a
= va
, *n
= vn
, *m
= vm
;
1398 for (i
= 0; i
< opr_sz
/ 8; i
+= 2) {
1399 Int128 e1
= int128_make64(a
[i
]);
1400 Int128 e2
= int128_make64(n
[i
+ sel
] ^ inv
);
1401 Int128 c
= int128_make64(m
[i
+ 1] & 1);
1402 Int128 r
= int128_add(int128_add(e1
, e2
), c
);
1403 d
[i
+ 0] = int128_getlo(r
);
1404 d
[i
+ 1] = int128_gethi(r
);
1408 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1409 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1411 intptr_t i, opr_sz = simd_oprsz(desc); \
1412 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1413 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1414 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1415 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1416 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1417 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1418 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1422 DO_SQDMLAL(sve2_sqdmlal_zzzw_h
, int16_t, int8_t, H1_2
, H1
,
1423 do_sqdmull_h
, DO_SQADD_H
)
1424 DO_SQDMLAL(sve2_sqdmlal_zzzw_s
, int32_t, int16_t, H1_4
, H1_2
,
1425 do_sqdmull_s
, DO_SQADD_S
)
1426 DO_SQDMLAL(sve2_sqdmlal_zzzw_d
, int64_t, int32_t, , H1_4
,
1427 do_sqdmull_d
, do_sqadd_d
)
1429 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h
, int16_t, int8_t, H1_2
, H1
,
1430 do_sqdmull_h
, DO_SQSUB_H
)
1431 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s
, int32_t, int16_t, H1_4
, H1_2
,
1432 do_sqdmull_s
, DO_SQSUB_S
)
1433 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d
, int64_t, int32_t, , H1_4
,
1434 do_sqdmull_d
, do_sqsub_d
)
1438 #define DO_BITPERM(NAME, TYPE, OP) \
1439 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1441 intptr_t i, opr_sz = simd_oprsz(desc); \
1442 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1443 TYPE nn = *(TYPE *)(vn + i); \
1444 TYPE mm = *(TYPE *)(vm + i); \
1445 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1449 static uint64_t bitextract(uint64_t data
, uint64_t mask
, int n
)
1454 for (db
= 0; db
< n
; ++db
) {
1455 if ((mask
>> db
) & 1) {
1456 res
|= ((data
>> db
) & 1) << rb
;
1463 DO_BITPERM(sve2_bext_b
, uint8_t, bitextract
)
1464 DO_BITPERM(sve2_bext_h
, uint16_t, bitextract
)
1465 DO_BITPERM(sve2_bext_s
, uint32_t, bitextract
)
1466 DO_BITPERM(sve2_bext_d
, uint64_t, bitextract
)
1468 static uint64_t bitdeposit(uint64_t data
, uint64_t mask
, int n
)
1473 for (rb
= 0; rb
< n
; ++rb
) {
1474 if ((mask
>> rb
) & 1) {
1475 res
|= ((data
>> db
) & 1) << rb
;
1482 DO_BITPERM(sve2_bdep_b
, uint8_t, bitdeposit
)
1483 DO_BITPERM(sve2_bdep_h
, uint16_t, bitdeposit
)
1484 DO_BITPERM(sve2_bdep_s
, uint32_t, bitdeposit
)
1485 DO_BITPERM(sve2_bdep_d
, uint64_t, bitdeposit
)
1487 static uint64_t bitgroup(uint64_t data
, uint64_t mask
, int n
)
1489 uint64_t resm
= 0, resu
= 0;
1490 int db
, rbm
= 0, rbu
= 0;
1492 for (db
= 0; db
< n
; ++db
) {
1493 uint64_t val
= (data
>> db
) & 1;
1494 if ((mask
>> db
) & 1) {
1495 resm
|= val
<< rbm
++;
1497 resu
|= val
<< rbu
++;
1501 return resm
| (resu
<< rbm
);
1504 DO_BITPERM(sve2_bgrp_b
, uint8_t, bitgroup
)
1505 DO_BITPERM(sve2_bgrp_h
, uint16_t, bitgroup
)
1506 DO_BITPERM(sve2_bgrp_s
, uint32_t, bitgroup
)
1507 DO_BITPERM(sve2_bgrp_d
, uint64_t, bitgroup
)
1511 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1512 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1514 intptr_t i, opr_sz = simd_oprsz(desc); \
1515 int sub_r = simd_data(desc); \
1517 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1518 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1519 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1520 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1521 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1522 acc_r = ADD_OP(acc_r, el2_i); \
1523 acc_i = SUB_OP(acc_i, el2_r); \
1524 *(TYPE *)(vd + H(i)) = acc_r; \
1525 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1528 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1529 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1530 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1531 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1532 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1533 acc_r = SUB_OP(acc_r, el2_i); \
1534 acc_i = ADD_OP(acc_i, el2_r); \
1535 *(TYPE *)(vd + H(i)) = acc_r; \
1536 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1541 DO_CADD(sve2_cadd_b
, int8_t, H1
, DO_ADD
, DO_SUB
)
1542 DO_CADD(sve2_cadd_h
, int16_t, H1_2
, DO_ADD
, DO_SUB
)
1543 DO_CADD(sve2_cadd_s
, int32_t, H1_4
, DO_ADD
, DO_SUB
)
1544 DO_CADD(sve2_cadd_d
, int64_t, , DO_ADD
, DO_SUB
)
1546 DO_CADD(sve2_sqcadd_b
, int8_t, H1
, DO_SQADD_B
, DO_SQSUB_B
)
1547 DO_CADD(sve2_sqcadd_h
, int16_t, H1_2
, DO_SQADD_H
, DO_SQSUB_H
)
1548 DO_CADD(sve2_sqcadd_s
, int32_t, H1_4
, DO_SQADD_S
, DO_SQSUB_S
)
1549 DO_CADD(sve2_sqcadd_d
, int64_t, , do_sqadd_d
, do_sqsub_d
)
1553 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1554 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1556 intptr_t i, opr_sz = simd_oprsz(desc); \
1557 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1558 int shift = simd_data(desc) >> 1; \
1559 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1560 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1561 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1565 DO_ZZI_SHLL(sve2_sshll_h
, int16_t, int8_t, H1_2
, H1
)
1566 DO_ZZI_SHLL(sve2_sshll_s
, int32_t, int16_t, H1_4
, H1_2
)
1567 DO_ZZI_SHLL(sve2_sshll_d
, int64_t, int32_t, , H1_4
)
1569 DO_ZZI_SHLL(sve2_ushll_h
, uint16_t, uint8_t, H1_2
, H1
)
1570 DO_ZZI_SHLL(sve2_ushll_s
, uint32_t, uint16_t, H1_4
, H1_2
)
1571 DO_ZZI_SHLL(sve2_ushll_d
, uint64_t, uint32_t, , H1_4
)
1575 /* Two-operand reduction expander, controlled by a predicate.
1576 * The difference between TYPERED and TYPERET has to do with
1577 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1578 * but TYPERET must be unsigned so that e.g. a 32-bit value
1579 * is not sign-extended to the ABI uint64_t return type.
1581 /* ??? If we were to vectorize this by hand the reduction ordering
1582 * would change. For integer operands, this is perfectly fine.
1584 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1585 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1587 intptr_t i, opr_sz = simd_oprsz(desc); \
1588 TYPERED ret = INIT; \
1589 for (i = 0; i < opr_sz; ) { \
1590 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1593 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1594 ret = OP(ret, nn); \
1596 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1599 return (TYPERET)ret; \
1602 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1603 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1605 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1609 for (i = 0; i < opr_sz; i += 1) { \
1610 if (pg[H1(i)] & 1) { \
1612 ret = OP(ret, nn); \
1618 DO_VPZ(sve_orv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_ORR
)
1619 DO_VPZ(sve_orv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_ORR
)
1620 DO_VPZ(sve_orv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_ORR
)
1621 DO_VPZ_D(sve_orv_d
, uint64_t, uint64_t, 0, DO_ORR
)
1623 DO_VPZ(sve_eorv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_EOR
)
1624 DO_VPZ(sve_eorv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_EOR
)
1625 DO_VPZ(sve_eorv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_EOR
)
1626 DO_VPZ_D(sve_eorv_d
, uint64_t, uint64_t, 0, DO_EOR
)
1628 DO_VPZ(sve_andv_b
, uint8_t, uint8_t, uint8_t, H1
, -1, DO_AND
)
1629 DO_VPZ(sve_andv_h
, uint16_t, uint16_t, uint16_t, H1_2
, -1, DO_AND
)
1630 DO_VPZ(sve_andv_s
, uint32_t, uint32_t, uint32_t, H1_4
, -1, DO_AND
)
1631 DO_VPZ_D(sve_andv_d
, uint64_t, uint64_t, -1, DO_AND
)
1633 DO_VPZ(sve_saddv_b
, int8_t, uint64_t, uint64_t, H1
, 0, DO_ADD
)
1634 DO_VPZ(sve_saddv_h
, int16_t, uint64_t, uint64_t, H1_2
, 0, DO_ADD
)
1635 DO_VPZ(sve_saddv_s
, int32_t, uint64_t, uint64_t, H1_4
, 0, DO_ADD
)
1637 DO_VPZ(sve_uaddv_b
, uint8_t, uint64_t, uint64_t, H1
, 0, DO_ADD
)
1638 DO_VPZ(sve_uaddv_h
, uint16_t, uint64_t, uint64_t, H1_2
, 0, DO_ADD
)
1639 DO_VPZ(sve_uaddv_s
, uint32_t, uint64_t, uint64_t, H1_4
, 0, DO_ADD
)
1640 DO_VPZ_D(sve_uaddv_d
, uint64_t, uint64_t, 0, DO_ADD
)
1642 DO_VPZ(sve_smaxv_b
, int8_t, int8_t, uint8_t, H1
, INT8_MIN
, DO_MAX
)
1643 DO_VPZ(sve_smaxv_h
, int16_t, int16_t, uint16_t, H1_2
, INT16_MIN
, DO_MAX
)
1644 DO_VPZ(sve_smaxv_s
, int32_t, int32_t, uint32_t, H1_4
, INT32_MIN
, DO_MAX
)
1645 DO_VPZ_D(sve_smaxv_d
, int64_t, int64_t, INT64_MIN
, DO_MAX
)
1647 DO_VPZ(sve_umaxv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_MAX
)
1648 DO_VPZ(sve_umaxv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_MAX
)
1649 DO_VPZ(sve_umaxv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_MAX
)
1650 DO_VPZ_D(sve_umaxv_d
, uint64_t, uint64_t, 0, DO_MAX
)
1652 DO_VPZ(sve_sminv_b
, int8_t, int8_t, uint8_t, H1
, INT8_MAX
, DO_MIN
)
1653 DO_VPZ(sve_sminv_h
, int16_t, int16_t, uint16_t, H1_2
, INT16_MAX
, DO_MIN
)
1654 DO_VPZ(sve_sminv_s
, int32_t, int32_t, uint32_t, H1_4
, INT32_MAX
, DO_MIN
)
1655 DO_VPZ_D(sve_sminv_d
, int64_t, int64_t, INT64_MAX
, DO_MIN
)
1657 DO_VPZ(sve_uminv_b
, uint8_t, uint8_t, uint8_t, H1
, -1, DO_MIN
)
1658 DO_VPZ(sve_uminv_h
, uint16_t, uint16_t, uint16_t, H1_2
, -1, DO_MIN
)
1659 DO_VPZ(sve_uminv_s
, uint32_t, uint32_t, uint32_t, H1_4
, -1, DO_MIN
)
1660 DO_VPZ_D(sve_uminv_d
, uint64_t, uint64_t, -1, DO_MIN
)
1665 /* Two vector operand, one scalar operand, unpredicated. */
1666 #define DO_ZZI(NAME, TYPE, OP) \
1667 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1669 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1670 TYPE s = s64, *d = vd, *n = vn; \
1671 for (i = 0; i < opr_sz; ++i) { \
1672 d[i] = OP(n[i], s); \
1676 #define DO_SUBR(X, Y) (Y - X)
1678 DO_ZZI(sve_subri_b
, uint8_t, DO_SUBR
)
1679 DO_ZZI(sve_subri_h
, uint16_t, DO_SUBR
)
1680 DO_ZZI(sve_subri_s
, uint32_t, DO_SUBR
)
1681 DO_ZZI(sve_subri_d
, uint64_t, DO_SUBR
)
1683 DO_ZZI(sve_smaxi_b
, int8_t, DO_MAX
)
1684 DO_ZZI(sve_smaxi_h
, int16_t, DO_MAX
)
1685 DO_ZZI(sve_smaxi_s
, int32_t, DO_MAX
)
1686 DO_ZZI(sve_smaxi_d
, int64_t, DO_MAX
)
1688 DO_ZZI(sve_smini_b
, int8_t, DO_MIN
)
1689 DO_ZZI(sve_smini_h
, int16_t, DO_MIN
)
1690 DO_ZZI(sve_smini_s
, int32_t, DO_MIN
)
1691 DO_ZZI(sve_smini_d
, int64_t, DO_MIN
)
1693 DO_ZZI(sve_umaxi_b
, uint8_t, DO_MAX
)
1694 DO_ZZI(sve_umaxi_h
, uint16_t, DO_MAX
)
1695 DO_ZZI(sve_umaxi_s
, uint32_t, DO_MAX
)
1696 DO_ZZI(sve_umaxi_d
, uint64_t, DO_MAX
)
1698 DO_ZZI(sve_umini_b
, uint8_t, DO_MIN
)
1699 DO_ZZI(sve_umini_h
, uint16_t, DO_MIN
)
1700 DO_ZZI(sve_umini_s
, uint32_t, DO_MIN
)
1701 DO_ZZI(sve_umini_d
, uint64_t, DO_MIN
)
1721 /* Similar to the ARM LastActiveElement pseudocode function, except the
1722 result is multiplied by the element size. This includes the not found
1723 indication; e.g. not found for esz=3 is -8. */
1724 static intptr_t last_active_element(uint64_t *g
, intptr_t words
, intptr_t esz
)
1726 uint64_t mask
= pred_esz_masks
[esz
];
1730 uint64_t this_g
= g
[--i
] & mask
;
1732 return i
* 64 + (63 - clz64(this_g
));
1735 return (intptr_t)-1 << esz
;
1738 uint32_t HELPER(sve_pfirst
)(void *vd
, void *vg
, uint32_t pred_desc
)
1740 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
1741 uint32_t flags
= PREDTEST_INIT
;
1742 uint64_t *d
= vd
, *g
= vg
;
1746 uint64_t this_d
= d
[i
];
1747 uint64_t this_g
= g
[i
];
1751 /* Set in D the first bit of G. */
1752 this_d
|= this_g
& -this_g
;
1755 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
1757 } while (++i
< words
);
1762 uint32_t HELPER(sve_pnext
)(void *vd
, void *vg
, uint32_t pred_desc
)
1764 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
1765 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
1766 uint32_t flags
= PREDTEST_INIT
;
1767 uint64_t *d
= vd
, *g
= vg
, esz_mask
;
1770 next
= last_active_element(vd
, words
, esz
) + (1 << esz
);
1771 esz_mask
= pred_esz_masks
[esz
];
1773 /* Similar to the pseudocode for pnext, but scaled by ESZ
1774 so that we find the correct bit. */
1775 if (next
< words
* 64) {
1779 mask
= ~((1ull << (next
& 63)) - 1);
1783 uint64_t this_g
= g
[next
/ 64] & esz_mask
& mask
;
1785 next
= (next
& -64) + ctz64(this_g
);
1790 } while (next
< words
* 64);
1795 uint64_t this_d
= 0;
1796 if (i
== next
/ 64) {
1797 this_d
= 1ull << (next
& 63);
1800 flags
= iter_predtest_fwd(this_d
, g
[i
] & esz_mask
, flags
);
1801 } while (++i
< words
);
1807 * Copy Zn into Zd, and store zero into inactive elements.
1808 * If inv, store zeros into the active elements.
1810 void HELPER(sve_movz_b
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
1812 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1813 uint64_t inv
= -(uint64_t)(simd_data(desc
) & 1);
1814 uint64_t *d
= vd
, *n
= vn
;
1817 for (i
= 0; i
< opr_sz
; i
+= 1) {
1818 d
[i
] = n
[i
] & (expand_pred_b(pg
[H1(i
)]) ^ inv
);
1822 void HELPER(sve_movz_h
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
1824 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1825 uint64_t inv
= -(uint64_t)(simd_data(desc
) & 1);
1826 uint64_t *d
= vd
, *n
= vn
;
1829 for (i
= 0; i
< opr_sz
; i
+= 1) {
1830 d
[i
] = n
[i
] & (expand_pred_h(pg
[H1(i
)]) ^ inv
);
1834 void HELPER(sve_movz_s
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
1836 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1837 uint64_t inv
= -(uint64_t)(simd_data(desc
) & 1);
1838 uint64_t *d
= vd
, *n
= vn
;
1841 for (i
= 0; i
< opr_sz
; i
+= 1) {
1842 d
[i
] = n
[i
] & (expand_pred_s(pg
[H1(i
)]) ^ inv
);
1846 void HELPER(sve_movz_d
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
1848 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1849 uint64_t *d
= vd
, *n
= vn
;
1851 uint8_t inv
= simd_data(desc
);
1853 for (i
= 0; i
< opr_sz
; i
+= 1) {
1854 d
[i
] = n
[i
] & -(uint64_t)((pg
[H1(i
)] ^ inv
) & 1);
1858 /* Three-operand expander, immediate operand, controlled by a predicate.
1860 #define DO_ZPZI(NAME, TYPE, H, OP) \
1861 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1863 intptr_t i, opr_sz = simd_oprsz(desc); \
1864 TYPE imm = simd_data(desc); \
1865 for (i = 0; i < opr_sz; ) { \
1866 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1869 TYPE nn = *(TYPE *)(vn + H(i)); \
1870 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1872 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1877 /* Similarly, specialized for 64-bit operands. */
1878 #define DO_ZPZI_D(NAME, TYPE, OP) \
1879 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1881 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1882 TYPE *d = vd, *n = vn; \
1883 TYPE imm = simd_data(desc); \
1885 for (i = 0; i < opr_sz; i += 1) { \
1886 if (pg[H1(i)] & 1) { \
1888 d[i] = OP(nn, imm); \
1893 #define DO_SHR(N, M) (N >> M)
1894 #define DO_SHL(N, M) (N << M)
1896 /* Arithmetic shift right for division. This rounds negative numbers
1897 toward zero as per signed division. Therefore before shifting,
1898 when N is negative, add 2**M-1. */
1899 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
1901 static inline uint64_t do_urshr(uint64_t x
, unsigned sh
)
1903 if (likely(sh
< 64)) {
1904 return (x
>> sh
) + ((x
>> (sh
- 1)) & 1);
1905 } else if (sh
== 64) {
1912 static inline int64_t do_srshr(int64_t x
, unsigned sh
)
1914 if (likely(sh
< 64)) {
1915 return (x
>> sh
) + ((x
>> (sh
- 1)) & 1);
1917 /* Rounding the sign bit always produces 0. */
1922 DO_ZPZI(sve_asr_zpzi_b
, int8_t, H1
, DO_SHR
)
1923 DO_ZPZI(sve_asr_zpzi_h
, int16_t, H1_2
, DO_SHR
)
1924 DO_ZPZI(sve_asr_zpzi_s
, int32_t, H1_4
, DO_SHR
)
1925 DO_ZPZI_D(sve_asr_zpzi_d
, int64_t, DO_SHR
)
1927 DO_ZPZI(sve_lsr_zpzi_b
, uint8_t, H1
, DO_SHR
)
1928 DO_ZPZI(sve_lsr_zpzi_h
, uint16_t, H1_2
, DO_SHR
)
1929 DO_ZPZI(sve_lsr_zpzi_s
, uint32_t, H1_4
, DO_SHR
)
1930 DO_ZPZI_D(sve_lsr_zpzi_d
, uint64_t, DO_SHR
)
1932 DO_ZPZI(sve_lsl_zpzi_b
, uint8_t, H1
, DO_SHL
)
1933 DO_ZPZI(sve_lsl_zpzi_h
, uint16_t, H1_2
, DO_SHL
)
1934 DO_ZPZI(sve_lsl_zpzi_s
, uint32_t, H1_4
, DO_SHL
)
1935 DO_ZPZI_D(sve_lsl_zpzi_d
, uint64_t, DO_SHL
)
1937 DO_ZPZI(sve_asrd_b
, int8_t, H1
, DO_ASRD
)
1938 DO_ZPZI(sve_asrd_h
, int16_t, H1_2
, DO_ASRD
)
1939 DO_ZPZI(sve_asrd_s
, int32_t, H1_4
, DO_ASRD
)
1940 DO_ZPZI_D(sve_asrd_d
, int64_t, DO_ASRD
)
1946 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
1947 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1949 intptr_t i, opr_sz = simd_oprsz(desc); \
1950 int shift = simd_data(desc); \
1951 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1952 TYPEW nn = *(TYPEW *)(vn + i); \
1953 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
1957 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
1958 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1960 intptr_t i, opr_sz = simd_oprsz(desc); \
1961 int shift = simd_data(desc); \
1962 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1963 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1964 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
1968 DO_SHRNB(sve2_shrnb_h
, uint16_t, uint8_t, DO_SHR
)
1969 DO_SHRNB(sve2_shrnb_s
, uint32_t, uint16_t, DO_SHR
)
1970 DO_SHRNB(sve2_shrnb_d
, uint64_t, uint32_t, DO_SHR
)
1972 DO_SHRNT(sve2_shrnt_h
, uint16_t, uint8_t, H1_2
, H1
, DO_SHR
)
1973 DO_SHRNT(sve2_shrnt_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_SHR
)
1974 DO_SHRNT(sve2_shrnt_d
, uint64_t, uint32_t, , H1_4
, DO_SHR
)
1976 DO_SHRNB(sve2_rshrnb_h
, uint16_t, uint8_t, do_urshr
)
1977 DO_SHRNB(sve2_rshrnb_s
, uint32_t, uint16_t, do_urshr
)
1978 DO_SHRNB(sve2_rshrnb_d
, uint64_t, uint32_t, do_urshr
)
1980 DO_SHRNT(sve2_rshrnt_h
, uint16_t, uint8_t, H1_2
, H1
, do_urshr
)
1981 DO_SHRNT(sve2_rshrnt_s
, uint32_t, uint16_t, H1_4
, H1_2
, do_urshr
)
1982 DO_SHRNT(sve2_rshrnt_d
, uint64_t, uint32_t, , H1_4
, do_urshr
)
1984 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
1985 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
1986 #define DO_SQSHRUN_D(x, sh) \
1987 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
1989 DO_SHRNB(sve2_sqshrunb_h
, int16_t, uint8_t, DO_SQSHRUN_H
)
1990 DO_SHRNB(sve2_sqshrunb_s
, int32_t, uint16_t, DO_SQSHRUN_S
)
1991 DO_SHRNB(sve2_sqshrunb_d
, int64_t, uint32_t, DO_SQSHRUN_D
)
1993 DO_SHRNT(sve2_sqshrunt_h
, int16_t, uint8_t, H1_2
, H1
, DO_SQSHRUN_H
)
1994 DO_SHRNT(sve2_sqshrunt_s
, int32_t, uint16_t, H1_4
, H1_2
, DO_SQSHRUN_S
)
1995 DO_SHRNT(sve2_sqshrunt_d
, int64_t, uint32_t, , H1_4
, DO_SQSHRUN_D
)
1997 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
1998 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
1999 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2001 DO_SHRNB(sve2_sqrshrunb_h
, int16_t, uint8_t, DO_SQRSHRUN_H
)
2002 DO_SHRNB(sve2_sqrshrunb_s
, int32_t, uint16_t, DO_SQRSHRUN_S
)
2003 DO_SHRNB(sve2_sqrshrunb_d
, int64_t, uint32_t, DO_SQRSHRUN_D
)
2005 DO_SHRNT(sve2_sqrshrunt_h
, int16_t, uint8_t, H1_2
, H1
, DO_SQRSHRUN_H
)
2006 DO_SHRNT(sve2_sqrshrunt_s
, int32_t, uint16_t, H1_4
, H1_2
, DO_SQRSHRUN_S
)
2007 DO_SHRNT(sve2_sqrshrunt_d
, int64_t, uint32_t, , H1_4
, DO_SQRSHRUN_D
)
2009 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2010 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2011 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2013 DO_SHRNB(sve2_sqshrnb_h
, int16_t, uint8_t, DO_SQSHRN_H
)
2014 DO_SHRNB(sve2_sqshrnb_s
, int32_t, uint16_t, DO_SQSHRN_S
)
2015 DO_SHRNB(sve2_sqshrnb_d
, int64_t, uint32_t, DO_SQSHRN_D
)
2017 DO_SHRNT(sve2_sqshrnt_h
, int16_t, uint8_t, H1_2
, H1
, DO_SQSHRN_H
)
2018 DO_SHRNT(sve2_sqshrnt_s
, int32_t, uint16_t, H1_4
, H1_2
, DO_SQSHRN_S
)
2019 DO_SHRNT(sve2_sqshrnt_d
, int64_t, uint32_t, , H1_4
, DO_SQSHRN_D
)
2021 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2022 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2023 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2025 DO_SHRNB(sve2_sqrshrnb_h
, int16_t, uint8_t, DO_SQRSHRN_H
)
2026 DO_SHRNB(sve2_sqrshrnb_s
, int32_t, uint16_t, DO_SQRSHRN_S
)
2027 DO_SHRNB(sve2_sqrshrnb_d
, int64_t, uint32_t, DO_SQRSHRN_D
)
2029 DO_SHRNT(sve2_sqrshrnt_h
, int16_t, uint8_t, H1_2
, H1
, DO_SQRSHRN_H
)
2030 DO_SHRNT(sve2_sqrshrnt_s
, int32_t, uint16_t, H1_4
, H1_2
, DO_SQRSHRN_S
)
2031 DO_SHRNT(sve2_sqrshrnt_d
, int64_t, uint32_t, , H1_4
, DO_SQRSHRN_D
)
2033 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2034 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2035 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2037 DO_SHRNB(sve2_uqshrnb_h
, uint16_t, uint8_t, DO_UQSHRN_H
)
2038 DO_SHRNB(sve2_uqshrnb_s
, uint32_t, uint16_t, DO_UQSHRN_S
)
2039 DO_SHRNB(sve2_uqshrnb_d
, uint64_t, uint32_t, DO_UQSHRN_D
)
2041 DO_SHRNT(sve2_uqshrnt_h
, uint16_t, uint8_t, H1_2
, H1
, DO_UQSHRN_H
)
2042 DO_SHRNT(sve2_uqshrnt_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_UQSHRN_S
)
2043 DO_SHRNT(sve2_uqshrnt_d
, uint64_t, uint32_t, , H1_4
, DO_UQSHRN_D
)
2045 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2046 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2047 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2049 DO_SHRNB(sve2_uqrshrnb_h
, uint16_t, uint8_t, DO_UQRSHRN_H
)
2050 DO_SHRNB(sve2_uqrshrnb_s
, uint32_t, uint16_t, DO_UQRSHRN_S
)
2051 DO_SHRNB(sve2_uqrshrnb_d
, uint64_t, uint32_t, DO_UQRSHRN_D
)
2053 DO_SHRNT(sve2_uqrshrnt_h
, uint16_t, uint8_t, H1_2
, H1
, DO_UQRSHRN_H
)
2054 DO_SHRNT(sve2_uqrshrnt_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_UQRSHRN_S
)
2055 DO_SHRNT(sve2_uqrshrnt_d
, uint64_t, uint32_t, , H1_4
, DO_UQRSHRN_D
)
2060 /* Fully general four-operand expander, controlled by a predicate.
2062 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2063 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2064 void *vg, uint32_t desc) \
2066 intptr_t i, opr_sz = simd_oprsz(desc); \
2067 for (i = 0; i < opr_sz; ) { \
2068 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2071 TYPE nn = *(TYPE *)(vn + H(i)); \
2072 TYPE mm = *(TYPE *)(vm + H(i)); \
2073 TYPE aa = *(TYPE *)(va + H(i)); \
2074 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2076 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2081 /* Similarly, specialized for 64-bit operands. */
2082 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2083 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2084 void *vg, uint32_t desc) \
2086 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2087 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2089 for (i = 0; i < opr_sz; i += 1) { \
2090 if (pg[H1(i)] & 1) { \
2091 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2092 d[i] = OP(aa, nn, mm); \
2097 #define DO_MLA(A, N, M) (A + N * M)
2098 #define DO_MLS(A, N, M) (A - N * M)
2100 DO_ZPZZZ(sve_mla_b
, uint8_t, H1
, DO_MLA
)
2101 DO_ZPZZZ(sve_mls_b
, uint8_t, H1
, DO_MLS
)
2103 DO_ZPZZZ(sve_mla_h
, uint16_t, H1_2
, DO_MLA
)
2104 DO_ZPZZZ(sve_mls_h
, uint16_t, H1_2
, DO_MLS
)
2106 DO_ZPZZZ(sve_mla_s
, uint32_t, H1_4
, DO_MLA
)
2107 DO_ZPZZZ(sve_mls_s
, uint32_t, H1_4
, DO_MLS
)
2109 DO_ZPZZZ_D(sve_mla_d
, uint64_t, DO_MLA
)
2110 DO_ZPZZZ_D(sve_mls_d
, uint64_t, DO_MLS
)
2117 void HELPER(sve_index_b
)(void *vd
, uint32_t start
,
2118 uint32_t incr
, uint32_t desc
)
2120 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2122 for (i
= 0; i
< opr_sz
; i
+= 1) {
2123 d
[H1(i
)] = start
+ i
* incr
;
2127 void HELPER(sve_index_h
)(void *vd
, uint32_t start
,
2128 uint32_t incr
, uint32_t desc
)
2130 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
2132 for (i
= 0; i
< opr_sz
; i
+= 1) {
2133 d
[H2(i
)] = start
+ i
* incr
;
2137 void HELPER(sve_index_s
)(void *vd
, uint32_t start
,
2138 uint32_t incr
, uint32_t desc
)
2140 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
2142 for (i
= 0; i
< opr_sz
; i
+= 1) {
2143 d
[H4(i
)] = start
+ i
* incr
;
2147 void HELPER(sve_index_d
)(void *vd
, uint64_t start
,
2148 uint64_t incr
, uint32_t desc
)
2150 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2152 for (i
= 0; i
< opr_sz
; i
+= 1) {
2153 d
[i
] = start
+ i
* incr
;
2157 void HELPER(sve_adr_p32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2159 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
2160 uint32_t sh
= simd_data(desc
);
2161 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
2162 for (i
= 0; i
< opr_sz
; i
+= 1) {
2163 d
[i
] = n
[i
] + (m
[i
] << sh
);
2167 void HELPER(sve_adr_p64
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2169 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2170 uint64_t sh
= simd_data(desc
);
2171 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2172 for (i
= 0; i
< opr_sz
; i
+= 1) {
2173 d
[i
] = n
[i
] + (m
[i
] << sh
);
2177 void HELPER(sve_adr_s32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2179 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2180 uint64_t sh
= simd_data(desc
);
2181 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2182 for (i
= 0; i
< opr_sz
; i
+= 1) {
2183 d
[i
] = n
[i
] + ((uint64_t)(int32_t)m
[i
] << sh
);
2187 void HELPER(sve_adr_u32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2189 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2190 uint64_t sh
= simd_data(desc
);
2191 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2192 for (i
= 0; i
< opr_sz
; i
+= 1) {
2193 d
[i
] = n
[i
] + ((uint64_t)(uint32_t)m
[i
] << sh
);
2197 void HELPER(sve_fexpa_h
)(void *vd
, void *vn
, uint32_t desc
)
2199 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2200 static const uint16_t coeff
[] = {
2201 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2202 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2203 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2204 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2206 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
2207 uint16_t *d
= vd
, *n
= vn
;
2209 for (i
= 0; i
< opr_sz
; i
++) {
2211 intptr_t idx
= extract32(nn
, 0, 5);
2212 uint16_t exp
= extract32(nn
, 5, 5);
2213 d
[i
] = coeff
[idx
] | (exp
<< 10);
2217 void HELPER(sve_fexpa_s
)(void *vd
, void *vn
, uint32_t desc
)
2219 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2220 static const uint32_t coeff
[] = {
2221 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2222 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2223 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2224 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2225 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2226 0x1ef532, 0x20b051, 0x227043, 0x243516,
2227 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2228 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2229 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2230 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2231 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2232 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2233 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2234 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2235 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2236 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2238 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
2239 uint32_t *d
= vd
, *n
= vn
;
2241 for (i
= 0; i
< opr_sz
; i
++) {
2243 intptr_t idx
= extract32(nn
, 0, 6);
2244 uint32_t exp
= extract32(nn
, 6, 8);
2245 d
[i
] = coeff
[idx
] | (exp
<< 23);
2249 void HELPER(sve_fexpa_d
)(void *vd
, void *vn
, uint32_t desc
)
2251 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2252 static const uint64_t coeff
[] = {
2253 0x0000000000000ull
, 0x02C9A3E778061ull
, 0x059B0D3158574ull
,
2254 0x0874518759BC8ull
, 0x0B5586CF9890Full
, 0x0E3EC32D3D1A2ull
,
2255 0x11301D0125B51ull
, 0x1429AAEA92DE0ull
, 0x172B83C7D517Bull
,
2256 0x1A35BEB6FCB75ull
, 0x1D4873168B9AAull
, 0x2063B88628CD6ull
,
2257 0x2387A6E756238ull
, 0x26B4565E27CDDull
, 0x29E9DF51FDEE1ull
,
2258 0x2D285A6E4030Bull
, 0x306FE0A31B715ull
, 0x33C08B26416FFull
,
2259 0x371A7373AA9CBull
, 0x3A7DB34E59FF7ull
, 0x3DEA64C123422ull
,
2260 0x4160A21F72E2Aull
, 0x44E086061892Dull
, 0x486A2B5C13CD0ull
,
2261 0x4BFDAD5362A27ull
, 0x4F9B2769D2CA7ull
, 0x5342B569D4F82ull
,
2262 0x56F4736B527DAull
, 0x5AB07DD485429ull
, 0x5E76F15AD2148ull
,
2263 0x6247EB03A5585ull
, 0x6623882552225ull
, 0x6A09E667F3BCDull
,
2264 0x6DFB23C651A2Full
, 0x71F75E8EC5F74ull
, 0x75FEB564267C9ull
,
2265 0x7A11473EB0187ull
, 0x7E2F336CF4E62ull
, 0x82589994CCE13ull
,
2266 0x868D99B4492EDull
, 0x8ACE5422AA0DBull
, 0x8F1AE99157736ull
,
2267 0x93737B0CDC5E5ull
, 0x97D829FDE4E50ull
, 0x9C49182A3F090ull
,
2268 0xA0C667B5DE565ull
, 0xA5503B23E255Dull
, 0xA9E6B5579FDBFull
,
2269 0xAE89F995AD3ADull
, 0xB33A2B84F15FBull
, 0xB7F76F2FB5E47ull
,
2270 0xBCC1E904BC1D2ull
, 0xC199BDD85529Cull
, 0xC67F12E57D14Bull
,
2271 0xCB720DCEF9069ull
, 0xD072D4A07897Cull
, 0xD5818DCFBA487ull
,
2272 0xDA9E603DB3285ull
, 0xDFC97337B9B5Full
, 0xE502EE78B3FF6ull
,
2273 0xEA4AFA2A490DAull
, 0xEFA1BEE615A27ull
, 0xF50765B6E4540ull
,
2276 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2277 uint64_t *d
= vd
, *n
= vn
;
2279 for (i
= 0; i
< opr_sz
; i
++) {
2281 intptr_t idx
= extract32(nn
, 0, 6);
2282 uint64_t exp
= extract32(nn
, 6, 11);
2283 d
[i
] = coeff
[idx
] | (exp
<< 52);
2287 void HELPER(sve_ftssel_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2289 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
2290 uint16_t *d
= vd
, *n
= vn
, *m
= vm
;
2291 for (i
= 0; i
< opr_sz
; i
+= 1) {
2297 d
[i
] = nn
^ (mm
& 2) << 14;
2301 void HELPER(sve_ftssel_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2303 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
2304 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
2305 for (i
= 0; i
< opr_sz
; i
+= 1) {
2311 d
[i
] = nn
^ (mm
& 2) << 30;
2315 void HELPER(sve_ftssel_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2317 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2318 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2319 for (i
= 0; i
< opr_sz
; i
+= 1) {
2325 d
[i
] = nn
^ (mm
& 2) << 62;
2330 * Signed saturating addition with scalar operand.
2333 void HELPER(sve_sqaddi_b
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
2335 intptr_t i
, oprsz
= simd_oprsz(desc
);
2337 for (i
= 0; i
< oprsz
; i
+= sizeof(int8_t)) {
2338 *(int8_t *)(d
+ i
) = DO_SQADD_B(b
, *(int8_t *)(a
+ i
));
2342 void HELPER(sve_sqaddi_h
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
2344 intptr_t i
, oprsz
= simd_oprsz(desc
);
2346 for (i
= 0; i
< oprsz
; i
+= sizeof(int16_t)) {
2347 *(int16_t *)(d
+ i
) = DO_SQADD_H(b
, *(int16_t *)(a
+ i
));
2351 void HELPER(sve_sqaddi_s
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
2353 intptr_t i
, oprsz
= simd_oprsz(desc
);
2355 for (i
= 0; i
< oprsz
; i
+= sizeof(int32_t)) {
2356 *(int32_t *)(d
+ i
) = DO_SQADD_S(b
, *(int32_t *)(a
+ i
));
2360 void HELPER(sve_sqaddi_d
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
2362 intptr_t i
, oprsz
= simd_oprsz(desc
);
2364 for (i
= 0; i
< oprsz
; i
+= sizeof(int64_t)) {
2365 *(int64_t *)(d
+ i
) = do_sqadd_d(b
, *(int64_t *)(a
+ i
));
2370 * Unsigned saturating addition with scalar operand.
2373 void HELPER(sve_uqaddi_b
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
2375 intptr_t i
, oprsz
= simd_oprsz(desc
);
2377 for (i
= 0; i
< oprsz
; i
+= sizeof(uint8_t)) {
2378 *(uint8_t *)(d
+ i
) = DO_UQADD_B(b
, *(uint8_t *)(a
+ i
));
2382 void HELPER(sve_uqaddi_h
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
2384 intptr_t i
, oprsz
= simd_oprsz(desc
);
2386 for (i
= 0; i
< oprsz
; i
+= sizeof(uint16_t)) {
2387 *(uint16_t *)(d
+ i
) = DO_UQADD_H(b
, *(uint16_t *)(a
+ i
));
2391 void HELPER(sve_uqaddi_s
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
2393 intptr_t i
, oprsz
= simd_oprsz(desc
);
2395 for (i
= 0; i
< oprsz
; i
+= sizeof(uint32_t)) {
2396 *(uint32_t *)(d
+ i
) = DO_UQADD_S(b
, *(uint32_t *)(a
+ i
));
2400 void HELPER(sve_uqaddi_d
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
2402 intptr_t i
, oprsz
= simd_oprsz(desc
);
2404 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
2405 *(uint64_t *)(d
+ i
) = do_uqadd_d(b
, *(uint64_t *)(a
+ i
));
2409 void HELPER(sve_uqsubi_d
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
2411 intptr_t i
, oprsz
= simd_oprsz(desc
);
2413 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
2414 *(uint64_t *)(d
+ i
) = do_uqsub_d(*(uint64_t *)(a
+ i
), b
);
2418 /* Two operand predicated copy immediate with merge. All valid immediates
2419 * can fit within 17 signed bits in the simd_data field.
2421 void HELPER(sve_cpy_m_b
)(void *vd
, void *vn
, void *vg
,
2422 uint64_t mm
, uint32_t desc
)
2424 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2425 uint64_t *d
= vd
, *n
= vn
;
2428 mm
= dup_const(MO_8
, mm
);
2429 for (i
= 0; i
< opr_sz
; i
+= 1) {
2431 uint64_t pp
= expand_pred_b(pg
[H1(i
)]);
2432 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
2436 void HELPER(sve_cpy_m_h
)(void *vd
, void *vn
, void *vg
,
2437 uint64_t mm
, uint32_t desc
)
2439 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2440 uint64_t *d
= vd
, *n
= vn
;
2443 mm
= dup_const(MO_16
, mm
);
2444 for (i
= 0; i
< opr_sz
; i
+= 1) {
2446 uint64_t pp
= expand_pred_h(pg
[H1(i
)]);
2447 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
2451 void HELPER(sve_cpy_m_s
)(void *vd
, void *vn
, void *vg
,
2452 uint64_t mm
, uint32_t desc
)
2454 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2455 uint64_t *d
= vd
, *n
= vn
;
2458 mm
= dup_const(MO_32
, mm
);
2459 for (i
= 0; i
< opr_sz
; i
+= 1) {
2461 uint64_t pp
= expand_pred_s(pg
[H1(i
)]);
2462 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
2466 void HELPER(sve_cpy_m_d
)(void *vd
, void *vn
, void *vg
,
2467 uint64_t mm
, uint32_t desc
)
2469 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2470 uint64_t *d
= vd
, *n
= vn
;
2473 for (i
= 0; i
< opr_sz
; i
+= 1) {
2475 d
[i
] = (pg
[H1(i
)] & 1 ? mm
: nn
);
2479 void HELPER(sve_cpy_z_b
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2481 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2485 val
= dup_const(MO_8
, val
);
2486 for (i
= 0; i
< opr_sz
; i
+= 1) {
2487 d
[i
] = val
& expand_pred_b(pg
[H1(i
)]);
2491 void HELPER(sve_cpy_z_h
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2493 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2497 val
= dup_const(MO_16
, val
);
2498 for (i
= 0; i
< opr_sz
; i
+= 1) {
2499 d
[i
] = val
& expand_pred_h(pg
[H1(i
)]);
2503 void HELPER(sve_cpy_z_s
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2505 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2509 val
= dup_const(MO_32
, val
);
2510 for (i
= 0; i
< opr_sz
; i
+= 1) {
2511 d
[i
] = val
& expand_pred_s(pg
[H1(i
)]);
2515 void HELPER(sve_cpy_z_d
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2517 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2521 for (i
= 0; i
< opr_sz
; i
+= 1) {
2522 d
[i
] = (pg
[H1(i
)] & 1 ? val
: 0);
2526 /* Big-endian hosts need to frob the byte indices. If the copy
2527 * happens to be 8-byte aligned, then no frobbing necessary.
2529 static void swap_memmove(void *vd
, void *vs
, size_t n
)
2531 uintptr_t d
= (uintptr_t)vd
;
2532 uintptr_t s
= (uintptr_t)vs
;
2533 uintptr_t o
= (d
| s
| n
) & 7;
2536 #ifndef HOST_WORDS_BIGENDIAN
2545 if (d
< s
|| d
>= s
+ n
) {
2546 for (i
= 0; i
< n
; i
+= 4) {
2547 *(uint32_t *)H1_4(d
+ i
) = *(uint32_t *)H1_4(s
+ i
);
2550 for (i
= n
; i
> 0; ) {
2552 *(uint32_t *)H1_4(d
+ i
) = *(uint32_t *)H1_4(s
+ i
);
2559 if (d
< s
|| d
>= s
+ n
) {
2560 for (i
= 0; i
< n
; i
+= 2) {
2561 *(uint16_t *)H1_2(d
+ i
) = *(uint16_t *)H1_2(s
+ i
);
2564 for (i
= n
; i
> 0; ) {
2566 *(uint16_t *)H1_2(d
+ i
) = *(uint16_t *)H1_2(s
+ i
);
2572 if (d
< s
|| d
>= s
+ n
) {
2573 for (i
= 0; i
< n
; i
++) {
2574 *(uint8_t *)H1(d
+ i
) = *(uint8_t *)H1(s
+ i
);
2577 for (i
= n
; i
> 0; ) {
2579 *(uint8_t *)H1(d
+ i
) = *(uint8_t *)H1(s
+ i
);
2586 /* Similarly for memset of 0. */
2587 static void swap_memzero(void *vd
, size_t n
)
2589 uintptr_t d
= (uintptr_t)vd
;
2590 uintptr_t o
= (d
| n
) & 7;
2593 /* Usually, the first bit of a predicate is set, so N is 0. */
2594 if (likely(n
== 0)) {
2598 #ifndef HOST_WORDS_BIGENDIAN
2607 for (i
= 0; i
< n
; i
+= 4) {
2608 *(uint32_t *)H1_4(d
+ i
) = 0;
2614 for (i
= 0; i
< n
; i
+= 2) {
2615 *(uint16_t *)H1_2(d
+ i
) = 0;
2620 for (i
= 0; i
< n
; i
++) {
2621 *(uint8_t *)H1(d
+ i
) = 0;
2627 void HELPER(sve_ext
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2629 intptr_t opr_sz
= simd_oprsz(desc
);
2630 size_t n_ofs
= simd_data(desc
);
2631 size_t n_siz
= opr_sz
- n_ofs
;
2634 swap_memmove(vd
, vn
+ n_ofs
, n_siz
);
2635 swap_memmove(vd
+ n_siz
, vm
, n_ofs
);
2636 } else if (vd
!= vn
) {
2637 swap_memmove(vd
+ n_siz
, vd
, n_ofs
);
2638 swap_memmove(vd
, vn
+ n_ofs
, n_siz
);
2640 /* vd == vn == vm. Need temp space. */
2642 swap_memmove(&tmp
, vm
, n_ofs
);
2643 swap_memmove(vd
, vd
+ n_ofs
, n_siz
);
2644 memcpy(vd
+ n_siz
, &tmp
, n_ofs
);
2648 #define DO_INSR(NAME, TYPE, H) \
2649 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2651 intptr_t opr_sz = simd_oprsz(desc); \
2652 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2653 *(TYPE *)(vd + H(0)) = val; \
2656 DO_INSR(sve_insr_b
, uint8_t, H1
)
2657 DO_INSR(sve_insr_h
, uint16_t, H1_2
)
2658 DO_INSR(sve_insr_s
, uint32_t, H1_4
)
2659 DO_INSR(sve_insr_d
, uint64_t, )
2663 void HELPER(sve_rev_b
)(void *vd
, void *vn
, uint32_t desc
)
2665 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2666 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
2667 uint64_t f
= *(uint64_t *)(vn
+ i
);
2668 uint64_t b
= *(uint64_t *)(vn
+ j
);
2669 *(uint64_t *)(vd
+ i
) = bswap64(b
);
2670 *(uint64_t *)(vd
+ j
) = bswap64(f
);
2674 void HELPER(sve_rev_h
)(void *vd
, void *vn
, uint32_t desc
)
2676 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2677 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
2678 uint64_t f
= *(uint64_t *)(vn
+ i
);
2679 uint64_t b
= *(uint64_t *)(vn
+ j
);
2680 *(uint64_t *)(vd
+ i
) = hswap64(b
);
2681 *(uint64_t *)(vd
+ j
) = hswap64(f
);
2685 void HELPER(sve_rev_s
)(void *vd
, void *vn
, uint32_t desc
)
2687 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2688 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
2689 uint64_t f
= *(uint64_t *)(vn
+ i
);
2690 uint64_t b
= *(uint64_t *)(vn
+ j
);
2691 *(uint64_t *)(vd
+ i
) = rol64(b
, 32);
2692 *(uint64_t *)(vd
+ j
) = rol64(f
, 32);
2696 void HELPER(sve_rev_d
)(void *vd
, void *vn
, uint32_t desc
)
2698 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2699 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
2700 uint64_t f
= *(uint64_t *)(vn
+ i
);
2701 uint64_t b
= *(uint64_t *)(vn
+ j
);
2702 *(uint64_t *)(vd
+ i
) = b
;
2703 *(uint64_t *)(vd
+ j
) = f
;
2707 #define DO_TBL(NAME, TYPE, H) \
2708 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2710 intptr_t i, opr_sz = simd_oprsz(desc); \
2711 uintptr_t elem = opr_sz / sizeof(TYPE); \
2712 TYPE *d = vd, *n = vn, *m = vm; \
2714 if (unlikely(vd == vn)) { \
2715 n = memcpy(&tmp, vn, opr_sz); \
2717 for (i = 0; i < elem; i++) { \
2719 d[H(i)] = j < elem ? n[H(j)] : 0; \
2723 DO_TBL(sve_tbl_b
, uint8_t, H1
)
2724 DO_TBL(sve_tbl_h
, uint16_t, H2
)
2725 DO_TBL(sve_tbl_s
, uint32_t, H4
)
2726 DO_TBL(sve_tbl_d
, uint64_t, )
2730 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
2731 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2733 intptr_t i, opr_sz = simd_oprsz(desc); \
2737 if (unlikely(vn - vd < opr_sz)) { \
2738 n = memcpy(&tmp, n, opr_sz / 2); \
2740 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
2741 d[HD(i)] = n[HS(i)]; \
2745 DO_UNPK(sve_sunpk_h
, int16_t, int8_t, H2
, H1
)
2746 DO_UNPK(sve_sunpk_s
, int32_t, int16_t, H4
, H2
)
2747 DO_UNPK(sve_sunpk_d
, int64_t, int32_t, , H4
)
2749 DO_UNPK(sve_uunpk_h
, uint16_t, uint8_t, H2
, H1
)
2750 DO_UNPK(sve_uunpk_s
, uint32_t, uint16_t, H4
, H2
)
2751 DO_UNPK(sve_uunpk_d
, uint64_t, uint32_t, , H4
)
2755 /* Mask of bits included in the even numbered predicates of width esz.
2756 * We also use this for expand_bits/compress_bits, and so extend the
2757 * same pattern out to 16-bit units.
2759 static const uint64_t even_bit_esz_masks
[5] = {
2760 0x5555555555555555ull
,
2761 0x3333333333333333ull
,
2762 0x0f0f0f0f0f0f0f0full
,
2763 0x00ff00ff00ff00ffull
,
2764 0x0000ffff0000ffffull
,
2767 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
2768 * For N==0, this corresponds to the operation that in qemu/bitops.h
2769 * we call half_shuffle64; this algorithm is from Hacker's Delight,
2770 * section 7-2 Shuffling Bits.
2772 static uint64_t expand_bits(uint64_t x
, int n
)
2777 for (i
= 4; i
>= n
; i
--) {
2779 x
= ((x
<< sh
) | x
) & even_bit_esz_masks
[i
];
2784 /* Compress units of 2**(N+1) bits to units of 2**N bits.
2785 * For N==0, this corresponds to the operation that in qemu/bitops.h
2786 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
2787 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
2789 static uint64_t compress_bits(uint64_t x
, int n
)
2793 for (i
= n
; i
<= 4; i
++) {
2795 x
&= even_bit_esz_masks
[i
];
2798 return x
& 0xffffffffu
;
2801 void HELPER(sve_zip_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
2803 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
2804 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
2805 intptr_t high
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
);
2806 int esize
= 1 << esz
;
2811 uint64_t nn
= *(uint64_t *)vn
;
2812 uint64_t mm
= *(uint64_t *)vm
;
2813 int half
= 4 * oprsz
;
2815 nn
= extract64(nn
, high
* half
, half
);
2816 mm
= extract64(mm
, high
* half
, half
);
2817 nn
= expand_bits(nn
, esz
);
2818 mm
= expand_bits(mm
, esz
);
2819 d
[0] = nn
| (mm
<< esize
);
2821 ARMPredicateReg tmp
;
2823 /* We produce output faster than we consume input.
2824 Therefore we must be mindful of possible overlap. */
2826 vn
= memcpy(&tmp
, vn
, oprsz
);
2830 } else if (vd
== vm
) {
2831 vm
= memcpy(&tmp
, vm
, oprsz
);
2837 if ((oprsz
& 7) == 0) {
2838 uint32_t *n
= vn
, *m
= vm
;
2841 for (i
= 0; i
< oprsz
/ 8; i
++) {
2842 uint64_t nn
= n
[H4(high
+ i
)];
2843 uint64_t mm
= m
[H4(high
+ i
)];
2845 nn
= expand_bits(nn
, esz
);
2846 mm
= expand_bits(mm
, esz
);
2847 d
[i
] = nn
| (mm
<< esize
);
2850 uint8_t *n
= vn
, *m
= vm
;
2853 for (i
= 0; i
< oprsz
/ 2; i
++) {
2854 uint16_t nn
= n
[H1(high
+ i
)];
2855 uint16_t mm
= m
[H1(high
+ i
)];
2857 nn
= expand_bits(nn
, esz
);
2858 mm
= expand_bits(mm
, esz
);
2859 d16
[H2(i
)] = nn
| (mm
<< esize
);
2865 void HELPER(sve_uzp_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
2867 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
2868 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
2869 int odd
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
) << esz
;
2870 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2875 l
= compress_bits(n
[0] >> odd
, esz
);
2876 h
= compress_bits(m
[0] >> odd
, esz
);
2877 d
[0] = l
| (h
<< (4 * oprsz
));
2879 ARMPredicateReg tmp_m
;
2880 intptr_t oprsz_16
= oprsz
/ 16;
2882 if ((vm
- vd
) < (uintptr_t)oprsz
) {
2883 m
= memcpy(&tmp_m
, vm
, oprsz
);
2886 for (i
= 0; i
< oprsz_16
; i
++) {
2889 l
= compress_bits(l
>> odd
, esz
);
2890 h
= compress_bits(h
>> odd
, esz
);
2891 d
[i
] = l
| (h
<< 32);
2895 * For VL which is not a multiple of 512, the results from M do not
2896 * align nicely with the uint64_t for D. Put the aligned results
2897 * from M into TMP_M and then copy it into place afterward.
2900 int final_shift
= (oprsz
& 15) * 2;
2904 l
= compress_bits(l
>> odd
, esz
);
2905 h
= compress_bits(h
>> odd
, esz
);
2906 d
[i
] = l
| (h
<< final_shift
);
2908 for (i
= 0; i
< oprsz_16
; i
++) {
2911 l
= compress_bits(l
>> odd
, esz
);
2912 h
= compress_bits(h
>> odd
, esz
);
2913 tmp_m
.p
[i
] = l
| (h
<< 32);
2917 l
= compress_bits(l
>> odd
, esz
);
2918 h
= compress_bits(h
>> odd
, esz
);
2919 tmp_m
.p
[i
] = l
| (h
<< final_shift
);
2921 swap_memmove(vd
+ oprsz
/ 2, &tmp_m
, oprsz
/ 2);
2923 for (i
= 0; i
< oprsz_16
; i
++) {
2926 l
= compress_bits(l
>> odd
, esz
);
2927 h
= compress_bits(h
>> odd
, esz
);
2928 d
[oprsz_16
+ i
] = l
| (h
<< 32);
2934 void HELPER(sve_trn_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
2936 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
2937 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
2938 int odd
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
);
2939 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2946 mask
= even_bit_esz_masks
[esz
];
2953 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); i
++) {
2954 uint64_t nn
= (n
[i
] & mask
) >> shr
;
2955 uint64_t mm
= (m
[i
] & mask
) << shl
;
2960 /* Reverse units of 2**N bits. */
2961 static uint64_t reverse_bits_64(uint64_t x
, int n
)
2966 for (i
= 2, sh
= 4; i
>= n
; i
--, sh
>>= 1) {
2967 uint64_t mask
= even_bit_esz_masks
[i
];
2968 x
= ((x
& mask
) << sh
) | ((x
>> sh
) & mask
);
2973 static uint8_t reverse_bits_8(uint8_t x
, int n
)
2975 static const uint8_t mask
[3] = { 0x55, 0x33, 0x0f };
2978 for (i
= 2, sh
= 4; i
>= n
; i
--, sh
>>= 1) {
2979 x
= ((x
& mask
[i
]) << sh
) | ((x
>> sh
) & mask
[i
]);
2984 void HELPER(sve_rev_p
)(void *vd
, void *vn
, uint32_t pred_desc
)
2986 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
2987 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
2988 intptr_t i
, oprsz_2
= oprsz
/ 2;
2991 uint64_t l
= *(uint64_t *)vn
;
2992 l
= reverse_bits_64(l
<< (64 - 8 * oprsz
), esz
);
2993 *(uint64_t *)vd
= l
;
2994 } else if ((oprsz
& 15) == 0) {
2995 for (i
= 0; i
< oprsz_2
; i
+= 8) {
2996 intptr_t ih
= oprsz
- 8 - i
;
2997 uint64_t l
= reverse_bits_64(*(uint64_t *)(vn
+ i
), esz
);
2998 uint64_t h
= reverse_bits_64(*(uint64_t *)(vn
+ ih
), esz
);
2999 *(uint64_t *)(vd
+ i
) = h
;
3000 *(uint64_t *)(vd
+ ih
) = l
;
3003 for (i
= 0; i
< oprsz_2
; i
+= 1) {
3004 intptr_t il
= H1(i
);
3005 intptr_t ih
= H1(oprsz
- 1 - i
);
3006 uint8_t l
= reverse_bits_8(*(uint8_t *)(vn
+ il
), esz
);
3007 uint8_t h
= reverse_bits_8(*(uint8_t *)(vn
+ ih
), esz
);
3008 *(uint8_t *)(vd
+ il
) = h
;
3009 *(uint8_t *)(vd
+ ih
) = l
;
3014 void HELPER(sve_punpk_p
)(void *vd
, void *vn
, uint32_t pred_desc
)
3016 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3017 intptr_t high
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
);
3022 uint64_t nn
= *(uint64_t *)vn
;
3023 int half
= 4 * oprsz
;
3025 nn
= extract64(nn
, high
* half
, half
);
3026 nn
= expand_bits(nn
, 0);
3029 ARMPredicateReg tmp_n
;
3031 /* We produce output faster than we consume input.
3032 Therefore we must be mindful of possible overlap. */
3033 if ((vn
- vd
) < (uintptr_t)oprsz
) {
3034 vn
= memcpy(&tmp_n
, vn
, oprsz
);
3040 if ((oprsz
& 7) == 0) {
3044 for (i
= 0; i
< oprsz
/ 8; i
++) {
3045 uint64_t nn
= n
[H4(high
+ i
)];
3046 d
[i
] = expand_bits(nn
, 0);
3052 for (i
= 0; i
< oprsz
/ 2; i
++) {
3053 uint16_t nn
= n
[H1(high
+ i
)];
3054 d16
[H2(i
)] = expand_bits(nn
, 0);
3060 #define DO_ZIP(NAME, TYPE, H) \
3061 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3063 intptr_t oprsz = simd_oprsz(desc); \
3064 intptr_t i, oprsz_2 = oprsz / 2; \
3065 ARMVectorReg tmp_n, tmp_m; \
3066 /* We produce output faster than we consume input. \
3067 Therefore we must be mindful of possible overlap. */ \
3068 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3069 vn = memcpy(&tmp_n, vn, oprsz_2); \
3071 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3072 vm = memcpy(&tmp_m, vm, oprsz_2); \
3074 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3075 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
3076 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
3080 DO_ZIP(sve_zip_b
, uint8_t, H1
)
3081 DO_ZIP(sve_zip_h
, uint16_t, H1_2
)
3082 DO_ZIP(sve_zip_s
, uint32_t, H1_4
)
3083 DO_ZIP(sve_zip_d
, uint64_t, )
3085 #define DO_UZP(NAME, TYPE, H) \
3086 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3088 intptr_t oprsz = simd_oprsz(desc); \
3089 intptr_t oprsz_2 = oprsz / 2; \
3090 intptr_t odd_ofs = simd_data(desc); \
3092 ARMVectorReg tmp_m; \
3093 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3094 vm = memcpy(&tmp_m, vm, oprsz); \
3096 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3097 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
3099 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3100 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
3104 DO_UZP(sve_uzp_b
, uint8_t, H1
)
3105 DO_UZP(sve_uzp_h
, uint16_t, H1_2
)
3106 DO_UZP(sve_uzp_s
, uint32_t, H1_4
)
3107 DO_UZP(sve_uzp_d
, uint64_t, )
3109 #define DO_TRN(NAME, TYPE, H) \
3110 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3112 intptr_t oprsz = simd_oprsz(desc); \
3113 intptr_t odd_ofs = simd_data(desc); \
3115 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3116 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3117 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3118 *(TYPE *)(vd + H(i + 0)) = ae; \
3119 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3123 DO_TRN(sve_trn_b
, uint8_t, H1
)
3124 DO_TRN(sve_trn_h
, uint16_t, H1_2
)
3125 DO_TRN(sve_trn_s
, uint32_t, H1_4
)
3126 DO_TRN(sve_trn_d
, uint64_t, )
3132 void HELPER(sve_compact_s
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
3134 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
) / 4;
3135 uint32_t *d
= vd
, *n
= vn
;
3138 for (i
= j
= 0; i
< opr_sz
; i
++) {
3139 if (pg
[H1(i
/ 2)] & (i
& 1 ? 0x10 : 0x01)) {
3140 d
[H4(j
)] = n
[H4(i
)];
3144 for (; j
< opr_sz
; j
++) {
3149 void HELPER(sve_compact_d
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
3151 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
) / 8;
3152 uint64_t *d
= vd
, *n
= vn
;
3155 for (i
= j
= 0; i
< opr_sz
; i
++) {
3156 if (pg
[H1(i
)] & 1) {
3161 for (; j
< opr_sz
; j
++) {
3166 /* Similar to the ARM LastActiveElement pseudocode function, except the
3167 * result is multiplied by the element size. This includes the not found
3168 * indication; e.g. not found for esz=3 is -8.
3170 int32_t HELPER(sve_last_active_element
)(void *vg
, uint32_t pred_desc
)
3172 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
3173 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3175 return last_active_element(vg
, words
, esz
);
3178 void HELPER(sve_splice
)(void *vd
, void *vn
, void *vm
, void *vg
, uint32_t desc
)
3180 intptr_t opr_sz
= simd_oprsz(desc
) / 8;
3181 int esz
= simd_data(desc
);
3182 uint64_t pg
, first_g
, last_g
, len
, mask
= pred_esz_masks
[esz
];
3183 intptr_t i
, first_i
, last_i
;
3186 first_i
= last_i
= 0;
3187 first_g
= last_g
= 0;
3189 /* Find the extent of the active elements within VG. */
3190 for (i
= QEMU_ALIGN_UP(opr_sz
, 8) - 8; i
>= 0; i
-= 8) {
3191 pg
= *(uint64_t *)(vg
+ i
) & mask
;
3204 first_i
= first_i
* 8 + ctz64(first_g
);
3205 last_i
= last_i
* 8 + 63 - clz64(last_g
);
3206 len
= last_i
- first_i
+ (1 << esz
);
3208 vm
= memcpy(&tmp
, vm
, opr_sz
* 8);
3210 swap_memmove(vd
, vn
+ first_i
, len
);
3212 swap_memmove(vd
+ len
, vm
, opr_sz
* 8 - len
);
3215 void HELPER(sve_sel_zpzz_b
)(void *vd
, void *vn
, void *vm
,
3216 void *vg
, uint32_t desc
)
3218 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
3219 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3222 for (i
= 0; i
< opr_sz
; i
+= 1) {
3223 uint64_t nn
= n
[i
], mm
= m
[i
];
3224 uint64_t pp
= expand_pred_b(pg
[H1(i
)]);
3225 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
3229 void HELPER(sve_sel_zpzz_h
)(void *vd
, void *vn
, void *vm
,
3230 void *vg
, uint32_t desc
)
3232 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
3233 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3236 for (i
= 0; i
< opr_sz
; i
+= 1) {
3237 uint64_t nn
= n
[i
], mm
= m
[i
];
3238 uint64_t pp
= expand_pred_h(pg
[H1(i
)]);
3239 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
3243 void HELPER(sve_sel_zpzz_s
)(void *vd
, void *vn
, void *vm
,
3244 void *vg
, uint32_t desc
)
3246 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
3247 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3250 for (i
= 0; i
< opr_sz
; i
+= 1) {
3251 uint64_t nn
= n
[i
], mm
= m
[i
];
3252 uint64_t pp
= expand_pred_s(pg
[H1(i
)]);
3253 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
3257 void HELPER(sve_sel_zpzz_d
)(void *vd
, void *vn
, void *vm
,
3258 void *vg
, uint32_t desc
)
3260 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
3261 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3264 for (i
= 0; i
< opr_sz
; i
+= 1) {
3265 uint64_t nn
= n
[i
], mm
= m
[i
];
3266 d
[i
] = (pg
[H1(i
)] & 1 ? nn
: mm
);
3270 /* Two operand comparison controlled by a predicate.
3271 * ??? It is very tempting to want to be able to expand this inline
3272 * with x86 instructions, e.g.
3274 * vcmpeqw zm, zn, %ymm0
3275 * vpmovmskb %ymm0, %eax
3279 * or even aarch64, e.g.
3281 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3282 * cmeq v0.8h, zn, zm
3283 * and v0.8h, v0.8h, mask
3287 * However, coming up with an abstraction that allows vector inputs and
3288 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3289 * scalar outputs, is tricky.
3291 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3292 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3294 intptr_t opr_sz = simd_oprsz(desc); \
3295 uint32_t flags = PREDTEST_INIT; \
3296 intptr_t i = opr_sz; \
3298 uint64_t out = 0, pg; \
3300 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3301 TYPE nn = *(TYPE *)(vn + H(i)); \
3302 TYPE mm = *(TYPE *)(vm + H(i)); \
3305 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3307 *(uint64_t *)(vd + (i >> 3)) = out; \
3308 flags = iter_predtest_bwd(out, pg, flags); \
3313 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3314 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3315 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3316 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3317 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3318 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3319 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3320 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
3322 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b
, uint8_t, ==)
3323 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h
, uint16_t, ==)
3324 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s
, uint32_t, ==)
3325 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d
, uint64_t, ==)
3327 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b
, uint8_t, !=)
3328 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h
, uint16_t, !=)
3329 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s
, uint32_t, !=)
3330 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d
, uint64_t, !=)
3332 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b
, int8_t, >)
3333 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h
, int16_t, >)
3334 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s
, int32_t, >)
3335 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d
, int64_t, >)
3337 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b
, int8_t, >=)
3338 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h
, int16_t, >=)
3339 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s
, int32_t, >=)
3340 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d
, int64_t, >=)
3342 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b
, uint8_t, >)
3343 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h
, uint16_t, >)
3344 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s
, uint32_t, >)
3345 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d
, uint64_t, >)
3347 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b
, uint8_t, >=)
3348 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h
, uint16_t, >=)
3349 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s
, uint32_t, >=)
3350 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d
, uint64_t, >=)
3352 #undef DO_CMP_PPZZ_B
3353 #undef DO_CMP_PPZZ_H
3354 #undef DO_CMP_PPZZ_S
3355 #undef DO_CMP_PPZZ_D
3358 /* Similar, but the second source is "wide". */
3359 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3360 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3362 intptr_t opr_sz = simd_oprsz(desc); \
3363 uint32_t flags = PREDTEST_INIT; \
3364 intptr_t i = opr_sz; \
3366 uint64_t out = 0, pg; \
3368 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3370 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3371 TYPE nn = *(TYPE *)(vn + H(i)); \
3375 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3377 *(uint64_t *)(vd + (i >> 3)) = out; \
3378 flags = iter_predtest_bwd(out, pg, flags); \
3383 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3384 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3385 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3386 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3387 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3388 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3390 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b
, int8_t, uint64_t, ==)
3391 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h
, int16_t, uint64_t, ==)
3392 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s
, int32_t, uint64_t, ==)
3394 DO_CMP_PPZW_B(sve_cmpne_ppzw_b
, int8_t, uint64_t, !=)
3395 DO_CMP_PPZW_H(sve_cmpne_ppzw_h
, int16_t, uint64_t, !=)
3396 DO_CMP_PPZW_S(sve_cmpne_ppzw_s
, int32_t, uint64_t, !=)
3398 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b
, int8_t, int64_t, >)
3399 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h
, int16_t, int64_t, >)
3400 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s
, int32_t, int64_t, >)
3402 DO_CMP_PPZW_B(sve_cmpge_ppzw_b
, int8_t, int64_t, >=)
3403 DO_CMP_PPZW_H(sve_cmpge_ppzw_h
, int16_t, int64_t, >=)
3404 DO_CMP_PPZW_S(sve_cmpge_ppzw_s
, int32_t, int64_t, >=)
3406 DO_CMP_PPZW_B(sve_cmphi_ppzw_b
, uint8_t, uint64_t, >)
3407 DO_CMP_PPZW_H(sve_cmphi_ppzw_h
, uint16_t, uint64_t, >)
3408 DO_CMP_PPZW_S(sve_cmphi_ppzw_s
, uint32_t, uint64_t, >)
3410 DO_CMP_PPZW_B(sve_cmphs_ppzw_b
, uint8_t, uint64_t, >=)
3411 DO_CMP_PPZW_H(sve_cmphs_ppzw_h
, uint16_t, uint64_t, >=)
3412 DO_CMP_PPZW_S(sve_cmphs_ppzw_s
, uint32_t, uint64_t, >=)
3414 DO_CMP_PPZW_B(sve_cmplt_ppzw_b
, int8_t, int64_t, <)
3415 DO_CMP_PPZW_H(sve_cmplt_ppzw_h
, int16_t, int64_t, <)
3416 DO_CMP_PPZW_S(sve_cmplt_ppzw_s
, int32_t, int64_t, <)
3418 DO_CMP_PPZW_B(sve_cmple_ppzw_b
, int8_t, int64_t, <=)
3419 DO_CMP_PPZW_H(sve_cmple_ppzw_h
, int16_t, int64_t, <=)
3420 DO_CMP_PPZW_S(sve_cmple_ppzw_s
, int32_t, int64_t, <=)
3422 DO_CMP_PPZW_B(sve_cmplo_ppzw_b
, uint8_t, uint64_t, <)
3423 DO_CMP_PPZW_H(sve_cmplo_ppzw_h
, uint16_t, uint64_t, <)
3424 DO_CMP_PPZW_S(sve_cmplo_ppzw_s
, uint32_t, uint64_t, <)
3426 DO_CMP_PPZW_B(sve_cmpls_ppzw_b
, uint8_t, uint64_t, <=)
3427 DO_CMP_PPZW_H(sve_cmpls_ppzw_h
, uint16_t, uint64_t, <=)
3428 DO_CMP_PPZW_S(sve_cmpls_ppzw_s
, uint32_t, uint64_t, <=)
3430 #undef DO_CMP_PPZW_B
3431 #undef DO_CMP_PPZW_H
3432 #undef DO_CMP_PPZW_S
3435 /* Similar, but the second source is immediate. */
3436 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3437 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3439 intptr_t opr_sz = simd_oprsz(desc); \
3440 uint32_t flags = PREDTEST_INIT; \
3441 TYPE mm = simd_data(desc); \
3442 intptr_t i = opr_sz; \
3444 uint64_t out = 0, pg; \
3446 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3447 TYPE nn = *(TYPE *)(vn + H(i)); \
3450 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3452 *(uint64_t *)(vd + (i >> 3)) = out; \
3453 flags = iter_predtest_bwd(out, pg, flags); \
3458 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3459 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3460 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3461 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3462 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3463 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3464 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3465 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
3467 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b
, uint8_t, ==)
3468 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h
, uint16_t, ==)
3469 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s
, uint32_t, ==)
3470 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d
, uint64_t, ==)
3472 DO_CMP_PPZI_B(sve_cmpne_ppzi_b
, uint8_t, !=)
3473 DO_CMP_PPZI_H(sve_cmpne_ppzi_h
, uint16_t, !=)
3474 DO_CMP_PPZI_S(sve_cmpne_ppzi_s
, uint32_t, !=)
3475 DO_CMP_PPZI_D(sve_cmpne_ppzi_d
, uint64_t, !=)
3477 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b
, int8_t, >)
3478 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h
, int16_t, >)
3479 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s
, int32_t, >)
3480 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d
, int64_t, >)
3482 DO_CMP_PPZI_B(sve_cmpge_ppzi_b
, int8_t, >=)
3483 DO_CMP_PPZI_H(sve_cmpge_ppzi_h
, int16_t, >=)
3484 DO_CMP_PPZI_S(sve_cmpge_ppzi_s
, int32_t, >=)
3485 DO_CMP_PPZI_D(sve_cmpge_ppzi_d
, int64_t, >=)
3487 DO_CMP_PPZI_B(sve_cmphi_ppzi_b
, uint8_t, >)
3488 DO_CMP_PPZI_H(sve_cmphi_ppzi_h
, uint16_t, >)
3489 DO_CMP_PPZI_S(sve_cmphi_ppzi_s
, uint32_t, >)
3490 DO_CMP_PPZI_D(sve_cmphi_ppzi_d
, uint64_t, >)
3492 DO_CMP_PPZI_B(sve_cmphs_ppzi_b
, uint8_t, >=)
3493 DO_CMP_PPZI_H(sve_cmphs_ppzi_h
, uint16_t, >=)
3494 DO_CMP_PPZI_S(sve_cmphs_ppzi_s
, uint32_t, >=)
3495 DO_CMP_PPZI_D(sve_cmphs_ppzi_d
, uint64_t, >=)
3497 DO_CMP_PPZI_B(sve_cmplt_ppzi_b
, int8_t, <)
3498 DO_CMP_PPZI_H(sve_cmplt_ppzi_h
, int16_t, <)
3499 DO_CMP_PPZI_S(sve_cmplt_ppzi_s
, int32_t, <)
3500 DO_CMP_PPZI_D(sve_cmplt_ppzi_d
, int64_t, <)
3502 DO_CMP_PPZI_B(sve_cmple_ppzi_b
, int8_t, <=)
3503 DO_CMP_PPZI_H(sve_cmple_ppzi_h
, int16_t, <=)
3504 DO_CMP_PPZI_S(sve_cmple_ppzi_s
, int32_t, <=)
3505 DO_CMP_PPZI_D(sve_cmple_ppzi_d
, int64_t, <=)
3507 DO_CMP_PPZI_B(sve_cmplo_ppzi_b
, uint8_t, <)
3508 DO_CMP_PPZI_H(sve_cmplo_ppzi_h
, uint16_t, <)
3509 DO_CMP_PPZI_S(sve_cmplo_ppzi_s
, uint32_t, <)
3510 DO_CMP_PPZI_D(sve_cmplo_ppzi_d
, uint64_t, <)
3512 DO_CMP_PPZI_B(sve_cmpls_ppzi_b
, uint8_t, <=)
3513 DO_CMP_PPZI_H(sve_cmpls_ppzi_h
, uint16_t, <=)
3514 DO_CMP_PPZI_S(sve_cmpls_ppzi_s
, uint32_t, <=)
3515 DO_CMP_PPZI_D(sve_cmpls_ppzi_d
, uint64_t, <=)
3517 #undef DO_CMP_PPZI_B
3518 #undef DO_CMP_PPZI_H
3519 #undef DO_CMP_PPZI_S
3520 #undef DO_CMP_PPZI_D
3523 /* Similar to the ARM LastActive pseudocode function. */
3524 static bool last_active_pred(void *vd
, void *vg
, intptr_t oprsz
)
3528 for (i
= QEMU_ALIGN_UP(oprsz
, 8) - 8; i
>= 0; i
-= 8) {
3529 uint64_t pg
= *(uint64_t *)(vg
+ i
);
3531 return (pow2floor(pg
) & *(uint64_t *)(vd
+ i
)) != 0;
3537 /* Compute a mask into RETB that is true for all G, up to and including
3538 * (if after) or excluding (if !after) the first G & N.
3539 * Return true if BRK found.
3541 static bool compute_brk(uint64_t *retb
, uint64_t n
, uint64_t g
,
3542 bool brk
, bool after
)
3548 } else if ((g
& n
) == 0) {
3549 /* For all G, no N are set; break not found. */
3552 /* Break somewhere in N. Locate it. */
3553 b
= g
& n
; /* guard true, pred true */
3554 b
= b
& -b
; /* first such */
3556 b
= b
| (b
- 1); /* break after same */
3558 b
= b
- 1; /* break before same */
3567 /* Compute a zeroing BRK. */
3568 static void compute_brk_z(uint64_t *d
, uint64_t *n
, uint64_t *g
,
3569 intptr_t oprsz
, bool after
)
3574 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
3575 uint64_t this_b
, this_g
= g
[i
];
3577 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
3578 d
[i
] = this_b
& this_g
;
3582 /* Likewise, but also compute flags. */
3583 static uint32_t compute_brks_z(uint64_t *d
, uint64_t *n
, uint64_t *g
,
3584 intptr_t oprsz
, bool after
)
3586 uint32_t flags
= PREDTEST_INIT
;
3590 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
3591 uint64_t this_b
, this_d
, this_g
= g
[i
];
3593 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
3594 d
[i
] = this_d
= this_b
& this_g
;
3595 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
3600 /* Compute a merging BRK. */
3601 static void compute_brk_m(uint64_t *d
, uint64_t *n
, uint64_t *g
,
3602 intptr_t oprsz
, bool after
)
3607 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
3608 uint64_t this_b
, this_g
= g
[i
];
3610 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
3611 d
[i
] = (this_b
& this_g
) | (d
[i
] & ~this_g
);
3615 /* Likewise, but also compute flags. */
3616 static uint32_t compute_brks_m(uint64_t *d
, uint64_t *n
, uint64_t *g
,
3617 intptr_t oprsz
, bool after
)
3619 uint32_t flags
= PREDTEST_INIT
;
3623 for (i
= 0; i
< oprsz
/ 8; ++i
) {
3624 uint64_t this_b
, this_d
= d
[i
], this_g
= g
[i
];
3626 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
3627 d
[i
] = this_d
= (this_b
& this_g
) | (this_d
& ~this_g
);
3628 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
3633 static uint32_t do_zero(ARMPredicateReg
*d
, intptr_t oprsz
)
3635 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3636 * The compiler should turn this into 4 64-bit integer stores.
3638 memset(d
, 0, sizeof(ARMPredicateReg
));
3639 return PREDTEST_INIT
;
3642 void HELPER(sve_brkpa
)(void *vd
, void *vn
, void *vm
, void *vg
,
3645 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3646 if (last_active_pred(vn
, vg
, oprsz
)) {
3647 compute_brk_z(vd
, vm
, vg
, oprsz
, true);
3653 uint32_t HELPER(sve_brkpas
)(void *vd
, void *vn
, void *vm
, void *vg
,
3656 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3657 if (last_active_pred(vn
, vg
, oprsz
)) {
3658 return compute_brks_z(vd
, vm
, vg
, oprsz
, true);
3660 return do_zero(vd
, oprsz
);
3664 void HELPER(sve_brkpb
)(void *vd
, void *vn
, void *vm
, void *vg
,
3667 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3668 if (last_active_pred(vn
, vg
, oprsz
)) {
3669 compute_brk_z(vd
, vm
, vg
, oprsz
, false);
3675 uint32_t HELPER(sve_brkpbs
)(void *vd
, void *vn
, void *vm
, void *vg
,
3678 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3679 if (last_active_pred(vn
, vg
, oprsz
)) {
3680 return compute_brks_z(vd
, vm
, vg
, oprsz
, false);
3682 return do_zero(vd
, oprsz
);
3686 void HELPER(sve_brka_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3688 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3689 compute_brk_z(vd
, vn
, vg
, oprsz
, true);
3692 uint32_t HELPER(sve_brkas_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3694 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3695 return compute_brks_z(vd
, vn
, vg
, oprsz
, true);
3698 void HELPER(sve_brkb_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3700 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3701 compute_brk_z(vd
, vn
, vg
, oprsz
, false);
3704 uint32_t HELPER(sve_brkbs_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3706 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3707 return compute_brks_z(vd
, vn
, vg
, oprsz
, false);
3710 void HELPER(sve_brka_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3712 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3713 compute_brk_m(vd
, vn
, vg
, oprsz
, true);
3716 uint32_t HELPER(sve_brkas_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3718 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3719 return compute_brks_m(vd
, vn
, vg
, oprsz
, true);
3722 void HELPER(sve_brkb_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3724 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3725 compute_brk_m(vd
, vn
, vg
, oprsz
, false);
3728 uint32_t HELPER(sve_brkbs_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3730 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3731 return compute_brks_m(vd
, vn
, vg
, oprsz
, false);
3734 void HELPER(sve_brkn
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3736 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3737 if (!last_active_pred(vn
, vg
, oprsz
)) {
3742 /* As if PredTest(Ones(PL), D, esz). */
3743 static uint32_t predtest_ones(ARMPredicateReg
*d
, intptr_t oprsz
,
3746 uint32_t flags
= PREDTEST_INIT
;
3749 for (i
= 0; i
< oprsz
/ 8; i
++) {
3750 flags
= iter_predtest_fwd(d
->p
[i
], esz_mask
, flags
);
3753 uint64_t mask
= ~(-1ULL << (8 * (oprsz
& 7)));
3754 flags
= iter_predtest_fwd(d
->p
[i
], esz_mask
& mask
, flags
);
3759 uint32_t HELPER(sve_brkns
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3761 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3762 if (last_active_pred(vn
, vg
, oprsz
)) {
3763 return predtest_ones(vd
, oprsz
, -1);
3765 return do_zero(vd
, oprsz
);
3769 uint64_t HELPER(sve_cntp
)(void *vn
, void *vg
, uint32_t pred_desc
)
3771 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
3772 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3773 uint64_t *n
= vn
, *g
= vg
, sum
= 0, mask
= pred_esz_masks
[esz
];
3776 for (i
= 0; i
< words
; ++i
) {
3777 uint64_t t
= n
[i
] & g
[i
] & mask
;
3783 uint32_t HELPER(sve_whilel
)(void *vd
, uint32_t count
, uint32_t pred_desc
)
3785 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3786 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3787 uint64_t esz_mask
= pred_esz_masks
[esz
];
3788 ARMPredicateReg
*d
= vd
;
3792 /* Begin with a zero predicate register. */
3793 flags
= do_zero(d
, oprsz
);
3798 /* Set all of the requested bits. */
3799 for (i
= 0; i
< count
/ 64; ++i
) {
3803 d
->p
[i
] = MAKE_64BIT_MASK(0, count
& 63) & esz_mask
;
3806 return predtest_ones(d
, oprsz
, esz_mask
);
3809 uint32_t HELPER(sve_whileg
)(void *vd
, uint32_t count
, uint32_t pred_desc
)
3811 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3812 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3813 uint64_t esz_mask
= pred_esz_masks
[esz
];
3814 ARMPredicateReg
*d
= vd
;
3815 intptr_t i
, invcount
, oprbits
;
3819 return do_zero(d
, oprsz
);
3822 oprbits
= oprsz
* 8;
3823 tcg_debug_assert(count
<= oprbits
);
3827 bits
&= MAKE_64BIT_MASK(0, oprbits
& 63);
3830 invcount
= oprbits
- count
;
3831 for (i
= (oprsz
- 1) / 8; i
> invcount
/ 64; --i
) {
3836 d
->p
[i
] = bits
& MAKE_64BIT_MASK(invcount
& 63, 64);
3842 return predtest_ones(d
, oprsz
, esz_mask
);
3845 /* Recursive reduction on a function;
3846 * C.f. the ARM ARM function ReducePredicated.
3848 * While it would be possible to write this without the DATA temporary,
3849 * it is much simpler to process the predicate register this way.
3850 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
3851 * little to gain with a more complex non-recursive form.
3853 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
3854 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
3859 uintptr_t half = n / 2; \
3860 TYPE lo = NAME##_reduce(data, status, half); \
3861 TYPE hi = NAME##_reduce(data + half, status, half); \
3862 return TYPE##_##FUNC(lo, hi, status); \
3865 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
3867 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
3868 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
3869 for (i = 0; i < oprsz; ) { \
3870 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
3872 TYPE nn = *(TYPE *)(vn + H(i)); \
3873 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
3874 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
3877 for (; i < maxsz; i += sizeof(TYPE)) { \
3878 *(TYPE *)((void *)data + i) = IDENT; \
3880 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
3883 DO_REDUCE(sve_faddv_h
, float16
, H1_2
, add
, float16_zero
)
3884 DO_REDUCE(sve_faddv_s
, float32
, H1_4
, add
, float32_zero
)
3885 DO_REDUCE(sve_faddv_d
, float64
, , add
, float64_zero
)
3887 /* Identity is floatN_default_nan, without the function call. */
3888 DO_REDUCE(sve_fminnmv_h
, float16
, H1_2
, minnum
, 0x7E00)
3889 DO_REDUCE(sve_fminnmv_s
, float32
, H1_4
, minnum
, 0x7FC00000)
3890 DO_REDUCE(sve_fminnmv_d
, float64
, , minnum
, 0x7FF8000000000000ULL
)
3892 DO_REDUCE(sve_fmaxnmv_h
, float16
, H1_2
, maxnum
, 0x7E00)
3893 DO_REDUCE(sve_fmaxnmv_s
, float32
, H1_4
, maxnum
, 0x7FC00000)
3894 DO_REDUCE(sve_fmaxnmv_d
, float64
, , maxnum
, 0x7FF8000000000000ULL
)
3896 DO_REDUCE(sve_fminv_h
, float16
, H1_2
, min
, float16_infinity
)
3897 DO_REDUCE(sve_fminv_s
, float32
, H1_4
, min
, float32_infinity
)
3898 DO_REDUCE(sve_fminv_d
, float64
, , min
, float64_infinity
)
3900 DO_REDUCE(sve_fmaxv_h
, float16
, H1_2
, max
, float16_chs(float16_infinity
))
3901 DO_REDUCE(sve_fmaxv_s
, float32
, H1_4
, max
, float32_chs(float32_infinity
))
3902 DO_REDUCE(sve_fmaxv_d
, float64
, , max
, float64_chs(float64_infinity
))
3906 uint64_t HELPER(sve_fadda_h
)(uint64_t nn
, void *vm
, void *vg
,
3907 void *status
, uint32_t desc
)
3909 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
);
3910 float16 result
= nn
;
3913 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
3916 float16 mm
= *(float16
*)(vm
+ H1_2(i
));
3917 result
= float16_add(result
, mm
, status
);
3919 i
+= sizeof(float16
), pg
>>= sizeof(float16
);
3921 } while (i
< opr_sz
);
3926 uint64_t HELPER(sve_fadda_s
)(uint64_t nn
, void *vm
, void *vg
,
3927 void *status
, uint32_t desc
)
3929 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
);
3930 float32 result
= nn
;
3933 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
3936 float32 mm
= *(float32
*)(vm
+ H1_2(i
));
3937 result
= float32_add(result
, mm
, status
);
3939 i
+= sizeof(float32
), pg
>>= sizeof(float32
);
3941 } while (i
< opr_sz
);
3946 uint64_t HELPER(sve_fadda_d
)(uint64_t nn
, void *vm
, void *vg
,
3947 void *status
, uint32_t desc
)
3949 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
) / 8;
3953 for (i
= 0; i
< opr_sz
; i
++) {
3954 if (pg
[H1(i
)] & 1) {
3955 nn
= float64_add(nn
, m
[i
], status
);
3962 /* Fully general three-operand expander, controlled by a predicate,
3963 * With the extra float_status parameter.
3965 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
3966 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
3967 void *status, uint32_t desc) \
3969 intptr_t i = simd_oprsz(desc); \
3972 uint64_t pg = g[(i - 1) >> 6]; \
3974 i -= sizeof(TYPE); \
3975 if (likely((pg >> (i & 63)) & 1)) { \
3976 TYPE nn = *(TYPE *)(vn + H(i)); \
3977 TYPE mm = *(TYPE *)(vm + H(i)); \
3978 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
3984 DO_ZPZZ_FP(sve_fadd_h
, uint16_t, H1_2
, float16_add
)
3985 DO_ZPZZ_FP(sve_fadd_s
, uint32_t, H1_4
, float32_add
)
3986 DO_ZPZZ_FP(sve_fadd_d
, uint64_t, , float64_add
)
3988 DO_ZPZZ_FP(sve_fsub_h
, uint16_t, H1_2
, float16_sub
)
3989 DO_ZPZZ_FP(sve_fsub_s
, uint32_t, H1_4
, float32_sub
)
3990 DO_ZPZZ_FP(sve_fsub_d
, uint64_t, , float64_sub
)
3992 DO_ZPZZ_FP(sve_fmul_h
, uint16_t, H1_2
, float16_mul
)
3993 DO_ZPZZ_FP(sve_fmul_s
, uint32_t, H1_4
, float32_mul
)
3994 DO_ZPZZ_FP(sve_fmul_d
, uint64_t, , float64_mul
)
3996 DO_ZPZZ_FP(sve_fdiv_h
, uint16_t, H1_2
, float16_div
)
3997 DO_ZPZZ_FP(sve_fdiv_s
, uint32_t, H1_4
, float32_div
)
3998 DO_ZPZZ_FP(sve_fdiv_d
, uint64_t, , float64_div
)
4000 DO_ZPZZ_FP(sve_fmin_h
, uint16_t, H1_2
, float16_min
)
4001 DO_ZPZZ_FP(sve_fmin_s
, uint32_t, H1_4
, float32_min
)
4002 DO_ZPZZ_FP(sve_fmin_d
, uint64_t, , float64_min
)
4004 DO_ZPZZ_FP(sve_fmax_h
, uint16_t, H1_2
, float16_max
)
4005 DO_ZPZZ_FP(sve_fmax_s
, uint32_t, H1_4
, float32_max
)
4006 DO_ZPZZ_FP(sve_fmax_d
, uint64_t, , float64_max
)
4008 DO_ZPZZ_FP(sve_fminnum_h
, uint16_t, H1_2
, float16_minnum
)
4009 DO_ZPZZ_FP(sve_fminnum_s
, uint32_t, H1_4
, float32_minnum
)
4010 DO_ZPZZ_FP(sve_fminnum_d
, uint64_t, , float64_minnum
)
4012 DO_ZPZZ_FP(sve_fmaxnum_h
, uint16_t, H1_2
, float16_maxnum
)
4013 DO_ZPZZ_FP(sve_fmaxnum_s
, uint32_t, H1_4
, float32_maxnum
)
4014 DO_ZPZZ_FP(sve_fmaxnum_d
, uint64_t, , float64_maxnum
)
4016 static inline float16
abd_h(float16 a
, float16 b
, float_status
*s
)
4018 return float16_abs(float16_sub(a
, b
, s
));
4021 static inline float32
abd_s(float32 a
, float32 b
, float_status
*s
)
4023 return float32_abs(float32_sub(a
, b
, s
));
4026 static inline float64
abd_d(float64 a
, float64 b
, float_status
*s
)
4028 return float64_abs(float64_sub(a
, b
, s
));
4031 DO_ZPZZ_FP(sve_fabd_h
, uint16_t, H1_2
, abd_h
)
4032 DO_ZPZZ_FP(sve_fabd_s
, uint32_t, H1_4
, abd_s
)
4033 DO_ZPZZ_FP(sve_fabd_d
, uint64_t, , abd_d
)
4035 static inline float64
scalbn_d(float64 a
, int64_t b
, float_status
*s
)
4037 int b_int
= MIN(MAX(b
, INT_MIN
), INT_MAX
);
4038 return float64_scalbn(a
, b_int
, s
);
4041 DO_ZPZZ_FP(sve_fscalbn_h
, int16_t, H1_2
, float16_scalbn
)
4042 DO_ZPZZ_FP(sve_fscalbn_s
, int32_t, H1_4
, float32_scalbn
)
4043 DO_ZPZZ_FP(sve_fscalbn_d
, int64_t, , scalbn_d
)
4045 DO_ZPZZ_FP(sve_fmulx_h
, uint16_t, H1_2
, helper_advsimd_mulxh
)
4046 DO_ZPZZ_FP(sve_fmulx_s
, uint32_t, H1_4
, helper_vfp_mulxs
)
4047 DO_ZPZZ_FP(sve_fmulx_d
, uint64_t, , helper_vfp_mulxd
)
4051 /* Three-operand expander, with one scalar operand, controlled by
4052 * a predicate, with the extra float_status parameter.
4054 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4055 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4056 void *status, uint32_t desc) \
4058 intptr_t i = simd_oprsz(desc); \
4062 uint64_t pg = g[(i - 1) >> 6]; \
4064 i -= sizeof(TYPE); \
4065 if (likely((pg >> (i & 63)) & 1)) { \
4066 TYPE nn = *(TYPE *)(vn + H(i)); \
4067 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4073 DO_ZPZS_FP(sve_fadds_h
, float16
, H1_2
, float16_add
)
4074 DO_ZPZS_FP(sve_fadds_s
, float32
, H1_4
, float32_add
)
4075 DO_ZPZS_FP(sve_fadds_d
, float64
, , float64_add
)
4077 DO_ZPZS_FP(sve_fsubs_h
, float16
, H1_2
, float16_sub
)
4078 DO_ZPZS_FP(sve_fsubs_s
, float32
, H1_4
, float32_sub
)
4079 DO_ZPZS_FP(sve_fsubs_d
, float64
, , float64_sub
)
4081 DO_ZPZS_FP(sve_fmuls_h
, float16
, H1_2
, float16_mul
)
4082 DO_ZPZS_FP(sve_fmuls_s
, float32
, H1_4
, float32_mul
)
4083 DO_ZPZS_FP(sve_fmuls_d
, float64
, , float64_mul
)
4085 static inline float16
subr_h(float16 a
, float16 b
, float_status
*s
)
4087 return float16_sub(b
, a
, s
);
4090 static inline float32
subr_s(float32 a
, float32 b
, float_status
*s
)
4092 return float32_sub(b
, a
, s
);
4095 static inline float64
subr_d(float64 a
, float64 b
, float_status
*s
)
4097 return float64_sub(b
, a
, s
);
4100 DO_ZPZS_FP(sve_fsubrs_h
, float16
, H1_2
, subr_h
)
4101 DO_ZPZS_FP(sve_fsubrs_s
, float32
, H1_4
, subr_s
)
4102 DO_ZPZS_FP(sve_fsubrs_d
, float64
, , subr_d
)
4104 DO_ZPZS_FP(sve_fmaxnms_h
, float16
, H1_2
, float16_maxnum
)
4105 DO_ZPZS_FP(sve_fmaxnms_s
, float32
, H1_4
, float32_maxnum
)
4106 DO_ZPZS_FP(sve_fmaxnms_d
, float64
, , float64_maxnum
)
4108 DO_ZPZS_FP(sve_fminnms_h
, float16
, H1_2
, float16_minnum
)
4109 DO_ZPZS_FP(sve_fminnms_s
, float32
, H1_4
, float32_minnum
)
4110 DO_ZPZS_FP(sve_fminnms_d
, float64
, , float64_minnum
)
4112 DO_ZPZS_FP(sve_fmaxs_h
, float16
, H1_2
, float16_max
)
4113 DO_ZPZS_FP(sve_fmaxs_s
, float32
, H1_4
, float32_max
)
4114 DO_ZPZS_FP(sve_fmaxs_d
, float64
, , float64_max
)
4116 DO_ZPZS_FP(sve_fmins_h
, float16
, H1_2
, float16_min
)
4117 DO_ZPZS_FP(sve_fmins_s
, float32
, H1_4
, float32_min
)
4118 DO_ZPZS_FP(sve_fmins_d
, float64
, , float64_min
)
4120 /* Fully general two-operand expander, controlled by a predicate,
4121 * With the extra float_status parameter.
4123 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4124 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4126 intptr_t i = simd_oprsz(desc); \
4129 uint64_t pg = g[(i - 1) >> 6]; \
4131 i -= sizeof(TYPE); \
4132 if (likely((pg >> (i & 63)) & 1)) { \
4133 TYPE nn = *(TYPE *)(vn + H(i)); \
4134 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4140 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4141 * FZ16. When converting from fp16, this affects flushing input denormals;
4142 * when converting to fp16, this affects flushing output denormals.
4144 static inline float32
sve_f16_to_f32(float16 f
, float_status
*fpst
)
4146 bool save
= get_flush_inputs_to_zero(fpst
);
4149 set_flush_inputs_to_zero(false, fpst
);
4150 ret
= float16_to_float32(f
, true, fpst
);
4151 set_flush_inputs_to_zero(save
, fpst
);
4155 static inline float64
sve_f16_to_f64(float16 f
, float_status
*fpst
)
4157 bool save
= get_flush_inputs_to_zero(fpst
);
4160 set_flush_inputs_to_zero(false, fpst
);
4161 ret
= float16_to_float64(f
, true, fpst
);
4162 set_flush_inputs_to_zero(save
, fpst
);
4166 static inline float16
sve_f32_to_f16(float32 f
, float_status
*fpst
)
4168 bool save
= get_flush_to_zero(fpst
);
4171 set_flush_to_zero(false, fpst
);
4172 ret
= float32_to_float16(f
, true, fpst
);
4173 set_flush_to_zero(save
, fpst
);
4177 static inline float16
sve_f64_to_f16(float64 f
, float_status
*fpst
)
4179 bool save
= get_flush_to_zero(fpst
);
4182 set_flush_to_zero(false, fpst
);
4183 ret
= float64_to_float16(f
, true, fpst
);
4184 set_flush_to_zero(save
, fpst
);
4188 static inline int16_t vfp_float16_to_int16_rtz(float16 f
, float_status
*s
)
4190 if (float16_is_any_nan(f
)) {
4191 float_raise(float_flag_invalid
, s
);
4194 return float16_to_int16_round_to_zero(f
, s
);
4197 static inline int64_t vfp_float16_to_int64_rtz(float16 f
, float_status
*s
)
4199 if (float16_is_any_nan(f
)) {
4200 float_raise(float_flag_invalid
, s
);
4203 return float16_to_int64_round_to_zero(f
, s
);
4206 static inline int64_t vfp_float32_to_int64_rtz(float32 f
, float_status
*s
)
4208 if (float32_is_any_nan(f
)) {
4209 float_raise(float_flag_invalid
, s
);
4212 return float32_to_int64_round_to_zero(f
, s
);
4215 static inline int64_t vfp_float64_to_int64_rtz(float64 f
, float_status
*s
)
4217 if (float64_is_any_nan(f
)) {
4218 float_raise(float_flag_invalid
, s
);
4221 return float64_to_int64_round_to_zero(f
, s
);
4224 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f
, float_status
*s
)
4226 if (float16_is_any_nan(f
)) {
4227 float_raise(float_flag_invalid
, s
);
4230 return float16_to_uint16_round_to_zero(f
, s
);
4233 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f
, float_status
*s
)
4235 if (float16_is_any_nan(f
)) {
4236 float_raise(float_flag_invalid
, s
);
4239 return float16_to_uint64_round_to_zero(f
, s
);
4242 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f
, float_status
*s
)
4244 if (float32_is_any_nan(f
)) {
4245 float_raise(float_flag_invalid
, s
);
4248 return float32_to_uint64_round_to_zero(f
, s
);
4251 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f
, float_status
*s
)
4253 if (float64_is_any_nan(f
)) {
4254 float_raise(float_flag_invalid
, s
);
4257 return float64_to_uint64_round_to_zero(f
, s
);
4260 DO_ZPZ_FP(sve_fcvt_sh
, uint32_t, H1_4
, sve_f32_to_f16
)
4261 DO_ZPZ_FP(sve_fcvt_hs
, uint32_t, H1_4
, sve_f16_to_f32
)
4262 DO_ZPZ_FP(sve_fcvt_dh
, uint64_t, , sve_f64_to_f16
)
4263 DO_ZPZ_FP(sve_fcvt_hd
, uint64_t, , sve_f16_to_f64
)
4264 DO_ZPZ_FP(sve_fcvt_ds
, uint64_t, , float64_to_float32
)
4265 DO_ZPZ_FP(sve_fcvt_sd
, uint64_t, , float32_to_float64
)
4267 DO_ZPZ_FP(sve_fcvtzs_hh
, uint16_t, H1_2
, vfp_float16_to_int16_rtz
)
4268 DO_ZPZ_FP(sve_fcvtzs_hs
, uint32_t, H1_4
, helper_vfp_tosizh
)
4269 DO_ZPZ_FP(sve_fcvtzs_ss
, uint32_t, H1_4
, helper_vfp_tosizs
)
4270 DO_ZPZ_FP(sve_fcvtzs_hd
, uint64_t, , vfp_float16_to_int64_rtz
)
4271 DO_ZPZ_FP(sve_fcvtzs_sd
, uint64_t, , vfp_float32_to_int64_rtz
)
4272 DO_ZPZ_FP(sve_fcvtzs_ds
, uint64_t, , helper_vfp_tosizd
)
4273 DO_ZPZ_FP(sve_fcvtzs_dd
, uint64_t, , vfp_float64_to_int64_rtz
)
4275 DO_ZPZ_FP(sve_fcvtzu_hh
, uint16_t, H1_2
, vfp_float16_to_uint16_rtz
)
4276 DO_ZPZ_FP(sve_fcvtzu_hs
, uint32_t, H1_4
, helper_vfp_touizh
)
4277 DO_ZPZ_FP(sve_fcvtzu_ss
, uint32_t, H1_4
, helper_vfp_touizs
)
4278 DO_ZPZ_FP(sve_fcvtzu_hd
, uint64_t, , vfp_float16_to_uint64_rtz
)
4279 DO_ZPZ_FP(sve_fcvtzu_sd
, uint64_t, , vfp_float32_to_uint64_rtz
)
4280 DO_ZPZ_FP(sve_fcvtzu_ds
, uint64_t, , helper_vfp_touizd
)
4281 DO_ZPZ_FP(sve_fcvtzu_dd
, uint64_t, , vfp_float64_to_uint64_rtz
)
4283 DO_ZPZ_FP(sve_frint_h
, uint16_t, H1_2
, helper_advsimd_rinth
)
4284 DO_ZPZ_FP(sve_frint_s
, uint32_t, H1_4
, helper_rints
)
4285 DO_ZPZ_FP(sve_frint_d
, uint64_t, , helper_rintd
)
4287 DO_ZPZ_FP(sve_frintx_h
, uint16_t, H1_2
, float16_round_to_int
)
4288 DO_ZPZ_FP(sve_frintx_s
, uint32_t, H1_4
, float32_round_to_int
)
4289 DO_ZPZ_FP(sve_frintx_d
, uint64_t, , float64_round_to_int
)
4291 DO_ZPZ_FP(sve_frecpx_h
, uint16_t, H1_2
, helper_frecpx_f16
)
4292 DO_ZPZ_FP(sve_frecpx_s
, uint32_t, H1_4
, helper_frecpx_f32
)
4293 DO_ZPZ_FP(sve_frecpx_d
, uint64_t, , helper_frecpx_f64
)
4295 DO_ZPZ_FP(sve_fsqrt_h
, uint16_t, H1_2
, float16_sqrt
)
4296 DO_ZPZ_FP(sve_fsqrt_s
, uint32_t, H1_4
, float32_sqrt
)
4297 DO_ZPZ_FP(sve_fsqrt_d
, uint64_t, , float64_sqrt
)
4299 DO_ZPZ_FP(sve_scvt_hh
, uint16_t, H1_2
, int16_to_float16
)
4300 DO_ZPZ_FP(sve_scvt_sh
, uint32_t, H1_4
, int32_to_float16
)
4301 DO_ZPZ_FP(sve_scvt_ss
, uint32_t, H1_4
, int32_to_float32
)
4302 DO_ZPZ_FP(sve_scvt_sd
, uint64_t, , int32_to_float64
)
4303 DO_ZPZ_FP(sve_scvt_dh
, uint64_t, , int64_to_float16
)
4304 DO_ZPZ_FP(sve_scvt_ds
, uint64_t, , int64_to_float32
)
4305 DO_ZPZ_FP(sve_scvt_dd
, uint64_t, , int64_to_float64
)
4307 DO_ZPZ_FP(sve_ucvt_hh
, uint16_t, H1_2
, uint16_to_float16
)
4308 DO_ZPZ_FP(sve_ucvt_sh
, uint32_t, H1_4
, uint32_to_float16
)
4309 DO_ZPZ_FP(sve_ucvt_ss
, uint32_t, H1_4
, uint32_to_float32
)
4310 DO_ZPZ_FP(sve_ucvt_sd
, uint64_t, , uint32_to_float64
)
4311 DO_ZPZ_FP(sve_ucvt_dh
, uint64_t, , uint64_to_float16
)
4312 DO_ZPZ_FP(sve_ucvt_ds
, uint64_t, , uint64_to_float32
)
4313 DO_ZPZ_FP(sve_ucvt_dd
, uint64_t, , uint64_to_float64
)
4317 static void do_fmla_zpzzz_h(void *vd
, void *vn
, void *vm
, void *va
, void *vg
,
4318 float_status
*status
, uint32_t desc
,
4319 uint16_t neg1
, uint16_t neg3
)
4321 intptr_t i
= simd_oprsz(desc
);
4325 uint64_t pg
= g
[(i
- 1) >> 6];
4328 if (likely((pg
>> (i
& 63)) & 1)) {
4329 float16 e1
, e2
, e3
, r
;
4331 e1
= *(uint16_t *)(vn
+ H1_2(i
)) ^ neg1
;
4332 e2
= *(uint16_t *)(vm
+ H1_2(i
));
4333 e3
= *(uint16_t *)(va
+ H1_2(i
)) ^ neg3
;
4334 r
= float16_muladd(e1
, e2
, e3
, 0, status
);
4335 *(uint16_t *)(vd
+ H1_2(i
)) = r
;
4341 void HELPER(sve_fmla_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4342 void *vg
, void *status
, uint32_t desc
)
4344 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0);
4347 void HELPER(sve_fmls_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4348 void *vg
, void *status
, uint32_t desc
)
4350 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x8000, 0);
4353 void HELPER(sve_fnmla_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4354 void *vg
, void *status
, uint32_t desc
)
4356 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x8000, 0x8000);
4359 void HELPER(sve_fnmls_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4360 void *vg
, void *status
, uint32_t desc
)
4362 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0x8000);
4365 static void do_fmla_zpzzz_s(void *vd
, void *vn
, void *vm
, void *va
, void *vg
,
4366 float_status
*status
, uint32_t desc
,
4367 uint32_t neg1
, uint32_t neg3
)
4369 intptr_t i
= simd_oprsz(desc
);
4373 uint64_t pg
= g
[(i
- 1) >> 6];
4376 if (likely((pg
>> (i
& 63)) & 1)) {
4377 float32 e1
, e2
, e3
, r
;
4379 e1
= *(uint32_t *)(vn
+ H1_4(i
)) ^ neg1
;
4380 e2
= *(uint32_t *)(vm
+ H1_4(i
));
4381 e3
= *(uint32_t *)(va
+ H1_4(i
)) ^ neg3
;
4382 r
= float32_muladd(e1
, e2
, e3
, 0, status
);
4383 *(uint32_t *)(vd
+ H1_4(i
)) = r
;
4389 void HELPER(sve_fmla_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4390 void *vg
, void *status
, uint32_t desc
)
4392 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0);
4395 void HELPER(sve_fmls_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4396 void *vg
, void *status
, uint32_t desc
)
4398 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x80000000, 0);
4401 void HELPER(sve_fnmla_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4402 void *vg
, void *status
, uint32_t desc
)
4404 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x80000000, 0x80000000);
4407 void HELPER(sve_fnmls_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4408 void *vg
, void *status
, uint32_t desc
)
4410 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0x80000000);
4413 static void do_fmla_zpzzz_d(void *vd
, void *vn
, void *vm
, void *va
, void *vg
,
4414 float_status
*status
, uint32_t desc
,
4415 uint64_t neg1
, uint64_t neg3
)
4417 intptr_t i
= simd_oprsz(desc
);
4421 uint64_t pg
= g
[(i
- 1) >> 6];
4424 if (likely((pg
>> (i
& 63)) & 1)) {
4425 float64 e1
, e2
, e3
, r
;
4427 e1
= *(uint64_t *)(vn
+ i
) ^ neg1
;
4428 e2
= *(uint64_t *)(vm
+ i
);
4429 e3
= *(uint64_t *)(va
+ i
) ^ neg3
;
4430 r
= float64_muladd(e1
, e2
, e3
, 0, status
);
4431 *(uint64_t *)(vd
+ i
) = r
;
4437 void HELPER(sve_fmla_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4438 void *vg
, void *status
, uint32_t desc
)
4440 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0);
4443 void HELPER(sve_fmls_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4444 void *vg
, void *status
, uint32_t desc
)
4446 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, INT64_MIN
, 0);
4449 void HELPER(sve_fnmla_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4450 void *vg
, void *status
, uint32_t desc
)
4452 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, INT64_MIN
, INT64_MIN
);
4455 void HELPER(sve_fnmls_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4456 void *vg
, void *status
, uint32_t desc
)
4458 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, INT64_MIN
);
4461 /* Two operand floating-point comparison controlled by a predicate.
4462 * Unlike the integer version, we are not allowed to optimistically
4463 * compare operands, since the comparison may have side effects wrt
4466 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4467 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4468 void *status, uint32_t desc) \
4470 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4471 uint64_t *d = vd, *g = vg; \
4473 uint64_t out = 0, pg = g[j]; \
4475 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4476 if (likely((pg >> (i & 63)) & 1)) { \
4477 TYPE nn = *(TYPE *)(vn + H(i)); \
4478 TYPE mm = *(TYPE *)(vm + H(i)); \
4479 out |= OP(TYPE, nn, mm, status); \
4486 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4487 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4488 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4489 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4490 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4491 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
4493 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4494 DO_FPCMP_PPZZ_H(NAME, OP) \
4495 DO_FPCMP_PPZZ_S(NAME, OP) \
4496 DO_FPCMP_PPZZ_D(NAME, OP)
4498 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4499 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4500 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4501 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4502 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4503 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4504 #define DO_FCMUO(TYPE, X, Y, ST) \
4505 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4506 #define DO_FACGE(TYPE, X, Y, ST) \
4507 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4508 #define DO_FACGT(TYPE, X, Y, ST) \
4509 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4511 DO_FPCMP_PPZZ_ALL(sve_fcmge
, DO_FCMGE
)
4512 DO_FPCMP_PPZZ_ALL(sve_fcmgt
, DO_FCMGT
)
4513 DO_FPCMP_PPZZ_ALL(sve_fcmeq
, DO_FCMEQ
)
4514 DO_FPCMP_PPZZ_ALL(sve_fcmne
, DO_FCMNE
)
4515 DO_FPCMP_PPZZ_ALL(sve_fcmuo
, DO_FCMUO
)
4516 DO_FPCMP_PPZZ_ALL(sve_facge
, DO_FACGE
)
4517 DO_FPCMP_PPZZ_ALL(sve_facgt
, DO_FACGT
)
4519 #undef DO_FPCMP_PPZZ_ALL
4520 #undef DO_FPCMP_PPZZ_D
4521 #undef DO_FPCMP_PPZZ_S
4522 #undef DO_FPCMP_PPZZ_H
4523 #undef DO_FPCMP_PPZZ
4525 /* One operand floating-point comparison against zero, controlled
4528 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4529 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4530 void *status, uint32_t desc) \
4532 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4533 uint64_t *d = vd, *g = vg; \
4535 uint64_t out = 0, pg = g[j]; \
4537 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4538 if ((pg >> (i & 63)) & 1) { \
4539 TYPE nn = *(TYPE *)(vn + H(i)); \
4540 out |= OP(TYPE, nn, 0, status); \
4547 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4548 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4549 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4550 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4551 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4552 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
4554 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4555 DO_FPCMP_PPZ0_H(NAME, OP) \
4556 DO_FPCMP_PPZ0_S(NAME, OP) \
4557 DO_FPCMP_PPZ0_D(NAME, OP)
4559 DO_FPCMP_PPZ0_ALL(sve_fcmge0
, DO_FCMGE
)
4560 DO_FPCMP_PPZ0_ALL(sve_fcmgt0
, DO_FCMGT
)
4561 DO_FPCMP_PPZ0_ALL(sve_fcmle0
, DO_FCMLE
)
4562 DO_FPCMP_PPZ0_ALL(sve_fcmlt0
, DO_FCMLT
)
4563 DO_FPCMP_PPZ0_ALL(sve_fcmeq0
, DO_FCMEQ
)
4564 DO_FPCMP_PPZ0_ALL(sve_fcmne0
, DO_FCMNE
)
4566 /* FP Trig Multiply-Add. */
4568 void HELPER(sve_ftmad_h
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
4570 static const float16 coeff
[16] = {
4571 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4572 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4574 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float16
);
4575 intptr_t x
= simd_data(desc
);
4576 float16
*d
= vd
, *n
= vn
, *m
= vm
;
4577 for (i
= 0; i
< opr_sz
; i
++) {
4580 if (float16_is_neg(mm
)) {
4581 mm
= float16_abs(mm
);
4584 d
[i
] = float16_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
4588 void HELPER(sve_ftmad_s
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
4590 static const float32 coeff
[16] = {
4591 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
4592 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
4593 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
4594 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
4596 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float32
);
4597 intptr_t x
= simd_data(desc
);
4598 float32
*d
= vd
, *n
= vn
, *m
= vm
;
4599 for (i
= 0; i
< opr_sz
; i
++) {
4602 if (float32_is_neg(mm
)) {
4603 mm
= float32_abs(mm
);
4606 d
[i
] = float32_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
4610 void HELPER(sve_ftmad_d
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
4612 static const float64 coeff
[16] = {
4613 0x3ff0000000000000ull
, 0xbfc5555555555543ull
,
4614 0x3f8111111110f30cull
, 0xbf2a01a019b92fc6ull
,
4615 0x3ec71de351f3d22bull
, 0xbe5ae5e2b60f7b91ull
,
4616 0x3de5d8408868552full
, 0x0000000000000000ull
,
4617 0x3ff0000000000000ull
, 0xbfe0000000000000ull
,
4618 0x3fa5555555555536ull
, 0xbf56c16c16c13a0bull
,
4619 0x3efa01a019b1e8d8ull
, 0xbe927e4f7282f468ull
,
4620 0x3e21ee96d2641b13ull
, 0xbda8f76380fbb401ull
,
4622 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float64
);
4623 intptr_t x
= simd_data(desc
);
4624 float64
*d
= vd
, *n
= vn
, *m
= vm
;
4625 for (i
= 0; i
< opr_sz
; i
++) {
4628 if (float64_is_neg(mm
)) {
4629 mm
= float64_abs(mm
);
4632 d
[i
] = float64_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
4640 void HELPER(sve_fcadd_h
)(void *vd
, void *vn
, void *vm
, void *vg
,
4641 void *vs
, uint32_t desc
)
4643 intptr_t j
, i
= simd_oprsz(desc
);
4645 float16 neg_imag
= float16_set_sign(0, simd_data(desc
));
4646 float16 neg_real
= float16_chs(neg_imag
);
4649 uint64_t pg
= g
[(i
- 1) >> 6];
4651 float16 e0
, e1
, e2
, e3
;
4653 /* I holds the real index; J holds the imag index. */
4654 j
= i
- sizeof(float16
);
4655 i
-= 2 * sizeof(float16
);
4657 e0
= *(float16
*)(vn
+ H1_2(i
));
4658 e1
= *(float16
*)(vm
+ H1_2(j
)) ^ neg_real
;
4659 e2
= *(float16
*)(vn
+ H1_2(j
));
4660 e3
= *(float16
*)(vm
+ H1_2(i
)) ^ neg_imag
;
4662 if (likely((pg
>> (i
& 63)) & 1)) {
4663 *(float16
*)(vd
+ H1_2(i
)) = float16_add(e0
, e1
, vs
);
4665 if (likely((pg
>> (j
& 63)) & 1)) {
4666 *(float16
*)(vd
+ H1_2(j
)) = float16_add(e2
, e3
, vs
);
4672 void HELPER(sve_fcadd_s
)(void *vd
, void *vn
, void *vm
, void *vg
,
4673 void *vs
, uint32_t desc
)
4675 intptr_t j
, i
= simd_oprsz(desc
);
4677 float32 neg_imag
= float32_set_sign(0, simd_data(desc
));
4678 float32 neg_real
= float32_chs(neg_imag
);
4681 uint64_t pg
= g
[(i
- 1) >> 6];
4683 float32 e0
, e1
, e2
, e3
;
4685 /* I holds the real index; J holds the imag index. */
4686 j
= i
- sizeof(float32
);
4687 i
-= 2 * sizeof(float32
);
4689 e0
= *(float32
*)(vn
+ H1_2(i
));
4690 e1
= *(float32
*)(vm
+ H1_2(j
)) ^ neg_real
;
4691 e2
= *(float32
*)(vn
+ H1_2(j
));
4692 e3
= *(float32
*)(vm
+ H1_2(i
)) ^ neg_imag
;
4694 if (likely((pg
>> (i
& 63)) & 1)) {
4695 *(float32
*)(vd
+ H1_2(i
)) = float32_add(e0
, e1
, vs
);
4697 if (likely((pg
>> (j
& 63)) & 1)) {
4698 *(float32
*)(vd
+ H1_2(j
)) = float32_add(e2
, e3
, vs
);
4704 void HELPER(sve_fcadd_d
)(void *vd
, void *vn
, void *vm
, void *vg
,
4705 void *vs
, uint32_t desc
)
4707 intptr_t j
, i
= simd_oprsz(desc
);
4709 float64 neg_imag
= float64_set_sign(0, simd_data(desc
));
4710 float64 neg_real
= float64_chs(neg_imag
);
4713 uint64_t pg
= g
[(i
- 1) >> 6];
4715 float64 e0
, e1
, e2
, e3
;
4717 /* I holds the real index; J holds the imag index. */
4718 j
= i
- sizeof(float64
);
4719 i
-= 2 * sizeof(float64
);
4721 e0
= *(float64
*)(vn
+ H1_2(i
));
4722 e1
= *(float64
*)(vm
+ H1_2(j
)) ^ neg_real
;
4723 e2
= *(float64
*)(vn
+ H1_2(j
));
4724 e3
= *(float64
*)(vm
+ H1_2(i
)) ^ neg_imag
;
4726 if (likely((pg
>> (i
& 63)) & 1)) {
4727 *(float64
*)(vd
+ H1_2(i
)) = float64_add(e0
, e1
, vs
);
4729 if (likely((pg
>> (j
& 63)) & 1)) {
4730 *(float64
*)(vd
+ H1_2(j
)) = float64_add(e2
, e3
, vs
);
4737 * FP Complex Multiply
4740 void HELPER(sve_fcmla_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4741 void *vg
, void *status
, uint32_t desc
)
4743 intptr_t j
, i
= simd_oprsz(desc
);
4744 unsigned rot
= simd_data(desc
);
4745 bool flip
= rot
& 1;
4746 float16 neg_imag
, neg_real
;
4749 neg_imag
= float16_set_sign(0, (rot
& 2) != 0);
4750 neg_real
= float16_set_sign(0, rot
== 1 || rot
== 2);
4753 uint64_t pg
= g
[(i
- 1) >> 6];
4755 float16 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
4757 /* I holds the real index; J holds the imag index. */
4758 j
= i
- sizeof(float16
);
4759 i
-= 2 * sizeof(float16
);
4761 nr
= *(float16
*)(vn
+ H1_2(i
));
4762 ni
= *(float16
*)(vn
+ H1_2(j
));
4763 mr
= *(float16
*)(vm
+ H1_2(i
));
4764 mi
= *(float16
*)(vm
+ H1_2(j
));
4766 e2
= (flip
? ni
: nr
);
4767 e1
= (flip
? mi
: mr
) ^ neg_real
;
4769 e3
= (flip
? mr
: mi
) ^ neg_imag
;
4771 if (likely((pg
>> (i
& 63)) & 1)) {
4772 d
= *(float16
*)(va
+ H1_2(i
));
4773 d
= float16_muladd(e2
, e1
, d
, 0, status
);
4774 *(float16
*)(vd
+ H1_2(i
)) = d
;
4776 if (likely((pg
>> (j
& 63)) & 1)) {
4777 d
= *(float16
*)(va
+ H1_2(j
));
4778 d
= float16_muladd(e4
, e3
, d
, 0, status
);
4779 *(float16
*)(vd
+ H1_2(j
)) = d
;
4785 void HELPER(sve_fcmla_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4786 void *vg
, void *status
, uint32_t desc
)
4788 intptr_t j
, i
= simd_oprsz(desc
);
4789 unsigned rot
= simd_data(desc
);
4790 bool flip
= rot
& 1;
4791 float32 neg_imag
, neg_real
;
4794 neg_imag
= float32_set_sign(0, (rot
& 2) != 0);
4795 neg_real
= float32_set_sign(0, rot
== 1 || rot
== 2);
4798 uint64_t pg
= g
[(i
- 1) >> 6];
4800 float32 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
4802 /* I holds the real index; J holds the imag index. */
4803 j
= i
- sizeof(float32
);
4804 i
-= 2 * sizeof(float32
);
4806 nr
= *(float32
*)(vn
+ H1_2(i
));
4807 ni
= *(float32
*)(vn
+ H1_2(j
));
4808 mr
= *(float32
*)(vm
+ H1_2(i
));
4809 mi
= *(float32
*)(vm
+ H1_2(j
));
4811 e2
= (flip
? ni
: nr
);
4812 e1
= (flip
? mi
: mr
) ^ neg_real
;
4814 e3
= (flip
? mr
: mi
) ^ neg_imag
;
4816 if (likely((pg
>> (i
& 63)) & 1)) {
4817 d
= *(float32
*)(va
+ H1_2(i
));
4818 d
= float32_muladd(e2
, e1
, d
, 0, status
);
4819 *(float32
*)(vd
+ H1_2(i
)) = d
;
4821 if (likely((pg
>> (j
& 63)) & 1)) {
4822 d
= *(float32
*)(va
+ H1_2(j
));
4823 d
= float32_muladd(e4
, e3
, d
, 0, status
);
4824 *(float32
*)(vd
+ H1_2(j
)) = d
;
4830 void HELPER(sve_fcmla_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4831 void *vg
, void *status
, uint32_t desc
)
4833 intptr_t j
, i
= simd_oprsz(desc
);
4834 unsigned rot
= simd_data(desc
);
4835 bool flip
= rot
& 1;
4836 float64 neg_imag
, neg_real
;
4839 neg_imag
= float64_set_sign(0, (rot
& 2) != 0);
4840 neg_real
= float64_set_sign(0, rot
== 1 || rot
== 2);
4843 uint64_t pg
= g
[(i
- 1) >> 6];
4845 float64 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
4847 /* I holds the real index; J holds the imag index. */
4848 j
= i
- sizeof(float64
);
4849 i
-= 2 * sizeof(float64
);
4851 nr
= *(float64
*)(vn
+ H1_2(i
));
4852 ni
= *(float64
*)(vn
+ H1_2(j
));
4853 mr
= *(float64
*)(vm
+ H1_2(i
));
4854 mi
= *(float64
*)(vm
+ H1_2(j
));
4856 e2
= (flip
? ni
: nr
);
4857 e1
= (flip
? mi
: mr
) ^ neg_real
;
4859 e3
= (flip
? mr
: mi
) ^ neg_imag
;
4861 if (likely((pg
>> (i
& 63)) & 1)) {
4862 d
= *(float64
*)(va
+ H1_2(i
));
4863 d
= float64_muladd(e2
, e1
, d
, 0, status
);
4864 *(float64
*)(vd
+ H1_2(i
)) = d
;
4866 if (likely((pg
>> (j
& 63)) & 1)) {
4867 d
= *(float64
*)(va
+ H1_2(j
));
4868 d
= float64_muladd(e4
, e3
, d
, 0, status
);
4869 *(float64
*)(vd
+ H1_2(j
)) = d
;
4876 * Load contiguous data, protected by a governing predicate.
4880 * Load one element into @vd + @reg_off from @host.
4881 * The controlling predicate is known to be true.
4883 typedef void sve_ldst1_host_fn(void *vd
, intptr_t reg_off
, void *host
);
4886 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
4887 * The controlling predicate is known to be true.
4889 typedef void sve_ldst1_tlb_fn(CPUARMState
*env
, void *vd
, intptr_t reg_off
,
4890 target_ulong vaddr
, uintptr_t retaddr
);
4893 * Generate the above primitives.
4896 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
4897 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
4899 TYPEM val = HOST(host); \
4900 *(TYPEE *)(vd + H(reg_off)) = val; \
4903 #define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
4904 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
4905 { HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
4907 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
4908 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4909 target_ulong addr, uintptr_t ra) \
4911 *(TYPEE *)(vd + H(reg_off)) = \
4912 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
4915 #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
4916 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
4917 target_ulong addr, uintptr_t ra) \
4919 TLB(env, useronly_clean_ptr(addr), \
4920 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
4923 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
4924 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
4925 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
4927 DO_LD_PRIM_1(ld1bb
, H1
, uint8_t, uint8_t)
4928 DO_LD_PRIM_1(ld1bhu
, H1_2
, uint16_t, uint8_t)
4929 DO_LD_PRIM_1(ld1bhs
, H1_2
, uint16_t, int8_t)
4930 DO_LD_PRIM_1(ld1bsu
, H1_4
, uint32_t, uint8_t)
4931 DO_LD_PRIM_1(ld1bss
, H1_4
, uint32_t, int8_t)
4932 DO_LD_PRIM_1(ld1bdu
, , uint64_t, uint8_t)
4933 DO_LD_PRIM_1(ld1bds
, , uint64_t, int8_t)
4935 #define DO_ST_PRIM_1(NAME, H, TE, TM) \
4936 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
4937 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
4939 DO_ST_PRIM_1(bb
, H1
, uint8_t, uint8_t)
4940 DO_ST_PRIM_1(bh
, H1_2
, uint16_t, uint8_t)
4941 DO_ST_PRIM_1(bs
, H1_4
, uint32_t, uint8_t)
4942 DO_ST_PRIM_1(bd
, , uint64_t, uint8_t)
4944 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
4945 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
4946 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
4947 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
4948 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
4950 #define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
4951 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
4952 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
4953 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
4954 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
4956 DO_LD_PRIM_2(hh
, H1_2
, uint16_t, uint16_t, lduw
)
4957 DO_LD_PRIM_2(hsu
, H1_4
, uint32_t, uint16_t, lduw
)
4958 DO_LD_PRIM_2(hss
, H1_4
, uint32_t, int16_t, lduw
)
4959 DO_LD_PRIM_2(hdu
, , uint64_t, uint16_t, lduw
)
4960 DO_LD_PRIM_2(hds
, , uint64_t, int16_t, lduw
)
4962 DO_ST_PRIM_2(hh
, H1_2
, uint16_t, uint16_t, stw
)
4963 DO_ST_PRIM_2(hs
, H1_4
, uint32_t, uint16_t, stw
)
4964 DO_ST_PRIM_2(hd
, , uint64_t, uint16_t, stw
)
4966 DO_LD_PRIM_2(ss
, H1_4
, uint32_t, uint32_t, ldl
)
4967 DO_LD_PRIM_2(sdu
, , uint64_t, uint32_t, ldl
)
4968 DO_LD_PRIM_2(sds
, , uint64_t, int32_t, ldl
)
4970 DO_ST_PRIM_2(ss
, H1_4
, uint32_t, uint32_t, stl
)
4971 DO_ST_PRIM_2(sd
, , uint64_t, uint32_t, stl
)
4973 DO_LD_PRIM_2(dd
, , uint64_t, uint64_t, ldq
)
4974 DO_ST_PRIM_2(dd
, , uint64_t, uint64_t, stq
)
4985 * Skip through a sequence of inactive elements in the guarding predicate @vg,
4986 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
4987 * element >= @reg_off, or @reg_max if there were no active elements at all.
4989 static intptr_t find_next_active(uint64_t *vg
, intptr_t reg_off
,
4990 intptr_t reg_max
, int esz
)
4992 uint64_t pg_mask
= pred_esz_masks
[esz
];
4993 uint64_t pg
= (vg
[reg_off
>> 6] & pg_mask
) >> (reg_off
& 63);
4995 /* In normal usage, the first element is active. */
4996 if (likely(pg
& 1)) {
5004 if (unlikely(reg_off
>= reg_max
)) {
5005 /* The entire predicate was false. */
5008 pg
= vg
[reg_off
>> 6] & pg_mask
;
5011 reg_off
+= ctz64(pg
);
5013 /* We should never see an out of range predicate bit set. */
5014 tcg_debug_assert(reg_off
< reg_max
);
5019 * Resolve the guest virtual address to info->host and info->flags.
5020 * If @nofault, return false if the page is invalid, otherwise
5021 * exit via page fault exception.
5030 static bool sve_probe_page(SVEHostPage
*info
, bool nofault
,
5031 CPUARMState
*env
, target_ulong addr
,
5032 int mem_off
, MMUAccessType access_type
,
5033 int mmu_idx
, uintptr_t retaddr
)
5040 * User-only currently always issues with TBI. See the comment
5041 * above useronly_clean_ptr. Usually we clean this top byte away
5042 * during translation, but we can't do that for e.g. vector + imm
5045 * We currently always enable TBI for user-only, and do not provide
5046 * a way to turn it off. So clean the pointer unconditionally here,
5047 * rather than look it up here, or pass it down from above.
5049 addr
= useronly_clean_ptr(addr
);
5051 flags
= probe_access_flags(env
, addr
, access_type
, mmu_idx
, nofault
,
5052 &info
->host
, retaddr
);
5053 info
->flags
= flags
;
5055 if (flags
& TLB_INVALID_MASK
) {
5060 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5061 info
->host
-= mem_off
;
5063 #ifdef CONFIG_USER_ONLY
5064 memset(&info
->attrs
, 0, sizeof(info
->attrs
));
5067 * Find the iotlbentry for addr and return the transaction attributes.
5068 * This *must* be present in the TLB because we just found the mapping.
5071 uintptr_t index
= tlb_index(env
, mmu_idx
, addr
);
5073 # ifdef CONFIG_DEBUG_TCG
5074 CPUTLBEntry
*entry
= tlb_entry(env
, mmu_idx
, addr
);
5075 target_ulong comparator
= (access_type
== MMU_DATA_LOAD
5077 : tlb_addr_write(entry
));
5078 g_assert(tlb_hit(comparator
, addr
));
5081 CPUIOTLBEntry
*iotlbentry
= &env_tlb(env
)->d
[mmu_idx
].iotlb
[index
];
5082 info
->attrs
= iotlbentry
->attrs
;
5091 * Analyse contiguous data, protected by a governing predicate.
5102 * First and last element wholly contained within the two pages.
5103 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
5104 * reg_off_last[0] may be < 0 if the first element crosses pages.
5105 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
5106 * are set >= 0 only if there are complete elements on a second page.
5108 * The reg_off_* offsets are relative to the internal vector register.
5109 * The mem_off_first offset is relative to the memory address; the
5110 * two offsets are different when a load operation extends, a store
5111 * operation truncates, or for multi-register operations.
5113 int16_t mem_off_first
[2];
5114 int16_t reg_off_first
[2];
5115 int16_t reg_off_last
[2];
5118 * One element that is misaligned and spans both pages,
5119 * or -1 if there is no such active element.
5121 int16_t mem_off_split
;
5122 int16_t reg_off_split
;
5125 * The byte offset at which the entire operation crosses a page boundary.
5126 * Set >= 0 if and only if the entire operation spans two pages.
5130 /* TLB data for the two pages. */
5131 SVEHostPage page
[2];
5135 * Find first active element on each page, and a loose bound for the
5136 * final element on each page. Identify any single element that spans
5137 * the page boundary. Return true if there are any active elements.
5139 static bool sve_cont_ldst_elements(SVEContLdSt
*info
, target_ulong addr
,
5140 uint64_t *vg
, intptr_t reg_max
,
5143 const int esize
= 1 << esz
;
5144 const uint64_t pg_mask
= pred_esz_masks
[esz
];
5145 intptr_t reg_off_first
= -1, reg_off_last
= -1, reg_off_split
;
5146 intptr_t mem_off_last
, mem_off_split
;
5147 intptr_t page_split
, elt_split
;
5150 /* Set all of the element indices to -1, and the TLB data to 0. */
5151 memset(info
, -1, offsetof(SVEContLdSt
, page
));
5152 memset(info
->page
, 0, sizeof(info
->page
));
5154 /* Gross scan over the entire predicate to find bounds. */
5157 uint64_t pg
= vg
[i
] & pg_mask
;
5159 reg_off_last
= i
* 64 + 63 - clz64(pg
);
5160 if (reg_off_first
< 0) {
5161 reg_off_first
= i
* 64 + ctz64(pg
);
5164 } while (++i
* 64 < reg_max
);
5166 if (unlikely(reg_off_first
< 0)) {
5167 /* No active elements, no pages touched. */
5170 tcg_debug_assert(reg_off_last
>= 0 && reg_off_last
< reg_max
);
5172 info
->reg_off_first
[0] = reg_off_first
;
5173 info
->mem_off_first
[0] = (reg_off_first
>> esz
) * msize
;
5174 mem_off_last
= (reg_off_last
>> esz
) * msize
;
5176 page_split
= -(addr
| TARGET_PAGE_MASK
);
5177 if (likely(mem_off_last
+ msize
<= page_split
)) {
5178 /* The entire operation fits within a single page. */
5179 info
->reg_off_last
[0] = reg_off_last
;
5183 info
->page_split
= page_split
;
5184 elt_split
= page_split
/ msize
;
5185 reg_off_split
= elt_split
<< esz
;
5186 mem_off_split
= elt_split
* msize
;
5189 * This is the last full element on the first page, but it is not
5190 * necessarily active. If there is no full element, i.e. the first
5191 * active element is the one that's split, this value remains -1.
5192 * It is useful as iteration bounds.
5194 if (elt_split
!= 0) {
5195 info
->reg_off_last
[0] = reg_off_split
- esize
;
5198 /* Determine if an unaligned element spans the pages. */
5199 if (page_split
% msize
!= 0) {
5200 /* It is helpful to know if the split element is active. */
5201 if ((vg
[reg_off_split
>> 6] >> (reg_off_split
& 63)) & 1) {
5202 info
->reg_off_split
= reg_off_split
;
5203 info
->mem_off_split
= mem_off_split
;
5205 if (reg_off_split
== reg_off_last
) {
5206 /* The page crossing element is last. */
5210 reg_off_split
+= esize
;
5211 mem_off_split
+= msize
;
5215 * We do want the first active element on the second page, because
5216 * this may affect the address reported in an exception.
5218 reg_off_split
= find_next_active(vg
, reg_off_split
, reg_max
, esz
);
5219 tcg_debug_assert(reg_off_split
<= reg_off_last
);
5220 info
->reg_off_first
[1] = reg_off_split
;
5221 info
->mem_off_first
[1] = (reg_off_split
>> esz
) * msize
;
5222 info
->reg_off_last
[1] = reg_off_last
;
5227 * Resolve the guest virtual addresses to info->page[].
5228 * Control the generation of page faults with @fault. Return false if
5229 * there is no work to do, which can only happen with @fault == FAULT_NO.
5231 static bool sve_cont_ldst_pages(SVEContLdSt
*info
, SVEContFault fault
,
5232 CPUARMState
*env
, target_ulong addr
,
5233 MMUAccessType access_type
, uintptr_t retaddr
)
5235 int mmu_idx
= cpu_mmu_index(env
, false);
5236 int mem_off
= info
->mem_off_first
[0];
5237 bool nofault
= fault
== FAULT_NO
;
5238 bool have_work
= true;
5240 if (!sve_probe_page(&info
->page
[0], nofault
, env
, addr
, mem_off
,
5241 access_type
, mmu_idx
, retaddr
)) {
5242 /* No work to be done. */
5246 if (likely(info
->page_split
< 0)) {
5247 /* The entire operation was on the one page. */
5252 * If the second page is invalid, then we want the fault address to be
5253 * the first byte on that page which is accessed.
5255 if (info
->mem_off_split
>= 0) {
5257 * There is an element split across the pages. The fault address
5258 * should be the first byte of the second page.
5260 mem_off
= info
->page_split
;
5262 * If the split element is also the first active element
5263 * of the vector, then: For first-fault we should continue
5264 * to generate faults for the second page. For no-fault,
5265 * we have work only if the second page is valid.
5267 if (info
->mem_off_first
[0] < info
->mem_off_split
) {
5268 nofault
= FAULT_FIRST
;
5273 * There is no element split across the pages. The fault address
5274 * should be the first active element on the second page.
5276 mem_off
= info
->mem_off_first
[1];
5278 * There must have been one active element on the first page,
5279 * so we're out of first-fault territory.
5281 nofault
= fault
!= FAULT_ALL
;
5284 have_work
|= sve_probe_page(&info
->page
[1], nofault
, env
, addr
, mem_off
,
5285 access_type
, mmu_idx
, retaddr
);
5289 static void sve_cont_ldst_watchpoints(SVEContLdSt
*info
, CPUARMState
*env
,
5290 uint64_t *vg
, target_ulong addr
,
5291 int esize
, int msize
, int wp_access
,
5294 #ifndef CONFIG_USER_ONLY
5295 intptr_t mem_off
, reg_off
, reg_last
;
5296 int flags0
= info
->page
[0].flags
;
5297 int flags1
= info
->page
[1].flags
;
5299 if (likely(!((flags0
| flags1
) & TLB_WATCHPOINT
))) {
5303 /* Indicate that watchpoints are handled. */
5304 info
->page
[0].flags
= flags0
& ~TLB_WATCHPOINT
;
5305 info
->page
[1].flags
= flags1
& ~TLB_WATCHPOINT
;
5307 if (flags0
& TLB_WATCHPOINT
) {
5308 mem_off
= info
->mem_off_first
[0];
5309 reg_off
= info
->reg_off_first
[0];
5310 reg_last
= info
->reg_off_last
[0];
5312 while (reg_off
<= reg_last
) {
5313 uint64_t pg
= vg
[reg_off
>> 6];
5315 if ((pg
>> (reg_off
& 63)) & 1) {
5316 cpu_check_watchpoint(env_cpu(env
), addr
+ mem_off
,
5317 msize
, info
->page
[0].attrs
,
5318 wp_access
, retaddr
);
5322 } while (reg_off
<= reg_last
&& (reg_off
& 63));
5326 mem_off
= info
->mem_off_split
;
5328 cpu_check_watchpoint(env_cpu(env
), addr
+ mem_off
, msize
,
5329 info
->page
[0].attrs
, wp_access
, retaddr
);
5332 mem_off
= info
->mem_off_first
[1];
5333 if ((flags1
& TLB_WATCHPOINT
) && mem_off
>= 0) {
5334 reg_off
= info
->reg_off_first
[1];
5335 reg_last
= info
->reg_off_last
[1];
5338 uint64_t pg
= vg
[reg_off
>> 6];
5340 if ((pg
>> (reg_off
& 63)) & 1) {
5341 cpu_check_watchpoint(env_cpu(env
), addr
+ mem_off
,
5342 msize
, info
->page
[1].attrs
,
5343 wp_access
, retaddr
);
5347 } while (reg_off
& 63);
5348 } while (reg_off
<= reg_last
);
5353 static void sve_cont_ldst_mte_check(SVEContLdSt
*info
, CPUARMState
*env
,
5354 uint64_t *vg
, target_ulong addr
, int esize
,
5355 int msize
, uint32_t mtedesc
, uintptr_t ra
)
5357 intptr_t mem_off
, reg_off
, reg_last
;
5359 /* Process the page only if MemAttr == Tagged. */
5360 if (arm_tlb_mte_tagged(&info
->page
[0].attrs
)) {
5361 mem_off
= info
->mem_off_first
[0];
5362 reg_off
= info
->reg_off_first
[0];
5363 reg_last
= info
->reg_off_split
;
5365 reg_last
= info
->reg_off_last
[0];
5369 uint64_t pg
= vg
[reg_off
>> 6];
5371 if ((pg
>> (reg_off
& 63)) & 1) {
5372 mte_check(env
, mtedesc
, addr
, ra
);
5376 } while (reg_off
<= reg_last
&& (reg_off
& 63));
5377 } while (reg_off
<= reg_last
);
5380 mem_off
= info
->mem_off_first
[1];
5381 if (mem_off
>= 0 && arm_tlb_mte_tagged(&info
->page
[1].attrs
)) {
5382 reg_off
= info
->reg_off_first
[1];
5383 reg_last
= info
->reg_off_last
[1];
5386 uint64_t pg
= vg
[reg_off
>> 6];
5388 if ((pg
>> (reg_off
& 63)) & 1) {
5389 mte_check(env
, mtedesc
, addr
, ra
);
5393 } while (reg_off
& 63);
5394 } while (reg_off
<= reg_last
);
5399 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5401 static inline QEMU_ALWAYS_INLINE
5402 void sve_ldN_r(CPUARMState
*env
, uint64_t *vg
, const target_ulong addr
,
5403 uint32_t desc
, const uintptr_t retaddr
,
5404 const int esz
, const int msz
, const int N
, uint32_t mtedesc
,
5405 sve_ldst1_host_fn
*host_fn
,
5406 sve_ldst1_tlb_fn
*tlb_fn
)
5408 const unsigned rd
= simd_data(desc
);
5409 const intptr_t reg_max
= simd_oprsz(desc
);
5410 intptr_t reg_off
, reg_last
, mem_off
;
5415 /* Find the active elements. */
5416 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, N
<< msz
)) {
5417 /* The entire predicate was false; no load occurs. */
5418 for (i
= 0; i
< N
; ++i
) {
5419 memset(&env
->vfp
.zregs
[(rd
+ i
) & 31], 0, reg_max
);
5424 /* Probe the page(s). Exit with exception for any invalid page. */
5425 sve_cont_ldst_pages(&info
, FAULT_ALL
, env
, addr
, MMU_DATA_LOAD
, retaddr
);
5427 /* Handle watchpoints for all active elements. */
5428 sve_cont_ldst_watchpoints(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
5429 BP_MEM_READ
, retaddr
);
5432 * Handle mte checks for all active elements.
5433 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5436 sve_cont_ldst_mte_check(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
5440 flags
= info
.page
[0].flags
| info
.page
[1].flags
;
5441 if (unlikely(flags
!= 0)) {
5442 #ifdef CONFIG_USER_ONLY
5443 g_assert_not_reached();
5446 * At least one page includes MMIO.
5447 * Any bus operation can fail with cpu_transaction_failed,
5448 * which for ARM will raise SyncExternal. Perform the load
5449 * into scratch memory to preserve register state until the end.
5451 ARMVectorReg scratch
[4] = { };
5453 mem_off
= info
.mem_off_first
[0];
5454 reg_off
= info
.reg_off_first
[0];
5455 reg_last
= info
.reg_off_last
[1];
5457 reg_last
= info
.reg_off_split
;
5459 reg_last
= info
.reg_off_last
[0];
5464 uint64_t pg
= vg
[reg_off
>> 6];
5466 if ((pg
>> (reg_off
& 63)) & 1) {
5467 for (i
= 0; i
< N
; ++i
) {
5468 tlb_fn(env
, &scratch
[i
], reg_off
,
5469 addr
+ mem_off
+ (i
<< msz
), retaddr
);
5472 reg_off
+= 1 << esz
;
5473 mem_off
+= N
<< msz
;
5474 } while (reg_off
& 63);
5475 } while (reg_off
<= reg_last
);
5477 for (i
= 0; i
< N
; ++i
) {
5478 memcpy(&env
->vfp
.zregs
[(rd
+ i
) & 31], &scratch
[i
], reg_max
);
5484 /* The entire operation is in RAM, on valid pages. */
5486 for (i
= 0; i
< N
; ++i
) {
5487 memset(&env
->vfp
.zregs
[(rd
+ i
) & 31], 0, reg_max
);
5490 mem_off
= info
.mem_off_first
[0];
5491 reg_off
= info
.reg_off_first
[0];
5492 reg_last
= info
.reg_off_last
[0];
5493 host
= info
.page
[0].host
;
5495 while (reg_off
<= reg_last
) {
5496 uint64_t pg
= vg
[reg_off
>> 6];
5498 if ((pg
>> (reg_off
& 63)) & 1) {
5499 for (i
= 0; i
< N
; ++i
) {
5500 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5501 host
+ mem_off
+ (i
<< msz
));
5504 reg_off
+= 1 << esz
;
5505 mem_off
+= N
<< msz
;
5506 } while (reg_off
<= reg_last
&& (reg_off
& 63));
5510 * Use the slow path to manage the cross-page misalignment.
5511 * But we know this is RAM and cannot trap.
5513 mem_off
= info
.mem_off_split
;
5514 if (unlikely(mem_off
>= 0)) {
5515 reg_off
= info
.reg_off_split
;
5516 for (i
= 0; i
< N
; ++i
) {
5517 tlb_fn(env
, &env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5518 addr
+ mem_off
+ (i
<< msz
), retaddr
);
5522 mem_off
= info
.mem_off_first
[1];
5523 if (unlikely(mem_off
>= 0)) {
5524 reg_off
= info
.reg_off_first
[1];
5525 reg_last
= info
.reg_off_last
[1];
5526 host
= info
.page
[1].host
;
5529 uint64_t pg
= vg
[reg_off
>> 6];
5531 if ((pg
>> (reg_off
& 63)) & 1) {
5532 for (i
= 0; i
< N
; ++i
) {
5533 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5534 host
+ mem_off
+ (i
<< msz
));
5537 reg_off
+= 1 << esz
;
5538 mem_off
+= N
<< msz
;
5539 } while (reg_off
& 63);
5540 } while (reg_off
<= reg_last
);
5544 static inline QEMU_ALWAYS_INLINE
5545 void sve_ldN_r_mte(CPUARMState
*env
, uint64_t *vg
, target_ulong addr
,
5546 uint32_t desc
, const uintptr_t ra
,
5547 const int esz
, const int msz
, const int N
,
5548 sve_ldst1_host_fn
*host_fn
,
5549 sve_ldst1_tlb_fn
*tlb_fn
)
5551 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
5552 int bit55
= extract64(addr
, 55, 1);
5554 /* Remove mtedesc from the normal sve descriptor. */
5555 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
5557 /* Perform gross MTE suppression early. */
5558 if (!tbi_check(desc
, bit55
) ||
5559 tcma_check(desc
, bit55
, allocation_tag_from_addr(addr
))) {
5563 sve_ldN_r(env
, vg
, addr
, desc
, ra
, esz
, msz
, N
, mtedesc
, host_fn
, tlb_fn
);
5566 #define DO_LD1_1(NAME, ESZ) \
5567 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5568 target_ulong addr, uint32_t desc) \
5570 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
5571 sve_##NAME##_host, sve_##NAME##_tlb); \
5573 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5574 target_ulong addr, uint32_t desc) \
5576 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5577 sve_##NAME##_host, sve_##NAME##_tlb); \
5580 #define DO_LD1_2(NAME, ESZ, MSZ) \
5581 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5582 target_ulong addr, uint32_t desc) \
5584 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5585 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5587 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5588 target_ulong addr, uint32_t desc) \
5590 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5591 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5593 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5594 target_ulong addr, uint32_t desc) \
5596 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5597 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5599 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5600 target_ulong addr, uint32_t desc) \
5602 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5603 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5606 DO_LD1_1(ld1bb
, MO_8
)
5607 DO_LD1_1(ld1bhu
, MO_16
)
5608 DO_LD1_1(ld1bhs
, MO_16
)
5609 DO_LD1_1(ld1bsu
, MO_32
)
5610 DO_LD1_1(ld1bss
, MO_32
)
5611 DO_LD1_1(ld1bdu
, MO_64
)
5612 DO_LD1_1(ld1bds
, MO_64
)
5614 DO_LD1_2(ld1hh
, MO_16
, MO_16
)
5615 DO_LD1_2(ld1hsu
, MO_32
, MO_16
)
5616 DO_LD1_2(ld1hss
, MO_32
, MO_16
)
5617 DO_LD1_2(ld1hdu
, MO_64
, MO_16
)
5618 DO_LD1_2(ld1hds
, MO_64
, MO_16
)
5620 DO_LD1_2(ld1ss
, MO_32
, MO_32
)
5621 DO_LD1_2(ld1sdu
, MO_64
, MO_32
)
5622 DO_LD1_2(ld1sds
, MO_64
, MO_32
)
5624 DO_LD1_2(ld1dd
, MO_64
, MO_64
)
5629 #define DO_LDN_1(N) \
5630 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5631 target_ulong addr, uint32_t desc) \
5633 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
5634 sve_ld1bb_host, sve_ld1bb_tlb); \
5636 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
5637 target_ulong addr, uint32_t desc) \
5639 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
5640 sve_ld1bb_host, sve_ld1bb_tlb); \
5643 #define DO_LDN_2(N, SUFF, ESZ) \
5644 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
5645 target_ulong addr, uint32_t desc) \
5647 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5648 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5650 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
5651 target_ulong addr, uint32_t desc) \
5653 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5654 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5656 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
5657 target_ulong addr, uint32_t desc) \
5659 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5660 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5662 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
5663 target_ulong addr, uint32_t desc) \
5665 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5666 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5673 DO_LDN_2(2, hh
, MO_16
)
5674 DO_LDN_2(3, hh
, MO_16
)
5675 DO_LDN_2(4, hh
, MO_16
)
5677 DO_LDN_2(2, ss
, MO_32
)
5678 DO_LDN_2(3, ss
, MO_32
)
5679 DO_LDN_2(4, ss
, MO_32
)
5681 DO_LDN_2(2, dd
, MO_64
)
5682 DO_LDN_2(3, dd
, MO_64
)
5683 DO_LDN_2(4, dd
, MO_64
)
5689 * Load contiguous data, first-fault and no-fault.
5691 * For user-only, one could argue that we should hold the mmap_lock during
5692 * the operation so that there is no race between page_check_range and the
5693 * load operation. However, unmapping pages out from under a running thread
5694 * is extraordinarily unlikely. This theoretical race condition also affects
5695 * linux-user/ in its get_user/put_user macros.
5697 * TODO: Construct some helpers, written in assembly, that interact with
5698 * handle_cpu_signal to produce memory ops which can properly report errors
5702 /* Fault on byte I. All bits in FFR from I are cleared. The vector
5703 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5704 * option, which leaves subsequent data unchanged.
5706 static void record_fault(CPUARMState
*env
, uintptr_t i
, uintptr_t oprsz
)
5708 uint64_t *ffr
= env
->vfp
.pregs
[FFR_PRED_NUM
].p
;
5711 ffr
[i
/ 64] &= MAKE_64BIT_MASK(0, i
& 63);
5712 i
= ROUND_UP(i
, 64);
5714 for (; i
< oprsz
; i
+= 64) {
5720 * Common helper for all contiguous no-fault and first-fault loads.
5722 static inline QEMU_ALWAYS_INLINE
5723 void sve_ldnfff1_r(CPUARMState
*env
, void *vg
, const target_ulong addr
,
5724 uint32_t desc
, const uintptr_t retaddr
, uint32_t mtedesc
,
5725 const int esz
, const int msz
, const SVEContFault fault
,
5726 sve_ldst1_host_fn
*host_fn
,
5727 sve_ldst1_tlb_fn
*tlb_fn
)
5729 const unsigned rd
= simd_data(desc
);
5730 void *vd
= &env
->vfp
.zregs
[rd
];
5731 const intptr_t reg_max
= simd_oprsz(desc
);
5732 intptr_t reg_off
, mem_off
, reg_last
;
5737 /* Find the active elements. */
5738 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, 1 << msz
)) {
5739 /* The entire predicate was false; no load occurs. */
5740 memset(vd
, 0, reg_max
);
5743 reg_off
= info
.reg_off_first
[0];
5745 /* Probe the page(s). */
5746 if (!sve_cont_ldst_pages(&info
, fault
, env
, addr
, MMU_DATA_LOAD
, retaddr
)) {
5747 /* Fault on first element. */
5748 tcg_debug_assert(fault
== FAULT_NO
);
5749 memset(vd
, 0, reg_max
);
5753 mem_off
= info
.mem_off_first
[0];
5754 flags
= info
.page
[0].flags
;
5757 * Disable MTE checking if the Tagged bit is not set. Since TBI must
5758 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
5760 if (arm_tlb_mte_tagged(&info
.page
[0].attrs
)) {
5764 if (fault
== FAULT_FIRST
) {
5765 /* Trapping mte check for the first-fault element. */
5767 mte_check(env
, mtedesc
, addr
+ mem_off
, retaddr
);
5771 * Special handling of the first active element,
5772 * if it crosses a page boundary or is MMIO.
5774 bool is_split
= mem_off
== info
.mem_off_split
;
5775 if (unlikely(flags
!= 0) || unlikely(is_split
)) {
5777 * Use the slow path for cross-page handling.
5778 * Might trap for MMIO or watchpoints.
5780 tlb_fn(env
, vd
, reg_off
, addr
+ mem_off
, retaddr
);
5782 /* After any fault, zero the other elements. */
5783 swap_memzero(vd
, reg_off
);
5784 reg_off
+= 1 << esz
;
5785 mem_off
+= 1 << msz
;
5786 swap_memzero(vd
+ reg_off
, reg_max
- reg_off
);
5792 memset(vd
, 0, reg_max
);
5795 memset(vd
, 0, reg_max
);
5796 if (unlikely(mem_off
== info
.mem_off_split
)) {
5797 /* The first active element crosses a page boundary. */
5798 flags
|= info
.page
[1].flags
;
5799 if (unlikely(flags
& TLB_MMIO
)) {
5800 /* Some page is MMIO, see below. */
5803 if (unlikely(flags
& TLB_WATCHPOINT
) &&
5804 (cpu_watchpoint_address_matches
5805 (env_cpu(env
), addr
+ mem_off
, 1 << msz
)
5807 /* Watchpoint hit, see below. */
5810 if (mtedesc
&& !mte_probe(env
, mtedesc
, addr
+ mem_off
)) {
5814 * Use the slow path for cross-page handling.
5815 * This is RAM, without a watchpoint, and will not trap.
5817 tlb_fn(env
, vd
, reg_off
, addr
+ mem_off
, retaddr
);
5823 * From this point on, all memory operations are MemSingleNF.
5825 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
5826 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
5828 * Unfortuately we do not have access to the memory attributes from the
5829 * PTE to tell Device memory from Normal memory. So we make a mostly
5830 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
5831 * This gives the right answer for the common cases of "Normal memory,
5832 * backed by host RAM" and "Device memory, backed by MMIO".
5833 * The architecture allows us to suppress an NF load and return
5834 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
5835 * case of "Normal memory, backed by MMIO" is permitted. The case we
5836 * get wrong is "Device memory, backed by host RAM", for which we
5837 * should return (UNKNOWN, FAULT) for but do not.
5839 * Similarly, CPU_BP breakpoints would raise exceptions, and so
5840 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
5841 * architectural breakpoints the same.
5843 if (unlikely(flags
& TLB_MMIO
)) {
5847 reg_last
= info
.reg_off_last
[0];
5848 host
= info
.page
[0].host
;
5851 uint64_t pg
= *(uint64_t *)(vg
+ (reg_off
>> 3));
5853 if ((pg
>> (reg_off
& 63)) & 1) {
5854 if (unlikely(flags
& TLB_WATCHPOINT
) &&
5855 (cpu_watchpoint_address_matches
5856 (env_cpu(env
), addr
+ mem_off
, 1 << msz
)
5860 if (mtedesc
&& !mte_probe(env
, mtedesc
, addr
+ mem_off
)) {
5863 host_fn(vd
, reg_off
, host
+ mem_off
);
5865 reg_off
+= 1 << esz
;
5866 mem_off
+= 1 << msz
;
5867 } while (reg_off
<= reg_last
&& (reg_off
& 63));
5868 } while (reg_off
<= reg_last
);
5871 * MemSingleNF is allowed to fail for any reason. We have special
5872 * code above to handle the first element crossing a page boundary.
5873 * As an implementation choice, decline to handle a cross-page element
5874 * in any other position.
5876 reg_off
= info
.reg_off_split
;
5882 reg_off
= info
.reg_off_first
[1];
5883 if (likely(reg_off
< 0)) {
5884 /* No active elements on the second page. All done. */
5889 * MemSingleNF is allowed to fail for any reason. As an implementation
5890 * choice, decline to handle elements on the second page. This should
5891 * be low frequency as the guest walks through memory -- the next
5892 * iteration of the guest's loop should be aligned on the page boundary,
5893 * and then all following iterations will stay aligned.
5897 record_fault(env
, reg_off
, reg_max
);
5900 static inline QEMU_ALWAYS_INLINE
5901 void sve_ldnfff1_r_mte(CPUARMState
*env
, void *vg
, target_ulong addr
,
5902 uint32_t desc
, const uintptr_t retaddr
,
5903 const int esz
, const int msz
, const SVEContFault fault
,
5904 sve_ldst1_host_fn
*host_fn
,
5905 sve_ldst1_tlb_fn
*tlb_fn
)
5907 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
5908 int bit55
= extract64(addr
, 55, 1);
5910 /* Remove mtedesc from the normal sve descriptor. */
5911 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
5913 /* Perform gross MTE suppression early. */
5914 if (!tbi_check(desc
, bit55
) ||
5915 tcma_check(desc
, bit55
, allocation_tag_from_addr(addr
))) {
5919 sve_ldnfff1_r(env
, vg
, addr
, desc
, retaddr
, mtedesc
,
5920 esz
, msz
, fault
, host_fn
, tlb_fn
);
5923 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
5924 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
5925 target_ulong addr, uint32_t desc) \
5927 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
5928 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5930 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
5931 target_ulong addr, uint32_t desc) \
5933 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
5934 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5936 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
5937 target_ulong addr, uint32_t desc) \
5939 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
5940 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5942 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
5943 target_ulong addr, uint32_t desc) \
5945 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
5946 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
5949 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
5950 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
5951 target_ulong addr, uint32_t desc) \
5953 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
5954 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5956 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
5957 target_ulong addr, uint32_t desc) \
5959 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
5960 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5962 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
5963 target_ulong addr, uint32_t desc) \
5965 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
5966 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5968 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
5969 target_ulong addr, uint32_t desc) \
5971 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
5972 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5974 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5975 target_ulong addr, uint32_t desc) \
5977 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5978 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5980 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
5981 target_ulong addr, uint32_t desc) \
5983 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5984 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
5986 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5987 target_ulong addr, uint32_t desc) \
5989 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
5990 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5992 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
5993 target_ulong addr, uint32_t desc) \
5995 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
5996 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
5999 DO_LDFF1_LDNF1_1(bb
, MO_8
)
6000 DO_LDFF1_LDNF1_1(bhu
, MO_16
)
6001 DO_LDFF1_LDNF1_1(bhs
, MO_16
)
6002 DO_LDFF1_LDNF1_1(bsu
, MO_32
)
6003 DO_LDFF1_LDNF1_1(bss
, MO_32
)
6004 DO_LDFF1_LDNF1_1(bdu
, MO_64
)
6005 DO_LDFF1_LDNF1_1(bds
, MO_64
)
6007 DO_LDFF1_LDNF1_2(hh
, MO_16
, MO_16
)
6008 DO_LDFF1_LDNF1_2(hsu
, MO_32
, MO_16
)
6009 DO_LDFF1_LDNF1_2(hss
, MO_32
, MO_16
)
6010 DO_LDFF1_LDNF1_2(hdu
, MO_64
, MO_16
)
6011 DO_LDFF1_LDNF1_2(hds
, MO_64
, MO_16
)
6013 DO_LDFF1_LDNF1_2(ss
, MO_32
, MO_32
)
6014 DO_LDFF1_LDNF1_2(sdu
, MO_64
, MO_32
)
6015 DO_LDFF1_LDNF1_2(sds
, MO_64
, MO_32
)
6017 DO_LDFF1_LDNF1_2(dd
, MO_64
, MO_64
)
6019 #undef DO_LDFF1_LDNF1_1
6020 #undef DO_LDFF1_LDNF1_2
6023 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6026 static inline QEMU_ALWAYS_INLINE
6027 void sve_stN_r(CPUARMState
*env
, uint64_t *vg
, target_ulong addr
,
6028 uint32_t desc
, const uintptr_t retaddr
,
6029 const int esz
, const int msz
, const int N
, uint32_t mtedesc
,
6030 sve_ldst1_host_fn
*host_fn
,
6031 sve_ldst1_tlb_fn
*tlb_fn
)
6033 const unsigned rd
= simd_data(desc
);
6034 const intptr_t reg_max
= simd_oprsz(desc
);
6035 intptr_t reg_off
, reg_last
, mem_off
;
6040 /* Find the active elements. */
6041 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, N
<< msz
)) {
6042 /* The entire predicate was false; no store occurs. */
6046 /* Probe the page(s). Exit with exception for any invalid page. */
6047 sve_cont_ldst_pages(&info
, FAULT_ALL
, env
, addr
, MMU_DATA_STORE
, retaddr
);
6049 /* Handle watchpoints for all active elements. */
6050 sve_cont_ldst_watchpoints(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
6051 BP_MEM_WRITE
, retaddr
);
6054 * Handle mte checks for all active elements.
6055 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6058 sve_cont_ldst_mte_check(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
6062 flags
= info
.page
[0].flags
| info
.page
[1].flags
;
6063 if (unlikely(flags
!= 0)) {
6064 #ifdef CONFIG_USER_ONLY
6065 g_assert_not_reached();
6068 * At least one page includes MMIO.
6069 * Any bus operation can fail with cpu_transaction_failed,
6070 * which for ARM will raise SyncExternal. We cannot avoid
6071 * this fault and will leave with the store incomplete.
6073 mem_off
= info
.mem_off_first
[0];
6074 reg_off
= info
.reg_off_first
[0];
6075 reg_last
= info
.reg_off_last
[1];
6077 reg_last
= info
.reg_off_split
;
6079 reg_last
= info
.reg_off_last
[0];
6084 uint64_t pg
= vg
[reg_off
>> 6];
6086 if ((pg
>> (reg_off
& 63)) & 1) {
6087 for (i
= 0; i
< N
; ++i
) {
6088 tlb_fn(env
, &env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
6089 addr
+ mem_off
+ (i
<< msz
), retaddr
);
6092 reg_off
+= 1 << esz
;
6093 mem_off
+= N
<< msz
;
6094 } while (reg_off
& 63);
6095 } while (reg_off
<= reg_last
);
6100 mem_off
= info
.mem_off_first
[0];
6101 reg_off
= info
.reg_off_first
[0];
6102 reg_last
= info
.reg_off_last
[0];
6103 host
= info
.page
[0].host
;
6105 while (reg_off
<= reg_last
) {
6106 uint64_t pg
= vg
[reg_off
>> 6];
6108 if ((pg
>> (reg_off
& 63)) & 1) {
6109 for (i
= 0; i
< N
; ++i
) {
6110 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
6111 host
+ mem_off
+ (i
<< msz
));
6114 reg_off
+= 1 << esz
;
6115 mem_off
+= N
<< msz
;
6116 } while (reg_off
<= reg_last
&& (reg_off
& 63));
6120 * Use the slow path to manage the cross-page misalignment.
6121 * But we know this is RAM and cannot trap.
6123 mem_off
= info
.mem_off_split
;
6124 if (unlikely(mem_off
>= 0)) {
6125 reg_off
= info
.reg_off_split
;
6126 for (i
= 0; i
< N
; ++i
) {
6127 tlb_fn(env
, &env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
6128 addr
+ mem_off
+ (i
<< msz
), retaddr
);
6132 mem_off
= info
.mem_off_first
[1];
6133 if (unlikely(mem_off
>= 0)) {
6134 reg_off
= info
.reg_off_first
[1];
6135 reg_last
= info
.reg_off_last
[1];
6136 host
= info
.page
[1].host
;
6139 uint64_t pg
= vg
[reg_off
>> 6];
6141 if ((pg
>> (reg_off
& 63)) & 1) {
6142 for (i
= 0; i
< N
; ++i
) {
6143 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
6144 host
+ mem_off
+ (i
<< msz
));
6147 reg_off
+= 1 << esz
;
6148 mem_off
+= N
<< msz
;
6149 } while (reg_off
& 63);
6150 } while (reg_off
<= reg_last
);
6154 static inline QEMU_ALWAYS_INLINE
6155 void sve_stN_r_mte(CPUARMState
*env
, uint64_t *vg
, target_ulong addr
,
6156 uint32_t desc
, const uintptr_t ra
,
6157 const int esz
, const int msz
, const int N
,
6158 sve_ldst1_host_fn
*host_fn
,
6159 sve_ldst1_tlb_fn
*tlb_fn
)
6161 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6162 int bit55
= extract64(addr
, 55, 1);
6164 /* Remove mtedesc from the normal sve descriptor. */
6165 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6167 /* Perform gross MTE suppression early. */
6168 if (!tbi_check(desc
, bit55
) ||
6169 tcma_check(desc
, bit55
, allocation_tag_from_addr(addr
))) {
6173 sve_stN_r(env
, vg
, addr
, desc
, ra
, esz
, msz
, N
, mtedesc
, host_fn
, tlb_fn
);
6176 #define DO_STN_1(N, NAME, ESZ) \
6177 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6178 target_ulong addr, uint32_t desc) \
6180 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6181 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6183 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6184 target_ulong addr, uint32_t desc) \
6186 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6187 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6190 #define DO_STN_2(N, NAME, ESZ, MSZ) \
6191 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6192 target_ulong addr, uint32_t desc) \
6194 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6195 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6197 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6198 target_ulong addr, uint32_t desc) \
6200 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6201 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6203 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6204 target_ulong addr, uint32_t desc) \
6206 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6207 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6209 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6210 target_ulong addr, uint32_t desc) \
6212 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6213 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6216 DO_STN_1(1, bb
, MO_8
)
6217 DO_STN_1(1, bh
, MO_16
)
6218 DO_STN_1(1, bs
, MO_32
)
6219 DO_STN_1(1, bd
, MO_64
)
6220 DO_STN_1(2, bb
, MO_8
)
6221 DO_STN_1(3, bb
, MO_8
)
6222 DO_STN_1(4, bb
, MO_8
)
6224 DO_STN_2(1, hh
, MO_16
, MO_16
)
6225 DO_STN_2(1, hs
, MO_32
, MO_16
)
6226 DO_STN_2(1, hd
, MO_64
, MO_16
)
6227 DO_STN_2(2, hh
, MO_16
, MO_16
)
6228 DO_STN_2(3, hh
, MO_16
, MO_16
)
6229 DO_STN_2(4, hh
, MO_16
, MO_16
)
6231 DO_STN_2(1, ss
, MO_32
, MO_32
)
6232 DO_STN_2(1, sd
, MO_64
, MO_32
)
6233 DO_STN_2(2, ss
, MO_32
, MO_32
)
6234 DO_STN_2(3, ss
, MO_32
, MO_32
)
6235 DO_STN_2(4, ss
, MO_32
, MO_32
)
6237 DO_STN_2(1, dd
, MO_64
, MO_64
)
6238 DO_STN_2(2, dd
, MO_64
, MO_64
)
6239 DO_STN_2(3, dd
, MO_64
, MO_64
)
6240 DO_STN_2(4, dd
, MO_64
, MO_64
)
6246 * Loads with a vector index.
6250 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6252 typedef target_ulong
zreg_off_fn(void *reg
, intptr_t reg_ofs
);
6254 static target_ulong
off_zsu_s(void *reg
, intptr_t reg_ofs
)
6256 return *(uint32_t *)(reg
+ H1_4(reg_ofs
));
6259 static target_ulong
off_zss_s(void *reg
, intptr_t reg_ofs
)
6261 return *(int32_t *)(reg
+ H1_4(reg_ofs
));
6264 static target_ulong
off_zsu_d(void *reg
, intptr_t reg_ofs
)
6266 return (uint32_t)*(uint64_t *)(reg
+ reg_ofs
);
6269 static target_ulong
off_zss_d(void *reg
, intptr_t reg_ofs
)
6271 return (int32_t)*(uint64_t *)(reg
+ reg_ofs
);
6274 static target_ulong
off_zd_d(void *reg
, intptr_t reg_ofs
)
6276 return *(uint64_t *)(reg
+ reg_ofs
);
6279 static inline QEMU_ALWAYS_INLINE
6280 void sve_ld1_z(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6281 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6282 uint32_t mtedesc
, int esize
, int msize
,
6283 zreg_off_fn
*off_fn
,
6284 sve_ldst1_host_fn
*host_fn
,
6285 sve_ldst1_tlb_fn
*tlb_fn
)
6287 const int mmu_idx
= cpu_mmu_index(env
, false);
6288 const intptr_t reg_max
= simd_oprsz(desc
);
6289 const int scale
= simd_data(desc
);
6290 ARMVectorReg scratch
;
6292 SVEHostPage info
, info2
;
6294 memset(&scratch
, 0, reg_max
);
6297 uint64_t pg
= vg
[reg_off
>> 6];
6299 if (likely(pg
& 1)) {
6300 target_ulong addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6301 target_ulong in_page
= -(addr
| TARGET_PAGE_MASK
);
6303 sve_probe_page(&info
, false, env
, addr
, 0, MMU_DATA_LOAD
,
6306 if (likely(in_page
>= msize
)) {
6307 if (unlikely(info
.flags
& TLB_WATCHPOINT
)) {
6308 cpu_check_watchpoint(env_cpu(env
), addr
, msize
,
6309 info
.attrs
, BP_MEM_READ
, retaddr
);
6311 if (mtedesc
&& arm_tlb_mte_tagged(&info
.attrs
)) {
6312 mte_check(env
, mtedesc
, addr
, retaddr
);
6314 host_fn(&scratch
, reg_off
, info
.host
);
6316 /* Element crosses the page boundary. */
6317 sve_probe_page(&info2
, false, env
, addr
+ in_page
, 0,
6318 MMU_DATA_LOAD
, mmu_idx
, retaddr
);
6319 if (unlikely((info
.flags
| info2
.flags
) & TLB_WATCHPOINT
)) {
6320 cpu_check_watchpoint(env_cpu(env
), addr
,
6322 BP_MEM_READ
, retaddr
);
6324 if (mtedesc
&& arm_tlb_mte_tagged(&info
.attrs
)) {
6325 mte_check(env
, mtedesc
, addr
, retaddr
);
6327 tlb_fn(env
, &scratch
, reg_off
, addr
, retaddr
);
6332 } while (reg_off
& 63);
6333 } while (reg_off
< reg_max
);
6335 /* Wait until all exceptions have been raised to write back. */
6336 memcpy(vd
, &scratch
, reg_max
);
6339 static inline QEMU_ALWAYS_INLINE
6340 void sve_ld1_z_mte(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6341 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6342 int esize
, int msize
, zreg_off_fn
*off_fn
,
6343 sve_ldst1_host_fn
*host_fn
,
6344 sve_ldst1_tlb_fn
*tlb_fn
)
6346 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6347 /* Remove mtedesc from the normal sve descriptor. */
6348 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6351 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6352 * offset base entirely over the address space hole to change the
6353 * pointer tag, or change the bit55 selector. So we could here
6354 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6356 sve_ld1_z(env
, vd
, vg
, vm
, base
, desc
, retaddr
, mtedesc
,
6357 esize
, msize
, off_fn
, host_fn
, tlb_fn
);
6360 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6361 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6362 void *vm, target_ulong base, uint32_t desc) \
6364 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6365 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6367 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6368 void *vm, target_ulong base, uint32_t desc) \
6370 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6371 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6374 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6375 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6376 void *vm, target_ulong base, uint32_t desc) \
6378 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6379 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6381 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6382 void *vm, target_ulong base, uint32_t desc) \
6384 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6385 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6388 DO_LD1_ZPZ_S(bsu
, zsu
, MO_8
)
6389 DO_LD1_ZPZ_S(bsu
, zss
, MO_8
)
6390 DO_LD1_ZPZ_D(bdu
, zsu
, MO_8
)
6391 DO_LD1_ZPZ_D(bdu
, zss
, MO_8
)
6392 DO_LD1_ZPZ_D(bdu
, zd
, MO_8
)
6394 DO_LD1_ZPZ_S(bss
, zsu
, MO_8
)
6395 DO_LD1_ZPZ_S(bss
, zss
, MO_8
)
6396 DO_LD1_ZPZ_D(bds
, zsu
, MO_8
)
6397 DO_LD1_ZPZ_D(bds
, zss
, MO_8
)
6398 DO_LD1_ZPZ_D(bds
, zd
, MO_8
)
6400 DO_LD1_ZPZ_S(hsu_le
, zsu
, MO_16
)
6401 DO_LD1_ZPZ_S(hsu_le
, zss
, MO_16
)
6402 DO_LD1_ZPZ_D(hdu_le
, zsu
, MO_16
)
6403 DO_LD1_ZPZ_D(hdu_le
, zss
, MO_16
)
6404 DO_LD1_ZPZ_D(hdu_le
, zd
, MO_16
)
6406 DO_LD1_ZPZ_S(hsu_be
, zsu
, MO_16
)
6407 DO_LD1_ZPZ_S(hsu_be
, zss
, MO_16
)
6408 DO_LD1_ZPZ_D(hdu_be
, zsu
, MO_16
)
6409 DO_LD1_ZPZ_D(hdu_be
, zss
, MO_16
)
6410 DO_LD1_ZPZ_D(hdu_be
, zd
, MO_16
)
6412 DO_LD1_ZPZ_S(hss_le
, zsu
, MO_16
)
6413 DO_LD1_ZPZ_S(hss_le
, zss
, MO_16
)
6414 DO_LD1_ZPZ_D(hds_le
, zsu
, MO_16
)
6415 DO_LD1_ZPZ_D(hds_le
, zss
, MO_16
)
6416 DO_LD1_ZPZ_D(hds_le
, zd
, MO_16
)
6418 DO_LD1_ZPZ_S(hss_be
, zsu
, MO_16
)
6419 DO_LD1_ZPZ_S(hss_be
, zss
, MO_16
)
6420 DO_LD1_ZPZ_D(hds_be
, zsu
, MO_16
)
6421 DO_LD1_ZPZ_D(hds_be
, zss
, MO_16
)
6422 DO_LD1_ZPZ_D(hds_be
, zd
, MO_16
)
6424 DO_LD1_ZPZ_S(ss_le
, zsu
, MO_32
)
6425 DO_LD1_ZPZ_S(ss_le
, zss
, MO_32
)
6426 DO_LD1_ZPZ_D(sdu_le
, zsu
, MO_32
)
6427 DO_LD1_ZPZ_D(sdu_le
, zss
, MO_32
)
6428 DO_LD1_ZPZ_D(sdu_le
, zd
, MO_32
)
6430 DO_LD1_ZPZ_S(ss_be
, zsu
, MO_32
)
6431 DO_LD1_ZPZ_S(ss_be
, zss
, MO_32
)
6432 DO_LD1_ZPZ_D(sdu_be
, zsu
, MO_32
)
6433 DO_LD1_ZPZ_D(sdu_be
, zss
, MO_32
)
6434 DO_LD1_ZPZ_D(sdu_be
, zd
, MO_32
)
6436 DO_LD1_ZPZ_D(sds_le
, zsu
, MO_32
)
6437 DO_LD1_ZPZ_D(sds_le
, zss
, MO_32
)
6438 DO_LD1_ZPZ_D(sds_le
, zd
, MO_32
)
6440 DO_LD1_ZPZ_D(sds_be
, zsu
, MO_32
)
6441 DO_LD1_ZPZ_D(sds_be
, zss
, MO_32
)
6442 DO_LD1_ZPZ_D(sds_be
, zd
, MO_32
)
6444 DO_LD1_ZPZ_D(dd_le
, zsu
, MO_64
)
6445 DO_LD1_ZPZ_D(dd_le
, zss
, MO_64
)
6446 DO_LD1_ZPZ_D(dd_le
, zd
, MO_64
)
6448 DO_LD1_ZPZ_D(dd_be
, zsu
, MO_64
)
6449 DO_LD1_ZPZ_D(dd_be
, zss
, MO_64
)
6450 DO_LD1_ZPZ_D(dd_be
, zd
, MO_64
)
6455 /* First fault loads with a vector index. */
6458 * Common helpers for all gather first-faulting loads.
6461 static inline QEMU_ALWAYS_INLINE
6462 void sve_ldff1_z(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6463 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6464 uint32_t mtedesc
, const int esz
, const int msz
,
6465 zreg_off_fn
*off_fn
,
6466 sve_ldst1_host_fn
*host_fn
,
6467 sve_ldst1_tlb_fn
*tlb_fn
)
6469 const int mmu_idx
= cpu_mmu_index(env
, false);
6470 const intptr_t reg_max
= simd_oprsz(desc
);
6471 const int scale
= simd_data(desc
);
6472 const int esize
= 1 << esz
;
6473 const int msize
= 1 << msz
;
6476 target_ulong addr
, in_page
;
6478 /* Skip to the first true predicate. */
6479 reg_off
= find_next_active(vg
, 0, reg_max
, esz
);
6480 if (unlikely(reg_off
>= reg_max
)) {
6481 /* The entire predicate was false; no load occurs. */
6482 memset(vd
, 0, reg_max
);
6487 * Probe the first element, allowing faults.
6489 addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6491 mte_check(env
, mtedesc
, addr
, retaddr
);
6493 tlb_fn(env
, vd
, reg_off
, addr
, retaddr
);
6495 /* After any fault, zero the other elements. */
6496 swap_memzero(vd
, reg_off
);
6498 swap_memzero(vd
+ reg_off
, reg_max
- reg_off
);
6501 * Probe the remaining elements, not allowing faults.
6503 while (reg_off
< reg_max
) {
6504 uint64_t pg
= vg
[reg_off
>> 6];
6506 if (likely((pg
>> (reg_off
& 63)) & 1)) {
6507 addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6508 in_page
= -(addr
| TARGET_PAGE_MASK
);
6510 if (unlikely(in_page
< msize
)) {
6511 /* Stop if the element crosses a page boundary. */
6515 sve_probe_page(&info
, true, env
, addr
, 0, MMU_DATA_LOAD
,
6517 if (unlikely(info
.flags
& (TLB_INVALID_MASK
| TLB_MMIO
))) {
6520 if (unlikely(info
.flags
& TLB_WATCHPOINT
) &&
6521 (cpu_watchpoint_address_matches
6522 (env_cpu(env
), addr
, msize
) & BP_MEM_READ
)) {
6526 arm_tlb_mte_tagged(&info
.attrs
) &&
6527 !mte_probe(env
, mtedesc
, addr
)) {
6531 host_fn(vd
, reg_off
, info
.host
);
6534 } while (reg_off
& 63);
6539 record_fault(env
, reg_off
, reg_max
);
6542 static inline QEMU_ALWAYS_INLINE
6543 void sve_ldff1_z_mte(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6544 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6545 const int esz
, const int msz
,
6546 zreg_off_fn
*off_fn
,
6547 sve_ldst1_host_fn
*host_fn
,
6548 sve_ldst1_tlb_fn
*tlb_fn
)
6550 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6551 /* Remove mtedesc from the normal sve descriptor. */
6552 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6555 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6556 * offset base entirely over the address space hole to change the
6557 * pointer tag, or change the bit55 selector. So we could here
6558 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6560 sve_ldff1_z(env
, vd
, vg
, vm
, base
, desc
, retaddr
, mtedesc
,
6561 esz
, msz
, off_fn
, host_fn
, tlb_fn
);
6564 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6565 void HELPER(sve_ldff##MEM##_##OFS) \
6566 (CPUARMState *env, void *vd, void *vg, \
6567 void *vm, target_ulong base, uint32_t desc) \
6569 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6570 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6572 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6573 (CPUARMState *env, void *vd, void *vg, \
6574 void *vm, target_ulong base, uint32_t desc) \
6576 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6577 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6580 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6581 void HELPER(sve_ldff##MEM##_##OFS) \
6582 (CPUARMState *env, void *vd, void *vg, \
6583 void *vm, target_ulong base, uint32_t desc) \
6585 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6586 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6588 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6589 (CPUARMState *env, void *vd, void *vg, \
6590 void *vm, target_ulong base, uint32_t desc) \
6592 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6593 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6596 DO_LDFF1_ZPZ_S(bsu
, zsu
, MO_8
)
6597 DO_LDFF1_ZPZ_S(bsu
, zss
, MO_8
)
6598 DO_LDFF1_ZPZ_D(bdu
, zsu
, MO_8
)
6599 DO_LDFF1_ZPZ_D(bdu
, zss
, MO_8
)
6600 DO_LDFF1_ZPZ_D(bdu
, zd
, MO_8
)
6602 DO_LDFF1_ZPZ_S(bss
, zsu
, MO_8
)
6603 DO_LDFF1_ZPZ_S(bss
, zss
, MO_8
)
6604 DO_LDFF1_ZPZ_D(bds
, zsu
, MO_8
)
6605 DO_LDFF1_ZPZ_D(bds
, zss
, MO_8
)
6606 DO_LDFF1_ZPZ_D(bds
, zd
, MO_8
)
6608 DO_LDFF1_ZPZ_S(hsu_le
, zsu
, MO_16
)
6609 DO_LDFF1_ZPZ_S(hsu_le
, zss
, MO_16
)
6610 DO_LDFF1_ZPZ_D(hdu_le
, zsu
, MO_16
)
6611 DO_LDFF1_ZPZ_D(hdu_le
, zss
, MO_16
)
6612 DO_LDFF1_ZPZ_D(hdu_le
, zd
, MO_16
)
6614 DO_LDFF1_ZPZ_S(hsu_be
, zsu
, MO_16
)
6615 DO_LDFF1_ZPZ_S(hsu_be
, zss
, MO_16
)
6616 DO_LDFF1_ZPZ_D(hdu_be
, zsu
, MO_16
)
6617 DO_LDFF1_ZPZ_D(hdu_be
, zss
, MO_16
)
6618 DO_LDFF1_ZPZ_D(hdu_be
, zd
, MO_16
)
6620 DO_LDFF1_ZPZ_S(hss_le
, zsu
, MO_16
)
6621 DO_LDFF1_ZPZ_S(hss_le
, zss
, MO_16
)
6622 DO_LDFF1_ZPZ_D(hds_le
, zsu
, MO_16
)
6623 DO_LDFF1_ZPZ_D(hds_le
, zss
, MO_16
)
6624 DO_LDFF1_ZPZ_D(hds_le
, zd
, MO_16
)
6626 DO_LDFF1_ZPZ_S(hss_be
, zsu
, MO_16
)
6627 DO_LDFF1_ZPZ_S(hss_be
, zss
, MO_16
)
6628 DO_LDFF1_ZPZ_D(hds_be
, zsu
, MO_16
)
6629 DO_LDFF1_ZPZ_D(hds_be
, zss
, MO_16
)
6630 DO_LDFF1_ZPZ_D(hds_be
, zd
, MO_16
)
6632 DO_LDFF1_ZPZ_S(ss_le
, zsu
, MO_32
)
6633 DO_LDFF1_ZPZ_S(ss_le
, zss
, MO_32
)
6634 DO_LDFF1_ZPZ_D(sdu_le
, zsu
, MO_32
)
6635 DO_LDFF1_ZPZ_D(sdu_le
, zss
, MO_32
)
6636 DO_LDFF1_ZPZ_D(sdu_le
, zd
, MO_32
)
6638 DO_LDFF1_ZPZ_S(ss_be
, zsu
, MO_32
)
6639 DO_LDFF1_ZPZ_S(ss_be
, zss
, MO_32
)
6640 DO_LDFF1_ZPZ_D(sdu_be
, zsu
, MO_32
)
6641 DO_LDFF1_ZPZ_D(sdu_be
, zss
, MO_32
)
6642 DO_LDFF1_ZPZ_D(sdu_be
, zd
, MO_32
)
6644 DO_LDFF1_ZPZ_D(sds_le
, zsu
, MO_32
)
6645 DO_LDFF1_ZPZ_D(sds_le
, zss
, MO_32
)
6646 DO_LDFF1_ZPZ_D(sds_le
, zd
, MO_32
)
6648 DO_LDFF1_ZPZ_D(sds_be
, zsu
, MO_32
)
6649 DO_LDFF1_ZPZ_D(sds_be
, zss
, MO_32
)
6650 DO_LDFF1_ZPZ_D(sds_be
, zd
, MO_32
)
6652 DO_LDFF1_ZPZ_D(dd_le
, zsu
, MO_64
)
6653 DO_LDFF1_ZPZ_D(dd_le
, zss
, MO_64
)
6654 DO_LDFF1_ZPZ_D(dd_le
, zd
, MO_64
)
6656 DO_LDFF1_ZPZ_D(dd_be
, zsu
, MO_64
)
6657 DO_LDFF1_ZPZ_D(dd_be
, zss
, MO_64
)
6658 DO_LDFF1_ZPZ_D(dd_be
, zd
, MO_64
)
6660 /* Stores with a vector index. */
6662 static inline QEMU_ALWAYS_INLINE
6663 void sve_st1_z(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6664 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6665 uint32_t mtedesc
, int esize
, int msize
,
6666 zreg_off_fn
*off_fn
,
6667 sve_ldst1_host_fn
*host_fn
,
6668 sve_ldst1_tlb_fn
*tlb_fn
)
6670 const int mmu_idx
= cpu_mmu_index(env
, false);
6671 const intptr_t reg_max
= simd_oprsz(desc
);
6672 const int scale
= simd_data(desc
);
6673 void *host
[ARM_MAX_VQ
* 4];
6674 intptr_t reg_off
, i
;
6675 SVEHostPage info
, info2
;
6678 * Probe all of the elements for host addresses and flags.
6682 uint64_t pg
= vg
[reg_off
>> 6];
6684 target_ulong addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6685 target_ulong in_page
= -(addr
| TARGET_PAGE_MASK
);
6688 if (likely((pg
>> (reg_off
& 63)) & 1)) {
6689 if (likely(in_page
>= msize
)) {
6690 sve_probe_page(&info
, false, env
, addr
, 0, MMU_DATA_STORE
,
6692 host
[i
] = info
.host
;
6695 * Element crosses the page boundary.
6696 * Probe both pages, but do not record the host address,
6697 * so that we use the slow path.
6699 sve_probe_page(&info
, false, env
, addr
, 0,
6700 MMU_DATA_STORE
, mmu_idx
, retaddr
);
6701 sve_probe_page(&info2
, false, env
, addr
+ in_page
, 0,
6702 MMU_DATA_STORE
, mmu_idx
, retaddr
);
6703 info
.flags
|= info2
.flags
;
6706 if (unlikely(info
.flags
& TLB_WATCHPOINT
)) {
6707 cpu_check_watchpoint(env_cpu(env
), addr
, msize
,
6708 info
.attrs
, BP_MEM_WRITE
, retaddr
);
6711 if (mtedesc
&& arm_tlb_mte_tagged(&info
.attrs
)) {
6712 mte_check(env
, mtedesc
, addr
, retaddr
);
6717 } while (reg_off
& 63);
6718 } while (reg_off
< reg_max
);
6721 * Now that we have recognized all exceptions except SyncExternal
6722 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6724 * Note for the common case of an element in RAM, not crossing a page
6725 * boundary, we have stored the host address in host[]. This doubles
6726 * as a first-level check against the predicate, since only enabled
6727 * elements have non-null host addresses.
6732 if (likely(h
!= NULL
)) {
6733 host_fn(vd
, reg_off
, h
);
6734 } else if ((vg
[reg_off
>> 6] >> (reg_off
& 63)) & 1) {
6735 target_ulong addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6736 tlb_fn(env
, vd
, reg_off
, addr
, retaddr
);
6740 } while (reg_off
< reg_max
);
6743 static inline QEMU_ALWAYS_INLINE
6744 void sve_st1_z_mte(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6745 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6746 int esize
, int msize
, zreg_off_fn
*off_fn
,
6747 sve_ldst1_host_fn
*host_fn
,
6748 sve_ldst1_tlb_fn
*tlb_fn
)
6750 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6751 /* Remove mtedesc from the normal sve descriptor. */
6752 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6755 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6756 * offset base entirely over the address space hole to change the
6757 * pointer tag, or change the bit55 selector. So we could here
6758 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6760 sve_st1_z(env
, vd
, vg
, vm
, base
, desc
, retaddr
, mtedesc
,
6761 esize
, msize
, off_fn
, host_fn
, tlb_fn
);
6764 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
6765 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6766 void *vm, target_ulong base, uint32_t desc) \
6768 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6769 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6771 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6772 void *vm, target_ulong base, uint32_t desc) \
6774 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6775 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6778 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
6779 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6780 void *vm, target_ulong base, uint32_t desc) \
6782 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6783 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6785 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6786 void *vm, target_ulong base, uint32_t desc) \
6788 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6789 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6792 DO_ST1_ZPZ_S(bs
, zsu
, MO_8
)
6793 DO_ST1_ZPZ_S(hs_le
, zsu
, MO_16
)
6794 DO_ST1_ZPZ_S(hs_be
, zsu
, MO_16
)
6795 DO_ST1_ZPZ_S(ss_le
, zsu
, MO_32
)
6796 DO_ST1_ZPZ_S(ss_be
, zsu
, MO_32
)
6798 DO_ST1_ZPZ_S(bs
, zss
, MO_8
)
6799 DO_ST1_ZPZ_S(hs_le
, zss
, MO_16
)
6800 DO_ST1_ZPZ_S(hs_be
, zss
, MO_16
)
6801 DO_ST1_ZPZ_S(ss_le
, zss
, MO_32
)
6802 DO_ST1_ZPZ_S(ss_be
, zss
, MO_32
)
6804 DO_ST1_ZPZ_D(bd
, zsu
, MO_8
)
6805 DO_ST1_ZPZ_D(hd_le
, zsu
, MO_16
)
6806 DO_ST1_ZPZ_D(hd_be
, zsu
, MO_16
)
6807 DO_ST1_ZPZ_D(sd_le
, zsu
, MO_32
)
6808 DO_ST1_ZPZ_D(sd_be
, zsu
, MO_32
)
6809 DO_ST1_ZPZ_D(dd_le
, zsu
, MO_64
)
6810 DO_ST1_ZPZ_D(dd_be
, zsu
, MO_64
)
6812 DO_ST1_ZPZ_D(bd
, zss
, MO_8
)
6813 DO_ST1_ZPZ_D(hd_le
, zss
, MO_16
)
6814 DO_ST1_ZPZ_D(hd_be
, zss
, MO_16
)
6815 DO_ST1_ZPZ_D(sd_le
, zss
, MO_32
)
6816 DO_ST1_ZPZ_D(sd_be
, zss
, MO_32
)
6817 DO_ST1_ZPZ_D(dd_le
, zss
, MO_64
)
6818 DO_ST1_ZPZ_D(dd_be
, zss
, MO_64
)
6820 DO_ST1_ZPZ_D(bd
, zd
, MO_8
)
6821 DO_ST1_ZPZ_D(hd_le
, zd
, MO_16
)
6822 DO_ST1_ZPZ_D(hd_be
, zd
, MO_16
)
6823 DO_ST1_ZPZ_D(sd_le
, zd
, MO_32
)
6824 DO_ST1_ZPZ_D(sd_be
, zd
, MO_32
)
6825 DO_ST1_ZPZ_D(dd_le
, zd
, MO_64
)
6826 DO_ST1_ZPZ_D(dd_be
, zd
, MO_64
)
6831 void HELPER(sve2_eor3
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
6833 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
6834 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
6836 for (i
= 0; i
< opr_sz
; ++i
) {
6837 d
[i
] = n
[i
] ^ m
[i
] ^ k
[i
];
6841 void HELPER(sve2_bcax
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
6843 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
6844 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
6846 for (i
= 0; i
< opr_sz
; ++i
) {
6847 d
[i
] = n
[i
] ^ (m
[i
] & ~k
[i
]);
6851 void HELPER(sve2_bsl1n
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
6853 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
6854 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
6856 for (i
= 0; i
< opr_sz
; ++i
) {
6857 d
[i
] = (~n
[i
] & k
[i
]) | (m
[i
] & ~k
[i
]);
6861 void HELPER(sve2_bsl2n
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
6863 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
6864 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
6866 for (i
= 0; i
< opr_sz
; ++i
) {
6867 d
[i
] = (n
[i
] & k
[i
]) | (~m
[i
] & ~k
[i
]);
6871 void HELPER(sve2_nbsl
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
6873 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
6874 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
6876 for (i
= 0; i
< opr_sz
; ++i
) {
6877 d
[i
] = ~((n
[i
] & k
[i
]) | (m
[i
] & ~k
[i
]));
6882 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
6883 * See hasless(v,1) from
6884 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
6886 static inline bool do_match2(uint64_t n
, uint64_t m0
, uint64_t m1
, int esz
)
6888 int bits
= 8 << esz
;
6889 uint64_t ones
= dup_const(esz
, 1);
6890 uint64_t signs
= ones
<< (bits
- 1);
6891 uint64_t cmp0
, cmp1
;
6893 cmp1
= dup_const(esz
, n
);
6896 cmp0
= (cmp0
- ones
) & ~cmp0
;
6897 cmp1
= (cmp1
- ones
) & ~cmp1
;
6898 return (cmp0
| cmp1
) & signs
;
6901 static inline uint32_t do_match(void *vd
, void *vn
, void *vm
, void *vg
,
6902 uint32_t desc
, int esz
, bool nmatch
)
6904 uint16_t esz_mask
= pred_esz_masks
[esz
];
6905 intptr_t opr_sz
= simd_oprsz(desc
);
6906 uint32_t flags
= PREDTEST_INIT
;
6909 for (i
= 0; i
< opr_sz
; i
+= 16) {
6910 uint64_t m0
= *(uint64_t *)(vm
+ i
);
6911 uint64_t m1
= *(uint64_t *)(vm
+ i
+ 8);
6912 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3)) & esz_mask
;
6915 for (j
= 0; j
< 16; j
+= 8) {
6916 uint64_t n
= *(uint64_t *)(vn
+ i
+ j
);
6918 for (k
= 0; k
< 8; k
+= 1 << esz
) {
6919 if (pg
& (1 << (j
+ k
))) {
6920 bool o
= do_match2(n
>> (k
* 8), m0
, m1
, esz
);
6921 out
|= (o
^ nmatch
) << (j
+ k
);
6925 *(uint16_t *)(vd
+ H1_2(i
>> 3)) = out
;
6926 flags
= iter_predtest_fwd(out
, pg
, flags
);
6931 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \
6932 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
6934 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
6937 DO_PPZZ_MATCH(sve2_match_ppzz_b
, MO_8
, false)
6938 DO_PPZZ_MATCH(sve2_match_ppzz_h
, MO_16
, false)
6940 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b
, MO_8
, true)
6941 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h
, MO_16
, true)
6943 #undef DO_PPZZ_MATCH