4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
29 #include "vec_internal.h"
32 /* Note that vector data is stored in host-endian 64-bit chunks,
33 so addressing units smaller than that needs a host-endian fixup. */
34 #ifdef HOST_WORDS_BIGENDIAN
35 #define H1(x) ((x) ^ 7)
36 #define H1_2(x) ((x) ^ 6)
37 #define H1_4(x) ((x) ^ 4)
38 #define H2(x) ((x) ^ 3)
39 #define H4(x) ((x) ^ 1)
48 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
50 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
51 * and bit 0 set if C is set. Compare the definitions of these variables
55 /* For no G bits set, NZCV = C. */
56 #define PREDTEST_INIT 1
58 /* This is an iterative function, called for each Pd and Pg word
61 static uint32_t iter_predtest_fwd(uint64_t d
, uint64_t g
, uint32_t flags
)
64 /* Compute N from first D & G.
65 Use bit 2 to signal first G bit seen. */
67 flags
|= ((d
& (g
& -g
)) != 0) << 31;
71 /* Accumulate Z from each D & G. */
72 flags
|= ((d
& g
) != 0) << 1;
74 /* Compute C from last !(D & G). Replace previous. */
75 flags
= deposit32(flags
, 0, 1, (d
& pow2floor(g
)) == 0);
80 /* This is an iterative function, called for each Pd and Pg word
83 static uint32_t iter_predtest_bwd(uint64_t d
, uint64_t g
, uint32_t flags
)
86 /* Compute C from first (i.e last) !(D & G).
87 Use bit 2 to signal first G bit seen. */
89 flags
+= 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
90 flags
|= (d
& pow2floor(g
)) == 0;
93 /* Accumulate Z from each D & G. */
94 flags
|= ((d
& g
) != 0) << 1;
96 /* Compute N from last (i.e first) D & G. Replace previous. */
97 flags
= deposit32(flags
, 31, 1, (d
& (g
& -g
)) != 0);
102 /* The same for a single word predicate. */
103 uint32_t HELPER(sve_predtest1
)(uint64_t d
, uint64_t g
)
105 return iter_predtest_fwd(d
, g
, PREDTEST_INIT
);
108 /* The same for a multi-word predicate. */
109 uint32_t HELPER(sve_predtest
)(void *vd
, void *vg
, uint32_t words
)
111 uint32_t flags
= PREDTEST_INIT
;
112 uint64_t *d
= vd
, *g
= vg
;
116 flags
= iter_predtest_fwd(d
[i
], g
[i
], flags
);
117 } while (++i
< words
);
122 /* Expand active predicate bits to bytes, for byte elements.
123 * for (i = 0; i < 256; ++i) {
124 * unsigned long m = 0;
125 * for (j = 0; j < 8; j++) {
126 * if ((i >> j) & 1) {
127 * m |= 0xfful << (j << 3);
130 * printf("0x%016lx,\n", m);
133 static inline uint64_t expand_pred_b(uint8_t byte
)
135 static const uint64_t word
[256] = {
136 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
137 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
138 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
139 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
140 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
141 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
142 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
143 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
144 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
145 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
146 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
147 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
148 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
149 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
150 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
151 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
152 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
153 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
154 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
155 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
156 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
157 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
158 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
159 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
160 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
161 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
162 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
163 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
164 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
165 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
166 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
167 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
168 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
169 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
170 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
171 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
172 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
173 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
174 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
175 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
176 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
177 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
178 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
179 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
180 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
181 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
182 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
183 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
184 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
185 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
186 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
187 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
188 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
189 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
190 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
191 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
192 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
193 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
194 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
195 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
196 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
197 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
198 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
199 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
200 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
201 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
202 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
203 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
204 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
205 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
206 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
207 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
208 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
209 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
210 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
211 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
212 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
213 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
214 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
215 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
216 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
217 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
218 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
219 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
220 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
226 /* Similarly for half-word elements.
227 * for (i = 0; i < 256; ++i) {
228 * unsigned long m = 0;
232 * for (j = 0; j < 8; j += 2) {
233 * if ((i >> j) & 1) {
234 * m |= 0xfffful << (j << 3);
237 * printf("[0x%x] = 0x%016lx,\n", i, m);
240 static inline uint64_t expand_pred_h(uint8_t byte
)
242 static const uint64_t word
[] = {
243 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
244 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
245 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
246 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
247 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
248 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
249 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
250 [0x55] = 0xffffffffffffffff,
252 return word
[byte
& 0x55];
255 /* Similarly for single word elements. */
256 static inline uint64_t expand_pred_s(uint8_t byte
)
258 static const uint64_t word
[] = {
259 [0x01] = 0x00000000ffffffffull
,
260 [0x10] = 0xffffffff00000000ull
,
261 [0x11] = 0xffffffffffffffffull
,
263 return word
[byte
& 0x11];
266 /* Swap 16-bit words within a 32-bit word. */
267 static inline uint32_t hswap32(uint32_t h
)
272 /* Swap 16-bit words within a 64-bit word. */
273 static inline uint64_t hswap64(uint64_t h
)
275 uint64_t m
= 0x0000ffff0000ffffull
;
277 return ((h
& m
) << 16) | ((h
>> 16) & m
);
280 /* Swap 32-bit words within a 64-bit word. */
281 static inline uint64_t wswap64(uint64_t h
)
286 #define LOGICAL_PPPP(NAME, FUNC) \
287 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
289 uintptr_t opr_sz = simd_oprsz(desc); \
290 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
292 for (i = 0; i < opr_sz / 8; ++i) { \
293 d[i] = FUNC(n[i], m[i], g[i]); \
297 #define DO_AND(N, M, G) (((N) & (M)) & (G))
298 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
299 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
300 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
301 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
302 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
303 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
304 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
306 LOGICAL_PPPP(sve_and_pppp
, DO_AND
)
307 LOGICAL_PPPP(sve_bic_pppp
, DO_BIC
)
308 LOGICAL_PPPP(sve_eor_pppp
, DO_EOR
)
309 LOGICAL_PPPP(sve_sel_pppp
, DO_SEL
)
310 LOGICAL_PPPP(sve_orr_pppp
, DO_ORR
)
311 LOGICAL_PPPP(sve_orn_pppp
, DO_ORN
)
312 LOGICAL_PPPP(sve_nor_pppp
, DO_NOR
)
313 LOGICAL_PPPP(sve_nand_pppp
, DO_NAND
)
325 /* Fully general three-operand expander, controlled by a predicate.
326 * This is complicated by the host-endian storage of the register file.
328 /* ??? I don't expect the compiler could ever vectorize this itself.
329 * With some tables we can convert bit masks to byte masks, and with
330 * extra care wrt byte/word ordering we could use gcc generic vectors
331 * and do 16 bytes at a time.
333 #define DO_ZPZZ(NAME, TYPE, H, OP) \
334 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
336 intptr_t i, opr_sz = simd_oprsz(desc); \
337 for (i = 0; i < opr_sz; ) { \
338 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
341 TYPE nn = *(TYPE *)(vn + H(i)); \
342 TYPE mm = *(TYPE *)(vm + H(i)); \
343 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
345 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
350 /* Similarly, specialized for 64-bit operands. */
351 #define DO_ZPZZ_D(NAME, TYPE, OP) \
352 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
354 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
355 TYPE *d = vd, *n = vn, *m = vm; \
357 for (i = 0; i < opr_sz; i += 1) { \
358 if (pg[H1(i)] & 1) { \
359 TYPE nn = n[i], mm = m[i]; \
365 #define DO_AND(N, M) (N & M)
366 #define DO_EOR(N, M) (N ^ M)
367 #define DO_ORR(N, M) (N | M)
368 #define DO_BIC(N, M) (N & ~M)
369 #define DO_ADD(N, M) (N + M)
370 #define DO_SUB(N, M) (N - M)
371 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
372 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
373 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
374 #define DO_MUL(N, M) (N * M)
378 * We must avoid the C undefined behaviour cases: division by
379 * zero and signed division of INT_MIN by -1. Both of these
380 * have architecturally defined required results for Arm.
381 * We special case all signed divisions by -1 to avoid having
382 * to deduce the minimum integer for the type involved.
384 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
385 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
387 DO_ZPZZ(sve_and_zpzz_b
, uint8_t, H1
, DO_AND
)
388 DO_ZPZZ(sve_and_zpzz_h
, uint16_t, H1_2
, DO_AND
)
389 DO_ZPZZ(sve_and_zpzz_s
, uint32_t, H1_4
, DO_AND
)
390 DO_ZPZZ_D(sve_and_zpzz_d
, uint64_t, DO_AND
)
392 DO_ZPZZ(sve_orr_zpzz_b
, uint8_t, H1
, DO_ORR
)
393 DO_ZPZZ(sve_orr_zpzz_h
, uint16_t, H1_2
, DO_ORR
)
394 DO_ZPZZ(sve_orr_zpzz_s
, uint32_t, H1_4
, DO_ORR
)
395 DO_ZPZZ_D(sve_orr_zpzz_d
, uint64_t, DO_ORR
)
397 DO_ZPZZ(sve_eor_zpzz_b
, uint8_t, H1
, DO_EOR
)
398 DO_ZPZZ(sve_eor_zpzz_h
, uint16_t, H1_2
, DO_EOR
)
399 DO_ZPZZ(sve_eor_zpzz_s
, uint32_t, H1_4
, DO_EOR
)
400 DO_ZPZZ_D(sve_eor_zpzz_d
, uint64_t, DO_EOR
)
402 DO_ZPZZ(sve_bic_zpzz_b
, uint8_t, H1
, DO_BIC
)
403 DO_ZPZZ(sve_bic_zpzz_h
, uint16_t, H1_2
, DO_BIC
)
404 DO_ZPZZ(sve_bic_zpzz_s
, uint32_t, H1_4
, DO_BIC
)
405 DO_ZPZZ_D(sve_bic_zpzz_d
, uint64_t, DO_BIC
)
407 DO_ZPZZ(sve_add_zpzz_b
, uint8_t, H1
, DO_ADD
)
408 DO_ZPZZ(sve_add_zpzz_h
, uint16_t, H1_2
, DO_ADD
)
409 DO_ZPZZ(sve_add_zpzz_s
, uint32_t, H1_4
, DO_ADD
)
410 DO_ZPZZ_D(sve_add_zpzz_d
, uint64_t, DO_ADD
)
412 DO_ZPZZ(sve_sub_zpzz_b
, uint8_t, H1
, DO_SUB
)
413 DO_ZPZZ(sve_sub_zpzz_h
, uint16_t, H1_2
, DO_SUB
)
414 DO_ZPZZ(sve_sub_zpzz_s
, uint32_t, H1_4
, DO_SUB
)
415 DO_ZPZZ_D(sve_sub_zpzz_d
, uint64_t, DO_SUB
)
417 DO_ZPZZ(sve_smax_zpzz_b
, int8_t, H1
, DO_MAX
)
418 DO_ZPZZ(sve_smax_zpzz_h
, int16_t, H1_2
, DO_MAX
)
419 DO_ZPZZ(sve_smax_zpzz_s
, int32_t, H1_4
, DO_MAX
)
420 DO_ZPZZ_D(sve_smax_zpzz_d
, int64_t, DO_MAX
)
422 DO_ZPZZ(sve_umax_zpzz_b
, uint8_t, H1
, DO_MAX
)
423 DO_ZPZZ(sve_umax_zpzz_h
, uint16_t, H1_2
, DO_MAX
)
424 DO_ZPZZ(sve_umax_zpzz_s
, uint32_t, H1_4
, DO_MAX
)
425 DO_ZPZZ_D(sve_umax_zpzz_d
, uint64_t, DO_MAX
)
427 DO_ZPZZ(sve_smin_zpzz_b
, int8_t, H1
, DO_MIN
)
428 DO_ZPZZ(sve_smin_zpzz_h
, int16_t, H1_2
, DO_MIN
)
429 DO_ZPZZ(sve_smin_zpzz_s
, int32_t, H1_4
, DO_MIN
)
430 DO_ZPZZ_D(sve_smin_zpzz_d
, int64_t, DO_MIN
)
432 DO_ZPZZ(sve_umin_zpzz_b
, uint8_t, H1
, DO_MIN
)
433 DO_ZPZZ(sve_umin_zpzz_h
, uint16_t, H1_2
, DO_MIN
)
434 DO_ZPZZ(sve_umin_zpzz_s
, uint32_t, H1_4
, DO_MIN
)
435 DO_ZPZZ_D(sve_umin_zpzz_d
, uint64_t, DO_MIN
)
437 DO_ZPZZ(sve_sabd_zpzz_b
, int8_t, H1
, DO_ABD
)
438 DO_ZPZZ(sve_sabd_zpzz_h
, int16_t, H1_2
, DO_ABD
)
439 DO_ZPZZ(sve_sabd_zpzz_s
, int32_t, H1_4
, DO_ABD
)
440 DO_ZPZZ_D(sve_sabd_zpzz_d
, int64_t, DO_ABD
)
442 DO_ZPZZ(sve_uabd_zpzz_b
, uint8_t, H1
, DO_ABD
)
443 DO_ZPZZ(sve_uabd_zpzz_h
, uint16_t, H1_2
, DO_ABD
)
444 DO_ZPZZ(sve_uabd_zpzz_s
, uint32_t, H1_4
, DO_ABD
)
445 DO_ZPZZ_D(sve_uabd_zpzz_d
, uint64_t, DO_ABD
)
447 /* Because the computation type is at least twice as large as required,
448 these work for both signed and unsigned source types. */
449 static inline uint8_t do_mulh_b(int32_t n
, int32_t m
)
454 static inline uint16_t do_mulh_h(int32_t n
, int32_t m
)
456 return (n
* m
) >> 16;
459 static inline uint32_t do_mulh_s(int64_t n
, int64_t m
)
461 return (n
* m
) >> 32;
464 static inline uint64_t do_smulh_d(uint64_t n
, uint64_t m
)
467 muls64(&lo
, &hi
, n
, m
);
471 static inline uint64_t do_umulh_d(uint64_t n
, uint64_t m
)
474 mulu64(&lo
, &hi
, n
, m
);
478 DO_ZPZZ(sve_mul_zpzz_b
, uint8_t, H1
, DO_MUL
)
479 DO_ZPZZ(sve_mul_zpzz_h
, uint16_t, H1_2
, DO_MUL
)
480 DO_ZPZZ(sve_mul_zpzz_s
, uint32_t, H1_4
, DO_MUL
)
481 DO_ZPZZ_D(sve_mul_zpzz_d
, uint64_t, DO_MUL
)
483 DO_ZPZZ(sve_smulh_zpzz_b
, int8_t, H1
, do_mulh_b
)
484 DO_ZPZZ(sve_smulh_zpzz_h
, int16_t, H1_2
, do_mulh_h
)
485 DO_ZPZZ(sve_smulh_zpzz_s
, int32_t, H1_4
, do_mulh_s
)
486 DO_ZPZZ_D(sve_smulh_zpzz_d
, uint64_t, do_smulh_d
)
488 DO_ZPZZ(sve_umulh_zpzz_b
, uint8_t, H1
, do_mulh_b
)
489 DO_ZPZZ(sve_umulh_zpzz_h
, uint16_t, H1_2
, do_mulh_h
)
490 DO_ZPZZ(sve_umulh_zpzz_s
, uint32_t, H1_4
, do_mulh_s
)
491 DO_ZPZZ_D(sve_umulh_zpzz_d
, uint64_t, do_umulh_d
)
493 DO_ZPZZ(sve_sdiv_zpzz_s
, int32_t, H1_4
, DO_SDIV
)
494 DO_ZPZZ_D(sve_sdiv_zpzz_d
, int64_t, DO_SDIV
)
496 DO_ZPZZ(sve_udiv_zpzz_s
, uint32_t, H1_4
, DO_UDIV
)
497 DO_ZPZZ_D(sve_udiv_zpzz_d
, uint64_t, DO_UDIV
)
499 /* Note that all bits of the shift are significant
500 and not modulo the element size. */
501 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
502 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
503 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
505 DO_ZPZZ(sve_asr_zpzz_b
, int8_t, H1
, DO_ASR
)
506 DO_ZPZZ(sve_lsr_zpzz_b
, uint8_t, H1_2
, DO_LSR
)
507 DO_ZPZZ(sve_lsl_zpzz_b
, uint8_t, H1_4
, DO_LSL
)
509 DO_ZPZZ(sve_asr_zpzz_h
, int16_t, H1
, DO_ASR
)
510 DO_ZPZZ(sve_lsr_zpzz_h
, uint16_t, H1_2
, DO_LSR
)
511 DO_ZPZZ(sve_lsl_zpzz_h
, uint16_t, H1_4
, DO_LSL
)
513 DO_ZPZZ(sve_asr_zpzz_s
, int32_t, H1
, DO_ASR
)
514 DO_ZPZZ(sve_lsr_zpzz_s
, uint32_t, H1_2
, DO_LSR
)
515 DO_ZPZZ(sve_lsl_zpzz_s
, uint32_t, H1_4
, DO_LSL
)
517 DO_ZPZZ_D(sve_asr_zpzz_d
, int64_t, DO_ASR
)
518 DO_ZPZZ_D(sve_lsr_zpzz_d
, uint64_t, DO_LSR
)
519 DO_ZPZZ_D(sve_lsl_zpzz_d
, uint64_t, DO_LSL
)
521 static inline uint16_t do_sadalp_h(int16_t n
, int16_t m
)
523 int8_t n1
= n
, n2
= n
>> 8;
527 static inline uint32_t do_sadalp_s(int32_t n
, int32_t m
)
529 int16_t n1
= n
, n2
= n
>> 16;
533 static inline uint64_t do_sadalp_d(int64_t n
, int64_t m
)
535 int32_t n1
= n
, n2
= n
>> 32;
539 DO_ZPZZ(sve2_sadalp_zpzz_h
, int16_t, H1_2
, do_sadalp_h
)
540 DO_ZPZZ(sve2_sadalp_zpzz_s
, int32_t, H1_4
, do_sadalp_s
)
541 DO_ZPZZ_D(sve2_sadalp_zpzz_d
, int64_t, do_sadalp_d
)
543 static inline uint16_t do_uadalp_h(uint16_t n
, uint16_t m
)
545 uint8_t n1
= n
, n2
= n
>> 8;
549 static inline uint32_t do_uadalp_s(uint32_t n
, uint32_t m
)
551 uint16_t n1
= n
, n2
= n
>> 16;
555 static inline uint64_t do_uadalp_d(uint64_t n
, uint64_t m
)
557 uint32_t n1
= n
, n2
= n
>> 32;
561 DO_ZPZZ(sve2_uadalp_zpzz_h
, uint16_t, H1_2
, do_uadalp_h
)
562 DO_ZPZZ(sve2_uadalp_zpzz_s
, uint32_t, H1_4
, do_uadalp_s
)
563 DO_ZPZZ_D(sve2_uadalp_zpzz_d
, uint64_t, do_uadalp_d
)
565 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
566 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
567 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
568 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
570 DO_ZPZZ(sve2_srshl_zpzz_b
, int8_t, H1
, do_srshl_b
)
571 DO_ZPZZ(sve2_srshl_zpzz_h
, int16_t, H1_2
, do_srshl_h
)
572 DO_ZPZZ(sve2_srshl_zpzz_s
, int32_t, H1_4
, do_srshl_s
)
573 DO_ZPZZ_D(sve2_srshl_zpzz_d
, int64_t, do_srshl_d
)
575 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
576 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
577 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
578 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
580 DO_ZPZZ(sve2_urshl_zpzz_b
, uint8_t, H1
, do_urshl_b
)
581 DO_ZPZZ(sve2_urshl_zpzz_h
, uint16_t, H1_2
, do_urshl_h
)
582 DO_ZPZZ(sve2_urshl_zpzz_s
, uint32_t, H1_4
, do_urshl_s
)
583 DO_ZPZZ_D(sve2_urshl_zpzz_d
, uint64_t, do_urshl_d
)
586 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
587 * We pass in a pointer to a dummy saturation field to trigger
588 * the saturating arithmetic but discard the information about
589 * whether it has occurred.
591 #define do_sqshl_b(n, m) \
592 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
593 #define do_sqshl_h(n, m) \
594 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
595 #define do_sqshl_s(n, m) \
596 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
597 #define do_sqshl_d(n, m) \
598 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
600 DO_ZPZZ(sve2_sqshl_zpzz_b
, int8_t, H1_2
, do_sqshl_b
)
601 DO_ZPZZ(sve2_sqshl_zpzz_h
, int16_t, H1_2
, do_sqshl_h
)
602 DO_ZPZZ(sve2_sqshl_zpzz_s
, int32_t, H1_4
, do_sqshl_s
)
603 DO_ZPZZ_D(sve2_sqshl_zpzz_d
, int64_t, do_sqshl_d
)
605 #define do_uqshl_b(n, m) \
606 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
607 #define do_uqshl_h(n, m) \
608 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
609 #define do_uqshl_s(n, m) \
610 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
611 #define do_uqshl_d(n, m) \
612 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
614 DO_ZPZZ(sve2_uqshl_zpzz_b
, uint8_t, H1_2
, do_uqshl_b
)
615 DO_ZPZZ(sve2_uqshl_zpzz_h
, uint16_t, H1_2
, do_uqshl_h
)
616 DO_ZPZZ(sve2_uqshl_zpzz_s
, uint32_t, H1_4
, do_uqshl_s
)
617 DO_ZPZZ_D(sve2_uqshl_zpzz_d
, uint64_t, do_uqshl_d
)
619 #define do_sqrshl_b(n, m) \
620 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
621 #define do_sqrshl_h(n, m) \
622 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
623 #define do_sqrshl_s(n, m) \
624 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
625 #define do_sqrshl_d(n, m) \
626 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
628 DO_ZPZZ(sve2_sqrshl_zpzz_b
, int8_t, H1_2
, do_sqrshl_b
)
629 DO_ZPZZ(sve2_sqrshl_zpzz_h
, int16_t, H1_2
, do_sqrshl_h
)
630 DO_ZPZZ(sve2_sqrshl_zpzz_s
, int32_t, H1_4
, do_sqrshl_s
)
631 DO_ZPZZ_D(sve2_sqrshl_zpzz_d
, int64_t, do_sqrshl_d
)
635 #define do_uqrshl_b(n, m) \
636 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
637 #define do_uqrshl_h(n, m) \
638 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
639 #define do_uqrshl_s(n, m) \
640 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
641 #define do_uqrshl_d(n, m) \
642 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
644 DO_ZPZZ(sve2_uqrshl_zpzz_b
, uint8_t, H1_2
, do_uqrshl_b
)
645 DO_ZPZZ(sve2_uqrshl_zpzz_h
, uint16_t, H1_2
, do_uqrshl_h
)
646 DO_ZPZZ(sve2_uqrshl_zpzz_s
, uint32_t, H1_4
, do_uqrshl_s
)
647 DO_ZPZZ_D(sve2_uqrshl_zpzz_d
, uint64_t, do_uqrshl_d
)
651 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
652 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
654 DO_ZPZZ(sve2_shadd_zpzz_b
, int8_t, H1
, DO_HADD_BHS
)
655 DO_ZPZZ(sve2_shadd_zpzz_h
, int16_t, H1_2
, DO_HADD_BHS
)
656 DO_ZPZZ(sve2_shadd_zpzz_s
, int32_t, H1_4
, DO_HADD_BHS
)
657 DO_ZPZZ_D(sve2_shadd_zpzz_d
, int64_t, DO_HADD_D
)
659 DO_ZPZZ(sve2_uhadd_zpzz_b
, uint8_t, H1
, DO_HADD_BHS
)
660 DO_ZPZZ(sve2_uhadd_zpzz_h
, uint16_t, H1_2
, DO_HADD_BHS
)
661 DO_ZPZZ(sve2_uhadd_zpzz_s
, uint32_t, H1_4
, DO_HADD_BHS
)
662 DO_ZPZZ_D(sve2_uhadd_zpzz_d
, uint64_t, DO_HADD_D
)
664 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
665 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
667 DO_ZPZZ(sve2_srhadd_zpzz_b
, int8_t, H1
, DO_RHADD_BHS
)
668 DO_ZPZZ(sve2_srhadd_zpzz_h
, int16_t, H1_2
, DO_RHADD_BHS
)
669 DO_ZPZZ(sve2_srhadd_zpzz_s
, int32_t, H1_4
, DO_RHADD_BHS
)
670 DO_ZPZZ_D(sve2_srhadd_zpzz_d
, int64_t, DO_RHADD_D
)
672 DO_ZPZZ(sve2_urhadd_zpzz_b
, uint8_t, H1
, DO_RHADD_BHS
)
673 DO_ZPZZ(sve2_urhadd_zpzz_h
, uint16_t, H1_2
, DO_RHADD_BHS
)
674 DO_ZPZZ(sve2_urhadd_zpzz_s
, uint32_t, H1_4
, DO_RHADD_BHS
)
675 DO_ZPZZ_D(sve2_urhadd_zpzz_d
, uint64_t, DO_RHADD_D
)
677 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
678 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
680 DO_ZPZZ(sve2_shsub_zpzz_b
, int8_t, H1
, DO_HSUB_BHS
)
681 DO_ZPZZ(sve2_shsub_zpzz_h
, int16_t, H1_2
, DO_HSUB_BHS
)
682 DO_ZPZZ(sve2_shsub_zpzz_s
, int32_t, H1_4
, DO_HSUB_BHS
)
683 DO_ZPZZ_D(sve2_shsub_zpzz_d
, int64_t, DO_HSUB_D
)
685 DO_ZPZZ(sve2_uhsub_zpzz_b
, uint8_t, H1
, DO_HSUB_BHS
)
686 DO_ZPZZ(sve2_uhsub_zpzz_h
, uint16_t, H1_2
, DO_HSUB_BHS
)
687 DO_ZPZZ(sve2_uhsub_zpzz_s
, uint32_t, H1_4
, DO_HSUB_BHS
)
688 DO_ZPZZ_D(sve2_uhsub_zpzz_d
, uint64_t, DO_HSUB_D
)
690 static inline int32_t do_sat_bhs(int64_t val
, int64_t min
, int64_t max
)
692 return val
>= max
? max
: val
<= min
? min
: val
;
695 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
696 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
697 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
699 static inline int64_t do_sqadd_d(int64_t n
, int64_t m
)
702 if (((r
^ n
) & ~(n
^ m
)) < 0) {
703 /* Signed overflow. */
704 return r
< 0 ? INT64_MAX
: INT64_MIN
;
709 DO_ZPZZ(sve2_sqadd_zpzz_b
, int8_t, H1
, DO_SQADD_B
)
710 DO_ZPZZ(sve2_sqadd_zpzz_h
, int16_t, H1_2
, DO_SQADD_H
)
711 DO_ZPZZ(sve2_sqadd_zpzz_s
, int32_t, H1_4
, DO_SQADD_S
)
712 DO_ZPZZ_D(sve2_sqadd_zpzz_d
, int64_t, do_sqadd_d
)
714 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
715 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
716 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
718 static inline uint64_t do_uqadd_d(uint64_t n
, uint64_t m
)
721 return r
< n
? UINT64_MAX
: r
;
724 DO_ZPZZ(sve2_uqadd_zpzz_b
, uint8_t, H1
, DO_UQADD_B
)
725 DO_ZPZZ(sve2_uqadd_zpzz_h
, uint16_t, H1_2
, DO_UQADD_H
)
726 DO_ZPZZ(sve2_uqadd_zpzz_s
, uint32_t, H1_4
, DO_UQADD_S
)
727 DO_ZPZZ_D(sve2_uqadd_zpzz_d
, uint64_t, do_uqadd_d
)
729 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
730 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
731 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
733 static inline int64_t do_sqsub_d(int64_t n
, int64_t m
)
736 if (((r
^ n
) & (n
^ m
)) < 0) {
737 /* Signed overflow. */
738 return r
< 0 ? INT64_MAX
: INT64_MIN
;
743 DO_ZPZZ(sve2_sqsub_zpzz_b
, int8_t, H1
, DO_SQSUB_B
)
744 DO_ZPZZ(sve2_sqsub_zpzz_h
, int16_t, H1_2
, DO_SQSUB_H
)
745 DO_ZPZZ(sve2_sqsub_zpzz_s
, int32_t, H1_4
, DO_SQSUB_S
)
746 DO_ZPZZ_D(sve2_sqsub_zpzz_d
, int64_t, do_sqsub_d
)
748 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
749 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
750 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
752 static inline uint64_t do_uqsub_d(uint64_t n
, uint64_t m
)
754 return n
> m
? n
- m
: 0;
757 DO_ZPZZ(sve2_uqsub_zpzz_b
, uint8_t, H1
, DO_UQSUB_B
)
758 DO_ZPZZ(sve2_uqsub_zpzz_h
, uint16_t, H1_2
, DO_UQSUB_H
)
759 DO_ZPZZ(sve2_uqsub_zpzz_s
, uint32_t, H1_4
, DO_UQSUB_S
)
760 DO_ZPZZ_D(sve2_uqsub_zpzz_d
, uint64_t, do_uqsub_d
)
762 #define DO_SUQADD_B(n, m) \
763 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
764 #define DO_SUQADD_H(n, m) \
765 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
766 #define DO_SUQADD_S(n, m) \
767 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
769 static inline int64_t do_suqadd_d(int64_t n
, uint64_t m
)
774 /* Note that m - abs(n) cannot underflow. */
776 /* Result is either very large positive or negative. */
778 /* m > abs(n), so r is a very large positive. */
781 /* Result is negative. */
784 /* Both inputs are positive: check for overflow. */
785 if (r
< m
|| r
> INT64_MAX
) {
792 DO_ZPZZ(sve2_suqadd_zpzz_b
, uint8_t, H1
, DO_SUQADD_B
)
793 DO_ZPZZ(sve2_suqadd_zpzz_h
, uint16_t, H1_2
, DO_SUQADD_H
)
794 DO_ZPZZ(sve2_suqadd_zpzz_s
, uint32_t, H1_4
, DO_SUQADD_S
)
795 DO_ZPZZ_D(sve2_suqadd_zpzz_d
, uint64_t, do_suqadd_d
)
797 #define DO_USQADD_B(n, m) \
798 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
799 #define DO_USQADD_H(n, m) \
800 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
801 #define DO_USQADD_S(n, m) \
802 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
804 static inline uint64_t do_usqadd_d(uint64_t n
, int64_t m
)
809 return n
< -m
? 0 : r
;
811 return r
< n
? UINT64_MAX
: r
;
814 DO_ZPZZ(sve2_usqadd_zpzz_b
, uint8_t, H1
, DO_USQADD_B
)
815 DO_ZPZZ(sve2_usqadd_zpzz_h
, uint16_t, H1_2
, DO_USQADD_H
)
816 DO_ZPZZ(sve2_usqadd_zpzz_s
, uint32_t, H1_4
, DO_USQADD_S
)
817 DO_ZPZZ_D(sve2_usqadd_zpzz_d
, uint64_t, do_usqadd_d
)
823 * Three operand expander, operating on element pairs.
824 * If the slot I is even, the elements from from VN {I, I+1}.
825 * If the slot I is odd, the elements from from VM {I-1, I}.
826 * Load all of the input elements in each pair before overwriting output.
828 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
829 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
831 intptr_t i, opr_sz = simd_oprsz(desc); \
832 for (i = 0; i < opr_sz; ) { \
833 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
835 TYPE n0 = *(TYPE *)(vn + H(i)); \
836 TYPE m0 = *(TYPE *)(vm + H(i)); \
837 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
838 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
840 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
842 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
844 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
846 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
851 /* Similarly, specialized for 64-bit operands. */
852 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
853 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
855 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
856 TYPE *d = vd, *n = vn, *m = vm; \
858 for (i = 0; i < opr_sz; i += 2) { \
859 TYPE n0 = n[i], n1 = n[i + 1]; \
860 TYPE m0 = m[i], m1 = m[i + 1]; \
861 if (pg[H1(i)] & 1) { \
864 if (pg[H1(i + 1)] & 1) { \
865 d[i + 1] = OP(m0, m1); \
870 DO_ZPZZ_PAIR(sve2_addp_zpzz_b
, uint8_t, H1
, DO_ADD
)
871 DO_ZPZZ_PAIR(sve2_addp_zpzz_h
, uint16_t, H1_2
, DO_ADD
)
872 DO_ZPZZ_PAIR(sve2_addp_zpzz_s
, uint32_t, H1_4
, DO_ADD
)
873 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d
, uint64_t, DO_ADD
)
875 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b
, uint8_t, H1
, DO_MAX
)
876 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h
, uint16_t, H1_2
, DO_MAX
)
877 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s
, uint32_t, H1_4
, DO_MAX
)
878 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d
, uint64_t, DO_MAX
)
880 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b
, uint8_t, H1
, DO_MIN
)
881 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h
, uint16_t, H1_2
, DO_MIN
)
882 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s
, uint32_t, H1_4
, DO_MIN
)
883 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d
, uint64_t, DO_MIN
)
885 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b
, int8_t, H1
, DO_MAX
)
886 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h
, int16_t, H1_2
, DO_MAX
)
887 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s
, int32_t, H1_4
, DO_MAX
)
888 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d
, int64_t, DO_MAX
)
890 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b
, int8_t, H1
, DO_MIN
)
891 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h
, int16_t, H1_2
, DO_MIN
)
892 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s
, int32_t, H1_4
, DO_MIN
)
893 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d
, int64_t, DO_MIN
)
896 #undef DO_ZPZZ_PAIR_D
898 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
899 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
900 void *status, uint32_t desc) \
902 intptr_t i, opr_sz = simd_oprsz(desc); \
903 for (i = 0; i < opr_sz; ) { \
904 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
906 TYPE n0 = *(TYPE *)(vn + H(i)); \
907 TYPE m0 = *(TYPE *)(vm + H(i)); \
908 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
909 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
911 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
913 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
915 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
917 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
922 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h
, float16
, H1_2
, float16_add
)
923 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s
, float32
, H1_4
, float32_add
)
924 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d
, float64
, , float64_add
)
926 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h
, float16
, H1_2
, float16_maxnum
)
927 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s
, float32
, H1_4
, float32_maxnum
)
928 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d
, float64
, , float64_maxnum
)
930 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h
, float16
, H1_2
, float16_minnum
)
931 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s
, float32
, H1_4
, float32_minnum
)
932 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d
, float64
, , float64_minnum
)
934 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h
, float16
, H1_2
, float16_max
)
935 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s
, float32
, H1_4
, float32_max
)
936 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d
, float64
, , float64_max
)
938 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h
, float16
, H1_2
, float16_min
)
939 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s
, float32
, H1_4
, float32_min
)
940 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d
, float64
, , float64_min
)
942 #undef DO_ZPZZ_PAIR_FP
944 /* Three-operand expander, controlled by a predicate, in which the
945 * third operand is "wide". That is, for D = N op M, the same 64-bit
946 * value of M is used with all of the narrower values of N.
948 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
949 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
951 intptr_t i, opr_sz = simd_oprsz(desc); \
952 for (i = 0; i < opr_sz; ) { \
953 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
954 TYPEW mm = *(TYPEW *)(vm + i); \
957 TYPE nn = *(TYPE *)(vn + H(i)); \
958 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
960 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
965 DO_ZPZW(sve_asr_zpzw_b
, int8_t, uint64_t, H1
, DO_ASR
)
966 DO_ZPZW(sve_lsr_zpzw_b
, uint8_t, uint64_t, H1
, DO_LSR
)
967 DO_ZPZW(sve_lsl_zpzw_b
, uint8_t, uint64_t, H1
, DO_LSL
)
969 DO_ZPZW(sve_asr_zpzw_h
, int16_t, uint64_t, H1_2
, DO_ASR
)
970 DO_ZPZW(sve_lsr_zpzw_h
, uint16_t, uint64_t, H1_2
, DO_LSR
)
971 DO_ZPZW(sve_lsl_zpzw_h
, uint16_t, uint64_t, H1_2
, DO_LSL
)
973 DO_ZPZW(sve_asr_zpzw_s
, int32_t, uint64_t, H1_4
, DO_ASR
)
974 DO_ZPZW(sve_lsr_zpzw_s
, uint32_t, uint64_t, H1_4
, DO_LSR
)
975 DO_ZPZW(sve_lsl_zpzw_s
, uint32_t, uint64_t, H1_4
, DO_LSL
)
979 /* Fully general two-operand expander, controlled by a predicate.
981 #define DO_ZPZ(NAME, TYPE, H, OP) \
982 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
984 intptr_t i, opr_sz = simd_oprsz(desc); \
985 for (i = 0; i < opr_sz; ) { \
986 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
989 TYPE nn = *(TYPE *)(vn + H(i)); \
990 *(TYPE *)(vd + H(i)) = OP(nn); \
992 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
997 /* Similarly, specialized for 64-bit operands. */
998 #define DO_ZPZ_D(NAME, TYPE, OP) \
999 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1001 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1002 TYPE *d = vd, *n = vn; \
1004 for (i = 0; i < opr_sz; i += 1) { \
1005 if (pg[H1(i)] & 1) { \
1012 #define DO_CLS_B(N) (clrsb32(N) - 24)
1013 #define DO_CLS_H(N) (clrsb32(N) - 16)
1015 DO_ZPZ(sve_cls_b
, int8_t, H1
, DO_CLS_B
)
1016 DO_ZPZ(sve_cls_h
, int16_t, H1_2
, DO_CLS_H
)
1017 DO_ZPZ(sve_cls_s
, int32_t, H1_4
, clrsb32
)
1018 DO_ZPZ_D(sve_cls_d
, int64_t, clrsb64
)
1020 #define DO_CLZ_B(N) (clz32(N) - 24)
1021 #define DO_CLZ_H(N) (clz32(N) - 16)
1023 DO_ZPZ(sve_clz_b
, uint8_t, H1
, DO_CLZ_B
)
1024 DO_ZPZ(sve_clz_h
, uint16_t, H1_2
, DO_CLZ_H
)
1025 DO_ZPZ(sve_clz_s
, uint32_t, H1_4
, clz32
)
1026 DO_ZPZ_D(sve_clz_d
, uint64_t, clz64
)
1028 DO_ZPZ(sve_cnt_zpz_b
, uint8_t, H1
, ctpop8
)
1029 DO_ZPZ(sve_cnt_zpz_h
, uint16_t, H1_2
, ctpop16
)
1030 DO_ZPZ(sve_cnt_zpz_s
, uint32_t, H1_4
, ctpop32
)
1031 DO_ZPZ_D(sve_cnt_zpz_d
, uint64_t, ctpop64
)
1033 #define DO_CNOT(N) (N == 0)
1035 DO_ZPZ(sve_cnot_b
, uint8_t, H1
, DO_CNOT
)
1036 DO_ZPZ(sve_cnot_h
, uint16_t, H1_2
, DO_CNOT
)
1037 DO_ZPZ(sve_cnot_s
, uint32_t, H1_4
, DO_CNOT
)
1038 DO_ZPZ_D(sve_cnot_d
, uint64_t, DO_CNOT
)
1040 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
1042 DO_ZPZ(sve_fabs_h
, uint16_t, H1_2
, DO_FABS
)
1043 DO_ZPZ(sve_fabs_s
, uint32_t, H1_4
, DO_FABS
)
1044 DO_ZPZ_D(sve_fabs_d
, uint64_t, DO_FABS
)
1046 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
1048 DO_ZPZ(sve_fneg_h
, uint16_t, H1_2
, DO_FNEG
)
1049 DO_ZPZ(sve_fneg_s
, uint32_t, H1_4
, DO_FNEG
)
1050 DO_ZPZ_D(sve_fneg_d
, uint64_t, DO_FNEG
)
1052 #define DO_NOT(N) (~N)
1054 DO_ZPZ(sve_not_zpz_b
, uint8_t, H1
, DO_NOT
)
1055 DO_ZPZ(sve_not_zpz_h
, uint16_t, H1_2
, DO_NOT
)
1056 DO_ZPZ(sve_not_zpz_s
, uint32_t, H1_4
, DO_NOT
)
1057 DO_ZPZ_D(sve_not_zpz_d
, uint64_t, DO_NOT
)
1059 #define DO_SXTB(N) ((int8_t)N)
1060 #define DO_SXTH(N) ((int16_t)N)
1061 #define DO_SXTS(N) ((int32_t)N)
1062 #define DO_UXTB(N) ((uint8_t)N)
1063 #define DO_UXTH(N) ((uint16_t)N)
1064 #define DO_UXTS(N) ((uint32_t)N)
1066 DO_ZPZ(sve_sxtb_h
, uint16_t, H1_2
, DO_SXTB
)
1067 DO_ZPZ(sve_sxtb_s
, uint32_t, H1_4
, DO_SXTB
)
1068 DO_ZPZ(sve_sxth_s
, uint32_t, H1_4
, DO_SXTH
)
1069 DO_ZPZ_D(sve_sxtb_d
, uint64_t, DO_SXTB
)
1070 DO_ZPZ_D(sve_sxth_d
, uint64_t, DO_SXTH
)
1071 DO_ZPZ_D(sve_sxtw_d
, uint64_t, DO_SXTS
)
1073 DO_ZPZ(sve_uxtb_h
, uint16_t, H1_2
, DO_UXTB
)
1074 DO_ZPZ(sve_uxtb_s
, uint32_t, H1_4
, DO_UXTB
)
1075 DO_ZPZ(sve_uxth_s
, uint32_t, H1_4
, DO_UXTH
)
1076 DO_ZPZ_D(sve_uxtb_d
, uint64_t, DO_UXTB
)
1077 DO_ZPZ_D(sve_uxth_d
, uint64_t, DO_UXTH
)
1078 DO_ZPZ_D(sve_uxtw_d
, uint64_t, DO_UXTS
)
1080 #define DO_ABS(N) (N < 0 ? -N : N)
1082 DO_ZPZ(sve_abs_b
, int8_t, H1
, DO_ABS
)
1083 DO_ZPZ(sve_abs_h
, int16_t, H1_2
, DO_ABS
)
1084 DO_ZPZ(sve_abs_s
, int32_t, H1_4
, DO_ABS
)
1085 DO_ZPZ_D(sve_abs_d
, int64_t, DO_ABS
)
1087 #define DO_NEG(N) (-N)
1089 DO_ZPZ(sve_neg_b
, uint8_t, H1
, DO_NEG
)
1090 DO_ZPZ(sve_neg_h
, uint16_t, H1_2
, DO_NEG
)
1091 DO_ZPZ(sve_neg_s
, uint32_t, H1_4
, DO_NEG
)
1092 DO_ZPZ_D(sve_neg_d
, uint64_t, DO_NEG
)
1094 DO_ZPZ(sve_revb_h
, uint16_t, H1_2
, bswap16
)
1095 DO_ZPZ(sve_revb_s
, uint32_t, H1_4
, bswap32
)
1096 DO_ZPZ_D(sve_revb_d
, uint64_t, bswap64
)
1098 DO_ZPZ(sve_revh_s
, uint32_t, H1_4
, hswap32
)
1099 DO_ZPZ_D(sve_revh_d
, uint64_t, hswap64
)
1101 DO_ZPZ_D(sve_revw_d
, uint64_t, wswap64
)
1103 DO_ZPZ(sve_rbit_b
, uint8_t, H1
, revbit8
)
1104 DO_ZPZ(sve_rbit_h
, uint16_t, H1_2
, revbit16
)
1105 DO_ZPZ(sve_rbit_s
, uint32_t, H1_4
, revbit32
)
1106 DO_ZPZ_D(sve_rbit_d
, uint64_t, revbit64
)
1108 #define DO_SQABS(X) \
1109 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1110 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
1112 DO_ZPZ(sve2_sqabs_b
, int8_t, H1
, DO_SQABS
)
1113 DO_ZPZ(sve2_sqabs_h
, int16_t, H1_2
, DO_SQABS
)
1114 DO_ZPZ(sve2_sqabs_s
, int32_t, H1_4
, DO_SQABS
)
1115 DO_ZPZ_D(sve2_sqabs_d
, int64_t, DO_SQABS
)
1117 #define DO_SQNEG(X) \
1118 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1119 x_ == min_ ? -min_ - 1 : -x_; })
1121 DO_ZPZ(sve2_sqneg_b
, uint8_t, H1
, DO_SQNEG
)
1122 DO_ZPZ(sve2_sqneg_h
, uint16_t, H1_2
, DO_SQNEG
)
1123 DO_ZPZ(sve2_sqneg_s
, uint32_t, H1_4
, DO_SQNEG
)
1124 DO_ZPZ_D(sve2_sqneg_d
, uint64_t, DO_SQNEG
)
1126 DO_ZPZ(sve2_urecpe_s
, uint32_t, H1_4
, helper_recpe_u32
)
1127 DO_ZPZ(sve2_ursqrte_s
, uint32_t, H1_4
, helper_rsqrte_u32
)
1129 /* Three-operand expander, unpredicated, in which the third operand is "wide".
1131 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1132 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1134 intptr_t i, opr_sz = simd_oprsz(desc); \
1135 for (i = 0; i < opr_sz; ) { \
1136 TYPEW mm = *(TYPEW *)(vm + i); \
1138 TYPE nn = *(TYPE *)(vn + H(i)); \
1139 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1140 i += sizeof(TYPE); \
1145 DO_ZZW(sve_asr_zzw_b
, int8_t, uint64_t, H1
, DO_ASR
)
1146 DO_ZZW(sve_lsr_zzw_b
, uint8_t, uint64_t, H1
, DO_LSR
)
1147 DO_ZZW(sve_lsl_zzw_b
, uint8_t, uint64_t, H1
, DO_LSL
)
1149 DO_ZZW(sve_asr_zzw_h
, int16_t, uint64_t, H1_2
, DO_ASR
)
1150 DO_ZZW(sve_lsr_zzw_h
, uint16_t, uint64_t, H1_2
, DO_LSR
)
1151 DO_ZZW(sve_lsl_zzw_h
, uint16_t, uint64_t, H1_2
, DO_LSL
)
1153 DO_ZZW(sve_asr_zzw_s
, int32_t, uint64_t, H1_4
, DO_ASR
)
1154 DO_ZZW(sve_lsr_zzw_s
, uint32_t, uint64_t, H1_4
, DO_LSR
)
1155 DO_ZZW(sve_lsl_zzw_s
, uint32_t, uint64_t, H1_4
, DO_LSL
)
1172 * Three-operand expander, unpredicated, in which the two inputs are
1173 * selected from the top or bottom half of the wide column.
1175 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1176 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1178 intptr_t i, opr_sz = simd_oprsz(desc); \
1179 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1180 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1181 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1182 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1183 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1184 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1188 DO_ZZZ_TB(sve2_saddl_h
, int16_t, int8_t, H1_2
, H1
, DO_ADD
)
1189 DO_ZZZ_TB(sve2_saddl_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ADD
)
1190 DO_ZZZ_TB(sve2_saddl_d
, int64_t, int32_t, , H1_4
, DO_ADD
)
1192 DO_ZZZ_TB(sve2_ssubl_h
, int16_t, int8_t, H1_2
, H1
, DO_SUB
)
1193 DO_ZZZ_TB(sve2_ssubl_s
, int32_t, int16_t, H1_4
, H1_2
, DO_SUB
)
1194 DO_ZZZ_TB(sve2_ssubl_d
, int64_t, int32_t, , H1_4
, DO_SUB
)
1196 DO_ZZZ_TB(sve2_sabdl_h
, int16_t, int8_t, H1_2
, H1
, DO_ABD
)
1197 DO_ZZZ_TB(sve2_sabdl_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ABD
)
1198 DO_ZZZ_TB(sve2_sabdl_d
, int64_t, int32_t, , H1_4
, DO_ABD
)
1200 DO_ZZZ_TB(sve2_uaddl_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ADD
)
1201 DO_ZZZ_TB(sve2_uaddl_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ADD
)
1202 DO_ZZZ_TB(sve2_uaddl_d
, uint64_t, uint32_t, , H1_4
, DO_ADD
)
1204 DO_ZZZ_TB(sve2_usubl_h
, uint16_t, uint8_t, H1_2
, H1
, DO_SUB
)
1205 DO_ZZZ_TB(sve2_usubl_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_SUB
)
1206 DO_ZZZ_TB(sve2_usubl_d
, uint64_t, uint32_t, , H1_4
, DO_SUB
)
1208 DO_ZZZ_TB(sve2_uabdl_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ABD
)
1209 DO_ZZZ_TB(sve2_uabdl_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ABD
)
1210 DO_ZZZ_TB(sve2_uabdl_d
, uint64_t, uint32_t, , H1_4
, DO_ABD
)
1212 DO_ZZZ_TB(sve2_smull_zzz_h
, int16_t, int8_t, H1_2
, H1
, DO_MUL
)
1213 DO_ZZZ_TB(sve2_smull_zzz_s
, int32_t, int16_t, H1_4
, H1_2
, DO_MUL
)
1214 DO_ZZZ_TB(sve2_smull_zzz_d
, int64_t, int32_t, , H1_4
, DO_MUL
)
1216 DO_ZZZ_TB(sve2_umull_zzz_h
, uint16_t, uint8_t, H1_2
, H1
, DO_MUL
)
1217 DO_ZZZ_TB(sve2_umull_zzz_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_MUL
)
1218 DO_ZZZ_TB(sve2_umull_zzz_d
, uint64_t, uint32_t, , H1_4
, DO_MUL
)
1220 /* Note that the multiply cannot overflow, but the doubling can. */
1221 static inline int16_t do_sqdmull_h(int16_t n
, int16_t m
)
1223 int16_t val
= n
* m
;
1224 return DO_SQADD_H(val
, val
);
1227 static inline int32_t do_sqdmull_s(int32_t n
, int32_t m
)
1229 int32_t val
= n
* m
;
1230 return DO_SQADD_S(val
, val
);
1233 static inline int64_t do_sqdmull_d(int64_t n
, int64_t m
)
1235 int64_t val
= n
* m
;
1236 return do_sqadd_d(val
, val
);
1239 DO_ZZZ_TB(sve2_sqdmull_zzz_h
, int16_t, int8_t, H1_2
, H1
, do_sqdmull_h
)
1240 DO_ZZZ_TB(sve2_sqdmull_zzz_s
, int32_t, int16_t, H1_4
, H1_2
, do_sqdmull_s
)
1241 DO_ZZZ_TB(sve2_sqdmull_zzz_d
, int64_t, int32_t, , H1_4
, do_sqdmull_d
)
1245 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1246 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1248 intptr_t i, opr_sz = simd_oprsz(desc); \
1249 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1250 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1251 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1252 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1253 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1257 DO_ZZZ_WTB(sve2_saddw_h
, int16_t, int8_t, H1_2
, H1
, DO_ADD
)
1258 DO_ZZZ_WTB(sve2_saddw_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ADD
)
1259 DO_ZZZ_WTB(sve2_saddw_d
, int64_t, int32_t, , H1_4
, DO_ADD
)
1261 DO_ZZZ_WTB(sve2_ssubw_h
, int16_t, int8_t, H1_2
, H1
, DO_SUB
)
1262 DO_ZZZ_WTB(sve2_ssubw_s
, int32_t, int16_t, H1_4
, H1_2
, DO_SUB
)
1263 DO_ZZZ_WTB(sve2_ssubw_d
, int64_t, int32_t, , H1_4
, DO_SUB
)
1265 DO_ZZZ_WTB(sve2_uaddw_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ADD
)
1266 DO_ZZZ_WTB(sve2_uaddw_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ADD
)
1267 DO_ZZZ_WTB(sve2_uaddw_d
, uint64_t, uint32_t, , H1_4
, DO_ADD
)
1269 DO_ZZZ_WTB(sve2_usubw_h
, uint16_t, uint8_t, H1_2
, H1
, DO_SUB
)
1270 DO_ZZZ_WTB(sve2_usubw_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_SUB
)
1271 DO_ZZZ_WTB(sve2_usubw_d
, uint64_t, uint32_t, , H1_4
, DO_SUB
)
1275 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1276 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1278 intptr_t i, opr_sz = simd_oprsz(desc); \
1279 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1280 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1281 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1282 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1283 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1284 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1288 DO_ZZZ_NTB(sve2_eoril_b
, uint8_t, H1
, DO_EOR
)
1289 DO_ZZZ_NTB(sve2_eoril_h
, uint16_t, H1_2
, DO_EOR
)
1290 DO_ZZZ_NTB(sve2_eoril_s
, uint32_t, H1_4
, DO_EOR
)
1291 DO_ZZZ_NTB(sve2_eoril_d
, uint64_t, , DO_EOR
)
1295 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1296 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1298 intptr_t i, opr_sz = simd_oprsz(desc); \
1299 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1300 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1301 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1302 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1303 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1304 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1308 DO_ZZZW_ACC(sve2_sabal_h
, int16_t, int8_t, H1_2
, H1
, DO_ABD
)
1309 DO_ZZZW_ACC(sve2_sabal_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ABD
)
1310 DO_ZZZW_ACC(sve2_sabal_d
, int64_t, int32_t, , H1_4
, DO_ABD
)
1312 DO_ZZZW_ACC(sve2_uabal_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ABD
)
1313 DO_ZZZW_ACC(sve2_uabal_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ABD
)
1314 DO_ZZZW_ACC(sve2_uabal_d
, uint64_t, uint32_t, , H1_4
, DO_ABD
)
1316 DO_ZZZW_ACC(sve2_smlal_zzzw_h
, int16_t, int8_t, H1_2
, H1
, DO_MUL
)
1317 DO_ZZZW_ACC(sve2_smlal_zzzw_s
, int32_t, int16_t, H1_4
, H1_2
, DO_MUL
)
1318 DO_ZZZW_ACC(sve2_smlal_zzzw_d
, int64_t, int32_t, , H1_4
, DO_MUL
)
1320 DO_ZZZW_ACC(sve2_umlal_zzzw_h
, uint16_t, uint8_t, H1_2
, H1
, DO_MUL
)
1321 DO_ZZZW_ACC(sve2_umlal_zzzw_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_MUL
)
1322 DO_ZZZW_ACC(sve2_umlal_zzzw_d
, uint64_t, uint32_t, , H1_4
, DO_MUL
)
1324 #define DO_NMUL(N, M) -(N * M)
1326 DO_ZZZW_ACC(sve2_smlsl_zzzw_h
, int16_t, int8_t, H1_2
, H1
, DO_NMUL
)
1327 DO_ZZZW_ACC(sve2_smlsl_zzzw_s
, int32_t, int16_t, H1_4
, H1_2
, DO_NMUL
)
1328 DO_ZZZW_ACC(sve2_smlsl_zzzw_d
, int64_t, int32_t, , H1_4
, DO_NMUL
)
1330 DO_ZZZW_ACC(sve2_umlsl_zzzw_h
, uint16_t, uint8_t, H1_2
, H1
, DO_NMUL
)
1331 DO_ZZZW_ACC(sve2_umlsl_zzzw_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_NMUL
)
1332 DO_ZZZW_ACC(sve2_umlsl_zzzw_d
, uint64_t, uint32_t, , H1_4
, DO_NMUL
)
1336 #define DO_XTNB(NAME, TYPE, OP) \
1337 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1339 intptr_t i, opr_sz = simd_oprsz(desc); \
1340 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1341 TYPE nn = *(TYPE *)(vn + i); \
1342 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1343 *(TYPE *)(vd + i) = nn; \
1347 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1348 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1350 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1351 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1352 TYPE nn = *(TYPE *)(vn + i); \
1353 *(TYPEN *)(vd + i + odd) = OP(nn); \
1357 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1358 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1359 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1361 DO_XTNB(sve2_sqxtnb_h
, int16_t, DO_SQXTN_H
)
1362 DO_XTNB(sve2_sqxtnb_s
, int32_t, DO_SQXTN_S
)
1363 DO_XTNB(sve2_sqxtnb_d
, int64_t, DO_SQXTN_D
)
1365 DO_XTNT(sve2_sqxtnt_h
, int16_t, int8_t, H1
, DO_SQXTN_H
)
1366 DO_XTNT(sve2_sqxtnt_s
, int32_t, int16_t, H1_2
, DO_SQXTN_S
)
1367 DO_XTNT(sve2_sqxtnt_d
, int64_t, int32_t, H1_4
, DO_SQXTN_D
)
1369 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1370 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1371 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1373 DO_XTNB(sve2_uqxtnb_h
, uint16_t, DO_UQXTN_H
)
1374 DO_XTNB(sve2_uqxtnb_s
, uint32_t, DO_UQXTN_S
)
1375 DO_XTNB(sve2_uqxtnb_d
, uint64_t, DO_UQXTN_D
)
1377 DO_XTNT(sve2_uqxtnt_h
, uint16_t, uint8_t, H1
, DO_UQXTN_H
)
1378 DO_XTNT(sve2_uqxtnt_s
, uint32_t, uint16_t, H1_2
, DO_UQXTN_S
)
1379 DO_XTNT(sve2_uqxtnt_d
, uint64_t, uint32_t, H1_4
, DO_UQXTN_D
)
1381 DO_XTNB(sve2_sqxtunb_h
, int16_t, DO_UQXTN_H
)
1382 DO_XTNB(sve2_sqxtunb_s
, int32_t, DO_UQXTN_S
)
1383 DO_XTNB(sve2_sqxtunb_d
, int64_t, DO_UQXTN_D
)
1385 DO_XTNT(sve2_sqxtunt_h
, int16_t, int8_t, H1
, DO_UQXTN_H
)
1386 DO_XTNT(sve2_sqxtunt_s
, int32_t, int16_t, H1_2
, DO_UQXTN_S
)
1387 DO_XTNT(sve2_sqxtunt_d
, int64_t, int32_t, H1_4
, DO_UQXTN_D
)
1392 void HELPER(sve2_adcl_s
)(void *vd
, void *vn
, void *vm
, void *va
, uint32_t desc
)
1394 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1395 int sel
= H4(extract32(desc
, SIMD_DATA_SHIFT
, 1));
1396 uint32_t inv
= -extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
1397 uint32_t *a
= va
, *n
= vn
;
1398 uint64_t *d
= vd
, *m
= vm
;
1400 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
1401 uint32_t e1
= a
[2 * i
+ H4(0)];
1402 uint32_t e2
= n
[2 * i
+ sel
] ^ inv
;
1403 uint64_t c
= extract64(m
[i
], 32, 1);
1404 /* Compute and store the entire 33-bit result at once. */
1409 void HELPER(sve2_adcl_d
)(void *vd
, void *vn
, void *vm
, void *va
, uint32_t desc
)
1411 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1412 int sel
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
1413 uint64_t inv
= -(uint64_t)extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
1414 uint64_t *d
= vd
, *a
= va
, *n
= vn
, *m
= vm
;
1416 for (i
= 0; i
< opr_sz
/ 8; i
+= 2) {
1417 Int128 e1
= int128_make64(a
[i
]);
1418 Int128 e2
= int128_make64(n
[i
+ sel
] ^ inv
);
1419 Int128 c
= int128_make64(m
[i
+ 1] & 1);
1420 Int128 r
= int128_add(int128_add(e1
, e2
), c
);
1421 d
[i
+ 0] = int128_getlo(r
);
1422 d
[i
+ 1] = int128_gethi(r
);
1426 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1427 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1429 intptr_t i, opr_sz = simd_oprsz(desc); \
1430 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1431 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1432 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1433 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1434 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1435 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1436 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1440 DO_SQDMLAL(sve2_sqdmlal_zzzw_h
, int16_t, int8_t, H1_2
, H1
,
1441 do_sqdmull_h
, DO_SQADD_H
)
1442 DO_SQDMLAL(sve2_sqdmlal_zzzw_s
, int32_t, int16_t, H1_4
, H1_2
,
1443 do_sqdmull_s
, DO_SQADD_S
)
1444 DO_SQDMLAL(sve2_sqdmlal_zzzw_d
, int64_t, int32_t, , H1_4
,
1445 do_sqdmull_d
, do_sqadd_d
)
1447 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h
, int16_t, int8_t, H1_2
, H1
,
1448 do_sqdmull_h
, DO_SQSUB_H
)
1449 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s
, int32_t, int16_t, H1_4
, H1_2
,
1450 do_sqdmull_s
, DO_SQSUB_S
)
1451 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d
, int64_t, int32_t, , H1_4
,
1452 do_sqdmull_d
, do_sqsub_d
)
1456 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1457 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1459 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1460 int rot = simd_data(desc); \
1461 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1462 bool sub_r = rot == 1 || rot == 2; \
1463 bool sub_i = rot >= 2; \
1464 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1465 for (i = 0; i < opr_sz; i += 2) { \
1466 TYPE elt1_a = n[H(i + sel_a)]; \
1467 TYPE elt2_a = m[H(i + sel_a)]; \
1468 TYPE elt2_b = m[H(i + sel_b)]; \
1469 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1470 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1474 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1476 DO_CMLA_FUNC(sve2_cmla_zzzz_b
, uint8_t, H1
, DO_CMLA
)
1477 DO_CMLA_FUNC(sve2_cmla_zzzz_h
, uint16_t, H2
, DO_CMLA
)
1478 DO_CMLA_FUNC(sve2_cmla_zzzz_s
, uint32_t, H4
, DO_CMLA
)
1479 DO_CMLA_FUNC(sve2_cmla_zzzz_d
, uint64_t, , DO_CMLA
)
1481 #define DO_SQRDMLAH_B(N, M, A, S) \
1482 do_sqrdmlah_b(N, M, A, S, true)
1483 #define DO_SQRDMLAH_H(N, M, A, S) \
1484 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1485 #define DO_SQRDMLAH_S(N, M, A, S) \
1486 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1487 #define DO_SQRDMLAH_D(N, M, A, S) \
1488 do_sqrdmlah_d(N, M, A, S, true)
1490 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b
, int8_t, H1
, DO_SQRDMLAH_B
)
1491 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h
, int16_t, H2
, DO_SQRDMLAH_H
)
1492 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s
, int32_t, H4
, DO_SQRDMLAH_S
)
1493 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d
, int64_t, , DO_SQRDMLAH_D
)
1497 #undef DO_SQRDMLAH_B
1498 #undef DO_SQRDMLAH_H
1499 #undef DO_SQRDMLAH_S
1500 #undef DO_SQRDMLAH_D
1502 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1503 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1505 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1506 intptr_t i, j, idx = simd_data(desc); \
1507 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1508 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1510 for (j = 0; j < segment; j++) { \
1511 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1516 #define DO_SQRDMLAH_H(N, M, A) \
1517 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1518 #define DO_SQRDMLAH_S(N, M, A) \
1519 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1520 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1522 DO_ZZXZ(sve2_sqrdmlah_idx_h
, int16_t, H2
, DO_SQRDMLAH_H
)
1523 DO_ZZXZ(sve2_sqrdmlah_idx_s
, int32_t, H4
, DO_SQRDMLAH_S
)
1524 DO_ZZXZ(sve2_sqrdmlah_idx_d
, int64_t, , DO_SQRDMLAH_D
)
1526 #define DO_SQRDMLSH_H(N, M, A) \
1527 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1528 #define DO_SQRDMLSH_S(N, M, A) \
1529 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1530 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1532 DO_ZZXZ(sve2_sqrdmlsh_idx_h
, int16_t, H2
, DO_SQRDMLSH_H
)
1533 DO_ZZXZ(sve2_sqrdmlsh_idx_s
, int32_t, H4
, DO_SQRDMLSH_S
)
1534 DO_ZZXZ(sve2_sqrdmlsh_idx_d
, int64_t, , DO_SQRDMLSH_D
)
1538 #define DO_BITPERM(NAME, TYPE, OP) \
1539 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1541 intptr_t i, opr_sz = simd_oprsz(desc); \
1542 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1543 TYPE nn = *(TYPE *)(vn + i); \
1544 TYPE mm = *(TYPE *)(vm + i); \
1545 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1549 static uint64_t bitextract(uint64_t data
, uint64_t mask
, int n
)
1554 for (db
= 0; db
< n
; ++db
) {
1555 if ((mask
>> db
) & 1) {
1556 res
|= ((data
>> db
) & 1) << rb
;
1563 DO_BITPERM(sve2_bext_b
, uint8_t, bitextract
)
1564 DO_BITPERM(sve2_bext_h
, uint16_t, bitextract
)
1565 DO_BITPERM(sve2_bext_s
, uint32_t, bitextract
)
1566 DO_BITPERM(sve2_bext_d
, uint64_t, bitextract
)
1568 static uint64_t bitdeposit(uint64_t data
, uint64_t mask
, int n
)
1573 for (rb
= 0; rb
< n
; ++rb
) {
1574 if ((mask
>> rb
) & 1) {
1575 res
|= ((data
>> db
) & 1) << rb
;
1582 DO_BITPERM(sve2_bdep_b
, uint8_t, bitdeposit
)
1583 DO_BITPERM(sve2_bdep_h
, uint16_t, bitdeposit
)
1584 DO_BITPERM(sve2_bdep_s
, uint32_t, bitdeposit
)
1585 DO_BITPERM(sve2_bdep_d
, uint64_t, bitdeposit
)
1587 static uint64_t bitgroup(uint64_t data
, uint64_t mask
, int n
)
1589 uint64_t resm
= 0, resu
= 0;
1590 int db
, rbm
= 0, rbu
= 0;
1592 for (db
= 0; db
< n
; ++db
) {
1593 uint64_t val
= (data
>> db
) & 1;
1594 if ((mask
>> db
) & 1) {
1595 resm
|= val
<< rbm
++;
1597 resu
|= val
<< rbu
++;
1601 return resm
| (resu
<< rbm
);
1604 DO_BITPERM(sve2_bgrp_b
, uint8_t, bitgroup
)
1605 DO_BITPERM(sve2_bgrp_h
, uint16_t, bitgroup
)
1606 DO_BITPERM(sve2_bgrp_s
, uint32_t, bitgroup
)
1607 DO_BITPERM(sve2_bgrp_d
, uint64_t, bitgroup
)
1611 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1612 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1614 intptr_t i, opr_sz = simd_oprsz(desc); \
1615 int sub_r = simd_data(desc); \
1617 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1618 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1619 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1620 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1621 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1622 acc_r = ADD_OP(acc_r, el2_i); \
1623 acc_i = SUB_OP(acc_i, el2_r); \
1624 *(TYPE *)(vd + H(i)) = acc_r; \
1625 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1628 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1629 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1630 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1631 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1632 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1633 acc_r = SUB_OP(acc_r, el2_i); \
1634 acc_i = ADD_OP(acc_i, el2_r); \
1635 *(TYPE *)(vd + H(i)) = acc_r; \
1636 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1641 DO_CADD(sve2_cadd_b
, int8_t, H1
, DO_ADD
, DO_SUB
)
1642 DO_CADD(sve2_cadd_h
, int16_t, H1_2
, DO_ADD
, DO_SUB
)
1643 DO_CADD(sve2_cadd_s
, int32_t, H1_4
, DO_ADD
, DO_SUB
)
1644 DO_CADD(sve2_cadd_d
, int64_t, , DO_ADD
, DO_SUB
)
1646 DO_CADD(sve2_sqcadd_b
, int8_t, H1
, DO_SQADD_B
, DO_SQSUB_B
)
1647 DO_CADD(sve2_sqcadd_h
, int16_t, H1_2
, DO_SQADD_H
, DO_SQSUB_H
)
1648 DO_CADD(sve2_sqcadd_s
, int32_t, H1_4
, DO_SQADD_S
, DO_SQSUB_S
)
1649 DO_CADD(sve2_sqcadd_d
, int64_t, , do_sqadd_d
, do_sqsub_d
)
1653 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1654 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1656 intptr_t i, opr_sz = simd_oprsz(desc); \
1657 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1658 int shift = simd_data(desc) >> 1; \
1659 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1660 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1661 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1665 DO_ZZI_SHLL(sve2_sshll_h
, int16_t, int8_t, H1_2
, H1
)
1666 DO_ZZI_SHLL(sve2_sshll_s
, int32_t, int16_t, H1_4
, H1_2
)
1667 DO_ZZI_SHLL(sve2_sshll_d
, int64_t, int32_t, , H1_4
)
1669 DO_ZZI_SHLL(sve2_ushll_h
, uint16_t, uint8_t, H1_2
, H1
)
1670 DO_ZZI_SHLL(sve2_ushll_s
, uint32_t, uint16_t, H1_4
, H1_2
)
1671 DO_ZZI_SHLL(sve2_ushll_d
, uint64_t, uint32_t, , H1_4
)
1675 /* Two-operand reduction expander, controlled by a predicate.
1676 * The difference between TYPERED and TYPERET has to do with
1677 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1678 * but TYPERET must be unsigned so that e.g. a 32-bit value
1679 * is not sign-extended to the ABI uint64_t return type.
1681 /* ??? If we were to vectorize this by hand the reduction ordering
1682 * would change. For integer operands, this is perfectly fine.
1684 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1685 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1687 intptr_t i, opr_sz = simd_oprsz(desc); \
1688 TYPERED ret = INIT; \
1689 for (i = 0; i < opr_sz; ) { \
1690 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1693 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1694 ret = OP(ret, nn); \
1696 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1699 return (TYPERET)ret; \
1702 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1703 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1705 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1709 for (i = 0; i < opr_sz; i += 1) { \
1710 if (pg[H1(i)] & 1) { \
1712 ret = OP(ret, nn); \
1718 DO_VPZ(sve_orv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_ORR
)
1719 DO_VPZ(sve_orv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_ORR
)
1720 DO_VPZ(sve_orv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_ORR
)
1721 DO_VPZ_D(sve_orv_d
, uint64_t, uint64_t, 0, DO_ORR
)
1723 DO_VPZ(sve_eorv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_EOR
)
1724 DO_VPZ(sve_eorv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_EOR
)
1725 DO_VPZ(sve_eorv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_EOR
)
1726 DO_VPZ_D(sve_eorv_d
, uint64_t, uint64_t, 0, DO_EOR
)
1728 DO_VPZ(sve_andv_b
, uint8_t, uint8_t, uint8_t, H1
, -1, DO_AND
)
1729 DO_VPZ(sve_andv_h
, uint16_t, uint16_t, uint16_t, H1_2
, -1, DO_AND
)
1730 DO_VPZ(sve_andv_s
, uint32_t, uint32_t, uint32_t, H1_4
, -1, DO_AND
)
1731 DO_VPZ_D(sve_andv_d
, uint64_t, uint64_t, -1, DO_AND
)
1733 DO_VPZ(sve_saddv_b
, int8_t, uint64_t, uint64_t, H1
, 0, DO_ADD
)
1734 DO_VPZ(sve_saddv_h
, int16_t, uint64_t, uint64_t, H1_2
, 0, DO_ADD
)
1735 DO_VPZ(sve_saddv_s
, int32_t, uint64_t, uint64_t, H1_4
, 0, DO_ADD
)
1737 DO_VPZ(sve_uaddv_b
, uint8_t, uint64_t, uint64_t, H1
, 0, DO_ADD
)
1738 DO_VPZ(sve_uaddv_h
, uint16_t, uint64_t, uint64_t, H1_2
, 0, DO_ADD
)
1739 DO_VPZ(sve_uaddv_s
, uint32_t, uint64_t, uint64_t, H1_4
, 0, DO_ADD
)
1740 DO_VPZ_D(sve_uaddv_d
, uint64_t, uint64_t, 0, DO_ADD
)
1742 DO_VPZ(sve_smaxv_b
, int8_t, int8_t, uint8_t, H1
, INT8_MIN
, DO_MAX
)
1743 DO_VPZ(sve_smaxv_h
, int16_t, int16_t, uint16_t, H1_2
, INT16_MIN
, DO_MAX
)
1744 DO_VPZ(sve_smaxv_s
, int32_t, int32_t, uint32_t, H1_4
, INT32_MIN
, DO_MAX
)
1745 DO_VPZ_D(sve_smaxv_d
, int64_t, int64_t, INT64_MIN
, DO_MAX
)
1747 DO_VPZ(sve_umaxv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_MAX
)
1748 DO_VPZ(sve_umaxv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_MAX
)
1749 DO_VPZ(sve_umaxv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_MAX
)
1750 DO_VPZ_D(sve_umaxv_d
, uint64_t, uint64_t, 0, DO_MAX
)
1752 DO_VPZ(sve_sminv_b
, int8_t, int8_t, uint8_t, H1
, INT8_MAX
, DO_MIN
)
1753 DO_VPZ(sve_sminv_h
, int16_t, int16_t, uint16_t, H1_2
, INT16_MAX
, DO_MIN
)
1754 DO_VPZ(sve_sminv_s
, int32_t, int32_t, uint32_t, H1_4
, INT32_MAX
, DO_MIN
)
1755 DO_VPZ_D(sve_sminv_d
, int64_t, int64_t, INT64_MAX
, DO_MIN
)
1757 DO_VPZ(sve_uminv_b
, uint8_t, uint8_t, uint8_t, H1
, -1, DO_MIN
)
1758 DO_VPZ(sve_uminv_h
, uint16_t, uint16_t, uint16_t, H1_2
, -1, DO_MIN
)
1759 DO_VPZ(sve_uminv_s
, uint32_t, uint32_t, uint32_t, H1_4
, -1, DO_MIN
)
1760 DO_VPZ_D(sve_uminv_d
, uint64_t, uint64_t, -1, DO_MIN
)
1765 /* Two vector operand, one scalar operand, unpredicated. */
1766 #define DO_ZZI(NAME, TYPE, OP) \
1767 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1769 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1770 TYPE s = s64, *d = vd, *n = vn; \
1771 for (i = 0; i < opr_sz; ++i) { \
1772 d[i] = OP(n[i], s); \
1776 #define DO_SUBR(X, Y) (Y - X)
1778 DO_ZZI(sve_subri_b
, uint8_t, DO_SUBR
)
1779 DO_ZZI(sve_subri_h
, uint16_t, DO_SUBR
)
1780 DO_ZZI(sve_subri_s
, uint32_t, DO_SUBR
)
1781 DO_ZZI(sve_subri_d
, uint64_t, DO_SUBR
)
1783 DO_ZZI(sve_smaxi_b
, int8_t, DO_MAX
)
1784 DO_ZZI(sve_smaxi_h
, int16_t, DO_MAX
)
1785 DO_ZZI(sve_smaxi_s
, int32_t, DO_MAX
)
1786 DO_ZZI(sve_smaxi_d
, int64_t, DO_MAX
)
1788 DO_ZZI(sve_smini_b
, int8_t, DO_MIN
)
1789 DO_ZZI(sve_smini_h
, int16_t, DO_MIN
)
1790 DO_ZZI(sve_smini_s
, int32_t, DO_MIN
)
1791 DO_ZZI(sve_smini_d
, int64_t, DO_MIN
)
1793 DO_ZZI(sve_umaxi_b
, uint8_t, DO_MAX
)
1794 DO_ZZI(sve_umaxi_h
, uint16_t, DO_MAX
)
1795 DO_ZZI(sve_umaxi_s
, uint32_t, DO_MAX
)
1796 DO_ZZI(sve_umaxi_d
, uint64_t, DO_MAX
)
1798 DO_ZZI(sve_umini_b
, uint8_t, DO_MIN
)
1799 DO_ZZI(sve_umini_h
, uint16_t, DO_MIN
)
1800 DO_ZZI(sve_umini_s
, uint32_t, DO_MIN
)
1801 DO_ZZI(sve_umini_d
, uint64_t, DO_MIN
)
1821 /* Similar to the ARM LastActiveElement pseudocode function, except the
1822 result is multiplied by the element size. This includes the not found
1823 indication; e.g. not found for esz=3 is -8. */
1824 static intptr_t last_active_element(uint64_t *g
, intptr_t words
, intptr_t esz
)
1826 uint64_t mask
= pred_esz_masks
[esz
];
1830 uint64_t this_g
= g
[--i
] & mask
;
1832 return i
* 64 + (63 - clz64(this_g
));
1835 return (intptr_t)-1 << esz
;
1838 uint32_t HELPER(sve_pfirst
)(void *vd
, void *vg
, uint32_t pred_desc
)
1840 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
1841 uint32_t flags
= PREDTEST_INIT
;
1842 uint64_t *d
= vd
, *g
= vg
;
1846 uint64_t this_d
= d
[i
];
1847 uint64_t this_g
= g
[i
];
1851 /* Set in D the first bit of G. */
1852 this_d
|= this_g
& -this_g
;
1855 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
1857 } while (++i
< words
);
1862 uint32_t HELPER(sve_pnext
)(void *vd
, void *vg
, uint32_t pred_desc
)
1864 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
1865 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
1866 uint32_t flags
= PREDTEST_INIT
;
1867 uint64_t *d
= vd
, *g
= vg
, esz_mask
;
1870 next
= last_active_element(vd
, words
, esz
) + (1 << esz
);
1871 esz_mask
= pred_esz_masks
[esz
];
1873 /* Similar to the pseudocode for pnext, but scaled by ESZ
1874 so that we find the correct bit. */
1875 if (next
< words
* 64) {
1879 mask
= ~((1ull << (next
& 63)) - 1);
1883 uint64_t this_g
= g
[next
/ 64] & esz_mask
& mask
;
1885 next
= (next
& -64) + ctz64(this_g
);
1890 } while (next
< words
* 64);
1895 uint64_t this_d
= 0;
1896 if (i
== next
/ 64) {
1897 this_d
= 1ull << (next
& 63);
1900 flags
= iter_predtest_fwd(this_d
, g
[i
] & esz_mask
, flags
);
1901 } while (++i
< words
);
1907 * Copy Zn into Zd, and store zero into inactive elements.
1908 * If inv, store zeros into the active elements.
1910 void HELPER(sve_movz_b
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
1912 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1913 uint64_t inv
= -(uint64_t)(simd_data(desc
) & 1);
1914 uint64_t *d
= vd
, *n
= vn
;
1917 for (i
= 0; i
< opr_sz
; i
+= 1) {
1918 d
[i
] = n
[i
] & (expand_pred_b(pg
[H1(i
)]) ^ inv
);
1922 void HELPER(sve_movz_h
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
1924 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1925 uint64_t inv
= -(uint64_t)(simd_data(desc
) & 1);
1926 uint64_t *d
= vd
, *n
= vn
;
1929 for (i
= 0; i
< opr_sz
; i
+= 1) {
1930 d
[i
] = n
[i
] & (expand_pred_h(pg
[H1(i
)]) ^ inv
);
1934 void HELPER(sve_movz_s
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
1936 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1937 uint64_t inv
= -(uint64_t)(simd_data(desc
) & 1);
1938 uint64_t *d
= vd
, *n
= vn
;
1941 for (i
= 0; i
< opr_sz
; i
+= 1) {
1942 d
[i
] = n
[i
] & (expand_pred_s(pg
[H1(i
)]) ^ inv
);
1946 void HELPER(sve_movz_d
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
1948 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
1949 uint64_t *d
= vd
, *n
= vn
;
1951 uint8_t inv
= simd_data(desc
);
1953 for (i
= 0; i
< opr_sz
; i
+= 1) {
1954 d
[i
] = n
[i
] & -(uint64_t)((pg
[H1(i
)] ^ inv
) & 1);
1958 /* Three-operand expander, immediate operand, controlled by a predicate.
1960 #define DO_ZPZI(NAME, TYPE, H, OP) \
1961 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1963 intptr_t i, opr_sz = simd_oprsz(desc); \
1964 TYPE imm = simd_data(desc); \
1965 for (i = 0; i < opr_sz; ) { \
1966 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1969 TYPE nn = *(TYPE *)(vn + H(i)); \
1970 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
1972 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1977 /* Similarly, specialized for 64-bit operands. */
1978 #define DO_ZPZI_D(NAME, TYPE, OP) \
1979 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1981 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1982 TYPE *d = vd, *n = vn; \
1983 TYPE imm = simd_data(desc); \
1985 for (i = 0; i < opr_sz; i += 1) { \
1986 if (pg[H1(i)] & 1) { \
1988 d[i] = OP(nn, imm); \
1993 #define DO_SHR(N, M) (N >> M)
1994 #define DO_SHL(N, M) (N << M)
1996 /* Arithmetic shift right for division. This rounds negative numbers
1997 toward zero as per signed division. Therefore before shifting,
1998 when N is negative, add 2**M-1. */
1999 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2001 static inline uint64_t do_urshr(uint64_t x
, unsigned sh
)
2003 if (likely(sh
< 64)) {
2004 return (x
>> sh
) + ((x
>> (sh
- 1)) & 1);
2005 } else if (sh
== 64) {
2012 static inline int64_t do_srshr(int64_t x
, unsigned sh
)
2014 if (likely(sh
< 64)) {
2015 return (x
>> sh
) + ((x
>> (sh
- 1)) & 1);
2017 /* Rounding the sign bit always produces 0. */
2022 DO_ZPZI(sve_asr_zpzi_b
, int8_t, H1
, DO_SHR
)
2023 DO_ZPZI(sve_asr_zpzi_h
, int16_t, H1_2
, DO_SHR
)
2024 DO_ZPZI(sve_asr_zpzi_s
, int32_t, H1_4
, DO_SHR
)
2025 DO_ZPZI_D(sve_asr_zpzi_d
, int64_t, DO_SHR
)
2027 DO_ZPZI(sve_lsr_zpzi_b
, uint8_t, H1
, DO_SHR
)
2028 DO_ZPZI(sve_lsr_zpzi_h
, uint16_t, H1_2
, DO_SHR
)
2029 DO_ZPZI(sve_lsr_zpzi_s
, uint32_t, H1_4
, DO_SHR
)
2030 DO_ZPZI_D(sve_lsr_zpzi_d
, uint64_t, DO_SHR
)
2032 DO_ZPZI(sve_lsl_zpzi_b
, uint8_t, H1
, DO_SHL
)
2033 DO_ZPZI(sve_lsl_zpzi_h
, uint16_t, H1_2
, DO_SHL
)
2034 DO_ZPZI(sve_lsl_zpzi_s
, uint32_t, H1_4
, DO_SHL
)
2035 DO_ZPZI_D(sve_lsl_zpzi_d
, uint64_t, DO_SHL
)
2037 DO_ZPZI(sve_asrd_b
, int8_t, H1
, DO_ASRD
)
2038 DO_ZPZI(sve_asrd_h
, int16_t, H1_2
, DO_ASRD
)
2039 DO_ZPZI(sve_asrd_s
, int32_t, H1_4
, DO_ASRD
)
2040 DO_ZPZI_D(sve_asrd_d
, int64_t, DO_ASRD
)
2046 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2047 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2049 intptr_t i, opr_sz = simd_oprsz(desc); \
2050 int shift = simd_data(desc); \
2051 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2052 TYPEW nn = *(TYPEW *)(vn + i); \
2053 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2057 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2058 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2060 intptr_t i, opr_sz = simd_oprsz(desc); \
2061 int shift = simd_data(desc); \
2062 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2063 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2064 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2068 DO_SHRNB(sve2_shrnb_h
, uint16_t, uint8_t, DO_SHR
)
2069 DO_SHRNB(sve2_shrnb_s
, uint32_t, uint16_t, DO_SHR
)
2070 DO_SHRNB(sve2_shrnb_d
, uint64_t, uint32_t, DO_SHR
)
2072 DO_SHRNT(sve2_shrnt_h
, uint16_t, uint8_t, H1_2
, H1
, DO_SHR
)
2073 DO_SHRNT(sve2_shrnt_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_SHR
)
2074 DO_SHRNT(sve2_shrnt_d
, uint64_t, uint32_t, , H1_4
, DO_SHR
)
2076 DO_SHRNB(sve2_rshrnb_h
, uint16_t, uint8_t, do_urshr
)
2077 DO_SHRNB(sve2_rshrnb_s
, uint32_t, uint16_t, do_urshr
)
2078 DO_SHRNB(sve2_rshrnb_d
, uint64_t, uint32_t, do_urshr
)
2080 DO_SHRNT(sve2_rshrnt_h
, uint16_t, uint8_t, H1_2
, H1
, do_urshr
)
2081 DO_SHRNT(sve2_rshrnt_s
, uint32_t, uint16_t, H1_4
, H1_2
, do_urshr
)
2082 DO_SHRNT(sve2_rshrnt_d
, uint64_t, uint32_t, , H1_4
, do_urshr
)
2084 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2085 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2086 #define DO_SQSHRUN_D(x, sh) \
2087 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2089 DO_SHRNB(sve2_sqshrunb_h
, int16_t, uint8_t, DO_SQSHRUN_H
)
2090 DO_SHRNB(sve2_sqshrunb_s
, int32_t, uint16_t, DO_SQSHRUN_S
)
2091 DO_SHRNB(sve2_sqshrunb_d
, int64_t, uint32_t, DO_SQSHRUN_D
)
2093 DO_SHRNT(sve2_sqshrunt_h
, int16_t, uint8_t, H1_2
, H1
, DO_SQSHRUN_H
)
2094 DO_SHRNT(sve2_sqshrunt_s
, int32_t, uint16_t, H1_4
, H1_2
, DO_SQSHRUN_S
)
2095 DO_SHRNT(sve2_sqshrunt_d
, int64_t, uint32_t, , H1_4
, DO_SQSHRUN_D
)
2097 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2098 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2099 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2101 DO_SHRNB(sve2_sqrshrunb_h
, int16_t, uint8_t, DO_SQRSHRUN_H
)
2102 DO_SHRNB(sve2_sqrshrunb_s
, int32_t, uint16_t, DO_SQRSHRUN_S
)
2103 DO_SHRNB(sve2_sqrshrunb_d
, int64_t, uint32_t, DO_SQRSHRUN_D
)
2105 DO_SHRNT(sve2_sqrshrunt_h
, int16_t, uint8_t, H1_2
, H1
, DO_SQRSHRUN_H
)
2106 DO_SHRNT(sve2_sqrshrunt_s
, int32_t, uint16_t, H1_4
, H1_2
, DO_SQRSHRUN_S
)
2107 DO_SHRNT(sve2_sqrshrunt_d
, int64_t, uint32_t, , H1_4
, DO_SQRSHRUN_D
)
2109 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2110 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2111 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2113 DO_SHRNB(sve2_sqshrnb_h
, int16_t, uint8_t, DO_SQSHRN_H
)
2114 DO_SHRNB(sve2_sqshrnb_s
, int32_t, uint16_t, DO_SQSHRN_S
)
2115 DO_SHRNB(sve2_sqshrnb_d
, int64_t, uint32_t, DO_SQSHRN_D
)
2117 DO_SHRNT(sve2_sqshrnt_h
, int16_t, uint8_t, H1_2
, H1
, DO_SQSHRN_H
)
2118 DO_SHRNT(sve2_sqshrnt_s
, int32_t, uint16_t, H1_4
, H1_2
, DO_SQSHRN_S
)
2119 DO_SHRNT(sve2_sqshrnt_d
, int64_t, uint32_t, , H1_4
, DO_SQSHRN_D
)
2121 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2122 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2123 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2125 DO_SHRNB(sve2_sqrshrnb_h
, int16_t, uint8_t, DO_SQRSHRN_H
)
2126 DO_SHRNB(sve2_sqrshrnb_s
, int32_t, uint16_t, DO_SQRSHRN_S
)
2127 DO_SHRNB(sve2_sqrshrnb_d
, int64_t, uint32_t, DO_SQRSHRN_D
)
2129 DO_SHRNT(sve2_sqrshrnt_h
, int16_t, uint8_t, H1_2
, H1
, DO_SQRSHRN_H
)
2130 DO_SHRNT(sve2_sqrshrnt_s
, int32_t, uint16_t, H1_4
, H1_2
, DO_SQRSHRN_S
)
2131 DO_SHRNT(sve2_sqrshrnt_d
, int64_t, uint32_t, , H1_4
, DO_SQRSHRN_D
)
2133 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2134 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2135 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2137 DO_SHRNB(sve2_uqshrnb_h
, uint16_t, uint8_t, DO_UQSHRN_H
)
2138 DO_SHRNB(sve2_uqshrnb_s
, uint32_t, uint16_t, DO_UQSHRN_S
)
2139 DO_SHRNB(sve2_uqshrnb_d
, uint64_t, uint32_t, DO_UQSHRN_D
)
2141 DO_SHRNT(sve2_uqshrnt_h
, uint16_t, uint8_t, H1_2
, H1
, DO_UQSHRN_H
)
2142 DO_SHRNT(sve2_uqshrnt_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_UQSHRN_S
)
2143 DO_SHRNT(sve2_uqshrnt_d
, uint64_t, uint32_t, , H1_4
, DO_UQSHRN_D
)
2145 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2146 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2147 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2149 DO_SHRNB(sve2_uqrshrnb_h
, uint16_t, uint8_t, DO_UQRSHRN_H
)
2150 DO_SHRNB(sve2_uqrshrnb_s
, uint32_t, uint16_t, DO_UQRSHRN_S
)
2151 DO_SHRNB(sve2_uqrshrnb_d
, uint64_t, uint32_t, DO_UQRSHRN_D
)
2153 DO_SHRNT(sve2_uqrshrnt_h
, uint16_t, uint8_t, H1_2
, H1
, DO_UQRSHRN_H
)
2154 DO_SHRNT(sve2_uqrshrnt_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_UQRSHRN_S
)
2155 DO_SHRNT(sve2_uqrshrnt_d
, uint64_t, uint32_t, , H1_4
, DO_UQRSHRN_D
)
2160 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2161 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2163 intptr_t i, opr_sz = simd_oprsz(desc); \
2164 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2165 TYPEW nn = *(TYPEW *)(vn + i); \
2166 TYPEW mm = *(TYPEW *)(vm + i); \
2167 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2171 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2172 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2174 intptr_t i, opr_sz = simd_oprsz(desc); \
2175 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2176 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2177 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2178 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2182 #define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2183 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2184 #define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2185 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2187 DO_BINOPNB(sve2_addhnb_h
, uint16_t, uint8_t, 8, DO_ADDHN
)
2188 DO_BINOPNB(sve2_addhnb_s
, uint32_t, uint16_t, 16, DO_ADDHN
)
2189 DO_BINOPNB(sve2_addhnb_d
, uint64_t, uint32_t, 32, DO_ADDHN
)
2191 DO_BINOPNT(sve2_addhnt_h
, uint16_t, uint8_t, 8, H1_2
, H1
, DO_ADDHN
)
2192 DO_BINOPNT(sve2_addhnt_s
, uint32_t, uint16_t, 16, H1_4
, H1_2
, DO_ADDHN
)
2193 DO_BINOPNT(sve2_addhnt_d
, uint64_t, uint32_t, 32, , H1_4
, DO_ADDHN
)
2195 DO_BINOPNB(sve2_raddhnb_h
, uint16_t, uint8_t, 8, DO_RADDHN
)
2196 DO_BINOPNB(sve2_raddhnb_s
, uint32_t, uint16_t, 16, DO_RADDHN
)
2197 DO_BINOPNB(sve2_raddhnb_d
, uint64_t, uint32_t, 32, DO_RADDHN
)
2199 DO_BINOPNT(sve2_raddhnt_h
, uint16_t, uint8_t, 8, H1_2
, H1
, DO_RADDHN
)
2200 DO_BINOPNT(sve2_raddhnt_s
, uint32_t, uint16_t, 16, H1_4
, H1_2
, DO_RADDHN
)
2201 DO_BINOPNT(sve2_raddhnt_d
, uint64_t, uint32_t, 32, , H1_4
, DO_RADDHN
)
2203 DO_BINOPNB(sve2_subhnb_h
, uint16_t, uint8_t, 8, DO_SUBHN
)
2204 DO_BINOPNB(sve2_subhnb_s
, uint32_t, uint16_t, 16, DO_SUBHN
)
2205 DO_BINOPNB(sve2_subhnb_d
, uint64_t, uint32_t, 32, DO_SUBHN
)
2207 DO_BINOPNT(sve2_subhnt_h
, uint16_t, uint8_t, 8, H1_2
, H1
, DO_SUBHN
)
2208 DO_BINOPNT(sve2_subhnt_s
, uint32_t, uint16_t, 16, H1_4
, H1_2
, DO_SUBHN
)
2209 DO_BINOPNT(sve2_subhnt_d
, uint64_t, uint32_t, 32, , H1_4
, DO_SUBHN
)
2211 DO_BINOPNB(sve2_rsubhnb_h
, uint16_t, uint8_t, 8, DO_RSUBHN
)
2212 DO_BINOPNB(sve2_rsubhnb_s
, uint32_t, uint16_t, 16, DO_RSUBHN
)
2213 DO_BINOPNB(sve2_rsubhnb_d
, uint64_t, uint32_t, 32, DO_RSUBHN
)
2215 DO_BINOPNT(sve2_rsubhnt_h
, uint16_t, uint8_t, 8, H1_2
, H1
, DO_RSUBHN
)
2216 DO_BINOPNT(sve2_rsubhnt_s
, uint32_t, uint16_t, 16, H1_4
, H1_2
, DO_RSUBHN
)
2217 DO_BINOPNT(sve2_rsubhnt_d
, uint64_t, uint32_t, 32, , H1_4
, DO_RSUBHN
)
2226 /* Fully general four-operand expander, controlled by a predicate.
2228 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2229 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2230 void *vg, uint32_t desc) \
2232 intptr_t i, opr_sz = simd_oprsz(desc); \
2233 for (i = 0; i < opr_sz; ) { \
2234 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2237 TYPE nn = *(TYPE *)(vn + H(i)); \
2238 TYPE mm = *(TYPE *)(vm + H(i)); \
2239 TYPE aa = *(TYPE *)(va + H(i)); \
2240 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2242 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2247 /* Similarly, specialized for 64-bit operands. */
2248 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2249 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2250 void *vg, uint32_t desc) \
2252 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2253 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2255 for (i = 0; i < opr_sz; i += 1) { \
2256 if (pg[H1(i)] & 1) { \
2257 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2258 d[i] = OP(aa, nn, mm); \
2263 #define DO_MLA(A, N, M) (A + N * M)
2264 #define DO_MLS(A, N, M) (A - N * M)
2266 DO_ZPZZZ(sve_mla_b
, uint8_t, H1
, DO_MLA
)
2267 DO_ZPZZZ(sve_mls_b
, uint8_t, H1
, DO_MLS
)
2269 DO_ZPZZZ(sve_mla_h
, uint16_t, H1_2
, DO_MLA
)
2270 DO_ZPZZZ(sve_mls_h
, uint16_t, H1_2
, DO_MLS
)
2272 DO_ZPZZZ(sve_mla_s
, uint32_t, H1_4
, DO_MLA
)
2273 DO_ZPZZZ(sve_mls_s
, uint32_t, H1_4
, DO_MLS
)
2275 DO_ZPZZZ_D(sve_mla_d
, uint64_t, DO_MLA
)
2276 DO_ZPZZZ_D(sve_mls_d
, uint64_t, DO_MLS
)
2283 void HELPER(sve_index_b
)(void *vd
, uint32_t start
,
2284 uint32_t incr
, uint32_t desc
)
2286 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2288 for (i
= 0; i
< opr_sz
; i
+= 1) {
2289 d
[H1(i
)] = start
+ i
* incr
;
2293 void HELPER(sve_index_h
)(void *vd
, uint32_t start
,
2294 uint32_t incr
, uint32_t desc
)
2296 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
2298 for (i
= 0; i
< opr_sz
; i
+= 1) {
2299 d
[H2(i
)] = start
+ i
* incr
;
2303 void HELPER(sve_index_s
)(void *vd
, uint32_t start
,
2304 uint32_t incr
, uint32_t desc
)
2306 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
2308 for (i
= 0; i
< opr_sz
; i
+= 1) {
2309 d
[H4(i
)] = start
+ i
* incr
;
2313 void HELPER(sve_index_d
)(void *vd
, uint64_t start
,
2314 uint64_t incr
, uint32_t desc
)
2316 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2318 for (i
= 0; i
< opr_sz
; i
+= 1) {
2319 d
[i
] = start
+ i
* incr
;
2323 void HELPER(sve_adr_p32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2325 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
2326 uint32_t sh
= simd_data(desc
);
2327 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
2328 for (i
= 0; i
< opr_sz
; i
+= 1) {
2329 d
[i
] = n
[i
] + (m
[i
] << sh
);
2333 void HELPER(sve_adr_p64
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2335 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2336 uint64_t sh
= simd_data(desc
);
2337 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2338 for (i
= 0; i
< opr_sz
; i
+= 1) {
2339 d
[i
] = n
[i
] + (m
[i
] << sh
);
2343 void HELPER(sve_adr_s32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2345 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2346 uint64_t sh
= simd_data(desc
);
2347 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2348 for (i
= 0; i
< opr_sz
; i
+= 1) {
2349 d
[i
] = n
[i
] + ((uint64_t)(int32_t)m
[i
] << sh
);
2353 void HELPER(sve_adr_u32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2355 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2356 uint64_t sh
= simd_data(desc
);
2357 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2358 for (i
= 0; i
< opr_sz
; i
+= 1) {
2359 d
[i
] = n
[i
] + ((uint64_t)(uint32_t)m
[i
] << sh
);
2363 void HELPER(sve_fexpa_h
)(void *vd
, void *vn
, uint32_t desc
)
2365 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2366 static const uint16_t coeff
[] = {
2367 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2368 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2369 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2370 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2372 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
2373 uint16_t *d
= vd
, *n
= vn
;
2375 for (i
= 0; i
< opr_sz
; i
++) {
2377 intptr_t idx
= extract32(nn
, 0, 5);
2378 uint16_t exp
= extract32(nn
, 5, 5);
2379 d
[i
] = coeff
[idx
] | (exp
<< 10);
2383 void HELPER(sve_fexpa_s
)(void *vd
, void *vn
, uint32_t desc
)
2385 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2386 static const uint32_t coeff
[] = {
2387 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2388 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2389 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2390 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2391 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2392 0x1ef532, 0x20b051, 0x227043, 0x243516,
2393 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2394 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2395 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2396 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2397 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2398 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2399 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2400 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2401 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2402 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2404 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
2405 uint32_t *d
= vd
, *n
= vn
;
2407 for (i
= 0; i
< opr_sz
; i
++) {
2409 intptr_t idx
= extract32(nn
, 0, 6);
2410 uint32_t exp
= extract32(nn
, 6, 8);
2411 d
[i
] = coeff
[idx
] | (exp
<< 23);
2415 void HELPER(sve_fexpa_d
)(void *vd
, void *vn
, uint32_t desc
)
2417 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2418 static const uint64_t coeff
[] = {
2419 0x0000000000000ull
, 0x02C9A3E778061ull
, 0x059B0D3158574ull
,
2420 0x0874518759BC8ull
, 0x0B5586CF9890Full
, 0x0E3EC32D3D1A2ull
,
2421 0x11301D0125B51ull
, 0x1429AAEA92DE0ull
, 0x172B83C7D517Bull
,
2422 0x1A35BEB6FCB75ull
, 0x1D4873168B9AAull
, 0x2063B88628CD6ull
,
2423 0x2387A6E756238ull
, 0x26B4565E27CDDull
, 0x29E9DF51FDEE1ull
,
2424 0x2D285A6E4030Bull
, 0x306FE0A31B715ull
, 0x33C08B26416FFull
,
2425 0x371A7373AA9CBull
, 0x3A7DB34E59FF7ull
, 0x3DEA64C123422ull
,
2426 0x4160A21F72E2Aull
, 0x44E086061892Dull
, 0x486A2B5C13CD0ull
,
2427 0x4BFDAD5362A27ull
, 0x4F9B2769D2CA7ull
, 0x5342B569D4F82ull
,
2428 0x56F4736B527DAull
, 0x5AB07DD485429ull
, 0x5E76F15AD2148ull
,
2429 0x6247EB03A5585ull
, 0x6623882552225ull
, 0x6A09E667F3BCDull
,
2430 0x6DFB23C651A2Full
, 0x71F75E8EC5F74ull
, 0x75FEB564267C9ull
,
2431 0x7A11473EB0187ull
, 0x7E2F336CF4E62ull
, 0x82589994CCE13ull
,
2432 0x868D99B4492EDull
, 0x8ACE5422AA0DBull
, 0x8F1AE99157736ull
,
2433 0x93737B0CDC5E5ull
, 0x97D829FDE4E50ull
, 0x9C49182A3F090ull
,
2434 0xA0C667B5DE565ull
, 0xA5503B23E255Dull
, 0xA9E6B5579FDBFull
,
2435 0xAE89F995AD3ADull
, 0xB33A2B84F15FBull
, 0xB7F76F2FB5E47ull
,
2436 0xBCC1E904BC1D2ull
, 0xC199BDD85529Cull
, 0xC67F12E57D14Bull
,
2437 0xCB720DCEF9069ull
, 0xD072D4A07897Cull
, 0xD5818DCFBA487ull
,
2438 0xDA9E603DB3285ull
, 0xDFC97337B9B5Full
, 0xE502EE78B3FF6ull
,
2439 0xEA4AFA2A490DAull
, 0xEFA1BEE615A27ull
, 0xF50765B6E4540ull
,
2442 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2443 uint64_t *d
= vd
, *n
= vn
;
2445 for (i
= 0; i
< opr_sz
; i
++) {
2447 intptr_t idx
= extract32(nn
, 0, 6);
2448 uint64_t exp
= extract32(nn
, 6, 11);
2449 d
[i
] = coeff
[idx
] | (exp
<< 52);
2453 void HELPER(sve_ftssel_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2455 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
2456 uint16_t *d
= vd
, *n
= vn
, *m
= vm
;
2457 for (i
= 0; i
< opr_sz
; i
+= 1) {
2463 d
[i
] = nn
^ (mm
& 2) << 14;
2467 void HELPER(sve_ftssel_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2469 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
2470 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
2471 for (i
= 0; i
< opr_sz
; i
+= 1) {
2477 d
[i
] = nn
^ (mm
& 2) << 30;
2481 void HELPER(sve_ftssel_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2483 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2484 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2485 for (i
= 0; i
< opr_sz
; i
+= 1) {
2491 d
[i
] = nn
^ (mm
& 2) << 62;
2496 * Signed saturating addition with scalar operand.
2499 void HELPER(sve_sqaddi_b
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
2501 intptr_t i
, oprsz
= simd_oprsz(desc
);
2503 for (i
= 0; i
< oprsz
; i
+= sizeof(int8_t)) {
2504 *(int8_t *)(d
+ i
) = DO_SQADD_B(b
, *(int8_t *)(a
+ i
));
2508 void HELPER(sve_sqaddi_h
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
2510 intptr_t i
, oprsz
= simd_oprsz(desc
);
2512 for (i
= 0; i
< oprsz
; i
+= sizeof(int16_t)) {
2513 *(int16_t *)(d
+ i
) = DO_SQADD_H(b
, *(int16_t *)(a
+ i
));
2517 void HELPER(sve_sqaddi_s
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
2519 intptr_t i
, oprsz
= simd_oprsz(desc
);
2521 for (i
= 0; i
< oprsz
; i
+= sizeof(int32_t)) {
2522 *(int32_t *)(d
+ i
) = DO_SQADD_S(b
, *(int32_t *)(a
+ i
));
2526 void HELPER(sve_sqaddi_d
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
2528 intptr_t i
, oprsz
= simd_oprsz(desc
);
2530 for (i
= 0; i
< oprsz
; i
+= sizeof(int64_t)) {
2531 *(int64_t *)(d
+ i
) = do_sqadd_d(b
, *(int64_t *)(a
+ i
));
2536 * Unsigned saturating addition with scalar operand.
2539 void HELPER(sve_uqaddi_b
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
2541 intptr_t i
, oprsz
= simd_oprsz(desc
);
2543 for (i
= 0; i
< oprsz
; i
+= sizeof(uint8_t)) {
2544 *(uint8_t *)(d
+ i
) = DO_UQADD_B(b
, *(uint8_t *)(a
+ i
));
2548 void HELPER(sve_uqaddi_h
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
2550 intptr_t i
, oprsz
= simd_oprsz(desc
);
2552 for (i
= 0; i
< oprsz
; i
+= sizeof(uint16_t)) {
2553 *(uint16_t *)(d
+ i
) = DO_UQADD_H(b
, *(uint16_t *)(a
+ i
));
2557 void HELPER(sve_uqaddi_s
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
2559 intptr_t i
, oprsz
= simd_oprsz(desc
);
2561 for (i
= 0; i
< oprsz
; i
+= sizeof(uint32_t)) {
2562 *(uint32_t *)(d
+ i
) = DO_UQADD_S(b
, *(uint32_t *)(a
+ i
));
2566 void HELPER(sve_uqaddi_d
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
2568 intptr_t i
, oprsz
= simd_oprsz(desc
);
2570 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
2571 *(uint64_t *)(d
+ i
) = do_uqadd_d(b
, *(uint64_t *)(a
+ i
));
2575 void HELPER(sve_uqsubi_d
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
2577 intptr_t i
, oprsz
= simd_oprsz(desc
);
2579 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
2580 *(uint64_t *)(d
+ i
) = do_uqsub_d(*(uint64_t *)(a
+ i
), b
);
2584 /* Two operand predicated copy immediate with merge. All valid immediates
2585 * can fit within 17 signed bits in the simd_data field.
2587 void HELPER(sve_cpy_m_b
)(void *vd
, void *vn
, void *vg
,
2588 uint64_t mm
, uint32_t desc
)
2590 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2591 uint64_t *d
= vd
, *n
= vn
;
2594 mm
= dup_const(MO_8
, mm
);
2595 for (i
= 0; i
< opr_sz
; i
+= 1) {
2597 uint64_t pp
= expand_pred_b(pg
[H1(i
)]);
2598 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
2602 void HELPER(sve_cpy_m_h
)(void *vd
, void *vn
, void *vg
,
2603 uint64_t mm
, uint32_t desc
)
2605 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2606 uint64_t *d
= vd
, *n
= vn
;
2609 mm
= dup_const(MO_16
, mm
);
2610 for (i
= 0; i
< opr_sz
; i
+= 1) {
2612 uint64_t pp
= expand_pred_h(pg
[H1(i
)]);
2613 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
2617 void HELPER(sve_cpy_m_s
)(void *vd
, void *vn
, void *vg
,
2618 uint64_t mm
, uint32_t desc
)
2620 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2621 uint64_t *d
= vd
, *n
= vn
;
2624 mm
= dup_const(MO_32
, mm
);
2625 for (i
= 0; i
< opr_sz
; i
+= 1) {
2627 uint64_t pp
= expand_pred_s(pg
[H1(i
)]);
2628 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
2632 void HELPER(sve_cpy_m_d
)(void *vd
, void *vn
, void *vg
,
2633 uint64_t mm
, uint32_t desc
)
2635 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2636 uint64_t *d
= vd
, *n
= vn
;
2639 for (i
= 0; i
< opr_sz
; i
+= 1) {
2641 d
[i
] = (pg
[H1(i
)] & 1 ? mm
: nn
);
2645 void HELPER(sve_cpy_z_b
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2647 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2651 val
= dup_const(MO_8
, val
);
2652 for (i
= 0; i
< opr_sz
; i
+= 1) {
2653 d
[i
] = val
& expand_pred_b(pg
[H1(i
)]);
2657 void HELPER(sve_cpy_z_h
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2659 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2663 val
= dup_const(MO_16
, val
);
2664 for (i
= 0; i
< opr_sz
; i
+= 1) {
2665 d
[i
] = val
& expand_pred_h(pg
[H1(i
)]);
2669 void HELPER(sve_cpy_z_s
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2671 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2675 val
= dup_const(MO_32
, val
);
2676 for (i
= 0; i
< opr_sz
; i
+= 1) {
2677 d
[i
] = val
& expand_pred_s(pg
[H1(i
)]);
2681 void HELPER(sve_cpy_z_d
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2683 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2687 for (i
= 0; i
< opr_sz
; i
+= 1) {
2688 d
[i
] = (pg
[H1(i
)] & 1 ? val
: 0);
2692 /* Big-endian hosts need to frob the byte indices. If the copy
2693 * happens to be 8-byte aligned, then no frobbing necessary.
2695 static void swap_memmove(void *vd
, void *vs
, size_t n
)
2697 uintptr_t d
= (uintptr_t)vd
;
2698 uintptr_t s
= (uintptr_t)vs
;
2699 uintptr_t o
= (d
| s
| n
) & 7;
2702 #ifndef HOST_WORDS_BIGENDIAN
2711 if (d
< s
|| d
>= s
+ n
) {
2712 for (i
= 0; i
< n
; i
+= 4) {
2713 *(uint32_t *)H1_4(d
+ i
) = *(uint32_t *)H1_4(s
+ i
);
2716 for (i
= n
; i
> 0; ) {
2718 *(uint32_t *)H1_4(d
+ i
) = *(uint32_t *)H1_4(s
+ i
);
2725 if (d
< s
|| d
>= s
+ n
) {
2726 for (i
= 0; i
< n
; i
+= 2) {
2727 *(uint16_t *)H1_2(d
+ i
) = *(uint16_t *)H1_2(s
+ i
);
2730 for (i
= n
; i
> 0; ) {
2732 *(uint16_t *)H1_2(d
+ i
) = *(uint16_t *)H1_2(s
+ i
);
2738 if (d
< s
|| d
>= s
+ n
) {
2739 for (i
= 0; i
< n
; i
++) {
2740 *(uint8_t *)H1(d
+ i
) = *(uint8_t *)H1(s
+ i
);
2743 for (i
= n
; i
> 0; ) {
2745 *(uint8_t *)H1(d
+ i
) = *(uint8_t *)H1(s
+ i
);
2752 /* Similarly for memset of 0. */
2753 static void swap_memzero(void *vd
, size_t n
)
2755 uintptr_t d
= (uintptr_t)vd
;
2756 uintptr_t o
= (d
| n
) & 7;
2759 /* Usually, the first bit of a predicate is set, so N is 0. */
2760 if (likely(n
== 0)) {
2764 #ifndef HOST_WORDS_BIGENDIAN
2773 for (i
= 0; i
< n
; i
+= 4) {
2774 *(uint32_t *)H1_4(d
+ i
) = 0;
2780 for (i
= 0; i
< n
; i
+= 2) {
2781 *(uint16_t *)H1_2(d
+ i
) = 0;
2786 for (i
= 0; i
< n
; i
++) {
2787 *(uint8_t *)H1(d
+ i
) = 0;
2793 void HELPER(sve_ext
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2795 intptr_t opr_sz
= simd_oprsz(desc
);
2796 size_t n_ofs
= simd_data(desc
);
2797 size_t n_siz
= opr_sz
- n_ofs
;
2800 swap_memmove(vd
, vn
+ n_ofs
, n_siz
);
2801 swap_memmove(vd
+ n_siz
, vm
, n_ofs
);
2802 } else if (vd
!= vn
) {
2803 swap_memmove(vd
+ n_siz
, vd
, n_ofs
);
2804 swap_memmove(vd
, vn
+ n_ofs
, n_siz
);
2806 /* vd == vn == vm. Need temp space. */
2808 swap_memmove(&tmp
, vm
, n_ofs
);
2809 swap_memmove(vd
, vd
+ n_ofs
, n_siz
);
2810 memcpy(vd
+ n_siz
, &tmp
, n_ofs
);
2814 #define DO_INSR(NAME, TYPE, H) \
2815 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
2817 intptr_t opr_sz = simd_oprsz(desc); \
2818 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
2819 *(TYPE *)(vd + H(0)) = val; \
2822 DO_INSR(sve_insr_b
, uint8_t, H1
)
2823 DO_INSR(sve_insr_h
, uint16_t, H1_2
)
2824 DO_INSR(sve_insr_s
, uint32_t, H1_4
)
2825 DO_INSR(sve_insr_d
, uint64_t, )
2829 void HELPER(sve_rev_b
)(void *vd
, void *vn
, uint32_t desc
)
2831 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2832 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
2833 uint64_t f
= *(uint64_t *)(vn
+ i
);
2834 uint64_t b
= *(uint64_t *)(vn
+ j
);
2835 *(uint64_t *)(vd
+ i
) = bswap64(b
);
2836 *(uint64_t *)(vd
+ j
) = bswap64(f
);
2840 void HELPER(sve_rev_h
)(void *vd
, void *vn
, uint32_t desc
)
2842 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2843 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
2844 uint64_t f
= *(uint64_t *)(vn
+ i
);
2845 uint64_t b
= *(uint64_t *)(vn
+ j
);
2846 *(uint64_t *)(vd
+ i
) = hswap64(b
);
2847 *(uint64_t *)(vd
+ j
) = hswap64(f
);
2851 void HELPER(sve_rev_s
)(void *vd
, void *vn
, uint32_t desc
)
2853 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2854 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
2855 uint64_t f
= *(uint64_t *)(vn
+ i
);
2856 uint64_t b
= *(uint64_t *)(vn
+ j
);
2857 *(uint64_t *)(vd
+ i
) = rol64(b
, 32);
2858 *(uint64_t *)(vd
+ j
) = rol64(f
, 32);
2862 void HELPER(sve_rev_d
)(void *vd
, void *vn
, uint32_t desc
)
2864 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
2865 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
2866 uint64_t f
= *(uint64_t *)(vn
+ i
);
2867 uint64_t b
= *(uint64_t *)(vn
+ j
);
2868 *(uint64_t *)(vd
+ i
) = b
;
2869 *(uint64_t *)(vd
+ j
) = f
;
2873 #define DO_TBL(NAME, TYPE, H) \
2874 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2876 intptr_t i, opr_sz = simd_oprsz(desc); \
2877 uintptr_t elem = opr_sz / sizeof(TYPE); \
2878 TYPE *d = vd, *n = vn, *m = vm; \
2880 if (unlikely(vd == vn)) { \
2881 n = memcpy(&tmp, vn, opr_sz); \
2883 for (i = 0; i < elem; i++) { \
2885 d[H(i)] = j < elem ? n[H(j)] : 0; \
2889 DO_TBL(sve_tbl_b
, uint8_t, H1
)
2890 DO_TBL(sve_tbl_h
, uint16_t, H2
)
2891 DO_TBL(sve_tbl_s
, uint32_t, H4
)
2892 DO_TBL(sve_tbl_d
, uint64_t, )
2896 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
2897 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2899 intptr_t i, opr_sz = simd_oprsz(desc); \
2903 if (unlikely(vn - vd < opr_sz)) { \
2904 n = memcpy(&tmp, n, opr_sz / 2); \
2906 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
2907 d[HD(i)] = n[HS(i)]; \
2911 DO_UNPK(sve_sunpk_h
, int16_t, int8_t, H2
, H1
)
2912 DO_UNPK(sve_sunpk_s
, int32_t, int16_t, H4
, H2
)
2913 DO_UNPK(sve_sunpk_d
, int64_t, int32_t, , H4
)
2915 DO_UNPK(sve_uunpk_h
, uint16_t, uint8_t, H2
, H1
)
2916 DO_UNPK(sve_uunpk_s
, uint32_t, uint16_t, H4
, H2
)
2917 DO_UNPK(sve_uunpk_d
, uint64_t, uint32_t, , H4
)
2921 /* Mask of bits included in the even numbered predicates of width esz.
2922 * We also use this for expand_bits/compress_bits, and so extend the
2923 * same pattern out to 16-bit units.
2925 static const uint64_t even_bit_esz_masks
[5] = {
2926 0x5555555555555555ull
,
2927 0x3333333333333333ull
,
2928 0x0f0f0f0f0f0f0f0full
,
2929 0x00ff00ff00ff00ffull
,
2930 0x0000ffff0000ffffull
,
2933 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
2934 * For N==0, this corresponds to the operation that in qemu/bitops.h
2935 * we call half_shuffle64; this algorithm is from Hacker's Delight,
2936 * section 7-2 Shuffling Bits.
2938 static uint64_t expand_bits(uint64_t x
, int n
)
2943 for (i
= 4; i
>= n
; i
--) {
2945 x
= ((x
<< sh
) | x
) & even_bit_esz_masks
[i
];
2950 /* Compress units of 2**(N+1) bits to units of 2**N bits.
2951 * For N==0, this corresponds to the operation that in qemu/bitops.h
2952 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
2953 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
2955 static uint64_t compress_bits(uint64_t x
, int n
)
2959 for (i
= n
; i
<= 4; i
++) {
2961 x
&= even_bit_esz_masks
[i
];
2964 return x
& 0xffffffffu
;
2967 void HELPER(sve_zip_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
2969 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
2970 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
2971 intptr_t high
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
);
2972 int esize
= 1 << esz
;
2977 uint64_t nn
= *(uint64_t *)vn
;
2978 uint64_t mm
= *(uint64_t *)vm
;
2979 int half
= 4 * oprsz
;
2981 nn
= extract64(nn
, high
* half
, half
);
2982 mm
= extract64(mm
, high
* half
, half
);
2983 nn
= expand_bits(nn
, esz
);
2984 mm
= expand_bits(mm
, esz
);
2985 d
[0] = nn
| (mm
<< esize
);
2987 ARMPredicateReg tmp
;
2989 /* We produce output faster than we consume input.
2990 Therefore we must be mindful of possible overlap. */
2992 vn
= memcpy(&tmp
, vn
, oprsz
);
2996 } else if (vd
== vm
) {
2997 vm
= memcpy(&tmp
, vm
, oprsz
);
3003 if ((oprsz
& 7) == 0) {
3004 uint32_t *n
= vn
, *m
= vm
;
3007 for (i
= 0; i
< oprsz
/ 8; i
++) {
3008 uint64_t nn
= n
[H4(high
+ i
)];
3009 uint64_t mm
= m
[H4(high
+ i
)];
3011 nn
= expand_bits(nn
, esz
);
3012 mm
= expand_bits(mm
, esz
);
3013 d
[i
] = nn
| (mm
<< esize
);
3016 uint8_t *n
= vn
, *m
= vm
;
3019 for (i
= 0; i
< oprsz
/ 2; i
++) {
3020 uint16_t nn
= n
[H1(high
+ i
)];
3021 uint16_t mm
= m
[H1(high
+ i
)];
3023 nn
= expand_bits(nn
, esz
);
3024 mm
= expand_bits(mm
, esz
);
3025 d16
[H2(i
)] = nn
| (mm
<< esize
);
3031 void HELPER(sve_uzp_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
3033 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3034 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3035 int odd
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
) << esz
;
3036 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3041 l
= compress_bits(n
[0] >> odd
, esz
);
3042 h
= compress_bits(m
[0] >> odd
, esz
);
3043 d
[0] = l
| (h
<< (4 * oprsz
));
3045 ARMPredicateReg tmp_m
;
3046 intptr_t oprsz_16
= oprsz
/ 16;
3048 if ((vm
- vd
) < (uintptr_t)oprsz
) {
3049 m
= memcpy(&tmp_m
, vm
, oprsz
);
3052 for (i
= 0; i
< oprsz_16
; i
++) {
3055 l
= compress_bits(l
>> odd
, esz
);
3056 h
= compress_bits(h
>> odd
, esz
);
3057 d
[i
] = l
| (h
<< 32);
3061 * For VL which is not a multiple of 512, the results from M do not
3062 * align nicely with the uint64_t for D. Put the aligned results
3063 * from M into TMP_M and then copy it into place afterward.
3066 int final_shift
= (oprsz
& 15) * 2;
3070 l
= compress_bits(l
>> odd
, esz
);
3071 h
= compress_bits(h
>> odd
, esz
);
3072 d
[i
] = l
| (h
<< final_shift
);
3074 for (i
= 0; i
< oprsz_16
; i
++) {
3077 l
= compress_bits(l
>> odd
, esz
);
3078 h
= compress_bits(h
>> odd
, esz
);
3079 tmp_m
.p
[i
] = l
| (h
<< 32);
3083 l
= compress_bits(l
>> odd
, esz
);
3084 h
= compress_bits(h
>> odd
, esz
);
3085 tmp_m
.p
[i
] = l
| (h
<< final_shift
);
3087 swap_memmove(vd
+ oprsz
/ 2, &tmp_m
, oprsz
/ 2);
3089 for (i
= 0; i
< oprsz_16
; i
++) {
3092 l
= compress_bits(l
>> odd
, esz
);
3093 h
= compress_bits(h
>> odd
, esz
);
3094 d
[oprsz_16
+ i
] = l
| (h
<< 32);
3100 void HELPER(sve_trn_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
3102 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3103 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3104 int odd
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
);
3105 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3112 mask
= even_bit_esz_masks
[esz
];
3119 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); i
++) {
3120 uint64_t nn
= (n
[i
] & mask
) >> shr
;
3121 uint64_t mm
= (m
[i
] & mask
) << shl
;
3126 /* Reverse units of 2**N bits. */
3127 static uint64_t reverse_bits_64(uint64_t x
, int n
)
3132 for (i
= 2, sh
= 4; i
>= n
; i
--, sh
>>= 1) {
3133 uint64_t mask
= even_bit_esz_masks
[i
];
3134 x
= ((x
& mask
) << sh
) | ((x
>> sh
) & mask
);
3139 static uint8_t reverse_bits_8(uint8_t x
, int n
)
3141 static const uint8_t mask
[3] = { 0x55, 0x33, 0x0f };
3144 for (i
= 2, sh
= 4; i
>= n
; i
--, sh
>>= 1) {
3145 x
= ((x
& mask
[i
]) << sh
) | ((x
>> sh
) & mask
[i
]);
3150 void HELPER(sve_rev_p
)(void *vd
, void *vn
, uint32_t pred_desc
)
3152 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3153 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3154 intptr_t i
, oprsz_2
= oprsz
/ 2;
3157 uint64_t l
= *(uint64_t *)vn
;
3158 l
= reverse_bits_64(l
<< (64 - 8 * oprsz
), esz
);
3159 *(uint64_t *)vd
= l
;
3160 } else if ((oprsz
& 15) == 0) {
3161 for (i
= 0; i
< oprsz_2
; i
+= 8) {
3162 intptr_t ih
= oprsz
- 8 - i
;
3163 uint64_t l
= reverse_bits_64(*(uint64_t *)(vn
+ i
), esz
);
3164 uint64_t h
= reverse_bits_64(*(uint64_t *)(vn
+ ih
), esz
);
3165 *(uint64_t *)(vd
+ i
) = h
;
3166 *(uint64_t *)(vd
+ ih
) = l
;
3169 for (i
= 0; i
< oprsz_2
; i
+= 1) {
3170 intptr_t il
= H1(i
);
3171 intptr_t ih
= H1(oprsz
- 1 - i
);
3172 uint8_t l
= reverse_bits_8(*(uint8_t *)(vn
+ il
), esz
);
3173 uint8_t h
= reverse_bits_8(*(uint8_t *)(vn
+ ih
), esz
);
3174 *(uint8_t *)(vd
+ il
) = h
;
3175 *(uint8_t *)(vd
+ ih
) = l
;
3180 void HELPER(sve_punpk_p
)(void *vd
, void *vn
, uint32_t pred_desc
)
3182 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3183 intptr_t high
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
);
3188 uint64_t nn
= *(uint64_t *)vn
;
3189 int half
= 4 * oprsz
;
3191 nn
= extract64(nn
, high
* half
, half
);
3192 nn
= expand_bits(nn
, 0);
3195 ARMPredicateReg tmp_n
;
3197 /* We produce output faster than we consume input.
3198 Therefore we must be mindful of possible overlap. */
3199 if ((vn
- vd
) < (uintptr_t)oprsz
) {
3200 vn
= memcpy(&tmp_n
, vn
, oprsz
);
3206 if ((oprsz
& 7) == 0) {
3210 for (i
= 0; i
< oprsz
/ 8; i
++) {
3211 uint64_t nn
= n
[H4(high
+ i
)];
3212 d
[i
] = expand_bits(nn
, 0);
3218 for (i
= 0; i
< oprsz
/ 2; i
++) {
3219 uint16_t nn
= n
[H1(high
+ i
)];
3220 d16
[H2(i
)] = expand_bits(nn
, 0);
3226 #define DO_ZIP(NAME, TYPE, H) \
3227 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3229 intptr_t oprsz = simd_oprsz(desc); \
3230 intptr_t i, oprsz_2 = oprsz / 2; \
3231 ARMVectorReg tmp_n, tmp_m; \
3232 /* We produce output faster than we consume input. \
3233 Therefore we must be mindful of possible overlap. */ \
3234 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3235 vn = memcpy(&tmp_n, vn, oprsz_2); \
3237 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3238 vm = memcpy(&tmp_m, vm, oprsz_2); \
3240 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3241 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
3242 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
3246 DO_ZIP(sve_zip_b
, uint8_t, H1
)
3247 DO_ZIP(sve_zip_h
, uint16_t, H1_2
)
3248 DO_ZIP(sve_zip_s
, uint32_t, H1_4
)
3249 DO_ZIP(sve_zip_d
, uint64_t, )
3251 #define DO_UZP(NAME, TYPE, H) \
3252 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3254 intptr_t oprsz = simd_oprsz(desc); \
3255 intptr_t oprsz_2 = oprsz / 2; \
3256 intptr_t odd_ofs = simd_data(desc); \
3258 ARMVectorReg tmp_m; \
3259 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3260 vm = memcpy(&tmp_m, vm, oprsz); \
3262 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3263 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
3265 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3266 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
3270 DO_UZP(sve_uzp_b
, uint8_t, H1
)
3271 DO_UZP(sve_uzp_h
, uint16_t, H1_2
)
3272 DO_UZP(sve_uzp_s
, uint32_t, H1_4
)
3273 DO_UZP(sve_uzp_d
, uint64_t, )
3275 #define DO_TRN(NAME, TYPE, H) \
3276 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3278 intptr_t oprsz = simd_oprsz(desc); \
3279 intptr_t odd_ofs = simd_data(desc); \
3281 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3282 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3283 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3284 *(TYPE *)(vd + H(i + 0)) = ae; \
3285 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3289 DO_TRN(sve_trn_b
, uint8_t, H1
)
3290 DO_TRN(sve_trn_h
, uint16_t, H1_2
)
3291 DO_TRN(sve_trn_s
, uint32_t, H1_4
)
3292 DO_TRN(sve_trn_d
, uint64_t, )
3298 void HELPER(sve_compact_s
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
3300 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
) / 4;
3301 uint32_t *d
= vd
, *n
= vn
;
3304 for (i
= j
= 0; i
< opr_sz
; i
++) {
3305 if (pg
[H1(i
/ 2)] & (i
& 1 ? 0x10 : 0x01)) {
3306 d
[H4(j
)] = n
[H4(i
)];
3310 for (; j
< opr_sz
; j
++) {
3315 void HELPER(sve_compact_d
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
3317 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
) / 8;
3318 uint64_t *d
= vd
, *n
= vn
;
3321 for (i
= j
= 0; i
< opr_sz
; i
++) {
3322 if (pg
[H1(i
)] & 1) {
3327 for (; j
< opr_sz
; j
++) {
3332 /* Similar to the ARM LastActiveElement pseudocode function, except the
3333 * result is multiplied by the element size. This includes the not found
3334 * indication; e.g. not found for esz=3 is -8.
3336 int32_t HELPER(sve_last_active_element
)(void *vg
, uint32_t pred_desc
)
3338 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
3339 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3341 return last_active_element(vg
, words
, esz
);
3344 void HELPER(sve_splice
)(void *vd
, void *vn
, void *vm
, void *vg
, uint32_t desc
)
3346 intptr_t opr_sz
= simd_oprsz(desc
) / 8;
3347 int esz
= simd_data(desc
);
3348 uint64_t pg
, first_g
, last_g
, len
, mask
= pred_esz_masks
[esz
];
3349 intptr_t i
, first_i
, last_i
;
3352 first_i
= last_i
= 0;
3353 first_g
= last_g
= 0;
3355 /* Find the extent of the active elements within VG. */
3356 for (i
= QEMU_ALIGN_UP(opr_sz
, 8) - 8; i
>= 0; i
-= 8) {
3357 pg
= *(uint64_t *)(vg
+ i
) & mask
;
3370 first_i
= first_i
* 8 + ctz64(first_g
);
3371 last_i
= last_i
* 8 + 63 - clz64(last_g
);
3372 len
= last_i
- first_i
+ (1 << esz
);
3374 vm
= memcpy(&tmp
, vm
, opr_sz
* 8);
3376 swap_memmove(vd
, vn
+ first_i
, len
);
3378 swap_memmove(vd
+ len
, vm
, opr_sz
* 8 - len
);
3381 void HELPER(sve_sel_zpzz_b
)(void *vd
, void *vn
, void *vm
,
3382 void *vg
, uint32_t desc
)
3384 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
3385 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3388 for (i
= 0; i
< opr_sz
; i
+= 1) {
3389 uint64_t nn
= n
[i
], mm
= m
[i
];
3390 uint64_t pp
= expand_pred_b(pg
[H1(i
)]);
3391 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
3395 void HELPER(sve_sel_zpzz_h
)(void *vd
, void *vn
, void *vm
,
3396 void *vg
, uint32_t desc
)
3398 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
3399 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3402 for (i
= 0; i
< opr_sz
; i
+= 1) {
3403 uint64_t nn
= n
[i
], mm
= m
[i
];
3404 uint64_t pp
= expand_pred_h(pg
[H1(i
)]);
3405 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
3409 void HELPER(sve_sel_zpzz_s
)(void *vd
, void *vn
, void *vm
,
3410 void *vg
, uint32_t desc
)
3412 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
3413 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3416 for (i
= 0; i
< opr_sz
; i
+= 1) {
3417 uint64_t nn
= n
[i
], mm
= m
[i
];
3418 uint64_t pp
= expand_pred_s(pg
[H1(i
)]);
3419 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
3423 void HELPER(sve_sel_zpzz_d
)(void *vd
, void *vn
, void *vm
,
3424 void *vg
, uint32_t desc
)
3426 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
3427 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3430 for (i
= 0; i
< opr_sz
; i
+= 1) {
3431 uint64_t nn
= n
[i
], mm
= m
[i
];
3432 d
[i
] = (pg
[H1(i
)] & 1 ? nn
: mm
);
3436 /* Two operand comparison controlled by a predicate.
3437 * ??? It is very tempting to want to be able to expand this inline
3438 * with x86 instructions, e.g.
3440 * vcmpeqw zm, zn, %ymm0
3441 * vpmovmskb %ymm0, %eax
3445 * or even aarch64, e.g.
3447 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3448 * cmeq v0.8h, zn, zm
3449 * and v0.8h, v0.8h, mask
3453 * However, coming up with an abstraction that allows vector inputs and
3454 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3455 * scalar outputs, is tricky.
3457 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3458 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3460 intptr_t opr_sz = simd_oprsz(desc); \
3461 uint32_t flags = PREDTEST_INIT; \
3462 intptr_t i = opr_sz; \
3464 uint64_t out = 0, pg; \
3466 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3467 TYPE nn = *(TYPE *)(vn + H(i)); \
3468 TYPE mm = *(TYPE *)(vm + H(i)); \
3471 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3473 *(uint64_t *)(vd + (i >> 3)) = out; \
3474 flags = iter_predtest_bwd(out, pg, flags); \
3479 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3480 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3481 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3482 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3483 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3484 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3485 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3486 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
3488 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b
, uint8_t, ==)
3489 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h
, uint16_t, ==)
3490 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s
, uint32_t, ==)
3491 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d
, uint64_t, ==)
3493 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b
, uint8_t, !=)
3494 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h
, uint16_t, !=)
3495 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s
, uint32_t, !=)
3496 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d
, uint64_t, !=)
3498 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b
, int8_t, >)
3499 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h
, int16_t, >)
3500 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s
, int32_t, >)
3501 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d
, int64_t, >)
3503 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b
, int8_t, >=)
3504 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h
, int16_t, >=)
3505 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s
, int32_t, >=)
3506 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d
, int64_t, >=)
3508 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b
, uint8_t, >)
3509 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h
, uint16_t, >)
3510 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s
, uint32_t, >)
3511 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d
, uint64_t, >)
3513 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b
, uint8_t, >=)
3514 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h
, uint16_t, >=)
3515 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s
, uint32_t, >=)
3516 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d
, uint64_t, >=)
3518 #undef DO_CMP_PPZZ_B
3519 #undef DO_CMP_PPZZ_H
3520 #undef DO_CMP_PPZZ_S
3521 #undef DO_CMP_PPZZ_D
3524 /* Similar, but the second source is "wide". */
3525 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3526 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3528 intptr_t opr_sz = simd_oprsz(desc); \
3529 uint32_t flags = PREDTEST_INIT; \
3530 intptr_t i = opr_sz; \
3532 uint64_t out = 0, pg; \
3534 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3536 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3537 TYPE nn = *(TYPE *)(vn + H(i)); \
3541 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3543 *(uint64_t *)(vd + (i >> 3)) = out; \
3544 flags = iter_predtest_bwd(out, pg, flags); \
3549 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3550 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3551 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3552 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3553 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3554 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3556 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b
, int8_t, uint64_t, ==)
3557 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h
, int16_t, uint64_t, ==)
3558 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s
, int32_t, uint64_t, ==)
3560 DO_CMP_PPZW_B(sve_cmpne_ppzw_b
, int8_t, uint64_t, !=)
3561 DO_CMP_PPZW_H(sve_cmpne_ppzw_h
, int16_t, uint64_t, !=)
3562 DO_CMP_PPZW_S(sve_cmpne_ppzw_s
, int32_t, uint64_t, !=)
3564 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b
, int8_t, int64_t, >)
3565 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h
, int16_t, int64_t, >)
3566 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s
, int32_t, int64_t, >)
3568 DO_CMP_PPZW_B(sve_cmpge_ppzw_b
, int8_t, int64_t, >=)
3569 DO_CMP_PPZW_H(sve_cmpge_ppzw_h
, int16_t, int64_t, >=)
3570 DO_CMP_PPZW_S(sve_cmpge_ppzw_s
, int32_t, int64_t, >=)
3572 DO_CMP_PPZW_B(sve_cmphi_ppzw_b
, uint8_t, uint64_t, >)
3573 DO_CMP_PPZW_H(sve_cmphi_ppzw_h
, uint16_t, uint64_t, >)
3574 DO_CMP_PPZW_S(sve_cmphi_ppzw_s
, uint32_t, uint64_t, >)
3576 DO_CMP_PPZW_B(sve_cmphs_ppzw_b
, uint8_t, uint64_t, >=)
3577 DO_CMP_PPZW_H(sve_cmphs_ppzw_h
, uint16_t, uint64_t, >=)
3578 DO_CMP_PPZW_S(sve_cmphs_ppzw_s
, uint32_t, uint64_t, >=)
3580 DO_CMP_PPZW_B(sve_cmplt_ppzw_b
, int8_t, int64_t, <)
3581 DO_CMP_PPZW_H(sve_cmplt_ppzw_h
, int16_t, int64_t, <)
3582 DO_CMP_PPZW_S(sve_cmplt_ppzw_s
, int32_t, int64_t, <)
3584 DO_CMP_PPZW_B(sve_cmple_ppzw_b
, int8_t, int64_t, <=)
3585 DO_CMP_PPZW_H(sve_cmple_ppzw_h
, int16_t, int64_t, <=)
3586 DO_CMP_PPZW_S(sve_cmple_ppzw_s
, int32_t, int64_t, <=)
3588 DO_CMP_PPZW_B(sve_cmplo_ppzw_b
, uint8_t, uint64_t, <)
3589 DO_CMP_PPZW_H(sve_cmplo_ppzw_h
, uint16_t, uint64_t, <)
3590 DO_CMP_PPZW_S(sve_cmplo_ppzw_s
, uint32_t, uint64_t, <)
3592 DO_CMP_PPZW_B(sve_cmpls_ppzw_b
, uint8_t, uint64_t, <=)
3593 DO_CMP_PPZW_H(sve_cmpls_ppzw_h
, uint16_t, uint64_t, <=)
3594 DO_CMP_PPZW_S(sve_cmpls_ppzw_s
, uint32_t, uint64_t, <=)
3596 #undef DO_CMP_PPZW_B
3597 #undef DO_CMP_PPZW_H
3598 #undef DO_CMP_PPZW_S
3601 /* Similar, but the second source is immediate. */
3602 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3603 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3605 intptr_t opr_sz = simd_oprsz(desc); \
3606 uint32_t flags = PREDTEST_INIT; \
3607 TYPE mm = simd_data(desc); \
3608 intptr_t i = opr_sz; \
3610 uint64_t out = 0, pg; \
3612 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3613 TYPE nn = *(TYPE *)(vn + H(i)); \
3616 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3618 *(uint64_t *)(vd + (i >> 3)) = out; \
3619 flags = iter_predtest_bwd(out, pg, flags); \
3624 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3625 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3626 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3627 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3628 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3629 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3630 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3631 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
3633 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b
, uint8_t, ==)
3634 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h
, uint16_t, ==)
3635 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s
, uint32_t, ==)
3636 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d
, uint64_t, ==)
3638 DO_CMP_PPZI_B(sve_cmpne_ppzi_b
, uint8_t, !=)
3639 DO_CMP_PPZI_H(sve_cmpne_ppzi_h
, uint16_t, !=)
3640 DO_CMP_PPZI_S(sve_cmpne_ppzi_s
, uint32_t, !=)
3641 DO_CMP_PPZI_D(sve_cmpne_ppzi_d
, uint64_t, !=)
3643 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b
, int8_t, >)
3644 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h
, int16_t, >)
3645 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s
, int32_t, >)
3646 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d
, int64_t, >)
3648 DO_CMP_PPZI_B(sve_cmpge_ppzi_b
, int8_t, >=)
3649 DO_CMP_PPZI_H(sve_cmpge_ppzi_h
, int16_t, >=)
3650 DO_CMP_PPZI_S(sve_cmpge_ppzi_s
, int32_t, >=)
3651 DO_CMP_PPZI_D(sve_cmpge_ppzi_d
, int64_t, >=)
3653 DO_CMP_PPZI_B(sve_cmphi_ppzi_b
, uint8_t, >)
3654 DO_CMP_PPZI_H(sve_cmphi_ppzi_h
, uint16_t, >)
3655 DO_CMP_PPZI_S(sve_cmphi_ppzi_s
, uint32_t, >)
3656 DO_CMP_PPZI_D(sve_cmphi_ppzi_d
, uint64_t, >)
3658 DO_CMP_PPZI_B(sve_cmphs_ppzi_b
, uint8_t, >=)
3659 DO_CMP_PPZI_H(sve_cmphs_ppzi_h
, uint16_t, >=)
3660 DO_CMP_PPZI_S(sve_cmphs_ppzi_s
, uint32_t, >=)
3661 DO_CMP_PPZI_D(sve_cmphs_ppzi_d
, uint64_t, >=)
3663 DO_CMP_PPZI_B(sve_cmplt_ppzi_b
, int8_t, <)
3664 DO_CMP_PPZI_H(sve_cmplt_ppzi_h
, int16_t, <)
3665 DO_CMP_PPZI_S(sve_cmplt_ppzi_s
, int32_t, <)
3666 DO_CMP_PPZI_D(sve_cmplt_ppzi_d
, int64_t, <)
3668 DO_CMP_PPZI_B(sve_cmple_ppzi_b
, int8_t, <=)
3669 DO_CMP_PPZI_H(sve_cmple_ppzi_h
, int16_t, <=)
3670 DO_CMP_PPZI_S(sve_cmple_ppzi_s
, int32_t, <=)
3671 DO_CMP_PPZI_D(sve_cmple_ppzi_d
, int64_t, <=)
3673 DO_CMP_PPZI_B(sve_cmplo_ppzi_b
, uint8_t, <)
3674 DO_CMP_PPZI_H(sve_cmplo_ppzi_h
, uint16_t, <)
3675 DO_CMP_PPZI_S(sve_cmplo_ppzi_s
, uint32_t, <)
3676 DO_CMP_PPZI_D(sve_cmplo_ppzi_d
, uint64_t, <)
3678 DO_CMP_PPZI_B(sve_cmpls_ppzi_b
, uint8_t, <=)
3679 DO_CMP_PPZI_H(sve_cmpls_ppzi_h
, uint16_t, <=)
3680 DO_CMP_PPZI_S(sve_cmpls_ppzi_s
, uint32_t, <=)
3681 DO_CMP_PPZI_D(sve_cmpls_ppzi_d
, uint64_t, <=)
3683 #undef DO_CMP_PPZI_B
3684 #undef DO_CMP_PPZI_H
3685 #undef DO_CMP_PPZI_S
3686 #undef DO_CMP_PPZI_D
3689 /* Similar to the ARM LastActive pseudocode function. */
3690 static bool last_active_pred(void *vd
, void *vg
, intptr_t oprsz
)
3694 for (i
= QEMU_ALIGN_UP(oprsz
, 8) - 8; i
>= 0; i
-= 8) {
3695 uint64_t pg
= *(uint64_t *)(vg
+ i
);
3697 return (pow2floor(pg
) & *(uint64_t *)(vd
+ i
)) != 0;
3703 /* Compute a mask into RETB that is true for all G, up to and including
3704 * (if after) or excluding (if !after) the first G & N.
3705 * Return true if BRK found.
3707 static bool compute_brk(uint64_t *retb
, uint64_t n
, uint64_t g
,
3708 bool brk
, bool after
)
3714 } else if ((g
& n
) == 0) {
3715 /* For all G, no N are set; break not found. */
3718 /* Break somewhere in N. Locate it. */
3719 b
= g
& n
; /* guard true, pred true */
3720 b
= b
& -b
; /* first such */
3722 b
= b
| (b
- 1); /* break after same */
3724 b
= b
- 1; /* break before same */
3733 /* Compute a zeroing BRK. */
3734 static void compute_brk_z(uint64_t *d
, uint64_t *n
, uint64_t *g
,
3735 intptr_t oprsz
, bool after
)
3740 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
3741 uint64_t this_b
, this_g
= g
[i
];
3743 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
3744 d
[i
] = this_b
& this_g
;
3748 /* Likewise, but also compute flags. */
3749 static uint32_t compute_brks_z(uint64_t *d
, uint64_t *n
, uint64_t *g
,
3750 intptr_t oprsz
, bool after
)
3752 uint32_t flags
= PREDTEST_INIT
;
3756 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
3757 uint64_t this_b
, this_d
, this_g
= g
[i
];
3759 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
3760 d
[i
] = this_d
= this_b
& this_g
;
3761 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
3766 /* Compute a merging BRK. */
3767 static void compute_brk_m(uint64_t *d
, uint64_t *n
, uint64_t *g
,
3768 intptr_t oprsz
, bool after
)
3773 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
3774 uint64_t this_b
, this_g
= g
[i
];
3776 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
3777 d
[i
] = (this_b
& this_g
) | (d
[i
] & ~this_g
);
3781 /* Likewise, but also compute flags. */
3782 static uint32_t compute_brks_m(uint64_t *d
, uint64_t *n
, uint64_t *g
,
3783 intptr_t oprsz
, bool after
)
3785 uint32_t flags
= PREDTEST_INIT
;
3789 for (i
= 0; i
< oprsz
/ 8; ++i
) {
3790 uint64_t this_b
, this_d
= d
[i
], this_g
= g
[i
];
3792 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
3793 d
[i
] = this_d
= (this_b
& this_g
) | (this_d
& ~this_g
);
3794 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
3799 static uint32_t do_zero(ARMPredicateReg
*d
, intptr_t oprsz
)
3801 /* It is quicker to zero the whole predicate than loop on OPRSZ.
3802 * The compiler should turn this into 4 64-bit integer stores.
3804 memset(d
, 0, sizeof(ARMPredicateReg
));
3805 return PREDTEST_INIT
;
3808 void HELPER(sve_brkpa
)(void *vd
, void *vn
, void *vm
, void *vg
,
3811 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3812 if (last_active_pred(vn
, vg
, oprsz
)) {
3813 compute_brk_z(vd
, vm
, vg
, oprsz
, true);
3819 uint32_t HELPER(sve_brkpas
)(void *vd
, void *vn
, void *vm
, void *vg
,
3822 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3823 if (last_active_pred(vn
, vg
, oprsz
)) {
3824 return compute_brks_z(vd
, vm
, vg
, oprsz
, true);
3826 return do_zero(vd
, oprsz
);
3830 void HELPER(sve_brkpb
)(void *vd
, void *vn
, void *vm
, void *vg
,
3833 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3834 if (last_active_pred(vn
, vg
, oprsz
)) {
3835 compute_brk_z(vd
, vm
, vg
, oprsz
, false);
3841 uint32_t HELPER(sve_brkpbs
)(void *vd
, void *vn
, void *vm
, void *vg
,
3844 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3845 if (last_active_pred(vn
, vg
, oprsz
)) {
3846 return compute_brks_z(vd
, vm
, vg
, oprsz
, false);
3848 return do_zero(vd
, oprsz
);
3852 void HELPER(sve_brka_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3854 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3855 compute_brk_z(vd
, vn
, vg
, oprsz
, true);
3858 uint32_t HELPER(sve_brkas_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3860 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3861 return compute_brks_z(vd
, vn
, vg
, oprsz
, true);
3864 void HELPER(sve_brkb_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3866 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3867 compute_brk_z(vd
, vn
, vg
, oprsz
, false);
3870 uint32_t HELPER(sve_brkbs_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3872 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3873 return compute_brks_z(vd
, vn
, vg
, oprsz
, false);
3876 void HELPER(sve_brka_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3878 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3879 compute_brk_m(vd
, vn
, vg
, oprsz
, true);
3882 uint32_t HELPER(sve_brkas_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3884 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3885 return compute_brks_m(vd
, vn
, vg
, oprsz
, true);
3888 void HELPER(sve_brkb_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3890 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3891 compute_brk_m(vd
, vn
, vg
, oprsz
, false);
3894 uint32_t HELPER(sve_brkbs_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3896 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3897 return compute_brks_m(vd
, vn
, vg
, oprsz
, false);
3900 void HELPER(sve_brkn
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3902 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3903 if (!last_active_pred(vn
, vg
, oprsz
)) {
3908 /* As if PredTest(Ones(PL), D, esz). */
3909 static uint32_t predtest_ones(ARMPredicateReg
*d
, intptr_t oprsz
,
3912 uint32_t flags
= PREDTEST_INIT
;
3915 for (i
= 0; i
< oprsz
/ 8; i
++) {
3916 flags
= iter_predtest_fwd(d
->p
[i
], esz_mask
, flags
);
3919 uint64_t mask
= ~(-1ULL << (8 * (oprsz
& 7)));
3920 flags
= iter_predtest_fwd(d
->p
[i
], esz_mask
& mask
, flags
);
3925 uint32_t HELPER(sve_brkns
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
3927 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3928 if (last_active_pred(vn
, vg
, oprsz
)) {
3929 return predtest_ones(vd
, oprsz
, -1);
3931 return do_zero(vd
, oprsz
);
3935 uint64_t HELPER(sve_cntp
)(void *vn
, void *vg
, uint32_t pred_desc
)
3937 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
3938 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3939 uint64_t *n
= vn
, *g
= vg
, sum
= 0, mask
= pred_esz_masks
[esz
];
3942 for (i
= 0; i
< words
; ++i
) {
3943 uint64_t t
= n
[i
] & g
[i
] & mask
;
3949 uint32_t HELPER(sve_whilel
)(void *vd
, uint32_t count
, uint32_t pred_desc
)
3951 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3952 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3953 uint64_t esz_mask
= pred_esz_masks
[esz
];
3954 ARMPredicateReg
*d
= vd
;
3958 /* Begin with a zero predicate register. */
3959 flags
= do_zero(d
, oprsz
);
3964 /* Set all of the requested bits. */
3965 for (i
= 0; i
< count
/ 64; ++i
) {
3969 d
->p
[i
] = MAKE_64BIT_MASK(0, count
& 63) & esz_mask
;
3972 return predtest_ones(d
, oprsz
, esz_mask
);
3975 uint32_t HELPER(sve_whileg
)(void *vd
, uint32_t count
, uint32_t pred_desc
)
3977 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3978 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3979 uint64_t esz_mask
= pred_esz_masks
[esz
];
3980 ARMPredicateReg
*d
= vd
;
3981 intptr_t i
, invcount
, oprbits
;
3985 return do_zero(d
, oprsz
);
3988 oprbits
= oprsz
* 8;
3989 tcg_debug_assert(count
<= oprbits
);
3993 bits
&= MAKE_64BIT_MASK(0, oprbits
& 63);
3996 invcount
= oprbits
- count
;
3997 for (i
= (oprsz
- 1) / 8; i
> invcount
/ 64; --i
) {
4002 d
->p
[i
] = bits
& MAKE_64BIT_MASK(invcount
& 63, 64);
4008 return predtest_ones(d
, oprsz
, esz_mask
);
4011 /* Recursive reduction on a function;
4012 * C.f. the ARM ARM function ReducePredicated.
4014 * While it would be possible to write this without the DATA temporary,
4015 * it is much simpler to process the predicate register this way.
4016 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4017 * little to gain with a more complex non-recursive form.
4019 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4020 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4025 uintptr_t half = n / 2; \
4026 TYPE lo = NAME##_reduce(data, status, half); \
4027 TYPE hi = NAME##_reduce(data + half, status, half); \
4028 return TYPE##_##FUNC(lo, hi, status); \
4031 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
4033 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4034 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4035 for (i = 0; i < oprsz; ) { \
4036 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4038 TYPE nn = *(TYPE *)(vn + H(i)); \
4039 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4040 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4043 for (; i < maxsz; i += sizeof(TYPE)) { \
4044 *(TYPE *)((void *)data + i) = IDENT; \
4046 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
4049 DO_REDUCE(sve_faddv_h
, float16
, H1_2
, add
, float16_zero
)
4050 DO_REDUCE(sve_faddv_s
, float32
, H1_4
, add
, float32_zero
)
4051 DO_REDUCE(sve_faddv_d
, float64
, , add
, float64_zero
)
4053 /* Identity is floatN_default_nan, without the function call. */
4054 DO_REDUCE(sve_fminnmv_h
, float16
, H1_2
, minnum
, 0x7E00)
4055 DO_REDUCE(sve_fminnmv_s
, float32
, H1_4
, minnum
, 0x7FC00000)
4056 DO_REDUCE(sve_fminnmv_d
, float64
, , minnum
, 0x7FF8000000000000ULL
)
4058 DO_REDUCE(sve_fmaxnmv_h
, float16
, H1_2
, maxnum
, 0x7E00)
4059 DO_REDUCE(sve_fmaxnmv_s
, float32
, H1_4
, maxnum
, 0x7FC00000)
4060 DO_REDUCE(sve_fmaxnmv_d
, float64
, , maxnum
, 0x7FF8000000000000ULL
)
4062 DO_REDUCE(sve_fminv_h
, float16
, H1_2
, min
, float16_infinity
)
4063 DO_REDUCE(sve_fminv_s
, float32
, H1_4
, min
, float32_infinity
)
4064 DO_REDUCE(sve_fminv_d
, float64
, , min
, float64_infinity
)
4066 DO_REDUCE(sve_fmaxv_h
, float16
, H1_2
, max
, float16_chs(float16_infinity
))
4067 DO_REDUCE(sve_fmaxv_s
, float32
, H1_4
, max
, float32_chs(float32_infinity
))
4068 DO_REDUCE(sve_fmaxv_d
, float64
, , max
, float64_chs(float64_infinity
))
4072 uint64_t HELPER(sve_fadda_h
)(uint64_t nn
, void *vm
, void *vg
,
4073 void *status
, uint32_t desc
)
4075 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
);
4076 float16 result
= nn
;
4079 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
4082 float16 mm
= *(float16
*)(vm
+ H1_2(i
));
4083 result
= float16_add(result
, mm
, status
);
4085 i
+= sizeof(float16
), pg
>>= sizeof(float16
);
4087 } while (i
< opr_sz
);
4092 uint64_t HELPER(sve_fadda_s
)(uint64_t nn
, void *vm
, void *vg
,
4093 void *status
, uint32_t desc
)
4095 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
);
4096 float32 result
= nn
;
4099 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
4102 float32 mm
= *(float32
*)(vm
+ H1_2(i
));
4103 result
= float32_add(result
, mm
, status
);
4105 i
+= sizeof(float32
), pg
>>= sizeof(float32
);
4107 } while (i
< opr_sz
);
4112 uint64_t HELPER(sve_fadda_d
)(uint64_t nn
, void *vm
, void *vg
,
4113 void *status
, uint32_t desc
)
4115 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
) / 8;
4119 for (i
= 0; i
< opr_sz
; i
++) {
4120 if (pg
[H1(i
)] & 1) {
4121 nn
= float64_add(nn
, m
[i
], status
);
4128 /* Fully general three-operand expander, controlled by a predicate,
4129 * With the extra float_status parameter.
4131 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4132 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4133 void *status, uint32_t desc) \
4135 intptr_t i = simd_oprsz(desc); \
4138 uint64_t pg = g[(i - 1) >> 6]; \
4140 i -= sizeof(TYPE); \
4141 if (likely((pg >> (i & 63)) & 1)) { \
4142 TYPE nn = *(TYPE *)(vn + H(i)); \
4143 TYPE mm = *(TYPE *)(vm + H(i)); \
4144 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4150 DO_ZPZZ_FP(sve_fadd_h
, uint16_t, H1_2
, float16_add
)
4151 DO_ZPZZ_FP(sve_fadd_s
, uint32_t, H1_4
, float32_add
)
4152 DO_ZPZZ_FP(sve_fadd_d
, uint64_t, , float64_add
)
4154 DO_ZPZZ_FP(sve_fsub_h
, uint16_t, H1_2
, float16_sub
)
4155 DO_ZPZZ_FP(sve_fsub_s
, uint32_t, H1_4
, float32_sub
)
4156 DO_ZPZZ_FP(sve_fsub_d
, uint64_t, , float64_sub
)
4158 DO_ZPZZ_FP(sve_fmul_h
, uint16_t, H1_2
, float16_mul
)
4159 DO_ZPZZ_FP(sve_fmul_s
, uint32_t, H1_4
, float32_mul
)
4160 DO_ZPZZ_FP(sve_fmul_d
, uint64_t, , float64_mul
)
4162 DO_ZPZZ_FP(sve_fdiv_h
, uint16_t, H1_2
, float16_div
)
4163 DO_ZPZZ_FP(sve_fdiv_s
, uint32_t, H1_4
, float32_div
)
4164 DO_ZPZZ_FP(sve_fdiv_d
, uint64_t, , float64_div
)
4166 DO_ZPZZ_FP(sve_fmin_h
, uint16_t, H1_2
, float16_min
)
4167 DO_ZPZZ_FP(sve_fmin_s
, uint32_t, H1_4
, float32_min
)
4168 DO_ZPZZ_FP(sve_fmin_d
, uint64_t, , float64_min
)
4170 DO_ZPZZ_FP(sve_fmax_h
, uint16_t, H1_2
, float16_max
)
4171 DO_ZPZZ_FP(sve_fmax_s
, uint32_t, H1_4
, float32_max
)
4172 DO_ZPZZ_FP(sve_fmax_d
, uint64_t, , float64_max
)
4174 DO_ZPZZ_FP(sve_fminnum_h
, uint16_t, H1_2
, float16_minnum
)
4175 DO_ZPZZ_FP(sve_fminnum_s
, uint32_t, H1_4
, float32_minnum
)
4176 DO_ZPZZ_FP(sve_fminnum_d
, uint64_t, , float64_minnum
)
4178 DO_ZPZZ_FP(sve_fmaxnum_h
, uint16_t, H1_2
, float16_maxnum
)
4179 DO_ZPZZ_FP(sve_fmaxnum_s
, uint32_t, H1_4
, float32_maxnum
)
4180 DO_ZPZZ_FP(sve_fmaxnum_d
, uint64_t, , float64_maxnum
)
4182 static inline float16
abd_h(float16 a
, float16 b
, float_status
*s
)
4184 return float16_abs(float16_sub(a
, b
, s
));
4187 static inline float32
abd_s(float32 a
, float32 b
, float_status
*s
)
4189 return float32_abs(float32_sub(a
, b
, s
));
4192 static inline float64
abd_d(float64 a
, float64 b
, float_status
*s
)
4194 return float64_abs(float64_sub(a
, b
, s
));
4197 DO_ZPZZ_FP(sve_fabd_h
, uint16_t, H1_2
, abd_h
)
4198 DO_ZPZZ_FP(sve_fabd_s
, uint32_t, H1_4
, abd_s
)
4199 DO_ZPZZ_FP(sve_fabd_d
, uint64_t, , abd_d
)
4201 static inline float64
scalbn_d(float64 a
, int64_t b
, float_status
*s
)
4203 int b_int
= MIN(MAX(b
, INT_MIN
), INT_MAX
);
4204 return float64_scalbn(a
, b_int
, s
);
4207 DO_ZPZZ_FP(sve_fscalbn_h
, int16_t, H1_2
, float16_scalbn
)
4208 DO_ZPZZ_FP(sve_fscalbn_s
, int32_t, H1_4
, float32_scalbn
)
4209 DO_ZPZZ_FP(sve_fscalbn_d
, int64_t, , scalbn_d
)
4211 DO_ZPZZ_FP(sve_fmulx_h
, uint16_t, H1_2
, helper_advsimd_mulxh
)
4212 DO_ZPZZ_FP(sve_fmulx_s
, uint32_t, H1_4
, helper_vfp_mulxs
)
4213 DO_ZPZZ_FP(sve_fmulx_d
, uint64_t, , helper_vfp_mulxd
)
4217 /* Three-operand expander, with one scalar operand, controlled by
4218 * a predicate, with the extra float_status parameter.
4220 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4221 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4222 void *status, uint32_t desc) \
4224 intptr_t i = simd_oprsz(desc); \
4228 uint64_t pg = g[(i - 1) >> 6]; \
4230 i -= sizeof(TYPE); \
4231 if (likely((pg >> (i & 63)) & 1)) { \
4232 TYPE nn = *(TYPE *)(vn + H(i)); \
4233 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4239 DO_ZPZS_FP(sve_fadds_h
, float16
, H1_2
, float16_add
)
4240 DO_ZPZS_FP(sve_fadds_s
, float32
, H1_4
, float32_add
)
4241 DO_ZPZS_FP(sve_fadds_d
, float64
, , float64_add
)
4243 DO_ZPZS_FP(sve_fsubs_h
, float16
, H1_2
, float16_sub
)
4244 DO_ZPZS_FP(sve_fsubs_s
, float32
, H1_4
, float32_sub
)
4245 DO_ZPZS_FP(sve_fsubs_d
, float64
, , float64_sub
)
4247 DO_ZPZS_FP(sve_fmuls_h
, float16
, H1_2
, float16_mul
)
4248 DO_ZPZS_FP(sve_fmuls_s
, float32
, H1_4
, float32_mul
)
4249 DO_ZPZS_FP(sve_fmuls_d
, float64
, , float64_mul
)
4251 static inline float16
subr_h(float16 a
, float16 b
, float_status
*s
)
4253 return float16_sub(b
, a
, s
);
4256 static inline float32
subr_s(float32 a
, float32 b
, float_status
*s
)
4258 return float32_sub(b
, a
, s
);
4261 static inline float64
subr_d(float64 a
, float64 b
, float_status
*s
)
4263 return float64_sub(b
, a
, s
);
4266 DO_ZPZS_FP(sve_fsubrs_h
, float16
, H1_2
, subr_h
)
4267 DO_ZPZS_FP(sve_fsubrs_s
, float32
, H1_4
, subr_s
)
4268 DO_ZPZS_FP(sve_fsubrs_d
, float64
, , subr_d
)
4270 DO_ZPZS_FP(sve_fmaxnms_h
, float16
, H1_2
, float16_maxnum
)
4271 DO_ZPZS_FP(sve_fmaxnms_s
, float32
, H1_4
, float32_maxnum
)
4272 DO_ZPZS_FP(sve_fmaxnms_d
, float64
, , float64_maxnum
)
4274 DO_ZPZS_FP(sve_fminnms_h
, float16
, H1_2
, float16_minnum
)
4275 DO_ZPZS_FP(sve_fminnms_s
, float32
, H1_4
, float32_minnum
)
4276 DO_ZPZS_FP(sve_fminnms_d
, float64
, , float64_minnum
)
4278 DO_ZPZS_FP(sve_fmaxs_h
, float16
, H1_2
, float16_max
)
4279 DO_ZPZS_FP(sve_fmaxs_s
, float32
, H1_4
, float32_max
)
4280 DO_ZPZS_FP(sve_fmaxs_d
, float64
, , float64_max
)
4282 DO_ZPZS_FP(sve_fmins_h
, float16
, H1_2
, float16_min
)
4283 DO_ZPZS_FP(sve_fmins_s
, float32
, H1_4
, float32_min
)
4284 DO_ZPZS_FP(sve_fmins_d
, float64
, , float64_min
)
4286 /* Fully general two-operand expander, controlled by a predicate,
4287 * With the extra float_status parameter.
4289 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4290 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4292 intptr_t i = simd_oprsz(desc); \
4295 uint64_t pg = g[(i - 1) >> 6]; \
4297 i -= sizeof(TYPE); \
4298 if (likely((pg >> (i & 63)) & 1)) { \
4299 TYPE nn = *(TYPE *)(vn + H(i)); \
4300 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4306 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4307 * FZ16. When converting from fp16, this affects flushing input denormals;
4308 * when converting to fp16, this affects flushing output denormals.
4310 static inline float32
sve_f16_to_f32(float16 f
, float_status
*fpst
)
4312 bool save
= get_flush_inputs_to_zero(fpst
);
4315 set_flush_inputs_to_zero(false, fpst
);
4316 ret
= float16_to_float32(f
, true, fpst
);
4317 set_flush_inputs_to_zero(save
, fpst
);
4321 static inline float64
sve_f16_to_f64(float16 f
, float_status
*fpst
)
4323 bool save
= get_flush_inputs_to_zero(fpst
);
4326 set_flush_inputs_to_zero(false, fpst
);
4327 ret
= float16_to_float64(f
, true, fpst
);
4328 set_flush_inputs_to_zero(save
, fpst
);
4332 static inline float16
sve_f32_to_f16(float32 f
, float_status
*fpst
)
4334 bool save
= get_flush_to_zero(fpst
);
4337 set_flush_to_zero(false, fpst
);
4338 ret
= float32_to_float16(f
, true, fpst
);
4339 set_flush_to_zero(save
, fpst
);
4343 static inline float16
sve_f64_to_f16(float64 f
, float_status
*fpst
)
4345 bool save
= get_flush_to_zero(fpst
);
4348 set_flush_to_zero(false, fpst
);
4349 ret
= float64_to_float16(f
, true, fpst
);
4350 set_flush_to_zero(save
, fpst
);
4354 static inline int16_t vfp_float16_to_int16_rtz(float16 f
, float_status
*s
)
4356 if (float16_is_any_nan(f
)) {
4357 float_raise(float_flag_invalid
, s
);
4360 return float16_to_int16_round_to_zero(f
, s
);
4363 static inline int64_t vfp_float16_to_int64_rtz(float16 f
, float_status
*s
)
4365 if (float16_is_any_nan(f
)) {
4366 float_raise(float_flag_invalid
, s
);
4369 return float16_to_int64_round_to_zero(f
, s
);
4372 static inline int64_t vfp_float32_to_int64_rtz(float32 f
, float_status
*s
)
4374 if (float32_is_any_nan(f
)) {
4375 float_raise(float_flag_invalid
, s
);
4378 return float32_to_int64_round_to_zero(f
, s
);
4381 static inline int64_t vfp_float64_to_int64_rtz(float64 f
, float_status
*s
)
4383 if (float64_is_any_nan(f
)) {
4384 float_raise(float_flag_invalid
, s
);
4387 return float64_to_int64_round_to_zero(f
, s
);
4390 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f
, float_status
*s
)
4392 if (float16_is_any_nan(f
)) {
4393 float_raise(float_flag_invalid
, s
);
4396 return float16_to_uint16_round_to_zero(f
, s
);
4399 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f
, float_status
*s
)
4401 if (float16_is_any_nan(f
)) {
4402 float_raise(float_flag_invalid
, s
);
4405 return float16_to_uint64_round_to_zero(f
, s
);
4408 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f
, float_status
*s
)
4410 if (float32_is_any_nan(f
)) {
4411 float_raise(float_flag_invalid
, s
);
4414 return float32_to_uint64_round_to_zero(f
, s
);
4417 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f
, float_status
*s
)
4419 if (float64_is_any_nan(f
)) {
4420 float_raise(float_flag_invalid
, s
);
4423 return float64_to_uint64_round_to_zero(f
, s
);
4426 DO_ZPZ_FP(sve_fcvt_sh
, uint32_t, H1_4
, sve_f32_to_f16
)
4427 DO_ZPZ_FP(sve_fcvt_hs
, uint32_t, H1_4
, sve_f16_to_f32
)
4428 DO_ZPZ_FP(sve_fcvt_dh
, uint64_t, , sve_f64_to_f16
)
4429 DO_ZPZ_FP(sve_fcvt_hd
, uint64_t, , sve_f16_to_f64
)
4430 DO_ZPZ_FP(sve_fcvt_ds
, uint64_t, , float64_to_float32
)
4431 DO_ZPZ_FP(sve_fcvt_sd
, uint64_t, , float32_to_float64
)
4433 DO_ZPZ_FP(sve_fcvtzs_hh
, uint16_t, H1_2
, vfp_float16_to_int16_rtz
)
4434 DO_ZPZ_FP(sve_fcvtzs_hs
, uint32_t, H1_4
, helper_vfp_tosizh
)
4435 DO_ZPZ_FP(sve_fcvtzs_ss
, uint32_t, H1_4
, helper_vfp_tosizs
)
4436 DO_ZPZ_FP(sve_fcvtzs_hd
, uint64_t, , vfp_float16_to_int64_rtz
)
4437 DO_ZPZ_FP(sve_fcvtzs_sd
, uint64_t, , vfp_float32_to_int64_rtz
)
4438 DO_ZPZ_FP(sve_fcvtzs_ds
, uint64_t, , helper_vfp_tosizd
)
4439 DO_ZPZ_FP(sve_fcvtzs_dd
, uint64_t, , vfp_float64_to_int64_rtz
)
4441 DO_ZPZ_FP(sve_fcvtzu_hh
, uint16_t, H1_2
, vfp_float16_to_uint16_rtz
)
4442 DO_ZPZ_FP(sve_fcvtzu_hs
, uint32_t, H1_4
, helper_vfp_touizh
)
4443 DO_ZPZ_FP(sve_fcvtzu_ss
, uint32_t, H1_4
, helper_vfp_touizs
)
4444 DO_ZPZ_FP(sve_fcvtzu_hd
, uint64_t, , vfp_float16_to_uint64_rtz
)
4445 DO_ZPZ_FP(sve_fcvtzu_sd
, uint64_t, , vfp_float32_to_uint64_rtz
)
4446 DO_ZPZ_FP(sve_fcvtzu_ds
, uint64_t, , helper_vfp_touizd
)
4447 DO_ZPZ_FP(sve_fcvtzu_dd
, uint64_t, , vfp_float64_to_uint64_rtz
)
4449 DO_ZPZ_FP(sve_frint_h
, uint16_t, H1_2
, helper_advsimd_rinth
)
4450 DO_ZPZ_FP(sve_frint_s
, uint32_t, H1_4
, helper_rints
)
4451 DO_ZPZ_FP(sve_frint_d
, uint64_t, , helper_rintd
)
4453 DO_ZPZ_FP(sve_frintx_h
, uint16_t, H1_2
, float16_round_to_int
)
4454 DO_ZPZ_FP(sve_frintx_s
, uint32_t, H1_4
, float32_round_to_int
)
4455 DO_ZPZ_FP(sve_frintx_d
, uint64_t, , float64_round_to_int
)
4457 DO_ZPZ_FP(sve_frecpx_h
, uint16_t, H1_2
, helper_frecpx_f16
)
4458 DO_ZPZ_FP(sve_frecpx_s
, uint32_t, H1_4
, helper_frecpx_f32
)
4459 DO_ZPZ_FP(sve_frecpx_d
, uint64_t, , helper_frecpx_f64
)
4461 DO_ZPZ_FP(sve_fsqrt_h
, uint16_t, H1_2
, float16_sqrt
)
4462 DO_ZPZ_FP(sve_fsqrt_s
, uint32_t, H1_4
, float32_sqrt
)
4463 DO_ZPZ_FP(sve_fsqrt_d
, uint64_t, , float64_sqrt
)
4465 DO_ZPZ_FP(sve_scvt_hh
, uint16_t, H1_2
, int16_to_float16
)
4466 DO_ZPZ_FP(sve_scvt_sh
, uint32_t, H1_4
, int32_to_float16
)
4467 DO_ZPZ_FP(sve_scvt_ss
, uint32_t, H1_4
, int32_to_float32
)
4468 DO_ZPZ_FP(sve_scvt_sd
, uint64_t, , int32_to_float64
)
4469 DO_ZPZ_FP(sve_scvt_dh
, uint64_t, , int64_to_float16
)
4470 DO_ZPZ_FP(sve_scvt_ds
, uint64_t, , int64_to_float32
)
4471 DO_ZPZ_FP(sve_scvt_dd
, uint64_t, , int64_to_float64
)
4473 DO_ZPZ_FP(sve_ucvt_hh
, uint16_t, H1_2
, uint16_to_float16
)
4474 DO_ZPZ_FP(sve_ucvt_sh
, uint32_t, H1_4
, uint32_to_float16
)
4475 DO_ZPZ_FP(sve_ucvt_ss
, uint32_t, H1_4
, uint32_to_float32
)
4476 DO_ZPZ_FP(sve_ucvt_sd
, uint64_t, , uint32_to_float64
)
4477 DO_ZPZ_FP(sve_ucvt_dh
, uint64_t, , uint64_to_float16
)
4478 DO_ZPZ_FP(sve_ucvt_ds
, uint64_t, , uint64_to_float32
)
4479 DO_ZPZ_FP(sve_ucvt_dd
, uint64_t, , uint64_to_float64
)
4483 static void do_fmla_zpzzz_h(void *vd
, void *vn
, void *vm
, void *va
, void *vg
,
4484 float_status
*status
, uint32_t desc
,
4485 uint16_t neg1
, uint16_t neg3
)
4487 intptr_t i
= simd_oprsz(desc
);
4491 uint64_t pg
= g
[(i
- 1) >> 6];
4494 if (likely((pg
>> (i
& 63)) & 1)) {
4495 float16 e1
, e2
, e3
, r
;
4497 e1
= *(uint16_t *)(vn
+ H1_2(i
)) ^ neg1
;
4498 e2
= *(uint16_t *)(vm
+ H1_2(i
));
4499 e3
= *(uint16_t *)(va
+ H1_2(i
)) ^ neg3
;
4500 r
= float16_muladd(e1
, e2
, e3
, 0, status
);
4501 *(uint16_t *)(vd
+ H1_2(i
)) = r
;
4507 void HELPER(sve_fmla_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4508 void *vg
, void *status
, uint32_t desc
)
4510 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0);
4513 void HELPER(sve_fmls_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4514 void *vg
, void *status
, uint32_t desc
)
4516 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x8000, 0);
4519 void HELPER(sve_fnmla_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4520 void *vg
, void *status
, uint32_t desc
)
4522 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x8000, 0x8000);
4525 void HELPER(sve_fnmls_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4526 void *vg
, void *status
, uint32_t desc
)
4528 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0x8000);
4531 static void do_fmla_zpzzz_s(void *vd
, void *vn
, void *vm
, void *va
, void *vg
,
4532 float_status
*status
, uint32_t desc
,
4533 uint32_t neg1
, uint32_t neg3
)
4535 intptr_t i
= simd_oprsz(desc
);
4539 uint64_t pg
= g
[(i
- 1) >> 6];
4542 if (likely((pg
>> (i
& 63)) & 1)) {
4543 float32 e1
, e2
, e3
, r
;
4545 e1
= *(uint32_t *)(vn
+ H1_4(i
)) ^ neg1
;
4546 e2
= *(uint32_t *)(vm
+ H1_4(i
));
4547 e3
= *(uint32_t *)(va
+ H1_4(i
)) ^ neg3
;
4548 r
= float32_muladd(e1
, e2
, e3
, 0, status
);
4549 *(uint32_t *)(vd
+ H1_4(i
)) = r
;
4555 void HELPER(sve_fmla_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4556 void *vg
, void *status
, uint32_t desc
)
4558 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0);
4561 void HELPER(sve_fmls_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4562 void *vg
, void *status
, uint32_t desc
)
4564 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x80000000, 0);
4567 void HELPER(sve_fnmla_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4568 void *vg
, void *status
, uint32_t desc
)
4570 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x80000000, 0x80000000);
4573 void HELPER(sve_fnmls_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4574 void *vg
, void *status
, uint32_t desc
)
4576 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0x80000000);
4579 static void do_fmla_zpzzz_d(void *vd
, void *vn
, void *vm
, void *va
, void *vg
,
4580 float_status
*status
, uint32_t desc
,
4581 uint64_t neg1
, uint64_t neg3
)
4583 intptr_t i
= simd_oprsz(desc
);
4587 uint64_t pg
= g
[(i
- 1) >> 6];
4590 if (likely((pg
>> (i
& 63)) & 1)) {
4591 float64 e1
, e2
, e3
, r
;
4593 e1
= *(uint64_t *)(vn
+ i
) ^ neg1
;
4594 e2
= *(uint64_t *)(vm
+ i
);
4595 e3
= *(uint64_t *)(va
+ i
) ^ neg3
;
4596 r
= float64_muladd(e1
, e2
, e3
, 0, status
);
4597 *(uint64_t *)(vd
+ i
) = r
;
4603 void HELPER(sve_fmla_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4604 void *vg
, void *status
, uint32_t desc
)
4606 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0);
4609 void HELPER(sve_fmls_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4610 void *vg
, void *status
, uint32_t desc
)
4612 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, INT64_MIN
, 0);
4615 void HELPER(sve_fnmla_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4616 void *vg
, void *status
, uint32_t desc
)
4618 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, INT64_MIN
, INT64_MIN
);
4621 void HELPER(sve_fnmls_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4622 void *vg
, void *status
, uint32_t desc
)
4624 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, INT64_MIN
);
4627 /* Two operand floating-point comparison controlled by a predicate.
4628 * Unlike the integer version, we are not allowed to optimistically
4629 * compare operands, since the comparison may have side effects wrt
4632 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4633 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4634 void *status, uint32_t desc) \
4636 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4637 uint64_t *d = vd, *g = vg; \
4639 uint64_t out = 0, pg = g[j]; \
4641 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4642 if (likely((pg >> (i & 63)) & 1)) { \
4643 TYPE nn = *(TYPE *)(vn + H(i)); \
4644 TYPE mm = *(TYPE *)(vm + H(i)); \
4645 out |= OP(TYPE, nn, mm, status); \
4652 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4653 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4654 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4655 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4656 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4657 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
4659 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4660 DO_FPCMP_PPZZ_H(NAME, OP) \
4661 DO_FPCMP_PPZZ_S(NAME, OP) \
4662 DO_FPCMP_PPZZ_D(NAME, OP)
4664 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4665 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4666 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4667 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4668 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4669 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4670 #define DO_FCMUO(TYPE, X, Y, ST) \
4671 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4672 #define DO_FACGE(TYPE, X, Y, ST) \
4673 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4674 #define DO_FACGT(TYPE, X, Y, ST) \
4675 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4677 DO_FPCMP_PPZZ_ALL(sve_fcmge
, DO_FCMGE
)
4678 DO_FPCMP_PPZZ_ALL(sve_fcmgt
, DO_FCMGT
)
4679 DO_FPCMP_PPZZ_ALL(sve_fcmeq
, DO_FCMEQ
)
4680 DO_FPCMP_PPZZ_ALL(sve_fcmne
, DO_FCMNE
)
4681 DO_FPCMP_PPZZ_ALL(sve_fcmuo
, DO_FCMUO
)
4682 DO_FPCMP_PPZZ_ALL(sve_facge
, DO_FACGE
)
4683 DO_FPCMP_PPZZ_ALL(sve_facgt
, DO_FACGT
)
4685 #undef DO_FPCMP_PPZZ_ALL
4686 #undef DO_FPCMP_PPZZ_D
4687 #undef DO_FPCMP_PPZZ_S
4688 #undef DO_FPCMP_PPZZ_H
4689 #undef DO_FPCMP_PPZZ
4691 /* One operand floating-point comparison against zero, controlled
4694 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4695 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4696 void *status, uint32_t desc) \
4698 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4699 uint64_t *d = vd, *g = vg; \
4701 uint64_t out = 0, pg = g[j]; \
4703 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4704 if ((pg >> (i & 63)) & 1) { \
4705 TYPE nn = *(TYPE *)(vn + H(i)); \
4706 out |= OP(TYPE, nn, 0, status); \
4713 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4714 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4715 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4716 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4717 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4718 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
4720 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4721 DO_FPCMP_PPZ0_H(NAME, OP) \
4722 DO_FPCMP_PPZ0_S(NAME, OP) \
4723 DO_FPCMP_PPZ0_D(NAME, OP)
4725 DO_FPCMP_PPZ0_ALL(sve_fcmge0
, DO_FCMGE
)
4726 DO_FPCMP_PPZ0_ALL(sve_fcmgt0
, DO_FCMGT
)
4727 DO_FPCMP_PPZ0_ALL(sve_fcmle0
, DO_FCMLE
)
4728 DO_FPCMP_PPZ0_ALL(sve_fcmlt0
, DO_FCMLT
)
4729 DO_FPCMP_PPZ0_ALL(sve_fcmeq0
, DO_FCMEQ
)
4730 DO_FPCMP_PPZ0_ALL(sve_fcmne0
, DO_FCMNE
)
4732 /* FP Trig Multiply-Add. */
4734 void HELPER(sve_ftmad_h
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
4736 static const float16 coeff
[16] = {
4737 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4738 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4740 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float16
);
4741 intptr_t x
= simd_data(desc
);
4742 float16
*d
= vd
, *n
= vn
, *m
= vm
;
4743 for (i
= 0; i
< opr_sz
; i
++) {
4746 if (float16_is_neg(mm
)) {
4747 mm
= float16_abs(mm
);
4750 d
[i
] = float16_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
4754 void HELPER(sve_ftmad_s
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
4756 static const float32 coeff
[16] = {
4757 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
4758 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
4759 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
4760 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
4762 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float32
);
4763 intptr_t x
= simd_data(desc
);
4764 float32
*d
= vd
, *n
= vn
, *m
= vm
;
4765 for (i
= 0; i
< opr_sz
; i
++) {
4768 if (float32_is_neg(mm
)) {
4769 mm
= float32_abs(mm
);
4772 d
[i
] = float32_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
4776 void HELPER(sve_ftmad_d
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
4778 static const float64 coeff
[16] = {
4779 0x3ff0000000000000ull
, 0xbfc5555555555543ull
,
4780 0x3f8111111110f30cull
, 0xbf2a01a019b92fc6ull
,
4781 0x3ec71de351f3d22bull
, 0xbe5ae5e2b60f7b91ull
,
4782 0x3de5d8408868552full
, 0x0000000000000000ull
,
4783 0x3ff0000000000000ull
, 0xbfe0000000000000ull
,
4784 0x3fa5555555555536ull
, 0xbf56c16c16c13a0bull
,
4785 0x3efa01a019b1e8d8ull
, 0xbe927e4f7282f468ull
,
4786 0x3e21ee96d2641b13ull
, 0xbda8f76380fbb401ull
,
4788 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float64
);
4789 intptr_t x
= simd_data(desc
);
4790 float64
*d
= vd
, *n
= vn
, *m
= vm
;
4791 for (i
= 0; i
< opr_sz
; i
++) {
4794 if (float64_is_neg(mm
)) {
4795 mm
= float64_abs(mm
);
4798 d
[i
] = float64_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
4806 void HELPER(sve_fcadd_h
)(void *vd
, void *vn
, void *vm
, void *vg
,
4807 void *vs
, uint32_t desc
)
4809 intptr_t j
, i
= simd_oprsz(desc
);
4811 float16 neg_imag
= float16_set_sign(0, simd_data(desc
));
4812 float16 neg_real
= float16_chs(neg_imag
);
4815 uint64_t pg
= g
[(i
- 1) >> 6];
4817 float16 e0
, e1
, e2
, e3
;
4819 /* I holds the real index; J holds the imag index. */
4820 j
= i
- sizeof(float16
);
4821 i
-= 2 * sizeof(float16
);
4823 e0
= *(float16
*)(vn
+ H1_2(i
));
4824 e1
= *(float16
*)(vm
+ H1_2(j
)) ^ neg_real
;
4825 e2
= *(float16
*)(vn
+ H1_2(j
));
4826 e3
= *(float16
*)(vm
+ H1_2(i
)) ^ neg_imag
;
4828 if (likely((pg
>> (i
& 63)) & 1)) {
4829 *(float16
*)(vd
+ H1_2(i
)) = float16_add(e0
, e1
, vs
);
4831 if (likely((pg
>> (j
& 63)) & 1)) {
4832 *(float16
*)(vd
+ H1_2(j
)) = float16_add(e2
, e3
, vs
);
4838 void HELPER(sve_fcadd_s
)(void *vd
, void *vn
, void *vm
, void *vg
,
4839 void *vs
, uint32_t desc
)
4841 intptr_t j
, i
= simd_oprsz(desc
);
4843 float32 neg_imag
= float32_set_sign(0, simd_data(desc
));
4844 float32 neg_real
= float32_chs(neg_imag
);
4847 uint64_t pg
= g
[(i
- 1) >> 6];
4849 float32 e0
, e1
, e2
, e3
;
4851 /* I holds the real index; J holds the imag index. */
4852 j
= i
- sizeof(float32
);
4853 i
-= 2 * sizeof(float32
);
4855 e0
= *(float32
*)(vn
+ H1_2(i
));
4856 e1
= *(float32
*)(vm
+ H1_2(j
)) ^ neg_real
;
4857 e2
= *(float32
*)(vn
+ H1_2(j
));
4858 e3
= *(float32
*)(vm
+ H1_2(i
)) ^ neg_imag
;
4860 if (likely((pg
>> (i
& 63)) & 1)) {
4861 *(float32
*)(vd
+ H1_2(i
)) = float32_add(e0
, e1
, vs
);
4863 if (likely((pg
>> (j
& 63)) & 1)) {
4864 *(float32
*)(vd
+ H1_2(j
)) = float32_add(e2
, e3
, vs
);
4870 void HELPER(sve_fcadd_d
)(void *vd
, void *vn
, void *vm
, void *vg
,
4871 void *vs
, uint32_t desc
)
4873 intptr_t j
, i
= simd_oprsz(desc
);
4875 float64 neg_imag
= float64_set_sign(0, simd_data(desc
));
4876 float64 neg_real
= float64_chs(neg_imag
);
4879 uint64_t pg
= g
[(i
- 1) >> 6];
4881 float64 e0
, e1
, e2
, e3
;
4883 /* I holds the real index; J holds the imag index. */
4884 j
= i
- sizeof(float64
);
4885 i
-= 2 * sizeof(float64
);
4887 e0
= *(float64
*)(vn
+ H1_2(i
));
4888 e1
= *(float64
*)(vm
+ H1_2(j
)) ^ neg_real
;
4889 e2
= *(float64
*)(vn
+ H1_2(j
));
4890 e3
= *(float64
*)(vm
+ H1_2(i
)) ^ neg_imag
;
4892 if (likely((pg
>> (i
& 63)) & 1)) {
4893 *(float64
*)(vd
+ H1_2(i
)) = float64_add(e0
, e1
, vs
);
4895 if (likely((pg
>> (j
& 63)) & 1)) {
4896 *(float64
*)(vd
+ H1_2(j
)) = float64_add(e2
, e3
, vs
);
4903 * FP Complex Multiply
4906 void HELPER(sve_fcmla_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4907 void *vg
, void *status
, uint32_t desc
)
4909 intptr_t j
, i
= simd_oprsz(desc
);
4910 unsigned rot
= simd_data(desc
);
4911 bool flip
= rot
& 1;
4912 float16 neg_imag
, neg_real
;
4915 neg_imag
= float16_set_sign(0, (rot
& 2) != 0);
4916 neg_real
= float16_set_sign(0, rot
== 1 || rot
== 2);
4919 uint64_t pg
= g
[(i
- 1) >> 6];
4921 float16 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
4923 /* I holds the real index; J holds the imag index. */
4924 j
= i
- sizeof(float16
);
4925 i
-= 2 * sizeof(float16
);
4927 nr
= *(float16
*)(vn
+ H1_2(i
));
4928 ni
= *(float16
*)(vn
+ H1_2(j
));
4929 mr
= *(float16
*)(vm
+ H1_2(i
));
4930 mi
= *(float16
*)(vm
+ H1_2(j
));
4932 e2
= (flip
? ni
: nr
);
4933 e1
= (flip
? mi
: mr
) ^ neg_real
;
4935 e3
= (flip
? mr
: mi
) ^ neg_imag
;
4937 if (likely((pg
>> (i
& 63)) & 1)) {
4938 d
= *(float16
*)(va
+ H1_2(i
));
4939 d
= float16_muladd(e2
, e1
, d
, 0, status
);
4940 *(float16
*)(vd
+ H1_2(i
)) = d
;
4942 if (likely((pg
>> (j
& 63)) & 1)) {
4943 d
= *(float16
*)(va
+ H1_2(j
));
4944 d
= float16_muladd(e4
, e3
, d
, 0, status
);
4945 *(float16
*)(vd
+ H1_2(j
)) = d
;
4951 void HELPER(sve_fcmla_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4952 void *vg
, void *status
, uint32_t desc
)
4954 intptr_t j
, i
= simd_oprsz(desc
);
4955 unsigned rot
= simd_data(desc
);
4956 bool flip
= rot
& 1;
4957 float32 neg_imag
, neg_real
;
4960 neg_imag
= float32_set_sign(0, (rot
& 2) != 0);
4961 neg_real
= float32_set_sign(0, rot
== 1 || rot
== 2);
4964 uint64_t pg
= g
[(i
- 1) >> 6];
4966 float32 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
4968 /* I holds the real index; J holds the imag index. */
4969 j
= i
- sizeof(float32
);
4970 i
-= 2 * sizeof(float32
);
4972 nr
= *(float32
*)(vn
+ H1_2(i
));
4973 ni
= *(float32
*)(vn
+ H1_2(j
));
4974 mr
= *(float32
*)(vm
+ H1_2(i
));
4975 mi
= *(float32
*)(vm
+ H1_2(j
));
4977 e2
= (flip
? ni
: nr
);
4978 e1
= (flip
? mi
: mr
) ^ neg_real
;
4980 e3
= (flip
? mr
: mi
) ^ neg_imag
;
4982 if (likely((pg
>> (i
& 63)) & 1)) {
4983 d
= *(float32
*)(va
+ H1_2(i
));
4984 d
= float32_muladd(e2
, e1
, d
, 0, status
);
4985 *(float32
*)(vd
+ H1_2(i
)) = d
;
4987 if (likely((pg
>> (j
& 63)) & 1)) {
4988 d
= *(float32
*)(va
+ H1_2(j
));
4989 d
= float32_muladd(e4
, e3
, d
, 0, status
);
4990 *(float32
*)(vd
+ H1_2(j
)) = d
;
4996 void HELPER(sve_fcmla_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4997 void *vg
, void *status
, uint32_t desc
)
4999 intptr_t j
, i
= simd_oprsz(desc
);
5000 unsigned rot
= simd_data(desc
);
5001 bool flip
= rot
& 1;
5002 float64 neg_imag
, neg_real
;
5005 neg_imag
= float64_set_sign(0, (rot
& 2) != 0);
5006 neg_real
= float64_set_sign(0, rot
== 1 || rot
== 2);
5009 uint64_t pg
= g
[(i
- 1) >> 6];
5011 float64 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
5013 /* I holds the real index; J holds the imag index. */
5014 j
= i
- sizeof(float64
);
5015 i
-= 2 * sizeof(float64
);
5017 nr
= *(float64
*)(vn
+ H1_2(i
));
5018 ni
= *(float64
*)(vn
+ H1_2(j
));
5019 mr
= *(float64
*)(vm
+ H1_2(i
));
5020 mi
= *(float64
*)(vm
+ H1_2(j
));
5022 e2
= (flip
? ni
: nr
);
5023 e1
= (flip
? mi
: mr
) ^ neg_real
;
5025 e3
= (flip
? mr
: mi
) ^ neg_imag
;
5027 if (likely((pg
>> (i
& 63)) & 1)) {
5028 d
= *(float64
*)(va
+ H1_2(i
));
5029 d
= float64_muladd(e2
, e1
, d
, 0, status
);
5030 *(float64
*)(vd
+ H1_2(i
)) = d
;
5032 if (likely((pg
>> (j
& 63)) & 1)) {
5033 d
= *(float64
*)(va
+ H1_2(j
));
5034 d
= float64_muladd(e4
, e3
, d
, 0, status
);
5035 *(float64
*)(vd
+ H1_2(j
)) = d
;
5042 * Load contiguous data, protected by a governing predicate.
5046 * Load one element into @vd + @reg_off from @host.
5047 * The controlling predicate is known to be true.
5049 typedef void sve_ldst1_host_fn(void *vd
, intptr_t reg_off
, void *host
);
5052 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
5053 * The controlling predicate is known to be true.
5055 typedef void sve_ldst1_tlb_fn(CPUARMState
*env
, void *vd
, intptr_t reg_off
,
5056 target_ulong vaddr
, uintptr_t retaddr
);
5059 * Generate the above primitives.
5062 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5063 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5065 TYPEM val = HOST(host); \
5066 *(TYPEE *)(vd + H(reg_off)) = val; \
5069 #define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5070 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5071 { HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
5073 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5074 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5075 target_ulong addr, uintptr_t ra) \
5077 *(TYPEE *)(vd + H(reg_off)) = \
5078 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
5081 #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5082 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5083 target_ulong addr, uintptr_t ra) \
5085 TLB(env, useronly_clean_ptr(addr), \
5086 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
5089 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
5090 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
5091 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
5093 DO_LD_PRIM_1(ld1bb
, H1
, uint8_t, uint8_t)
5094 DO_LD_PRIM_1(ld1bhu
, H1_2
, uint16_t, uint8_t)
5095 DO_LD_PRIM_1(ld1bhs
, H1_2
, uint16_t, int8_t)
5096 DO_LD_PRIM_1(ld1bsu
, H1_4
, uint32_t, uint8_t)
5097 DO_LD_PRIM_1(ld1bss
, H1_4
, uint32_t, int8_t)
5098 DO_LD_PRIM_1(ld1bdu
, , uint64_t, uint8_t)
5099 DO_LD_PRIM_1(ld1bds
, , uint64_t, int8_t)
5101 #define DO_ST_PRIM_1(NAME, H, TE, TM) \
5102 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
5103 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
5105 DO_ST_PRIM_1(bb
, H1
, uint8_t, uint8_t)
5106 DO_ST_PRIM_1(bh
, H1_2
, uint16_t, uint8_t)
5107 DO_ST_PRIM_1(bs
, H1_4
, uint32_t, uint8_t)
5108 DO_ST_PRIM_1(bd
, , uint64_t, uint8_t)
5110 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
5111 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
5112 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
5113 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
5114 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
5116 #define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
5117 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
5118 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
5119 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
5120 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
5122 DO_LD_PRIM_2(hh
, H1_2
, uint16_t, uint16_t, lduw
)
5123 DO_LD_PRIM_2(hsu
, H1_4
, uint32_t, uint16_t, lduw
)
5124 DO_LD_PRIM_2(hss
, H1_4
, uint32_t, int16_t, lduw
)
5125 DO_LD_PRIM_2(hdu
, , uint64_t, uint16_t, lduw
)
5126 DO_LD_PRIM_2(hds
, , uint64_t, int16_t, lduw
)
5128 DO_ST_PRIM_2(hh
, H1_2
, uint16_t, uint16_t, stw
)
5129 DO_ST_PRIM_2(hs
, H1_4
, uint32_t, uint16_t, stw
)
5130 DO_ST_PRIM_2(hd
, , uint64_t, uint16_t, stw
)
5132 DO_LD_PRIM_2(ss
, H1_4
, uint32_t, uint32_t, ldl
)
5133 DO_LD_PRIM_2(sdu
, , uint64_t, uint32_t, ldl
)
5134 DO_LD_PRIM_2(sds
, , uint64_t, int32_t, ldl
)
5136 DO_ST_PRIM_2(ss
, H1_4
, uint32_t, uint32_t, stl
)
5137 DO_ST_PRIM_2(sd
, , uint64_t, uint32_t, stl
)
5139 DO_LD_PRIM_2(dd
, , uint64_t, uint64_t, ldq
)
5140 DO_ST_PRIM_2(dd
, , uint64_t, uint64_t, stq
)
5151 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5152 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
5153 * element >= @reg_off, or @reg_max if there were no active elements at all.
5155 static intptr_t find_next_active(uint64_t *vg
, intptr_t reg_off
,
5156 intptr_t reg_max
, int esz
)
5158 uint64_t pg_mask
= pred_esz_masks
[esz
];
5159 uint64_t pg
= (vg
[reg_off
>> 6] & pg_mask
) >> (reg_off
& 63);
5161 /* In normal usage, the first element is active. */
5162 if (likely(pg
& 1)) {
5170 if (unlikely(reg_off
>= reg_max
)) {
5171 /* The entire predicate was false. */
5174 pg
= vg
[reg_off
>> 6] & pg_mask
;
5177 reg_off
+= ctz64(pg
);
5179 /* We should never see an out of range predicate bit set. */
5180 tcg_debug_assert(reg_off
< reg_max
);
5185 * Resolve the guest virtual address to info->host and info->flags.
5186 * If @nofault, return false if the page is invalid, otherwise
5187 * exit via page fault exception.
5196 static bool sve_probe_page(SVEHostPage
*info
, bool nofault
,
5197 CPUARMState
*env
, target_ulong addr
,
5198 int mem_off
, MMUAccessType access_type
,
5199 int mmu_idx
, uintptr_t retaddr
)
5206 * User-only currently always issues with TBI. See the comment
5207 * above useronly_clean_ptr. Usually we clean this top byte away
5208 * during translation, but we can't do that for e.g. vector + imm
5211 * We currently always enable TBI for user-only, and do not provide
5212 * a way to turn it off. So clean the pointer unconditionally here,
5213 * rather than look it up here, or pass it down from above.
5215 addr
= useronly_clean_ptr(addr
);
5217 flags
= probe_access_flags(env
, addr
, access_type
, mmu_idx
, nofault
,
5218 &info
->host
, retaddr
);
5219 info
->flags
= flags
;
5221 if (flags
& TLB_INVALID_MASK
) {
5226 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5227 info
->host
-= mem_off
;
5229 #ifdef CONFIG_USER_ONLY
5230 memset(&info
->attrs
, 0, sizeof(info
->attrs
));
5233 * Find the iotlbentry for addr and return the transaction attributes.
5234 * This *must* be present in the TLB because we just found the mapping.
5237 uintptr_t index
= tlb_index(env
, mmu_idx
, addr
);
5239 # ifdef CONFIG_DEBUG_TCG
5240 CPUTLBEntry
*entry
= tlb_entry(env
, mmu_idx
, addr
);
5241 target_ulong comparator
= (access_type
== MMU_DATA_LOAD
5243 : tlb_addr_write(entry
));
5244 g_assert(tlb_hit(comparator
, addr
));
5247 CPUIOTLBEntry
*iotlbentry
= &env_tlb(env
)->d
[mmu_idx
].iotlb
[index
];
5248 info
->attrs
= iotlbentry
->attrs
;
5257 * Analyse contiguous data, protected by a governing predicate.
5268 * First and last element wholly contained within the two pages.
5269 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
5270 * reg_off_last[0] may be < 0 if the first element crosses pages.
5271 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
5272 * are set >= 0 only if there are complete elements on a second page.
5274 * The reg_off_* offsets are relative to the internal vector register.
5275 * The mem_off_first offset is relative to the memory address; the
5276 * two offsets are different when a load operation extends, a store
5277 * operation truncates, or for multi-register operations.
5279 int16_t mem_off_first
[2];
5280 int16_t reg_off_first
[2];
5281 int16_t reg_off_last
[2];
5284 * One element that is misaligned and spans both pages,
5285 * or -1 if there is no such active element.
5287 int16_t mem_off_split
;
5288 int16_t reg_off_split
;
5291 * The byte offset at which the entire operation crosses a page boundary.
5292 * Set >= 0 if and only if the entire operation spans two pages.
5296 /* TLB data for the two pages. */
5297 SVEHostPage page
[2];
5301 * Find first active element on each page, and a loose bound for the
5302 * final element on each page. Identify any single element that spans
5303 * the page boundary. Return true if there are any active elements.
5305 static bool sve_cont_ldst_elements(SVEContLdSt
*info
, target_ulong addr
,
5306 uint64_t *vg
, intptr_t reg_max
,
5309 const int esize
= 1 << esz
;
5310 const uint64_t pg_mask
= pred_esz_masks
[esz
];
5311 intptr_t reg_off_first
= -1, reg_off_last
= -1, reg_off_split
;
5312 intptr_t mem_off_last
, mem_off_split
;
5313 intptr_t page_split
, elt_split
;
5316 /* Set all of the element indices to -1, and the TLB data to 0. */
5317 memset(info
, -1, offsetof(SVEContLdSt
, page
));
5318 memset(info
->page
, 0, sizeof(info
->page
));
5320 /* Gross scan over the entire predicate to find bounds. */
5323 uint64_t pg
= vg
[i
] & pg_mask
;
5325 reg_off_last
= i
* 64 + 63 - clz64(pg
);
5326 if (reg_off_first
< 0) {
5327 reg_off_first
= i
* 64 + ctz64(pg
);
5330 } while (++i
* 64 < reg_max
);
5332 if (unlikely(reg_off_first
< 0)) {
5333 /* No active elements, no pages touched. */
5336 tcg_debug_assert(reg_off_last
>= 0 && reg_off_last
< reg_max
);
5338 info
->reg_off_first
[0] = reg_off_first
;
5339 info
->mem_off_first
[0] = (reg_off_first
>> esz
) * msize
;
5340 mem_off_last
= (reg_off_last
>> esz
) * msize
;
5342 page_split
= -(addr
| TARGET_PAGE_MASK
);
5343 if (likely(mem_off_last
+ msize
<= page_split
)) {
5344 /* The entire operation fits within a single page. */
5345 info
->reg_off_last
[0] = reg_off_last
;
5349 info
->page_split
= page_split
;
5350 elt_split
= page_split
/ msize
;
5351 reg_off_split
= elt_split
<< esz
;
5352 mem_off_split
= elt_split
* msize
;
5355 * This is the last full element on the first page, but it is not
5356 * necessarily active. If there is no full element, i.e. the first
5357 * active element is the one that's split, this value remains -1.
5358 * It is useful as iteration bounds.
5360 if (elt_split
!= 0) {
5361 info
->reg_off_last
[0] = reg_off_split
- esize
;
5364 /* Determine if an unaligned element spans the pages. */
5365 if (page_split
% msize
!= 0) {
5366 /* It is helpful to know if the split element is active. */
5367 if ((vg
[reg_off_split
>> 6] >> (reg_off_split
& 63)) & 1) {
5368 info
->reg_off_split
= reg_off_split
;
5369 info
->mem_off_split
= mem_off_split
;
5371 if (reg_off_split
== reg_off_last
) {
5372 /* The page crossing element is last. */
5376 reg_off_split
+= esize
;
5377 mem_off_split
+= msize
;
5381 * We do want the first active element on the second page, because
5382 * this may affect the address reported in an exception.
5384 reg_off_split
= find_next_active(vg
, reg_off_split
, reg_max
, esz
);
5385 tcg_debug_assert(reg_off_split
<= reg_off_last
);
5386 info
->reg_off_first
[1] = reg_off_split
;
5387 info
->mem_off_first
[1] = (reg_off_split
>> esz
) * msize
;
5388 info
->reg_off_last
[1] = reg_off_last
;
5393 * Resolve the guest virtual addresses to info->page[].
5394 * Control the generation of page faults with @fault. Return false if
5395 * there is no work to do, which can only happen with @fault == FAULT_NO.
5397 static bool sve_cont_ldst_pages(SVEContLdSt
*info
, SVEContFault fault
,
5398 CPUARMState
*env
, target_ulong addr
,
5399 MMUAccessType access_type
, uintptr_t retaddr
)
5401 int mmu_idx
= cpu_mmu_index(env
, false);
5402 int mem_off
= info
->mem_off_first
[0];
5403 bool nofault
= fault
== FAULT_NO
;
5404 bool have_work
= true;
5406 if (!sve_probe_page(&info
->page
[0], nofault
, env
, addr
, mem_off
,
5407 access_type
, mmu_idx
, retaddr
)) {
5408 /* No work to be done. */
5412 if (likely(info
->page_split
< 0)) {
5413 /* The entire operation was on the one page. */
5418 * If the second page is invalid, then we want the fault address to be
5419 * the first byte on that page which is accessed.
5421 if (info
->mem_off_split
>= 0) {
5423 * There is an element split across the pages. The fault address
5424 * should be the first byte of the second page.
5426 mem_off
= info
->page_split
;
5428 * If the split element is also the first active element
5429 * of the vector, then: For first-fault we should continue
5430 * to generate faults for the second page. For no-fault,
5431 * we have work only if the second page is valid.
5433 if (info
->mem_off_first
[0] < info
->mem_off_split
) {
5434 nofault
= FAULT_FIRST
;
5439 * There is no element split across the pages. The fault address
5440 * should be the first active element on the second page.
5442 mem_off
= info
->mem_off_first
[1];
5444 * There must have been one active element on the first page,
5445 * so we're out of first-fault territory.
5447 nofault
= fault
!= FAULT_ALL
;
5450 have_work
|= sve_probe_page(&info
->page
[1], nofault
, env
, addr
, mem_off
,
5451 access_type
, mmu_idx
, retaddr
);
5455 static void sve_cont_ldst_watchpoints(SVEContLdSt
*info
, CPUARMState
*env
,
5456 uint64_t *vg
, target_ulong addr
,
5457 int esize
, int msize
, int wp_access
,
5460 #ifndef CONFIG_USER_ONLY
5461 intptr_t mem_off
, reg_off
, reg_last
;
5462 int flags0
= info
->page
[0].flags
;
5463 int flags1
= info
->page
[1].flags
;
5465 if (likely(!((flags0
| flags1
) & TLB_WATCHPOINT
))) {
5469 /* Indicate that watchpoints are handled. */
5470 info
->page
[0].flags
= flags0
& ~TLB_WATCHPOINT
;
5471 info
->page
[1].flags
= flags1
& ~TLB_WATCHPOINT
;
5473 if (flags0
& TLB_WATCHPOINT
) {
5474 mem_off
= info
->mem_off_first
[0];
5475 reg_off
= info
->reg_off_first
[0];
5476 reg_last
= info
->reg_off_last
[0];
5478 while (reg_off
<= reg_last
) {
5479 uint64_t pg
= vg
[reg_off
>> 6];
5481 if ((pg
>> (reg_off
& 63)) & 1) {
5482 cpu_check_watchpoint(env_cpu(env
), addr
+ mem_off
,
5483 msize
, info
->page
[0].attrs
,
5484 wp_access
, retaddr
);
5488 } while (reg_off
<= reg_last
&& (reg_off
& 63));
5492 mem_off
= info
->mem_off_split
;
5494 cpu_check_watchpoint(env_cpu(env
), addr
+ mem_off
, msize
,
5495 info
->page
[0].attrs
, wp_access
, retaddr
);
5498 mem_off
= info
->mem_off_first
[1];
5499 if ((flags1
& TLB_WATCHPOINT
) && mem_off
>= 0) {
5500 reg_off
= info
->reg_off_first
[1];
5501 reg_last
= info
->reg_off_last
[1];
5504 uint64_t pg
= vg
[reg_off
>> 6];
5506 if ((pg
>> (reg_off
& 63)) & 1) {
5507 cpu_check_watchpoint(env_cpu(env
), addr
+ mem_off
,
5508 msize
, info
->page
[1].attrs
,
5509 wp_access
, retaddr
);
5513 } while (reg_off
& 63);
5514 } while (reg_off
<= reg_last
);
5519 static void sve_cont_ldst_mte_check(SVEContLdSt
*info
, CPUARMState
*env
,
5520 uint64_t *vg
, target_ulong addr
, int esize
,
5521 int msize
, uint32_t mtedesc
, uintptr_t ra
)
5523 intptr_t mem_off
, reg_off
, reg_last
;
5525 /* Process the page only if MemAttr == Tagged. */
5526 if (arm_tlb_mte_tagged(&info
->page
[0].attrs
)) {
5527 mem_off
= info
->mem_off_first
[0];
5528 reg_off
= info
->reg_off_first
[0];
5529 reg_last
= info
->reg_off_split
;
5531 reg_last
= info
->reg_off_last
[0];
5535 uint64_t pg
= vg
[reg_off
>> 6];
5537 if ((pg
>> (reg_off
& 63)) & 1) {
5538 mte_check(env
, mtedesc
, addr
, ra
);
5542 } while (reg_off
<= reg_last
&& (reg_off
& 63));
5543 } while (reg_off
<= reg_last
);
5546 mem_off
= info
->mem_off_first
[1];
5547 if (mem_off
>= 0 && arm_tlb_mte_tagged(&info
->page
[1].attrs
)) {
5548 reg_off
= info
->reg_off_first
[1];
5549 reg_last
= info
->reg_off_last
[1];
5552 uint64_t pg
= vg
[reg_off
>> 6];
5554 if ((pg
>> (reg_off
& 63)) & 1) {
5555 mte_check(env
, mtedesc
, addr
, ra
);
5559 } while (reg_off
& 63);
5560 } while (reg_off
<= reg_last
);
5565 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5567 static inline QEMU_ALWAYS_INLINE
5568 void sve_ldN_r(CPUARMState
*env
, uint64_t *vg
, const target_ulong addr
,
5569 uint32_t desc
, const uintptr_t retaddr
,
5570 const int esz
, const int msz
, const int N
, uint32_t mtedesc
,
5571 sve_ldst1_host_fn
*host_fn
,
5572 sve_ldst1_tlb_fn
*tlb_fn
)
5574 const unsigned rd
= simd_data(desc
);
5575 const intptr_t reg_max
= simd_oprsz(desc
);
5576 intptr_t reg_off
, reg_last
, mem_off
;
5581 /* Find the active elements. */
5582 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, N
<< msz
)) {
5583 /* The entire predicate was false; no load occurs. */
5584 for (i
= 0; i
< N
; ++i
) {
5585 memset(&env
->vfp
.zregs
[(rd
+ i
) & 31], 0, reg_max
);
5590 /* Probe the page(s). Exit with exception for any invalid page. */
5591 sve_cont_ldst_pages(&info
, FAULT_ALL
, env
, addr
, MMU_DATA_LOAD
, retaddr
);
5593 /* Handle watchpoints for all active elements. */
5594 sve_cont_ldst_watchpoints(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
5595 BP_MEM_READ
, retaddr
);
5598 * Handle mte checks for all active elements.
5599 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5602 sve_cont_ldst_mte_check(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
5606 flags
= info
.page
[0].flags
| info
.page
[1].flags
;
5607 if (unlikely(flags
!= 0)) {
5608 #ifdef CONFIG_USER_ONLY
5609 g_assert_not_reached();
5612 * At least one page includes MMIO.
5613 * Any bus operation can fail with cpu_transaction_failed,
5614 * which for ARM will raise SyncExternal. Perform the load
5615 * into scratch memory to preserve register state until the end.
5617 ARMVectorReg scratch
[4] = { };
5619 mem_off
= info
.mem_off_first
[0];
5620 reg_off
= info
.reg_off_first
[0];
5621 reg_last
= info
.reg_off_last
[1];
5623 reg_last
= info
.reg_off_split
;
5625 reg_last
= info
.reg_off_last
[0];
5630 uint64_t pg
= vg
[reg_off
>> 6];
5632 if ((pg
>> (reg_off
& 63)) & 1) {
5633 for (i
= 0; i
< N
; ++i
) {
5634 tlb_fn(env
, &scratch
[i
], reg_off
,
5635 addr
+ mem_off
+ (i
<< msz
), retaddr
);
5638 reg_off
+= 1 << esz
;
5639 mem_off
+= N
<< msz
;
5640 } while (reg_off
& 63);
5641 } while (reg_off
<= reg_last
);
5643 for (i
= 0; i
< N
; ++i
) {
5644 memcpy(&env
->vfp
.zregs
[(rd
+ i
) & 31], &scratch
[i
], reg_max
);
5650 /* The entire operation is in RAM, on valid pages. */
5652 for (i
= 0; i
< N
; ++i
) {
5653 memset(&env
->vfp
.zregs
[(rd
+ i
) & 31], 0, reg_max
);
5656 mem_off
= info
.mem_off_first
[0];
5657 reg_off
= info
.reg_off_first
[0];
5658 reg_last
= info
.reg_off_last
[0];
5659 host
= info
.page
[0].host
;
5661 while (reg_off
<= reg_last
) {
5662 uint64_t pg
= vg
[reg_off
>> 6];
5664 if ((pg
>> (reg_off
& 63)) & 1) {
5665 for (i
= 0; i
< N
; ++i
) {
5666 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5667 host
+ mem_off
+ (i
<< msz
));
5670 reg_off
+= 1 << esz
;
5671 mem_off
+= N
<< msz
;
5672 } while (reg_off
<= reg_last
&& (reg_off
& 63));
5676 * Use the slow path to manage the cross-page misalignment.
5677 * But we know this is RAM and cannot trap.
5679 mem_off
= info
.mem_off_split
;
5680 if (unlikely(mem_off
>= 0)) {
5681 reg_off
= info
.reg_off_split
;
5682 for (i
= 0; i
< N
; ++i
) {
5683 tlb_fn(env
, &env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5684 addr
+ mem_off
+ (i
<< msz
), retaddr
);
5688 mem_off
= info
.mem_off_first
[1];
5689 if (unlikely(mem_off
>= 0)) {
5690 reg_off
= info
.reg_off_first
[1];
5691 reg_last
= info
.reg_off_last
[1];
5692 host
= info
.page
[1].host
;
5695 uint64_t pg
= vg
[reg_off
>> 6];
5697 if ((pg
>> (reg_off
& 63)) & 1) {
5698 for (i
= 0; i
< N
; ++i
) {
5699 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5700 host
+ mem_off
+ (i
<< msz
));
5703 reg_off
+= 1 << esz
;
5704 mem_off
+= N
<< msz
;
5705 } while (reg_off
& 63);
5706 } while (reg_off
<= reg_last
);
5710 static inline QEMU_ALWAYS_INLINE
5711 void sve_ldN_r_mte(CPUARMState
*env
, uint64_t *vg
, target_ulong addr
,
5712 uint32_t desc
, const uintptr_t ra
,
5713 const int esz
, const int msz
, const int N
,
5714 sve_ldst1_host_fn
*host_fn
,
5715 sve_ldst1_tlb_fn
*tlb_fn
)
5717 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
5718 int bit55
= extract64(addr
, 55, 1);
5720 /* Remove mtedesc from the normal sve descriptor. */
5721 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
5723 /* Perform gross MTE suppression early. */
5724 if (!tbi_check(desc
, bit55
) ||
5725 tcma_check(desc
, bit55
, allocation_tag_from_addr(addr
))) {
5729 sve_ldN_r(env
, vg
, addr
, desc
, ra
, esz
, msz
, N
, mtedesc
, host_fn
, tlb_fn
);
5732 #define DO_LD1_1(NAME, ESZ) \
5733 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5734 target_ulong addr, uint32_t desc) \
5736 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
5737 sve_##NAME##_host, sve_##NAME##_tlb); \
5739 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5740 target_ulong addr, uint32_t desc) \
5742 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5743 sve_##NAME##_host, sve_##NAME##_tlb); \
5746 #define DO_LD1_2(NAME, ESZ, MSZ) \
5747 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5748 target_ulong addr, uint32_t desc) \
5750 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5751 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5753 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
5754 target_ulong addr, uint32_t desc) \
5756 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
5757 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5759 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
5760 target_ulong addr, uint32_t desc) \
5762 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5763 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
5765 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
5766 target_ulong addr, uint32_t desc) \
5768 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
5769 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
5772 DO_LD1_1(ld1bb
, MO_8
)
5773 DO_LD1_1(ld1bhu
, MO_16
)
5774 DO_LD1_1(ld1bhs
, MO_16
)
5775 DO_LD1_1(ld1bsu
, MO_32
)
5776 DO_LD1_1(ld1bss
, MO_32
)
5777 DO_LD1_1(ld1bdu
, MO_64
)
5778 DO_LD1_1(ld1bds
, MO_64
)
5780 DO_LD1_2(ld1hh
, MO_16
, MO_16
)
5781 DO_LD1_2(ld1hsu
, MO_32
, MO_16
)
5782 DO_LD1_2(ld1hss
, MO_32
, MO_16
)
5783 DO_LD1_2(ld1hdu
, MO_64
, MO_16
)
5784 DO_LD1_2(ld1hds
, MO_64
, MO_16
)
5786 DO_LD1_2(ld1ss
, MO_32
, MO_32
)
5787 DO_LD1_2(ld1sdu
, MO_64
, MO_32
)
5788 DO_LD1_2(ld1sds
, MO_64
, MO_32
)
5790 DO_LD1_2(ld1dd
, MO_64
, MO_64
)
5795 #define DO_LDN_1(N) \
5796 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
5797 target_ulong addr, uint32_t desc) \
5799 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
5800 sve_ld1bb_host, sve_ld1bb_tlb); \
5802 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
5803 target_ulong addr, uint32_t desc) \
5805 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
5806 sve_ld1bb_host, sve_ld1bb_tlb); \
5809 #define DO_LDN_2(N, SUFF, ESZ) \
5810 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
5811 target_ulong addr, uint32_t desc) \
5813 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5814 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5816 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
5817 target_ulong addr, uint32_t desc) \
5819 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
5820 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5822 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
5823 target_ulong addr, uint32_t desc) \
5825 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5826 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
5828 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
5829 target_ulong addr, uint32_t desc) \
5831 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
5832 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
5839 DO_LDN_2(2, hh
, MO_16
)
5840 DO_LDN_2(3, hh
, MO_16
)
5841 DO_LDN_2(4, hh
, MO_16
)
5843 DO_LDN_2(2, ss
, MO_32
)
5844 DO_LDN_2(3, ss
, MO_32
)
5845 DO_LDN_2(4, ss
, MO_32
)
5847 DO_LDN_2(2, dd
, MO_64
)
5848 DO_LDN_2(3, dd
, MO_64
)
5849 DO_LDN_2(4, dd
, MO_64
)
5855 * Load contiguous data, first-fault and no-fault.
5857 * For user-only, one could argue that we should hold the mmap_lock during
5858 * the operation so that there is no race between page_check_range and the
5859 * load operation. However, unmapping pages out from under a running thread
5860 * is extraordinarily unlikely. This theoretical race condition also affects
5861 * linux-user/ in its get_user/put_user macros.
5863 * TODO: Construct some helpers, written in assembly, that interact with
5864 * handle_cpu_signal to produce memory ops which can properly report errors
5868 /* Fault on byte I. All bits in FFR from I are cleared. The vector
5869 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
5870 * option, which leaves subsequent data unchanged.
5872 static void record_fault(CPUARMState
*env
, uintptr_t i
, uintptr_t oprsz
)
5874 uint64_t *ffr
= env
->vfp
.pregs
[FFR_PRED_NUM
].p
;
5877 ffr
[i
/ 64] &= MAKE_64BIT_MASK(0, i
& 63);
5878 i
= ROUND_UP(i
, 64);
5880 for (; i
< oprsz
; i
+= 64) {
5886 * Common helper for all contiguous no-fault and first-fault loads.
5888 static inline QEMU_ALWAYS_INLINE
5889 void sve_ldnfff1_r(CPUARMState
*env
, void *vg
, const target_ulong addr
,
5890 uint32_t desc
, const uintptr_t retaddr
, uint32_t mtedesc
,
5891 const int esz
, const int msz
, const SVEContFault fault
,
5892 sve_ldst1_host_fn
*host_fn
,
5893 sve_ldst1_tlb_fn
*tlb_fn
)
5895 const unsigned rd
= simd_data(desc
);
5896 void *vd
= &env
->vfp
.zregs
[rd
];
5897 const intptr_t reg_max
= simd_oprsz(desc
);
5898 intptr_t reg_off
, mem_off
, reg_last
;
5903 /* Find the active elements. */
5904 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, 1 << msz
)) {
5905 /* The entire predicate was false; no load occurs. */
5906 memset(vd
, 0, reg_max
);
5909 reg_off
= info
.reg_off_first
[0];
5911 /* Probe the page(s). */
5912 if (!sve_cont_ldst_pages(&info
, fault
, env
, addr
, MMU_DATA_LOAD
, retaddr
)) {
5913 /* Fault on first element. */
5914 tcg_debug_assert(fault
== FAULT_NO
);
5915 memset(vd
, 0, reg_max
);
5919 mem_off
= info
.mem_off_first
[0];
5920 flags
= info
.page
[0].flags
;
5923 * Disable MTE checking if the Tagged bit is not set. Since TBI must
5924 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
5926 if (arm_tlb_mte_tagged(&info
.page
[0].attrs
)) {
5930 if (fault
== FAULT_FIRST
) {
5931 /* Trapping mte check for the first-fault element. */
5933 mte_check(env
, mtedesc
, addr
+ mem_off
, retaddr
);
5937 * Special handling of the first active element,
5938 * if it crosses a page boundary or is MMIO.
5940 bool is_split
= mem_off
== info
.mem_off_split
;
5941 if (unlikely(flags
!= 0) || unlikely(is_split
)) {
5943 * Use the slow path for cross-page handling.
5944 * Might trap for MMIO or watchpoints.
5946 tlb_fn(env
, vd
, reg_off
, addr
+ mem_off
, retaddr
);
5948 /* After any fault, zero the other elements. */
5949 swap_memzero(vd
, reg_off
);
5950 reg_off
+= 1 << esz
;
5951 mem_off
+= 1 << msz
;
5952 swap_memzero(vd
+ reg_off
, reg_max
- reg_off
);
5958 memset(vd
, 0, reg_max
);
5961 memset(vd
, 0, reg_max
);
5962 if (unlikely(mem_off
== info
.mem_off_split
)) {
5963 /* The first active element crosses a page boundary. */
5964 flags
|= info
.page
[1].flags
;
5965 if (unlikely(flags
& TLB_MMIO
)) {
5966 /* Some page is MMIO, see below. */
5969 if (unlikely(flags
& TLB_WATCHPOINT
) &&
5970 (cpu_watchpoint_address_matches
5971 (env_cpu(env
), addr
+ mem_off
, 1 << msz
)
5973 /* Watchpoint hit, see below. */
5976 if (mtedesc
&& !mte_probe(env
, mtedesc
, addr
+ mem_off
)) {
5980 * Use the slow path for cross-page handling.
5981 * This is RAM, without a watchpoint, and will not trap.
5983 tlb_fn(env
, vd
, reg_off
, addr
+ mem_off
, retaddr
);
5989 * From this point on, all memory operations are MemSingleNF.
5991 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
5992 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
5994 * Unfortuately we do not have access to the memory attributes from the
5995 * PTE to tell Device memory from Normal memory. So we make a mostly
5996 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
5997 * This gives the right answer for the common cases of "Normal memory,
5998 * backed by host RAM" and "Device memory, backed by MMIO".
5999 * The architecture allows us to suppress an NF load and return
6000 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6001 * case of "Normal memory, backed by MMIO" is permitted. The case we
6002 * get wrong is "Device memory, backed by host RAM", for which we
6003 * should return (UNKNOWN, FAULT) for but do not.
6005 * Similarly, CPU_BP breakpoints would raise exceptions, and so
6006 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
6007 * architectural breakpoints the same.
6009 if (unlikely(flags
& TLB_MMIO
)) {
6013 reg_last
= info
.reg_off_last
[0];
6014 host
= info
.page
[0].host
;
6017 uint64_t pg
= *(uint64_t *)(vg
+ (reg_off
>> 3));
6019 if ((pg
>> (reg_off
& 63)) & 1) {
6020 if (unlikely(flags
& TLB_WATCHPOINT
) &&
6021 (cpu_watchpoint_address_matches
6022 (env_cpu(env
), addr
+ mem_off
, 1 << msz
)
6026 if (mtedesc
&& !mte_probe(env
, mtedesc
, addr
+ mem_off
)) {
6029 host_fn(vd
, reg_off
, host
+ mem_off
);
6031 reg_off
+= 1 << esz
;
6032 mem_off
+= 1 << msz
;
6033 } while (reg_off
<= reg_last
&& (reg_off
& 63));
6034 } while (reg_off
<= reg_last
);
6037 * MemSingleNF is allowed to fail for any reason. We have special
6038 * code above to handle the first element crossing a page boundary.
6039 * As an implementation choice, decline to handle a cross-page element
6040 * in any other position.
6042 reg_off
= info
.reg_off_split
;
6048 reg_off
= info
.reg_off_first
[1];
6049 if (likely(reg_off
< 0)) {
6050 /* No active elements on the second page. All done. */
6055 * MemSingleNF is allowed to fail for any reason. As an implementation
6056 * choice, decline to handle elements on the second page. This should
6057 * be low frequency as the guest walks through memory -- the next
6058 * iteration of the guest's loop should be aligned on the page boundary,
6059 * and then all following iterations will stay aligned.
6063 record_fault(env
, reg_off
, reg_max
);
6066 static inline QEMU_ALWAYS_INLINE
6067 void sve_ldnfff1_r_mte(CPUARMState
*env
, void *vg
, target_ulong addr
,
6068 uint32_t desc
, const uintptr_t retaddr
,
6069 const int esz
, const int msz
, const SVEContFault fault
,
6070 sve_ldst1_host_fn
*host_fn
,
6071 sve_ldst1_tlb_fn
*tlb_fn
)
6073 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6074 int bit55
= extract64(addr
, 55, 1);
6076 /* Remove mtedesc from the normal sve descriptor. */
6077 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6079 /* Perform gross MTE suppression early. */
6080 if (!tbi_check(desc
, bit55
) ||
6081 tcma_check(desc
, bit55
, allocation_tag_from_addr(addr
))) {
6085 sve_ldnfff1_r(env
, vg
, addr
, desc
, retaddr
, mtedesc
,
6086 esz
, msz
, fault
, host_fn
, tlb_fn
);
6089 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
6090 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6091 target_ulong addr, uint32_t desc) \
6093 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6094 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6096 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6097 target_ulong addr, uint32_t desc) \
6099 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6100 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6102 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6103 target_ulong addr, uint32_t desc) \
6105 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6106 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6108 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6109 target_ulong addr, uint32_t desc) \
6111 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6112 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6115 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6116 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6117 target_ulong addr, uint32_t desc) \
6119 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6120 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6122 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6123 target_ulong addr, uint32_t desc) \
6125 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6126 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6128 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6129 target_ulong addr, uint32_t desc) \
6131 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6132 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6134 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6135 target_ulong addr, uint32_t desc) \
6137 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6138 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6140 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6141 target_ulong addr, uint32_t desc) \
6143 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6144 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6146 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6147 target_ulong addr, uint32_t desc) \
6149 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6150 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6152 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6153 target_ulong addr, uint32_t desc) \
6155 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6156 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6158 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6159 target_ulong addr, uint32_t desc) \
6161 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6162 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6165 DO_LDFF1_LDNF1_1(bb
, MO_8
)
6166 DO_LDFF1_LDNF1_1(bhu
, MO_16
)
6167 DO_LDFF1_LDNF1_1(bhs
, MO_16
)
6168 DO_LDFF1_LDNF1_1(bsu
, MO_32
)
6169 DO_LDFF1_LDNF1_1(bss
, MO_32
)
6170 DO_LDFF1_LDNF1_1(bdu
, MO_64
)
6171 DO_LDFF1_LDNF1_1(bds
, MO_64
)
6173 DO_LDFF1_LDNF1_2(hh
, MO_16
, MO_16
)
6174 DO_LDFF1_LDNF1_2(hsu
, MO_32
, MO_16
)
6175 DO_LDFF1_LDNF1_2(hss
, MO_32
, MO_16
)
6176 DO_LDFF1_LDNF1_2(hdu
, MO_64
, MO_16
)
6177 DO_LDFF1_LDNF1_2(hds
, MO_64
, MO_16
)
6179 DO_LDFF1_LDNF1_2(ss
, MO_32
, MO_32
)
6180 DO_LDFF1_LDNF1_2(sdu
, MO_64
, MO_32
)
6181 DO_LDFF1_LDNF1_2(sds
, MO_64
, MO_32
)
6183 DO_LDFF1_LDNF1_2(dd
, MO_64
, MO_64
)
6185 #undef DO_LDFF1_LDNF1_1
6186 #undef DO_LDFF1_LDNF1_2
6189 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6192 static inline QEMU_ALWAYS_INLINE
6193 void sve_stN_r(CPUARMState
*env
, uint64_t *vg
, target_ulong addr
,
6194 uint32_t desc
, const uintptr_t retaddr
,
6195 const int esz
, const int msz
, const int N
, uint32_t mtedesc
,
6196 sve_ldst1_host_fn
*host_fn
,
6197 sve_ldst1_tlb_fn
*tlb_fn
)
6199 const unsigned rd
= simd_data(desc
);
6200 const intptr_t reg_max
= simd_oprsz(desc
);
6201 intptr_t reg_off
, reg_last
, mem_off
;
6206 /* Find the active elements. */
6207 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, N
<< msz
)) {
6208 /* The entire predicate was false; no store occurs. */
6212 /* Probe the page(s). Exit with exception for any invalid page. */
6213 sve_cont_ldst_pages(&info
, FAULT_ALL
, env
, addr
, MMU_DATA_STORE
, retaddr
);
6215 /* Handle watchpoints for all active elements. */
6216 sve_cont_ldst_watchpoints(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
6217 BP_MEM_WRITE
, retaddr
);
6220 * Handle mte checks for all active elements.
6221 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6224 sve_cont_ldst_mte_check(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
6228 flags
= info
.page
[0].flags
| info
.page
[1].flags
;
6229 if (unlikely(flags
!= 0)) {
6230 #ifdef CONFIG_USER_ONLY
6231 g_assert_not_reached();
6234 * At least one page includes MMIO.
6235 * Any bus operation can fail with cpu_transaction_failed,
6236 * which for ARM will raise SyncExternal. We cannot avoid
6237 * this fault and will leave with the store incomplete.
6239 mem_off
= info
.mem_off_first
[0];
6240 reg_off
= info
.reg_off_first
[0];
6241 reg_last
= info
.reg_off_last
[1];
6243 reg_last
= info
.reg_off_split
;
6245 reg_last
= info
.reg_off_last
[0];
6250 uint64_t pg
= vg
[reg_off
>> 6];
6252 if ((pg
>> (reg_off
& 63)) & 1) {
6253 for (i
= 0; i
< N
; ++i
) {
6254 tlb_fn(env
, &env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
6255 addr
+ mem_off
+ (i
<< msz
), retaddr
);
6258 reg_off
+= 1 << esz
;
6259 mem_off
+= N
<< msz
;
6260 } while (reg_off
& 63);
6261 } while (reg_off
<= reg_last
);
6266 mem_off
= info
.mem_off_first
[0];
6267 reg_off
= info
.reg_off_first
[0];
6268 reg_last
= info
.reg_off_last
[0];
6269 host
= info
.page
[0].host
;
6271 while (reg_off
<= reg_last
) {
6272 uint64_t pg
= vg
[reg_off
>> 6];
6274 if ((pg
>> (reg_off
& 63)) & 1) {
6275 for (i
= 0; i
< N
; ++i
) {
6276 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
6277 host
+ mem_off
+ (i
<< msz
));
6280 reg_off
+= 1 << esz
;
6281 mem_off
+= N
<< msz
;
6282 } while (reg_off
<= reg_last
&& (reg_off
& 63));
6286 * Use the slow path to manage the cross-page misalignment.
6287 * But we know this is RAM and cannot trap.
6289 mem_off
= info
.mem_off_split
;
6290 if (unlikely(mem_off
>= 0)) {
6291 reg_off
= info
.reg_off_split
;
6292 for (i
= 0; i
< N
; ++i
) {
6293 tlb_fn(env
, &env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
6294 addr
+ mem_off
+ (i
<< msz
), retaddr
);
6298 mem_off
= info
.mem_off_first
[1];
6299 if (unlikely(mem_off
>= 0)) {
6300 reg_off
= info
.reg_off_first
[1];
6301 reg_last
= info
.reg_off_last
[1];
6302 host
= info
.page
[1].host
;
6305 uint64_t pg
= vg
[reg_off
>> 6];
6307 if ((pg
>> (reg_off
& 63)) & 1) {
6308 for (i
= 0; i
< N
; ++i
) {
6309 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
6310 host
+ mem_off
+ (i
<< msz
));
6313 reg_off
+= 1 << esz
;
6314 mem_off
+= N
<< msz
;
6315 } while (reg_off
& 63);
6316 } while (reg_off
<= reg_last
);
6320 static inline QEMU_ALWAYS_INLINE
6321 void sve_stN_r_mte(CPUARMState
*env
, uint64_t *vg
, target_ulong addr
,
6322 uint32_t desc
, const uintptr_t ra
,
6323 const int esz
, const int msz
, const int N
,
6324 sve_ldst1_host_fn
*host_fn
,
6325 sve_ldst1_tlb_fn
*tlb_fn
)
6327 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6328 int bit55
= extract64(addr
, 55, 1);
6330 /* Remove mtedesc from the normal sve descriptor. */
6331 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6333 /* Perform gross MTE suppression early. */
6334 if (!tbi_check(desc
, bit55
) ||
6335 tcma_check(desc
, bit55
, allocation_tag_from_addr(addr
))) {
6339 sve_stN_r(env
, vg
, addr
, desc
, ra
, esz
, msz
, N
, mtedesc
, host_fn
, tlb_fn
);
6342 #define DO_STN_1(N, NAME, ESZ) \
6343 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6344 target_ulong addr, uint32_t desc) \
6346 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6347 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6349 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6350 target_ulong addr, uint32_t desc) \
6352 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6353 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6356 #define DO_STN_2(N, NAME, ESZ, MSZ) \
6357 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6358 target_ulong addr, uint32_t desc) \
6360 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6361 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6363 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6364 target_ulong addr, uint32_t desc) \
6366 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6367 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6369 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6370 target_ulong addr, uint32_t desc) \
6372 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6373 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6375 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6376 target_ulong addr, uint32_t desc) \
6378 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6379 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6382 DO_STN_1(1, bb
, MO_8
)
6383 DO_STN_1(1, bh
, MO_16
)
6384 DO_STN_1(1, bs
, MO_32
)
6385 DO_STN_1(1, bd
, MO_64
)
6386 DO_STN_1(2, bb
, MO_8
)
6387 DO_STN_1(3, bb
, MO_8
)
6388 DO_STN_1(4, bb
, MO_8
)
6390 DO_STN_2(1, hh
, MO_16
, MO_16
)
6391 DO_STN_2(1, hs
, MO_32
, MO_16
)
6392 DO_STN_2(1, hd
, MO_64
, MO_16
)
6393 DO_STN_2(2, hh
, MO_16
, MO_16
)
6394 DO_STN_2(3, hh
, MO_16
, MO_16
)
6395 DO_STN_2(4, hh
, MO_16
, MO_16
)
6397 DO_STN_2(1, ss
, MO_32
, MO_32
)
6398 DO_STN_2(1, sd
, MO_64
, MO_32
)
6399 DO_STN_2(2, ss
, MO_32
, MO_32
)
6400 DO_STN_2(3, ss
, MO_32
, MO_32
)
6401 DO_STN_2(4, ss
, MO_32
, MO_32
)
6403 DO_STN_2(1, dd
, MO_64
, MO_64
)
6404 DO_STN_2(2, dd
, MO_64
, MO_64
)
6405 DO_STN_2(3, dd
, MO_64
, MO_64
)
6406 DO_STN_2(4, dd
, MO_64
, MO_64
)
6412 * Loads with a vector index.
6416 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6418 typedef target_ulong
zreg_off_fn(void *reg
, intptr_t reg_ofs
);
6420 static target_ulong
off_zsu_s(void *reg
, intptr_t reg_ofs
)
6422 return *(uint32_t *)(reg
+ H1_4(reg_ofs
));
6425 static target_ulong
off_zss_s(void *reg
, intptr_t reg_ofs
)
6427 return *(int32_t *)(reg
+ H1_4(reg_ofs
));
6430 static target_ulong
off_zsu_d(void *reg
, intptr_t reg_ofs
)
6432 return (uint32_t)*(uint64_t *)(reg
+ reg_ofs
);
6435 static target_ulong
off_zss_d(void *reg
, intptr_t reg_ofs
)
6437 return (int32_t)*(uint64_t *)(reg
+ reg_ofs
);
6440 static target_ulong
off_zd_d(void *reg
, intptr_t reg_ofs
)
6442 return *(uint64_t *)(reg
+ reg_ofs
);
6445 static inline QEMU_ALWAYS_INLINE
6446 void sve_ld1_z(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6447 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6448 uint32_t mtedesc
, int esize
, int msize
,
6449 zreg_off_fn
*off_fn
,
6450 sve_ldst1_host_fn
*host_fn
,
6451 sve_ldst1_tlb_fn
*tlb_fn
)
6453 const int mmu_idx
= cpu_mmu_index(env
, false);
6454 const intptr_t reg_max
= simd_oprsz(desc
);
6455 const int scale
= simd_data(desc
);
6456 ARMVectorReg scratch
;
6458 SVEHostPage info
, info2
;
6460 memset(&scratch
, 0, reg_max
);
6463 uint64_t pg
= vg
[reg_off
>> 6];
6465 if (likely(pg
& 1)) {
6466 target_ulong addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6467 target_ulong in_page
= -(addr
| TARGET_PAGE_MASK
);
6469 sve_probe_page(&info
, false, env
, addr
, 0, MMU_DATA_LOAD
,
6472 if (likely(in_page
>= msize
)) {
6473 if (unlikely(info
.flags
& TLB_WATCHPOINT
)) {
6474 cpu_check_watchpoint(env_cpu(env
), addr
, msize
,
6475 info
.attrs
, BP_MEM_READ
, retaddr
);
6477 if (mtedesc
&& arm_tlb_mte_tagged(&info
.attrs
)) {
6478 mte_check(env
, mtedesc
, addr
, retaddr
);
6480 host_fn(&scratch
, reg_off
, info
.host
);
6482 /* Element crosses the page boundary. */
6483 sve_probe_page(&info2
, false, env
, addr
+ in_page
, 0,
6484 MMU_DATA_LOAD
, mmu_idx
, retaddr
);
6485 if (unlikely((info
.flags
| info2
.flags
) & TLB_WATCHPOINT
)) {
6486 cpu_check_watchpoint(env_cpu(env
), addr
,
6488 BP_MEM_READ
, retaddr
);
6490 if (mtedesc
&& arm_tlb_mte_tagged(&info
.attrs
)) {
6491 mte_check(env
, mtedesc
, addr
, retaddr
);
6493 tlb_fn(env
, &scratch
, reg_off
, addr
, retaddr
);
6498 } while (reg_off
& 63);
6499 } while (reg_off
< reg_max
);
6501 /* Wait until all exceptions have been raised to write back. */
6502 memcpy(vd
, &scratch
, reg_max
);
6505 static inline QEMU_ALWAYS_INLINE
6506 void sve_ld1_z_mte(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6507 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6508 int esize
, int msize
, zreg_off_fn
*off_fn
,
6509 sve_ldst1_host_fn
*host_fn
,
6510 sve_ldst1_tlb_fn
*tlb_fn
)
6512 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6513 /* Remove mtedesc from the normal sve descriptor. */
6514 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6517 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6518 * offset base entirely over the address space hole to change the
6519 * pointer tag, or change the bit55 selector. So we could here
6520 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6522 sve_ld1_z(env
, vd
, vg
, vm
, base
, desc
, retaddr
, mtedesc
,
6523 esize
, msize
, off_fn
, host_fn
, tlb_fn
);
6526 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6527 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6528 void *vm, target_ulong base, uint32_t desc) \
6530 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6531 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6533 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6534 void *vm, target_ulong base, uint32_t desc) \
6536 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6537 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6540 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6541 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6542 void *vm, target_ulong base, uint32_t desc) \
6544 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6545 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6547 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6548 void *vm, target_ulong base, uint32_t desc) \
6550 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6551 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6554 DO_LD1_ZPZ_S(bsu
, zsu
, MO_8
)
6555 DO_LD1_ZPZ_S(bsu
, zss
, MO_8
)
6556 DO_LD1_ZPZ_D(bdu
, zsu
, MO_8
)
6557 DO_LD1_ZPZ_D(bdu
, zss
, MO_8
)
6558 DO_LD1_ZPZ_D(bdu
, zd
, MO_8
)
6560 DO_LD1_ZPZ_S(bss
, zsu
, MO_8
)
6561 DO_LD1_ZPZ_S(bss
, zss
, MO_8
)
6562 DO_LD1_ZPZ_D(bds
, zsu
, MO_8
)
6563 DO_LD1_ZPZ_D(bds
, zss
, MO_8
)
6564 DO_LD1_ZPZ_D(bds
, zd
, MO_8
)
6566 DO_LD1_ZPZ_S(hsu_le
, zsu
, MO_16
)
6567 DO_LD1_ZPZ_S(hsu_le
, zss
, MO_16
)
6568 DO_LD1_ZPZ_D(hdu_le
, zsu
, MO_16
)
6569 DO_LD1_ZPZ_D(hdu_le
, zss
, MO_16
)
6570 DO_LD1_ZPZ_D(hdu_le
, zd
, MO_16
)
6572 DO_LD1_ZPZ_S(hsu_be
, zsu
, MO_16
)
6573 DO_LD1_ZPZ_S(hsu_be
, zss
, MO_16
)
6574 DO_LD1_ZPZ_D(hdu_be
, zsu
, MO_16
)
6575 DO_LD1_ZPZ_D(hdu_be
, zss
, MO_16
)
6576 DO_LD1_ZPZ_D(hdu_be
, zd
, MO_16
)
6578 DO_LD1_ZPZ_S(hss_le
, zsu
, MO_16
)
6579 DO_LD1_ZPZ_S(hss_le
, zss
, MO_16
)
6580 DO_LD1_ZPZ_D(hds_le
, zsu
, MO_16
)
6581 DO_LD1_ZPZ_D(hds_le
, zss
, MO_16
)
6582 DO_LD1_ZPZ_D(hds_le
, zd
, MO_16
)
6584 DO_LD1_ZPZ_S(hss_be
, zsu
, MO_16
)
6585 DO_LD1_ZPZ_S(hss_be
, zss
, MO_16
)
6586 DO_LD1_ZPZ_D(hds_be
, zsu
, MO_16
)
6587 DO_LD1_ZPZ_D(hds_be
, zss
, MO_16
)
6588 DO_LD1_ZPZ_D(hds_be
, zd
, MO_16
)
6590 DO_LD1_ZPZ_S(ss_le
, zsu
, MO_32
)
6591 DO_LD1_ZPZ_S(ss_le
, zss
, MO_32
)
6592 DO_LD1_ZPZ_D(sdu_le
, zsu
, MO_32
)
6593 DO_LD1_ZPZ_D(sdu_le
, zss
, MO_32
)
6594 DO_LD1_ZPZ_D(sdu_le
, zd
, MO_32
)
6596 DO_LD1_ZPZ_S(ss_be
, zsu
, MO_32
)
6597 DO_LD1_ZPZ_S(ss_be
, zss
, MO_32
)
6598 DO_LD1_ZPZ_D(sdu_be
, zsu
, MO_32
)
6599 DO_LD1_ZPZ_D(sdu_be
, zss
, MO_32
)
6600 DO_LD1_ZPZ_D(sdu_be
, zd
, MO_32
)
6602 DO_LD1_ZPZ_D(sds_le
, zsu
, MO_32
)
6603 DO_LD1_ZPZ_D(sds_le
, zss
, MO_32
)
6604 DO_LD1_ZPZ_D(sds_le
, zd
, MO_32
)
6606 DO_LD1_ZPZ_D(sds_be
, zsu
, MO_32
)
6607 DO_LD1_ZPZ_D(sds_be
, zss
, MO_32
)
6608 DO_LD1_ZPZ_D(sds_be
, zd
, MO_32
)
6610 DO_LD1_ZPZ_D(dd_le
, zsu
, MO_64
)
6611 DO_LD1_ZPZ_D(dd_le
, zss
, MO_64
)
6612 DO_LD1_ZPZ_D(dd_le
, zd
, MO_64
)
6614 DO_LD1_ZPZ_D(dd_be
, zsu
, MO_64
)
6615 DO_LD1_ZPZ_D(dd_be
, zss
, MO_64
)
6616 DO_LD1_ZPZ_D(dd_be
, zd
, MO_64
)
6621 /* First fault loads with a vector index. */
6624 * Common helpers for all gather first-faulting loads.
6627 static inline QEMU_ALWAYS_INLINE
6628 void sve_ldff1_z(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6629 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6630 uint32_t mtedesc
, const int esz
, const int msz
,
6631 zreg_off_fn
*off_fn
,
6632 sve_ldst1_host_fn
*host_fn
,
6633 sve_ldst1_tlb_fn
*tlb_fn
)
6635 const int mmu_idx
= cpu_mmu_index(env
, false);
6636 const intptr_t reg_max
= simd_oprsz(desc
);
6637 const int scale
= simd_data(desc
);
6638 const int esize
= 1 << esz
;
6639 const int msize
= 1 << msz
;
6642 target_ulong addr
, in_page
;
6644 /* Skip to the first true predicate. */
6645 reg_off
= find_next_active(vg
, 0, reg_max
, esz
);
6646 if (unlikely(reg_off
>= reg_max
)) {
6647 /* The entire predicate was false; no load occurs. */
6648 memset(vd
, 0, reg_max
);
6653 * Probe the first element, allowing faults.
6655 addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6657 mte_check(env
, mtedesc
, addr
, retaddr
);
6659 tlb_fn(env
, vd
, reg_off
, addr
, retaddr
);
6661 /* After any fault, zero the other elements. */
6662 swap_memzero(vd
, reg_off
);
6664 swap_memzero(vd
+ reg_off
, reg_max
- reg_off
);
6667 * Probe the remaining elements, not allowing faults.
6669 while (reg_off
< reg_max
) {
6670 uint64_t pg
= vg
[reg_off
>> 6];
6672 if (likely((pg
>> (reg_off
& 63)) & 1)) {
6673 addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6674 in_page
= -(addr
| TARGET_PAGE_MASK
);
6676 if (unlikely(in_page
< msize
)) {
6677 /* Stop if the element crosses a page boundary. */
6681 sve_probe_page(&info
, true, env
, addr
, 0, MMU_DATA_LOAD
,
6683 if (unlikely(info
.flags
& (TLB_INVALID_MASK
| TLB_MMIO
))) {
6686 if (unlikely(info
.flags
& TLB_WATCHPOINT
) &&
6687 (cpu_watchpoint_address_matches
6688 (env_cpu(env
), addr
, msize
) & BP_MEM_READ
)) {
6692 arm_tlb_mte_tagged(&info
.attrs
) &&
6693 !mte_probe(env
, mtedesc
, addr
)) {
6697 host_fn(vd
, reg_off
, info
.host
);
6700 } while (reg_off
& 63);
6705 record_fault(env
, reg_off
, reg_max
);
6708 static inline QEMU_ALWAYS_INLINE
6709 void sve_ldff1_z_mte(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6710 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6711 const int esz
, const int msz
,
6712 zreg_off_fn
*off_fn
,
6713 sve_ldst1_host_fn
*host_fn
,
6714 sve_ldst1_tlb_fn
*tlb_fn
)
6716 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6717 /* Remove mtedesc from the normal sve descriptor. */
6718 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6721 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6722 * offset base entirely over the address space hole to change the
6723 * pointer tag, or change the bit55 selector. So we could here
6724 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6726 sve_ldff1_z(env
, vd
, vg
, vm
, base
, desc
, retaddr
, mtedesc
,
6727 esz
, msz
, off_fn
, host_fn
, tlb_fn
);
6730 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6731 void HELPER(sve_ldff##MEM##_##OFS) \
6732 (CPUARMState *env, void *vd, void *vg, \
6733 void *vm, target_ulong base, uint32_t desc) \
6735 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6736 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6738 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6739 (CPUARMState *env, void *vd, void *vg, \
6740 void *vm, target_ulong base, uint32_t desc) \
6742 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6743 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6746 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6747 void HELPER(sve_ldff##MEM##_##OFS) \
6748 (CPUARMState *env, void *vd, void *vg, \
6749 void *vm, target_ulong base, uint32_t desc) \
6751 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
6752 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6754 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6755 (CPUARMState *env, void *vd, void *vg, \
6756 void *vm, target_ulong base, uint32_t desc) \
6758 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
6759 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6762 DO_LDFF1_ZPZ_S(bsu
, zsu
, MO_8
)
6763 DO_LDFF1_ZPZ_S(bsu
, zss
, MO_8
)
6764 DO_LDFF1_ZPZ_D(bdu
, zsu
, MO_8
)
6765 DO_LDFF1_ZPZ_D(bdu
, zss
, MO_8
)
6766 DO_LDFF1_ZPZ_D(bdu
, zd
, MO_8
)
6768 DO_LDFF1_ZPZ_S(bss
, zsu
, MO_8
)
6769 DO_LDFF1_ZPZ_S(bss
, zss
, MO_8
)
6770 DO_LDFF1_ZPZ_D(bds
, zsu
, MO_8
)
6771 DO_LDFF1_ZPZ_D(bds
, zss
, MO_8
)
6772 DO_LDFF1_ZPZ_D(bds
, zd
, MO_8
)
6774 DO_LDFF1_ZPZ_S(hsu_le
, zsu
, MO_16
)
6775 DO_LDFF1_ZPZ_S(hsu_le
, zss
, MO_16
)
6776 DO_LDFF1_ZPZ_D(hdu_le
, zsu
, MO_16
)
6777 DO_LDFF1_ZPZ_D(hdu_le
, zss
, MO_16
)
6778 DO_LDFF1_ZPZ_D(hdu_le
, zd
, MO_16
)
6780 DO_LDFF1_ZPZ_S(hsu_be
, zsu
, MO_16
)
6781 DO_LDFF1_ZPZ_S(hsu_be
, zss
, MO_16
)
6782 DO_LDFF1_ZPZ_D(hdu_be
, zsu
, MO_16
)
6783 DO_LDFF1_ZPZ_D(hdu_be
, zss
, MO_16
)
6784 DO_LDFF1_ZPZ_D(hdu_be
, zd
, MO_16
)
6786 DO_LDFF1_ZPZ_S(hss_le
, zsu
, MO_16
)
6787 DO_LDFF1_ZPZ_S(hss_le
, zss
, MO_16
)
6788 DO_LDFF1_ZPZ_D(hds_le
, zsu
, MO_16
)
6789 DO_LDFF1_ZPZ_D(hds_le
, zss
, MO_16
)
6790 DO_LDFF1_ZPZ_D(hds_le
, zd
, MO_16
)
6792 DO_LDFF1_ZPZ_S(hss_be
, zsu
, MO_16
)
6793 DO_LDFF1_ZPZ_S(hss_be
, zss
, MO_16
)
6794 DO_LDFF1_ZPZ_D(hds_be
, zsu
, MO_16
)
6795 DO_LDFF1_ZPZ_D(hds_be
, zss
, MO_16
)
6796 DO_LDFF1_ZPZ_D(hds_be
, zd
, MO_16
)
6798 DO_LDFF1_ZPZ_S(ss_le
, zsu
, MO_32
)
6799 DO_LDFF1_ZPZ_S(ss_le
, zss
, MO_32
)
6800 DO_LDFF1_ZPZ_D(sdu_le
, zsu
, MO_32
)
6801 DO_LDFF1_ZPZ_D(sdu_le
, zss
, MO_32
)
6802 DO_LDFF1_ZPZ_D(sdu_le
, zd
, MO_32
)
6804 DO_LDFF1_ZPZ_S(ss_be
, zsu
, MO_32
)
6805 DO_LDFF1_ZPZ_S(ss_be
, zss
, MO_32
)
6806 DO_LDFF1_ZPZ_D(sdu_be
, zsu
, MO_32
)
6807 DO_LDFF1_ZPZ_D(sdu_be
, zss
, MO_32
)
6808 DO_LDFF1_ZPZ_D(sdu_be
, zd
, MO_32
)
6810 DO_LDFF1_ZPZ_D(sds_le
, zsu
, MO_32
)
6811 DO_LDFF1_ZPZ_D(sds_le
, zss
, MO_32
)
6812 DO_LDFF1_ZPZ_D(sds_le
, zd
, MO_32
)
6814 DO_LDFF1_ZPZ_D(sds_be
, zsu
, MO_32
)
6815 DO_LDFF1_ZPZ_D(sds_be
, zss
, MO_32
)
6816 DO_LDFF1_ZPZ_D(sds_be
, zd
, MO_32
)
6818 DO_LDFF1_ZPZ_D(dd_le
, zsu
, MO_64
)
6819 DO_LDFF1_ZPZ_D(dd_le
, zss
, MO_64
)
6820 DO_LDFF1_ZPZ_D(dd_le
, zd
, MO_64
)
6822 DO_LDFF1_ZPZ_D(dd_be
, zsu
, MO_64
)
6823 DO_LDFF1_ZPZ_D(dd_be
, zss
, MO_64
)
6824 DO_LDFF1_ZPZ_D(dd_be
, zd
, MO_64
)
6826 /* Stores with a vector index. */
6828 static inline QEMU_ALWAYS_INLINE
6829 void sve_st1_z(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6830 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6831 uint32_t mtedesc
, int esize
, int msize
,
6832 zreg_off_fn
*off_fn
,
6833 sve_ldst1_host_fn
*host_fn
,
6834 sve_ldst1_tlb_fn
*tlb_fn
)
6836 const int mmu_idx
= cpu_mmu_index(env
, false);
6837 const intptr_t reg_max
= simd_oprsz(desc
);
6838 const int scale
= simd_data(desc
);
6839 void *host
[ARM_MAX_VQ
* 4];
6840 intptr_t reg_off
, i
;
6841 SVEHostPage info
, info2
;
6844 * Probe all of the elements for host addresses and flags.
6848 uint64_t pg
= vg
[reg_off
>> 6];
6850 target_ulong addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6851 target_ulong in_page
= -(addr
| TARGET_PAGE_MASK
);
6854 if (likely((pg
>> (reg_off
& 63)) & 1)) {
6855 if (likely(in_page
>= msize
)) {
6856 sve_probe_page(&info
, false, env
, addr
, 0, MMU_DATA_STORE
,
6858 host
[i
] = info
.host
;
6861 * Element crosses the page boundary.
6862 * Probe both pages, but do not record the host address,
6863 * so that we use the slow path.
6865 sve_probe_page(&info
, false, env
, addr
, 0,
6866 MMU_DATA_STORE
, mmu_idx
, retaddr
);
6867 sve_probe_page(&info2
, false, env
, addr
+ in_page
, 0,
6868 MMU_DATA_STORE
, mmu_idx
, retaddr
);
6869 info
.flags
|= info2
.flags
;
6872 if (unlikely(info
.flags
& TLB_WATCHPOINT
)) {
6873 cpu_check_watchpoint(env_cpu(env
), addr
, msize
,
6874 info
.attrs
, BP_MEM_WRITE
, retaddr
);
6877 if (mtedesc
&& arm_tlb_mte_tagged(&info
.attrs
)) {
6878 mte_check(env
, mtedesc
, addr
, retaddr
);
6883 } while (reg_off
& 63);
6884 } while (reg_off
< reg_max
);
6887 * Now that we have recognized all exceptions except SyncExternal
6888 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
6890 * Note for the common case of an element in RAM, not crossing a page
6891 * boundary, we have stored the host address in host[]. This doubles
6892 * as a first-level check against the predicate, since only enabled
6893 * elements have non-null host addresses.
6898 if (likely(h
!= NULL
)) {
6899 host_fn(vd
, reg_off
, h
);
6900 } else if ((vg
[reg_off
>> 6] >> (reg_off
& 63)) & 1) {
6901 target_ulong addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6902 tlb_fn(env
, vd
, reg_off
, addr
, retaddr
);
6906 } while (reg_off
< reg_max
);
6909 static inline QEMU_ALWAYS_INLINE
6910 void sve_st1_z_mte(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6911 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6912 int esize
, int msize
, zreg_off_fn
*off_fn
,
6913 sve_ldst1_host_fn
*host_fn
,
6914 sve_ldst1_tlb_fn
*tlb_fn
)
6916 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6917 /* Remove mtedesc from the normal sve descriptor. */
6918 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6921 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6922 * offset base entirely over the address space hole to change the
6923 * pointer tag, or change the bit55 selector. So we could here
6924 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6926 sve_st1_z(env
, vd
, vg
, vm
, base
, desc
, retaddr
, mtedesc
,
6927 esize
, msize
, off_fn
, host_fn
, tlb_fn
);
6930 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
6931 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6932 void *vm, target_ulong base, uint32_t desc) \
6934 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6935 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6937 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6938 void *vm, target_ulong base, uint32_t desc) \
6940 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6941 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6944 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
6945 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6946 void *vm, target_ulong base, uint32_t desc) \
6948 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6949 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6951 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6952 void *vm, target_ulong base, uint32_t desc) \
6954 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6955 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
6958 DO_ST1_ZPZ_S(bs
, zsu
, MO_8
)
6959 DO_ST1_ZPZ_S(hs_le
, zsu
, MO_16
)
6960 DO_ST1_ZPZ_S(hs_be
, zsu
, MO_16
)
6961 DO_ST1_ZPZ_S(ss_le
, zsu
, MO_32
)
6962 DO_ST1_ZPZ_S(ss_be
, zsu
, MO_32
)
6964 DO_ST1_ZPZ_S(bs
, zss
, MO_8
)
6965 DO_ST1_ZPZ_S(hs_le
, zss
, MO_16
)
6966 DO_ST1_ZPZ_S(hs_be
, zss
, MO_16
)
6967 DO_ST1_ZPZ_S(ss_le
, zss
, MO_32
)
6968 DO_ST1_ZPZ_S(ss_be
, zss
, MO_32
)
6970 DO_ST1_ZPZ_D(bd
, zsu
, MO_8
)
6971 DO_ST1_ZPZ_D(hd_le
, zsu
, MO_16
)
6972 DO_ST1_ZPZ_D(hd_be
, zsu
, MO_16
)
6973 DO_ST1_ZPZ_D(sd_le
, zsu
, MO_32
)
6974 DO_ST1_ZPZ_D(sd_be
, zsu
, MO_32
)
6975 DO_ST1_ZPZ_D(dd_le
, zsu
, MO_64
)
6976 DO_ST1_ZPZ_D(dd_be
, zsu
, MO_64
)
6978 DO_ST1_ZPZ_D(bd
, zss
, MO_8
)
6979 DO_ST1_ZPZ_D(hd_le
, zss
, MO_16
)
6980 DO_ST1_ZPZ_D(hd_be
, zss
, MO_16
)
6981 DO_ST1_ZPZ_D(sd_le
, zss
, MO_32
)
6982 DO_ST1_ZPZ_D(sd_be
, zss
, MO_32
)
6983 DO_ST1_ZPZ_D(dd_le
, zss
, MO_64
)
6984 DO_ST1_ZPZ_D(dd_be
, zss
, MO_64
)
6986 DO_ST1_ZPZ_D(bd
, zd
, MO_8
)
6987 DO_ST1_ZPZ_D(hd_le
, zd
, MO_16
)
6988 DO_ST1_ZPZ_D(hd_be
, zd
, MO_16
)
6989 DO_ST1_ZPZ_D(sd_le
, zd
, MO_32
)
6990 DO_ST1_ZPZ_D(sd_be
, zd
, MO_32
)
6991 DO_ST1_ZPZ_D(dd_le
, zd
, MO_64
)
6992 DO_ST1_ZPZ_D(dd_be
, zd
, MO_64
)
6997 void HELPER(sve2_eor3
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
6999 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7000 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
7002 for (i
= 0; i
< opr_sz
; ++i
) {
7003 d
[i
] = n
[i
] ^ m
[i
] ^ k
[i
];
7007 void HELPER(sve2_bcax
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
7009 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7010 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
7012 for (i
= 0; i
< opr_sz
; ++i
) {
7013 d
[i
] = n
[i
] ^ (m
[i
] & ~k
[i
]);
7017 void HELPER(sve2_bsl1n
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
7019 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7020 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
7022 for (i
= 0; i
< opr_sz
; ++i
) {
7023 d
[i
] = (~n
[i
] & k
[i
]) | (m
[i
] & ~k
[i
]);
7027 void HELPER(sve2_bsl2n
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
7029 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7030 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
7032 for (i
= 0; i
< opr_sz
; ++i
) {
7033 d
[i
] = (n
[i
] & k
[i
]) | (~m
[i
] & ~k
[i
]);
7037 void HELPER(sve2_nbsl
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
7039 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7040 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
7042 for (i
= 0; i
< opr_sz
; ++i
) {
7043 d
[i
] = ~((n
[i
] & k
[i
]) | (m
[i
] & ~k
[i
]));
7048 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7049 * See hasless(v,1) from
7050 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7052 static inline bool do_match2(uint64_t n
, uint64_t m0
, uint64_t m1
, int esz
)
7054 int bits
= 8 << esz
;
7055 uint64_t ones
= dup_const(esz
, 1);
7056 uint64_t signs
= ones
<< (bits
- 1);
7057 uint64_t cmp0
, cmp1
;
7059 cmp1
= dup_const(esz
, n
);
7062 cmp0
= (cmp0
- ones
) & ~cmp0
;
7063 cmp1
= (cmp1
- ones
) & ~cmp1
;
7064 return (cmp0
| cmp1
) & signs
;
7067 static inline uint32_t do_match(void *vd
, void *vn
, void *vm
, void *vg
,
7068 uint32_t desc
, int esz
, bool nmatch
)
7070 uint16_t esz_mask
= pred_esz_masks
[esz
];
7071 intptr_t opr_sz
= simd_oprsz(desc
);
7072 uint32_t flags
= PREDTEST_INIT
;
7075 for (i
= 0; i
< opr_sz
; i
+= 16) {
7076 uint64_t m0
= *(uint64_t *)(vm
+ i
);
7077 uint64_t m1
= *(uint64_t *)(vm
+ i
+ 8);
7078 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3)) & esz_mask
;
7081 for (j
= 0; j
< 16; j
+= 8) {
7082 uint64_t n
= *(uint64_t *)(vn
+ i
+ j
);
7084 for (k
= 0; k
< 8; k
+= 1 << esz
) {
7085 if (pg
& (1 << (j
+ k
))) {
7086 bool o
= do_match2(n
>> (k
* 8), m0
, m1
, esz
);
7087 out
|= (o
^ nmatch
) << (j
+ k
);
7091 *(uint16_t *)(vd
+ H1_2(i
>> 3)) = out
;
7092 flags
= iter_predtest_fwd(out
, pg
, flags
);
7097 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7098 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7100 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7103 DO_PPZZ_MATCH(sve2_match_ppzz_b
, MO_8
, false)
7104 DO_PPZZ_MATCH(sve2_match_ppzz_h
, MO_16
, false)
7106 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b
, MO_8
, true)
7107 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h
, MO_16
, true)
7109 #undef DO_PPZZ_MATCH
7111 void HELPER(sve2_histcnt_s
)(void *vd
, void *vn
, void *vm
, void *vg
,
7114 ARMVectorReg scratch
;
7116 intptr_t opr_sz
= simd_oprsz(desc
);
7117 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
7121 n
= memcpy(&scratch
, n
, opr_sz
);
7125 } else if (d
== m
) {
7126 m
= memcpy(&scratch
, m
, opr_sz
);
7129 for (i
= 0; i
< opr_sz
; i
+= 4) {
7133 pred
= pg
[H1(i
>> 3)] >> (i
& 7);
7135 uint32_t nn
= n
[H4(i
>> 2)];
7137 for (j
= 0; j
<= i
; j
+= 4) {
7138 pred
= pg
[H1(j
>> 3)] >> (j
& 7);
7139 if ((pred
& 1) && nn
== m
[H4(j
>> 2)]) {
7144 d
[H4(i
>> 2)] = count
;
7148 void HELPER(sve2_histcnt_d
)(void *vd
, void *vn
, void *vm
, void *vg
,
7151 ARMVectorReg scratch
;
7153 intptr_t opr_sz
= simd_oprsz(desc
);
7154 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
7158 n
= memcpy(&scratch
, n
, opr_sz
);
7162 } else if (d
== m
) {
7163 m
= memcpy(&scratch
, m
, opr_sz
);
7166 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
7168 if (pg
[H1(i
)] & 1) {
7170 for (j
= 0; j
<= i
; ++j
) {
7171 if ((pg
[H1(j
)] & 1) && nn
== m
[j
]) {
7181 * Returns the number of bytes in m0 and m1 that match n.
7182 * Unlike do_match2 we don't just need true/false, we need an exact count.
7183 * This requires two extra logical operations.
7185 static inline uint64_t do_histseg_cnt(uint8_t n
, uint64_t m0
, uint64_t m1
)
7187 const uint64_t mask
= dup_const(MO_8
, 0x7f);
7188 uint64_t cmp0
, cmp1
;
7190 cmp1
= dup_const(MO_8
, n
);
7195 * 1: clear msb of each byte to avoid carry to next byte (& mask)
7196 * 2: carry in to msb if byte != 0 (+ mask)
7197 * 3: set msb if cmp has msb set (| cmp)
7198 * 4: set ~msb to ignore them (| mask)
7199 * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7200 * 5: invert, resulting in 0x80 if and only if byte == 0.
7202 cmp0
= ~(((cmp0
& mask
) + mask
) | cmp0
| mask
);
7203 cmp1
= ~(((cmp1
& mask
) + mask
) | cmp1
| mask
);
7206 * Combine the two compares in a way that the bits do
7207 * not overlap, and so preserves the count of set bits.
7208 * If the host has an efficient instruction for ctpop,
7209 * then ctpop(x) + ctpop(y) has the same number of
7210 * operations as ctpop(x | (y >> 1)). If the host does
7211 * not have an efficient ctpop, then we only want to
7214 return ctpop64(cmp0
| (cmp1
>> 1));
7217 void HELPER(sve2_histseg
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
7220 intptr_t opr_sz
= simd_oprsz(desc
);
7222 for (i
= 0; i
< opr_sz
; i
+= 16) {
7223 uint64_t n0
= *(uint64_t *)(vn
+ i
);
7224 uint64_t m0
= *(uint64_t *)(vm
+ i
);
7225 uint64_t n1
= *(uint64_t *)(vn
+ i
+ 8);
7226 uint64_t m1
= *(uint64_t *)(vm
+ i
+ 8);
7230 for (j
= 0; j
< 64; j
+= 8) {
7231 uint64_t cnt0
= do_histseg_cnt(n0
>> j
, m0
, m1
);
7232 uint64_t cnt1
= do_histseg_cnt(n1
>> j
, m0
, m1
);
7237 *(uint64_t *)(vd
+ i
) = out0
;
7238 *(uint64_t *)(vd
+ i
+ 8) = out1
;
7242 void HELPER(sve2_xar_b
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
7244 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7245 int shr
= simd_data(desc
);
7247 uint64_t mask
= dup_const(MO_8
, 0xff >> shr
);
7248 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
7250 for (i
= 0; i
< opr_sz
; ++i
) {
7251 uint64_t t
= n
[i
] ^ m
[i
];
7252 d
[i
] = ((t
>> shr
) & mask
) | ((t
<< shl
) & ~mask
);
7256 void HELPER(sve2_xar_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
7258 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7259 int shr
= simd_data(desc
);
7261 uint64_t mask
= dup_const(MO_16
, 0xffff >> shr
);
7262 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
7264 for (i
= 0; i
< opr_sz
; ++i
) {
7265 uint64_t t
= n
[i
] ^ m
[i
];
7266 d
[i
] = ((t
>> shr
) & mask
) | ((t
<< shl
) & ~mask
);
7270 void HELPER(sve2_xar_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
7272 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
7273 int shr
= simd_data(desc
);
7274 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
7276 for (i
= 0; i
< opr_sz
; ++i
) {
7277 d
[i
] = ror32(n
[i
] ^ m
[i
], shr
);
7281 void HELPER(fmmla_s
)(void *vd
, void *vn
, void *vm
, void *va
,
7282 void *status
, uint32_t desc
)
7284 intptr_t s
, opr_sz
= simd_oprsz(desc
) / (sizeof(float32
) * 4);
7286 for (s
= 0; s
< opr_sz
; ++s
) {
7287 float32
*n
= vn
+ s
* sizeof(float32
) * 4;
7288 float32
*m
= vm
+ s
* sizeof(float32
) * 4;
7289 float32
*a
= va
+ s
* sizeof(float32
) * 4;
7290 float32
*d
= vd
+ s
* sizeof(float32
) * 4;
7291 float32 n00
= n
[H4(0)], n01
= n
[H4(1)];
7292 float32 n10
= n
[H4(2)], n11
= n
[H4(3)];
7293 float32 m00
= m
[H4(0)], m01
= m
[H4(1)];
7294 float32 m10
= m
[H4(2)], m11
= m
[H4(3)];
7298 p0
= float32_mul(n00
, m00
, status
);
7299 p1
= float32_mul(n01
, m01
, status
);
7300 d
[H4(0)] = float32_add(a
[H4(0)], float32_add(p0
, p1
, status
), status
);
7303 p0
= float32_mul(n00
, m10
, status
);
7304 p1
= float32_mul(n01
, m11
, status
);
7305 d
[H4(1)] = float32_add(a
[H4(1)], float32_add(p0
, p1
, status
), status
);
7308 p0
= float32_mul(n10
, m00
, status
);
7309 p1
= float32_mul(n11
, m01
, status
);
7310 d
[H4(2)] = float32_add(a
[H4(2)], float32_add(p0
, p1
, status
), status
);
7313 p0
= float32_mul(n10
, m10
, status
);
7314 p1
= float32_mul(n11
, m11
, status
);
7315 d
[H4(3)] = float32_add(a
[H4(3)], float32_add(p0
, p1
, status
), status
);
7319 void HELPER(fmmla_d
)(void *vd
, void *vn
, void *vm
, void *va
,
7320 void *status
, uint32_t desc
)
7322 intptr_t s
, opr_sz
= simd_oprsz(desc
) / (sizeof(float64
) * 4);
7324 for (s
= 0; s
< opr_sz
; ++s
) {
7325 float64
*n
= vn
+ s
* sizeof(float64
) * 4;
7326 float64
*m
= vm
+ s
* sizeof(float64
) * 4;
7327 float64
*a
= va
+ s
* sizeof(float64
) * 4;
7328 float64
*d
= vd
+ s
* sizeof(float64
) * 4;
7329 float64 n00
= n
[0], n01
= n
[1], n10
= n
[2], n11
= n
[3];
7330 float64 m00
= m
[0], m01
= m
[1], m10
= m
[2], m11
= m
[3];
7334 p0
= float64_mul(n00
, m00
, status
);
7335 p1
= float64_mul(n01
, m01
, status
);
7336 d
[0] = float64_add(a
[0], float64_add(p0
, p1
, status
), status
);
7339 p0
= float64_mul(n00
, m10
, status
);
7340 p1
= float64_mul(n01
, m11
, status
);
7341 d
[1] = float64_add(a
[1], float64_add(p0
, p1
, status
), status
);
7344 p0
= float64_mul(n10
, m00
, status
);
7345 p1
= float64_mul(n11
, m01
, status
);
7346 d
[2] = float64_add(a
[2], float64_add(p0
, p1
, status
), status
);
7349 p0
= float64_mul(n10
, m10
, status
);
7350 p1
= float64_mul(n11
, m11
, status
);
7351 d
[3] = float64_add(a
[3], float64_add(p0
, p1
, status
), status
);