4 * Copyright (c) 2018 Linaro, Ltd.
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
20 #include "qemu/osdep.h"
22 #include "internals.h"
23 #include "exec/exec-all.h"
24 #include "exec/cpu_ldst.h"
25 #include "exec/helper-proto.h"
26 #include "tcg/tcg-gvec-desc.h"
27 #include "fpu/softfloat.h"
29 #include "vec_internal.h"
32 /* Note that vector data is stored in host-endian 64-bit chunks,
33 so addressing units smaller than that needs a host-endian fixup. */
34 #ifdef HOST_WORDS_BIGENDIAN
35 #define H1(x) ((x) ^ 7)
36 #define H1_2(x) ((x) ^ 6)
37 #define H1_4(x) ((x) ^ 4)
38 #define H2(x) ((x) ^ 3)
39 #define H4(x) ((x) ^ 1)
48 /* Return a value for NZCV as per the ARM PredTest pseudofunction.
50 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
51 * and bit 0 set if C is set. Compare the definitions of these variables
55 /* For no G bits set, NZCV = C. */
56 #define PREDTEST_INIT 1
58 /* This is an iterative function, called for each Pd and Pg word
61 static uint32_t iter_predtest_fwd(uint64_t d
, uint64_t g
, uint32_t flags
)
64 /* Compute N from first D & G.
65 Use bit 2 to signal first G bit seen. */
67 flags
|= ((d
& (g
& -g
)) != 0) << 31;
71 /* Accumulate Z from each D & G. */
72 flags
|= ((d
& g
) != 0) << 1;
74 /* Compute C from last !(D & G). Replace previous. */
75 flags
= deposit32(flags
, 0, 1, (d
& pow2floor(g
)) == 0);
80 /* This is an iterative function, called for each Pd and Pg word
83 static uint32_t iter_predtest_bwd(uint64_t d
, uint64_t g
, uint32_t flags
)
86 /* Compute C from first (i.e last) !(D & G).
87 Use bit 2 to signal first G bit seen. */
89 flags
+= 4 - 1; /* add bit 2, subtract C from PREDTEST_INIT */
90 flags
|= (d
& pow2floor(g
)) == 0;
93 /* Accumulate Z from each D & G. */
94 flags
|= ((d
& g
) != 0) << 1;
96 /* Compute N from last (i.e first) D & G. Replace previous. */
97 flags
= deposit32(flags
, 31, 1, (d
& (g
& -g
)) != 0);
102 /* The same for a single word predicate. */
103 uint32_t HELPER(sve_predtest1
)(uint64_t d
, uint64_t g
)
105 return iter_predtest_fwd(d
, g
, PREDTEST_INIT
);
108 /* The same for a multi-word predicate. */
109 uint32_t HELPER(sve_predtest
)(void *vd
, void *vg
, uint32_t words
)
111 uint32_t flags
= PREDTEST_INIT
;
112 uint64_t *d
= vd
, *g
= vg
;
116 flags
= iter_predtest_fwd(d
[i
], g
[i
], flags
);
117 } while (++i
< words
);
122 /* Expand active predicate bits to bytes, for byte elements.
123 * for (i = 0; i < 256; ++i) {
124 * unsigned long m = 0;
125 * for (j = 0; j < 8; j++) {
126 * if ((i >> j) & 1) {
127 * m |= 0xfful << (j << 3);
130 * printf("0x%016lx,\n", m);
133 static inline uint64_t expand_pred_b(uint8_t byte
)
135 static const uint64_t word
[256] = {
136 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
137 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
138 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
139 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
140 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
141 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
142 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
143 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
144 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
145 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
146 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
147 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
148 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
149 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
150 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
151 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
152 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
153 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
154 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
155 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
156 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
157 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
158 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
159 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
160 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
161 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
162 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
163 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
164 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
165 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
166 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
167 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
168 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
169 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
170 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
171 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
172 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
173 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
174 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
175 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
176 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
177 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
178 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
179 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
180 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
181 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
182 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
183 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
184 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
185 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
186 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
187 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
188 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
189 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
190 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
191 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
192 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
193 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
194 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
195 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
196 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
197 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
198 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
199 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
200 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
201 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
202 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
203 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
204 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
205 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
206 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
207 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
208 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
209 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
210 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
211 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
212 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
213 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
214 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
215 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
216 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
217 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
218 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
219 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
220 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
226 /* Similarly for half-word elements.
227 * for (i = 0; i < 256; ++i) {
228 * unsigned long m = 0;
232 * for (j = 0; j < 8; j += 2) {
233 * if ((i >> j) & 1) {
234 * m |= 0xfffful << (j << 3);
237 * printf("[0x%x] = 0x%016lx,\n", i, m);
240 static inline uint64_t expand_pred_h(uint8_t byte
)
242 static const uint64_t word
[] = {
243 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
244 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
245 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
246 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
247 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
248 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
249 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
250 [0x55] = 0xffffffffffffffff,
252 return word
[byte
& 0x55];
255 /* Similarly for single word elements. */
256 static inline uint64_t expand_pred_s(uint8_t byte
)
258 static const uint64_t word
[] = {
259 [0x01] = 0x00000000ffffffffull
,
260 [0x10] = 0xffffffff00000000ull
,
261 [0x11] = 0xffffffffffffffffull
,
263 return word
[byte
& 0x11];
266 /* Swap 16-bit words within a 32-bit word. */
267 static inline uint32_t hswap32(uint32_t h
)
272 /* Swap 16-bit words within a 64-bit word. */
273 static inline uint64_t hswap64(uint64_t h
)
275 uint64_t m
= 0x0000ffff0000ffffull
;
277 return ((h
& m
) << 16) | ((h
>> 16) & m
);
280 /* Swap 32-bit words within a 64-bit word. */
281 static inline uint64_t wswap64(uint64_t h
)
286 #define LOGICAL_PPPP(NAME, FUNC) \
287 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
289 uintptr_t opr_sz = simd_oprsz(desc); \
290 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
292 for (i = 0; i < opr_sz / 8; ++i) { \
293 d[i] = FUNC(n[i], m[i], g[i]); \
297 #define DO_AND(N, M, G) (((N) & (M)) & (G))
298 #define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
299 #define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
300 #define DO_ORR(N, M, G) (((N) | (M)) & (G))
301 #define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
302 #define DO_NOR(N, M, G) (~((N) | (M)) & (G))
303 #define DO_NAND(N, M, G) (~((N) & (M)) & (G))
304 #define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
306 LOGICAL_PPPP(sve_and_pppp
, DO_AND
)
307 LOGICAL_PPPP(sve_bic_pppp
, DO_BIC
)
308 LOGICAL_PPPP(sve_eor_pppp
, DO_EOR
)
309 LOGICAL_PPPP(sve_sel_pppp
, DO_SEL
)
310 LOGICAL_PPPP(sve_orr_pppp
, DO_ORR
)
311 LOGICAL_PPPP(sve_orn_pppp
, DO_ORN
)
312 LOGICAL_PPPP(sve_nor_pppp
, DO_NOR
)
313 LOGICAL_PPPP(sve_nand_pppp
, DO_NAND
)
325 /* Fully general three-operand expander, controlled by a predicate.
326 * This is complicated by the host-endian storage of the register file.
328 /* ??? I don't expect the compiler could ever vectorize this itself.
329 * With some tables we can convert bit masks to byte masks, and with
330 * extra care wrt byte/word ordering we could use gcc generic vectors
331 * and do 16 bytes at a time.
333 #define DO_ZPZZ(NAME, TYPE, H, OP) \
334 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
336 intptr_t i, opr_sz = simd_oprsz(desc); \
337 for (i = 0; i < opr_sz; ) { \
338 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
341 TYPE nn = *(TYPE *)(vn + H(i)); \
342 TYPE mm = *(TYPE *)(vm + H(i)); \
343 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
345 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
350 /* Similarly, specialized for 64-bit operands. */
351 #define DO_ZPZZ_D(NAME, TYPE, OP) \
352 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
354 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
355 TYPE *d = vd, *n = vn, *m = vm; \
357 for (i = 0; i < opr_sz; i += 1) { \
358 if (pg[H1(i)] & 1) { \
359 TYPE nn = n[i], mm = m[i]; \
365 #define DO_AND(N, M) (N & M)
366 #define DO_EOR(N, M) (N ^ M)
367 #define DO_ORR(N, M) (N | M)
368 #define DO_BIC(N, M) (N & ~M)
369 #define DO_ADD(N, M) (N + M)
370 #define DO_SUB(N, M) (N - M)
371 #define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
372 #define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
373 #define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
374 #define DO_MUL(N, M) (N * M)
378 * We must avoid the C undefined behaviour cases: division by
379 * zero and signed division of INT_MIN by -1. Both of these
380 * have architecturally defined required results for Arm.
381 * We special case all signed divisions by -1 to avoid having
382 * to deduce the minimum integer for the type involved.
384 #define DO_SDIV(N, M) (unlikely(M == 0) ? 0 : unlikely(M == -1) ? -N : N / M)
385 #define DO_UDIV(N, M) (unlikely(M == 0) ? 0 : N / M)
387 DO_ZPZZ(sve_and_zpzz_b
, uint8_t, H1
, DO_AND
)
388 DO_ZPZZ(sve_and_zpzz_h
, uint16_t, H1_2
, DO_AND
)
389 DO_ZPZZ(sve_and_zpzz_s
, uint32_t, H1_4
, DO_AND
)
390 DO_ZPZZ_D(sve_and_zpzz_d
, uint64_t, DO_AND
)
392 DO_ZPZZ(sve_orr_zpzz_b
, uint8_t, H1
, DO_ORR
)
393 DO_ZPZZ(sve_orr_zpzz_h
, uint16_t, H1_2
, DO_ORR
)
394 DO_ZPZZ(sve_orr_zpzz_s
, uint32_t, H1_4
, DO_ORR
)
395 DO_ZPZZ_D(sve_orr_zpzz_d
, uint64_t, DO_ORR
)
397 DO_ZPZZ(sve_eor_zpzz_b
, uint8_t, H1
, DO_EOR
)
398 DO_ZPZZ(sve_eor_zpzz_h
, uint16_t, H1_2
, DO_EOR
)
399 DO_ZPZZ(sve_eor_zpzz_s
, uint32_t, H1_4
, DO_EOR
)
400 DO_ZPZZ_D(sve_eor_zpzz_d
, uint64_t, DO_EOR
)
402 DO_ZPZZ(sve_bic_zpzz_b
, uint8_t, H1
, DO_BIC
)
403 DO_ZPZZ(sve_bic_zpzz_h
, uint16_t, H1_2
, DO_BIC
)
404 DO_ZPZZ(sve_bic_zpzz_s
, uint32_t, H1_4
, DO_BIC
)
405 DO_ZPZZ_D(sve_bic_zpzz_d
, uint64_t, DO_BIC
)
407 DO_ZPZZ(sve_add_zpzz_b
, uint8_t, H1
, DO_ADD
)
408 DO_ZPZZ(sve_add_zpzz_h
, uint16_t, H1_2
, DO_ADD
)
409 DO_ZPZZ(sve_add_zpzz_s
, uint32_t, H1_4
, DO_ADD
)
410 DO_ZPZZ_D(sve_add_zpzz_d
, uint64_t, DO_ADD
)
412 DO_ZPZZ(sve_sub_zpzz_b
, uint8_t, H1
, DO_SUB
)
413 DO_ZPZZ(sve_sub_zpzz_h
, uint16_t, H1_2
, DO_SUB
)
414 DO_ZPZZ(sve_sub_zpzz_s
, uint32_t, H1_4
, DO_SUB
)
415 DO_ZPZZ_D(sve_sub_zpzz_d
, uint64_t, DO_SUB
)
417 DO_ZPZZ(sve_smax_zpzz_b
, int8_t, H1
, DO_MAX
)
418 DO_ZPZZ(sve_smax_zpzz_h
, int16_t, H1_2
, DO_MAX
)
419 DO_ZPZZ(sve_smax_zpzz_s
, int32_t, H1_4
, DO_MAX
)
420 DO_ZPZZ_D(sve_smax_zpzz_d
, int64_t, DO_MAX
)
422 DO_ZPZZ(sve_umax_zpzz_b
, uint8_t, H1
, DO_MAX
)
423 DO_ZPZZ(sve_umax_zpzz_h
, uint16_t, H1_2
, DO_MAX
)
424 DO_ZPZZ(sve_umax_zpzz_s
, uint32_t, H1_4
, DO_MAX
)
425 DO_ZPZZ_D(sve_umax_zpzz_d
, uint64_t, DO_MAX
)
427 DO_ZPZZ(sve_smin_zpzz_b
, int8_t, H1
, DO_MIN
)
428 DO_ZPZZ(sve_smin_zpzz_h
, int16_t, H1_2
, DO_MIN
)
429 DO_ZPZZ(sve_smin_zpzz_s
, int32_t, H1_4
, DO_MIN
)
430 DO_ZPZZ_D(sve_smin_zpzz_d
, int64_t, DO_MIN
)
432 DO_ZPZZ(sve_umin_zpzz_b
, uint8_t, H1
, DO_MIN
)
433 DO_ZPZZ(sve_umin_zpzz_h
, uint16_t, H1_2
, DO_MIN
)
434 DO_ZPZZ(sve_umin_zpzz_s
, uint32_t, H1_4
, DO_MIN
)
435 DO_ZPZZ_D(sve_umin_zpzz_d
, uint64_t, DO_MIN
)
437 DO_ZPZZ(sve_sabd_zpzz_b
, int8_t, H1
, DO_ABD
)
438 DO_ZPZZ(sve_sabd_zpzz_h
, int16_t, H1_2
, DO_ABD
)
439 DO_ZPZZ(sve_sabd_zpzz_s
, int32_t, H1_4
, DO_ABD
)
440 DO_ZPZZ_D(sve_sabd_zpzz_d
, int64_t, DO_ABD
)
442 DO_ZPZZ(sve_uabd_zpzz_b
, uint8_t, H1
, DO_ABD
)
443 DO_ZPZZ(sve_uabd_zpzz_h
, uint16_t, H1_2
, DO_ABD
)
444 DO_ZPZZ(sve_uabd_zpzz_s
, uint32_t, H1_4
, DO_ABD
)
445 DO_ZPZZ_D(sve_uabd_zpzz_d
, uint64_t, DO_ABD
)
447 /* Because the computation type is at least twice as large as required,
448 these work for both signed and unsigned source types. */
449 static inline uint8_t do_mulh_b(int32_t n
, int32_t m
)
454 static inline uint16_t do_mulh_h(int32_t n
, int32_t m
)
456 return (n
* m
) >> 16;
459 static inline uint32_t do_mulh_s(int64_t n
, int64_t m
)
461 return (n
* m
) >> 32;
464 static inline uint64_t do_smulh_d(uint64_t n
, uint64_t m
)
467 muls64(&lo
, &hi
, n
, m
);
471 static inline uint64_t do_umulh_d(uint64_t n
, uint64_t m
)
474 mulu64(&lo
, &hi
, n
, m
);
478 DO_ZPZZ(sve_mul_zpzz_b
, uint8_t, H1
, DO_MUL
)
479 DO_ZPZZ(sve_mul_zpzz_h
, uint16_t, H1_2
, DO_MUL
)
480 DO_ZPZZ(sve_mul_zpzz_s
, uint32_t, H1_4
, DO_MUL
)
481 DO_ZPZZ_D(sve_mul_zpzz_d
, uint64_t, DO_MUL
)
483 DO_ZPZZ(sve_smulh_zpzz_b
, int8_t, H1
, do_mulh_b
)
484 DO_ZPZZ(sve_smulh_zpzz_h
, int16_t, H1_2
, do_mulh_h
)
485 DO_ZPZZ(sve_smulh_zpzz_s
, int32_t, H1_4
, do_mulh_s
)
486 DO_ZPZZ_D(sve_smulh_zpzz_d
, uint64_t, do_smulh_d
)
488 DO_ZPZZ(sve_umulh_zpzz_b
, uint8_t, H1
, do_mulh_b
)
489 DO_ZPZZ(sve_umulh_zpzz_h
, uint16_t, H1_2
, do_mulh_h
)
490 DO_ZPZZ(sve_umulh_zpzz_s
, uint32_t, H1_4
, do_mulh_s
)
491 DO_ZPZZ_D(sve_umulh_zpzz_d
, uint64_t, do_umulh_d
)
493 DO_ZPZZ(sve_sdiv_zpzz_s
, int32_t, H1_4
, DO_SDIV
)
494 DO_ZPZZ_D(sve_sdiv_zpzz_d
, int64_t, DO_SDIV
)
496 DO_ZPZZ(sve_udiv_zpzz_s
, uint32_t, H1_4
, DO_UDIV
)
497 DO_ZPZZ_D(sve_udiv_zpzz_d
, uint64_t, DO_UDIV
)
499 /* Note that all bits of the shift are significant
500 and not modulo the element size. */
501 #define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
502 #define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
503 #define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
505 DO_ZPZZ(sve_asr_zpzz_b
, int8_t, H1
, DO_ASR
)
506 DO_ZPZZ(sve_lsr_zpzz_b
, uint8_t, H1_2
, DO_LSR
)
507 DO_ZPZZ(sve_lsl_zpzz_b
, uint8_t, H1_4
, DO_LSL
)
509 DO_ZPZZ(sve_asr_zpzz_h
, int16_t, H1
, DO_ASR
)
510 DO_ZPZZ(sve_lsr_zpzz_h
, uint16_t, H1_2
, DO_LSR
)
511 DO_ZPZZ(sve_lsl_zpzz_h
, uint16_t, H1_4
, DO_LSL
)
513 DO_ZPZZ(sve_asr_zpzz_s
, int32_t, H1
, DO_ASR
)
514 DO_ZPZZ(sve_lsr_zpzz_s
, uint32_t, H1_2
, DO_LSR
)
515 DO_ZPZZ(sve_lsl_zpzz_s
, uint32_t, H1_4
, DO_LSL
)
517 DO_ZPZZ_D(sve_asr_zpzz_d
, int64_t, DO_ASR
)
518 DO_ZPZZ_D(sve_lsr_zpzz_d
, uint64_t, DO_LSR
)
519 DO_ZPZZ_D(sve_lsl_zpzz_d
, uint64_t, DO_LSL
)
521 static inline uint16_t do_sadalp_h(int16_t n
, int16_t m
)
523 int8_t n1
= n
, n2
= n
>> 8;
527 static inline uint32_t do_sadalp_s(int32_t n
, int32_t m
)
529 int16_t n1
= n
, n2
= n
>> 16;
533 static inline uint64_t do_sadalp_d(int64_t n
, int64_t m
)
535 int32_t n1
= n
, n2
= n
>> 32;
539 DO_ZPZZ(sve2_sadalp_zpzz_h
, int16_t, H1_2
, do_sadalp_h
)
540 DO_ZPZZ(sve2_sadalp_zpzz_s
, int32_t, H1_4
, do_sadalp_s
)
541 DO_ZPZZ_D(sve2_sadalp_zpzz_d
, int64_t, do_sadalp_d
)
543 static inline uint16_t do_uadalp_h(uint16_t n
, uint16_t m
)
545 uint8_t n1
= n
, n2
= n
>> 8;
549 static inline uint32_t do_uadalp_s(uint32_t n
, uint32_t m
)
551 uint16_t n1
= n
, n2
= n
>> 16;
555 static inline uint64_t do_uadalp_d(uint64_t n
, uint64_t m
)
557 uint32_t n1
= n
, n2
= n
>> 32;
561 DO_ZPZZ(sve2_uadalp_zpzz_h
, uint16_t, H1_2
, do_uadalp_h
)
562 DO_ZPZZ(sve2_uadalp_zpzz_s
, uint32_t, H1_4
, do_uadalp_s
)
563 DO_ZPZZ_D(sve2_uadalp_zpzz_d
, uint64_t, do_uadalp_d
)
565 #define do_srshl_b(n, m) do_sqrshl_bhs(n, m, 8, true, NULL)
566 #define do_srshl_h(n, m) do_sqrshl_bhs(n, m, 16, true, NULL)
567 #define do_srshl_s(n, m) do_sqrshl_bhs(n, m, 32, true, NULL)
568 #define do_srshl_d(n, m) do_sqrshl_d(n, m, true, NULL)
570 DO_ZPZZ(sve2_srshl_zpzz_b
, int8_t, H1
, do_srshl_b
)
571 DO_ZPZZ(sve2_srshl_zpzz_h
, int16_t, H1_2
, do_srshl_h
)
572 DO_ZPZZ(sve2_srshl_zpzz_s
, int32_t, H1_4
, do_srshl_s
)
573 DO_ZPZZ_D(sve2_srshl_zpzz_d
, int64_t, do_srshl_d
)
575 #define do_urshl_b(n, m) do_uqrshl_bhs(n, (int8_t)m, 8, true, NULL)
576 #define do_urshl_h(n, m) do_uqrshl_bhs(n, (int16_t)m, 16, true, NULL)
577 #define do_urshl_s(n, m) do_uqrshl_bhs(n, m, 32, true, NULL)
578 #define do_urshl_d(n, m) do_uqrshl_d(n, m, true, NULL)
580 DO_ZPZZ(sve2_urshl_zpzz_b
, uint8_t, H1
, do_urshl_b
)
581 DO_ZPZZ(sve2_urshl_zpzz_h
, uint16_t, H1_2
, do_urshl_h
)
582 DO_ZPZZ(sve2_urshl_zpzz_s
, uint32_t, H1_4
, do_urshl_s
)
583 DO_ZPZZ_D(sve2_urshl_zpzz_d
, uint64_t, do_urshl_d
)
586 * Unlike the NEON and AdvSIMD versions, there is no QC bit to set.
587 * We pass in a pointer to a dummy saturation field to trigger
588 * the saturating arithmetic but discard the information about
589 * whether it has occurred.
591 #define do_sqshl_b(n, m) \
592 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, false, &discard); })
593 #define do_sqshl_h(n, m) \
594 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, false, &discard); })
595 #define do_sqshl_s(n, m) \
596 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, false, &discard); })
597 #define do_sqshl_d(n, m) \
598 ({ uint32_t discard; do_sqrshl_d(n, m, false, &discard); })
600 DO_ZPZZ(sve2_sqshl_zpzz_b
, int8_t, H1_2
, do_sqshl_b
)
601 DO_ZPZZ(sve2_sqshl_zpzz_h
, int16_t, H1_2
, do_sqshl_h
)
602 DO_ZPZZ(sve2_sqshl_zpzz_s
, int32_t, H1_4
, do_sqshl_s
)
603 DO_ZPZZ_D(sve2_sqshl_zpzz_d
, int64_t, do_sqshl_d
)
605 #define do_uqshl_b(n, m) \
606 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, false, &discard); })
607 #define do_uqshl_h(n, m) \
608 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, false, &discard); })
609 #define do_uqshl_s(n, m) \
610 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, false, &discard); })
611 #define do_uqshl_d(n, m) \
612 ({ uint32_t discard; do_uqrshl_d(n, m, false, &discard); })
614 DO_ZPZZ(sve2_uqshl_zpzz_b
, uint8_t, H1_2
, do_uqshl_b
)
615 DO_ZPZZ(sve2_uqshl_zpzz_h
, uint16_t, H1_2
, do_uqshl_h
)
616 DO_ZPZZ(sve2_uqshl_zpzz_s
, uint32_t, H1_4
, do_uqshl_s
)
617 DO_ZPZZ_D(sve2_uqshl_zpzz_d
, uint64_t, do_uqshl_d
)
619 #define do_sqrshl_b(n, m) \
620 ({ uint32_t discard; do_sqrshl_bhs(n, m, 8, true, &discard); })
621 #define do_sqrshl_h(n, m) \
622 ({ uint32_t discard; do_sqrshl_bhs(n, m, 16, true, &discard); })
623 #define do_sqrshl_s(n, m) \
624 ({ uint32_t discard; do_sqrshl_bhs(n, m, 32, true, &discard); })
625 #define do_sqrshl_d(n, m) \
626 ({ uint32_t discard; do_sqrshl_d(n, m, true, &discard); })
628 DO_ZPZZ(sve2_sqrshl_zpzz_b
, int8_t, H1_2
, do_sqrshl_b
)
629 DO_ZPZZ(sve2_sqrshl_zpzz_h
, int16_t, H1_2
, do_sqrshl_h
)
630 DO_ZPZZ(sve2_sqrshl_zpzz_s
, int32_t, H1_4
, do_sqrshl_s
)
631 DO_ZPZZ_D(sve2_sqrshl_zpzz_d
, int64_t, do_sqrshl_d
)
635 #define do_uqrshl_b(n, m) \
636 ({ uint32_t discard; do_uqrshl_bhs(n, (int8_t)m, 8, true, &discard); })
637 #define do_uqrshl_h(n, m) \
638 ({ uint32_t discard; do_uqrshl_bhs(n, (int16_t)m, 16, true, &discard); })
639 #define do_uqrshl_s(n, m) \
640 ({ uint32_t discard; do_uqrshl_bhs(n, m, 32, true, &discard); })
641 #define do_uqrshl_d(n, m) \
642 ({ uint32_t discard; do_uqrshl_d(n, m, true, &discard); })
644 DO_ZPZZ(sve2_uqrshl_zpzz_b
, uint8_t, H1_2
, do_uqrshl_b
)
645 DO_ZPZZ(sve2_uqrshl_zpzz_h
, uint16_t, H1_2
, do_uqrshl_h
)
646 DO_ZPZZ(sve2_uqrshl_zpzz_s
, uint32_t, H1_4
, do_uqrshl_s
)
647 DO_ZPZZ_D(sve2_uqrshl_zpzz_d
, uint64_t, do_uqrshl_d
)
651 #define DO_HADD_BHS(n, m) (((int64_t)n + m) >> 1)
652 #define DO_HADD_D(n, m) ((n >> 1) + (m >> 1) + (n & m & 1))
654 DO_ZPZZ(sve2_shadd_zpzz_b
, int8_t, H1
, DO_HADD_BHS
)
655 DO_ZPZZ(sve2_shadd_zpzz_h
, int16_t, H1_2
, DO_HADD_BHS
)
656 DO_ZPZZ(sve2_shadd_zpzz_s
, int32_t, H1_4
, DO_HADD_BHS
)
657 DO_ZPZZ_D(sve2_shadd_zpzz_d
, int64_t, DO_HADD_D
)
659 DO_ZPZZ(sve2_uhadd_zpzz_b
, uint8_t, H1
, DO_HADD_BHS
)
660 DO_ZPZZ(sve2_uhadd_zpzz_h
, uint16_t, H1_2
, DO_HADD_BHS
)
661 DO_ZPZZ(sve2_uhadd_zpzz_s
, uint32_t, H1_4
, DO_HADD_BHS
)
662 DO_ZPZZ_D(sve2_uhadd_zpzz_d
, uint64_t, DO_HADD_D
)
664 #define DO_RHADD_BHS(n, m) (((int64_t)n + m + 1) >> 1)
665 #define DO_RHADD_D(n, m) ((n >> 1) + (m >> 1) + ((n | m) & 1))
667 DO_ZPZZ(sve2_srhadd_zpzz_b
, int8_t, H1
, DO_RHADD_BHS
)
668 DO_ZPZZ(sve2_srhadd_zpzz_h
, int16_t, H1_2
, DO_RHADD_BHS
)
669 DO_ZPZZ(sve2_srhadd_zpzz_s
, int32_t, H1_4
, DO_RHADD_BHS
)
670 DO_ZPZZ_D(sve2_srhadd_zpzz_d
, int64_t, DO_RHADD_D
)
672 DO_ZPZZ(sve2_urhadd_zpzz_b
, uint8_t, H1
, DO_RHADD_BHS
)
673 DO_ZPZZ(sve2_urhadd_zpzz_h
, uint16_t, H1_2
, DO_RHADD_BHS
)
674 DO_ZPZZ(sve2_urhadd_zpzz_s
, uint32_t, H1_4
, DO_RHADD_BHS
)
675 DO_ZPZZ_D(sve2_urhadd_zpzz_d
, uint64_t, DO_RHADD_D
)
677 #define DO_HSUB_BHS(n, m) (((int64_t)n - m) >> 1)
678 #define DO_HSUB_D(n, m) ((n >> 1) - (m >> 1) - (~n & m & 1))
680 DO_ZPZZ(sve2_shsub_zpzz_b
, int8_t, H1
, DO_HSUB_BHS
)
681 DO_ZPZZ(sve2_shsub_zpzz_h
, int16_t, H1_2
, DO_HSUB_BHS
)
682 DO_ZPZZ(sve2_shsub_zpzz_s
, int32_t, H1_4
, DO_HSUB_BHS
)
683 DO_ZPZZ_D(sve2_shsub_zpzz_d
, int64_t, DO_HSUB_D
)
685 DO_ZPZZ(sve2_uhsub_zpzz_b
, uint8_t, H1
, DO_HSUB_BHS
)
686 DO_ZPZZ(sve2_uhsub_zpzz_h
, uint16_t, H1_2
, DO_HSUB_BHS
)
687 DO_ZPZZ(sve2_uhsub_zpzz_s
, uint32_t, H1_4
, DO_HSUB_BHS
)
688 DO_ZPZZ_D(sve2_uhsub_zpzz_d
, uint64_t, DO_HSUB_D
)
690 static inline int32_t do_sat_bhs(int64_t val
, int64_t min
, int64_t max
)
692 return val
>= max
? max
: val
<= min
? min
: val
;
695 #define DO_SQADD_B(n, m) do_sat_bhs((int64_t)n + m, INT8_MIN, INT8_MAX)
696 #define DO_SQADD_H(n, m) do_sat_bhs((int64_t)n + m, INT16_MIN, INT16_MAX)
697 #define DO_SQADD_S(n, m) do_sat_bhs((int64_t)n + m, INT32_MIN, INT32_MAX)
699 static inline int64_t do_sqadd_d(int64_t n
, int64_t m
)
702 if (((r
^ n
) & ~(n
^ m
)) < 0) {
703 /* Signed overflow. */
704 return r
< 0 ? INT64_MAX
: INT64_MIN
;
709 DO_ZPZZ(sve2_sqadd_zpzz_b
, int8_t, H1
, DO_SQADD_B
)
710 DO_ZPZZ(sve2_sqadd_zpzz_h
, int16_t, H1_2
, DO_SQADD_H
)
711 DO_ZPZZ(sve2_sqadd_zpzz_s
, int32_t, H1_4
, DO_SQADD_S
)
712 DO_ZPZZ_D(sve2_sqadd_zpzz_d
, int64_t, do_sqadd_d
)
714 #define DO_UQADD_B(n, m) do_sat_bhs((int64_t)n + m, 0, UINT8_MAX)
715 #define DO_UQADD_H(n, m) do_sat_bhs((int64_t)n + m, 0, UINT16_MAX)
716 #define DO_UQADD_S(n, m) do_sat_bhs((int64_t)n + m, 0, UINT32_MAX)
718 static inline uint64_t do_uqadd_d(uint64_t n
, uint64_t m
)
721 return r
< n
? UINT64_MAX
: r
;
724 DO_ZPZZ(sve2_uqadd_zpzz_b
, uint8_t, H1
, DO_UQADD_B
)
725 DO_ZPZZ(sve2_uqadd_zpzz_h
, uint16_t, H1_2
, DO_UQADD_H
)
726 DO_ZPZZ(sve2_uqadd_zpzz_s
, uint32_t, H1_4
, DO_UQADD_S
)
727 DO_ZPZZ_D(sve2_uqadd_zpzz_d
, uint64_t, do_uqadd_d
)
729 #define DO_SQSUB_B(n, m) do_sat_bhs((int64_t)n - m, INT8_MIN, INT8_MAX)
730 #define DO_SQSUB_H(n, m) do_sat_bhs((int64_t)n - m, INT16_MIN, INT16_MAX)
731 #define DO_SQSUB_S(n, m) do_sat_bhs((int64_t)n - m, INT32_MIN, INT32_MAX)
733 static inline int64_t do_sqsub_d(int64_t n
, int64_t m
)
736 if (((r
^ n
) & (n
^ m
)) < 0) {
737 /* Signed overflow. */
738 return r
< 0 ? INT64_MAX
: INT64_MIN
;
743 DO_ZPZZ(sve2_sqsub_zpzz_b
, int8_t, H1
, DO_SQSUB_B
)
744 DO_ZPZZ(sve2_sqsub_zpzz_h
, int16_t, H1_2
, DO_SQSUB_H
)
745 DO_ZPZZ(sve2_sqsub_zpzz_s
, int32_t, H1_4
, DO_SQSUB_S
)
746 DO_ZPZZ_D(sve2_sqsub_zpzz_d
, int64_t, do_sqsub_d
)
748 #define DO_UQSUB_B(n, m) do_sat_bhs((int64_t)n - m, 0, UINT8_MAX)
749 #define DO_UQSUB_H(n, m) do_sat_bhs((int64_t)n - m, 0, UINT16_MAX)
750 #define DO_UQSUB_S(n, m) do_sat_bhs((int64_t)n - m, 0, UINT32_MAX)
752 static inline uint64_t do_uqsub_d(uint64_t n
, uint64_t m
)
754 return n
> m
? n
- m
: 0;
757 DO_ZPZZ(sve2_uqsub_zpzz_b
, uint8_t, H1
, DO_UQSUB_B
)
758 DO_ZPZZ(sve2_uqsub_zpzz_h
, uint16_t, H1_2
, DO_UQSUB_H
)
759 DO_ZPZZ(sve2_uqsub_zpzz_s
, uint32_t, H1_4
, DO_UQSUB_S
)
760 DO_ZPZZ_D(sve2_uqsub_zpzz_d
, uint64_t, do_uqsub_d
)
762 #define DO_SUQADD_B(n, m) \
763 do_sat_bhs((int64_t)(int8_t)n + m, INT8_MIN, INT8_MAX)
764 #define DO_SUQADD_H(n, m) \
765 do_sat_bhs((int64_t)(int16_t)n + m, INT16_MIN, INT16_MAX)
766 #define DO_SUQADD_S(n, m) \
767 do_sat_bhs((int64_t)(int32_t)n + m, INT32_MIN, INT32_MAX)
769 static inline int64_t do_suqadd_d(int64_t n
, uint64_t m
)
774 /* Note that m - abs(n) cannot underflow. */
776 /* Result is either very large positive or negative. */
778 /* m > abs(n), so r is a very large positive. */
781 /* Result is negative. */
784 /* Both inputs are positive: check for overflow. */
785 if (r
< m
|| r
> INT64_MAX
) {
792 DO_ZPZZ(sve2_suqadd_zpzz_b
, uint8_t, H1
, DO_SUQADD_B
)
793 DO_ZPZZ(sve2_suqadd_zpzz_h
, uint16_t, H1_2
, DO_SUQADD_H
)
794 DO_ZPZZ(sve2_suqadd_zpzz_s
, uint32_t, H1_4
, DO_SUQADD_S
)
795 DO_ZPZZ_D(sve2_suqadd_zpzz_d
, uint64_t, do_suqadd_d
)
797 #define DO_USQADD_B(n, m) \
798 do_sat_bhs((int64_t)n + (int8_t)m, 0, UINT8_MAX)
799 #define DO_USQADD_H(n, m) \
800 do_sat_bhs((int64_t)n + (int16_t)m, 0, UINT16_MAX)
801 #define DO_USQADD_S(n, m) \
802 do_sat_bhs((int64_t)n + (int32_t)m, 0, UINT32_MAX)
804 static inline uint64_t do_usqadd_d(uint64_t n
, int64_t m
)
809 return n
< -m
? 0 : r
;
811 return r
< n
? UINT64_MAX
: r
;
814 DO_ZPZZ(sve2_usqadd_zpzz_b
, uint8_t, H1
, DO_USQADD_B
)
815 DO_ZPZZ(sve2_usqadd_zpzz_h
, uint16_t, H1_2
, DO_USQADD_H
)
816 DO_ZPZZ(sve2_usqadd_zpzz_s
, uint32_t, H1_4
, DO_USQADD_S
)
817 DO_ZPZZ_D(sve2_usqadd_zpzz_d
, uint64_t, do_usqadd_d
)
823 * Three operand expander, operating on element pairs.
824 * If the slot I is even, the elements from from VN {I, I+1}.
825 * If the slot I is odd, the elements from from VM {I-1, I}.
826 * Load all of the input elements in each pair before overwriting output.
828 #define DO_ZPZZ_PAIR(NAME, TYPE, H, OP) \
829 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
831 intptr_t i, opr_sz = simd_oprsz(desc); \
832 for (i = 0; i < opr_sz; ) { \
833 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
835 TYPE n0 = *(TYPE *)(vn + H(i)); \
836 TYPE m0 = *(TYPE *)(vm + H(i)); \
837 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
838 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
840 *(TYPE *)(vd + H(i)) = OP(n0, n1); \
842 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
844 *(TYPE *)(vd + H(i)) = OP(m0, m1); \
846 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
851 /* Similarly, specialized for 64-bit operands. */
852 #define DO_ZPZZ_PAIR_D(NAME, TYPE, OP) \
853 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
855 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
856 TYPE *d = vd, *n = vn, *m = vm; \
858 for (i = 0; i < opr_sz; i += 2) { \
859 TYPE n0 = n[i], n1 = n[i + 1]; \
860 TYPE m0 = m[i], m1 = m[i + 1]; \
861 if (pg[H1(i)] & 1) { \
864 if (pg[H1(i + 1)] & 1) { \
865 d[i + 1] = OP(m0, m1); \
870 DO_ZPZZ_PAIR(sve2_addp_zpzz_b
, uint8_t, H1
, DO_ADD
)
871 DO_ZPZZ_PAIR(sve2_addp_zpzz_h
, uint16_t, H1_2
, DO_ADD
)
872 DO_ZPZZ_PAIR(sve2_addp_zpzz_s
, uint32_t, H1_4
, DO_ADD
)
873 DO_ZPZZ_PAIR_D(sve2_addp_zpzz_d
, uint64_t, DO_ADD
)
875 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_b
, uint8_t, H1
, DO_MAX
)
876 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_h
, uint16_t, H1_2
, DO_MAX
)
877 DO_ZPZZ_PAIR(sve2_umaxp_zpzz_s
, uint32_t, H1_4
, DO_MAX
)
878 DO_ZPZZ_PAIR_D(sve2_umaxp_zpzz_d
, uint64_t, DO_MAX
)
880 DO_ZPZZ_PAIR(sve2_uminp_zpzz_b
, uint8_t, H1
, DO_MIN
)
881 DO_ZPZZ_PAIR(sve2_uminp_zpzz_h
, uint16_t, H1_2
, DO_MIN
)
882 DO_ZPZZ_PAIR(sve2_uminp_zpzz_s
, uint32_t, H1_4
, DO_MIN
)
883 DO_ZPZZ_PAIR_D(sve2_uminp_zpzz_d
, uint64_t, DO_MIN
)
885 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_b
, int8_t, H1
, DO_MAX
)
886 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_h
, int16_t, H1_2
, DO_MAX
)
887 DO_ZPZZ_PAIR(sve2_smaxp_zpzz_s
, int32_t, H1_4
, DO_MAX
)
888 DO_ZPZZ_PAIR_D(sve2_smaxp_zpzz_d
, int64_t, DO_MAX
)
890 DO_ZPZZ_PAIR(sve2_sminp_zpzz_b
, int8_t, H1
, DO_MIN
)
891 DO_ZPZZ_PAIR(sve2_sminp_zpzz_h
, int16_t, H1_2
, DO_MIN
)
892 DO_ZPZZ_PAIR(sve2_sminp_zpzz_s
, int32_t, H1_4
, DO_MIN
)
893 DO_ZPZZ_PAIR_D(sve2_sminp_zpzz_d
, int64_t, DO_MIN
)
896 #undef DO_ZPZZ_PAIR_D
898 #define DO_ZPZZ_PAIR_FP(NAME, TYPE, H, OP) \
899 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
900 void *status, uint32_t desc) \
902 intptr_t i, opr_sz = simd_oprsz(desc); \
903 for (i = 0; i < opr_sz; ) { \
904 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
906 TYPE n0 = *(TYPE *)(vn + H(i)); \
907 TYPE m0 = *(TYPE *)(vm + H(i)); \
908 TYPE n1 = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
909 TYPE m1 = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
911 *(TYPE *)(vd + H(i)) = OP(n0, n1, status); \
913 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
915 *(TYPE *)(vd + H(i)) = OP(m0, m1, status); \
917 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
922 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_h
, float16
, H1_2
, float16_add
)
923 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_s
, float32
, H1_4
, float32_add
)
924 DO_ZPZZ_PAIR_FP(sve2_faddp_zpzz_d
, float64
, , float64_add
)
926 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_h
, float16
, H1_2
, float16_maxnum
)
927 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_s
, float32
, H1_4
, float32_maxnum
)
928 DO_ZPZZ_PAIR_FP(sve2_fmaxnmp_zpzz_d
, float64
, , float64_maxnum
)
930 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_h
, float16
, H1_2
, float16_minnum
)
931 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_s
, float32
, H1_4
, float32_minnum
)
932 DO_ZPZZ_PAIR_FP(sve2_fminnmp_zpzz_d
, float64
, , float64_minnum
)
934 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_h
, float16
, H1_2
, float16_max
)
935 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_s
, float32
, H1_4
, float32_max
)
936 DO_ZPZZ_PAIR_FP(sve2_fmaxp_zpzz_d
, float64
, , float64_max
)
938 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_h
, float16
, H1_2
, float16_min
)
939 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_s
, float32
, H1_4
, float32_min
)
940 DO_ZPZZ_PAIR_FP(sve2_fminp_zpzz_d
, float64
, , float64_min
)
942 #undef DO_ZPZZ_PAIR_FP
944 /* Three-operand expander, controlled by a predicate, in which the
945 * third operand is "wide". That is, for D = N op M, the same 64-bit
946 * value of M is used with all of the narrower values of N.
948 #define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
949 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
951 intptr_t i, opr_sz = simd_oprsz(desc); \
952 for (i = 0; i < opr_sz; ) { \
953 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
954 TYPEW mm = *(TYPEW *)(vm + i); \
957 TYPE nn = *(TYPE *)(vn + H(i)); \
958 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
960 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
965 DO_ZPZW(sve_asr_zpzw_b
, int8_t, uint64_t, H1
, DO_ASR
)
966 DO_ZPZW(sve_lsr_zpzw_b
, uint8_t, uint64_t, H1
, DO_LSR
)
967 DO_ZPZW(sve_lsl_zpzw_b
, uint8_t, uint64_t, H1
, DO_LSL
)
969 DO_ZPZW(sve_asr_zpzw_h
, int16_t, uint64_t, H1_2
, DO_ASR
)
970 DO_ZPZW(sve_lsr_zpzw_h
, uint16_t, uint64_t, H1_2
, DO_LSR
)
971 DO_ZPZW(sve_lsl_zpzw_h
, uint16_t, uint64_t, H1_2
, DO_LSL
)
973 DO_ZPZW(sve_asr_zpzw_s
, int32_t, uint64_t, H1_4
, DO_ASR
)
974 DO_ZPZW(sve_lsr_zpzw_s
, uint32_t, uint64_t, H1_4
, DO_LSR
)
975 DO_ZPZW(sve_lsl_zpzw_s
, uint32_t, uint64_t, H1_4
, DO_LSL
)
979 /* Fully general two-operand expander, controlled by a predicate.
981 #define DO_ZPZ(NAME, TYPE, H, OP) \
982 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
984 intptr_t i, opr_sz = simd_oprsz(desc); \
985 for (i = 0; i < opr_sz; ) { \
986 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
989 TYPE nn = *(TYPE *)(vn + H(i)); \
990 *(TYPE *)(vd + H(i)) = OP(nn); \
992 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
997 /* Similarly, specialized for 64-bit operands. */
998 #define DO_ZPZ_D(NAME, TYPE, OP) \
999 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
1001 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1002 TYPE *d = vd, *n = vn; \
1004 for (i = 0; i < opr_sz; i += 1) { \
1005 if (pg[H1(i)] & 1) { \
1012 #define DO_CLS_B(N) (clrsb32(N) - 24)
1013 #define DO_CLS_H(N) (clrsb32(N) - 16)
1015 DO_ZPZ(sve_cls_b
, int8_t, H1
, DO_CLS_B
)
1016 DO_ZPZ(sve_cls_h
, int16_t, H1_2
, DO_CLS_H
)
1017 DO_ZPZ(sve_cls_s
, int32_t, H1_4
, clrsb32
)
1018 DO_ZPZ_D(sve_cls_d
, int64_t, clrsb64
)
1020 #define DO_CLZ_B(N) (clz32(N) - 24)
1021 #define DO_CLZ_H(N) (clz32(N) - 16)
1023 DO_ZPZ(sve_clz_b
, uint8_t, H1
, DO_CLZ_B
)
1024 DO_ZPZ(sve_clz_h
, uint16_t, H1_2
, DO_CLZ_H
)
1025 DO_ZPZ(sve_clz_s
, uint32_t, H1_4
, clz32
)
1026 DO_ZPZ_D(sve_clz_d
, uint64_t, clz64
)
1028 DO_ZPZ(sve_cnt_zpz_b
, uint8_t, H1
, ctpop8
)
1029 DO_ZPZ(sve_cnt_zpz_h
, uint16_t, H1_2
, ctpop16
)
1030 DO_ZPZ(sve_cnt_zpz_s
, uint32_t, H1_4
, ctpop32
)
1031 DO_ZPZ_D(sve_cnt_zpz_d
, uint64_t, ctpop64
)
1033 #define DO_CNOT(N) (N == 0)
1035 DO_ZPZ(sve_cnot_b
, uint8_t, H1
, DO_CNOT
)
1036 DO_ZPZ(sve_cnot_h
, uint16_t, H1_2
, DO_CNOT
)
1037 DO_ZPZ(sve_cnot_s
, uint32_t, H1_4
, DO_CNOT
)
1038 DO_ZPZ_D(sve_cnot_d
, uint64_t, DO_CNOT
)
1040 #define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
1042 DO_ZPZ(sve_fabs_h
, uint16_t, H1_2
, DO_FABS
)
1043 DO_ZPZ(sve_fabs_s
, uint32_t, H1_4
, DO_FABS
)
1044 DO_ZPZ_D(sve_fabs_d
, uint64_t, DO_FABS
)
1046 #define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
1048 DO_ZPZ(sve_fneg_h
, uint16_t, H1_2
, DO_FNEG
)
1049 DO_ZPZ(sve_fneg_s
, uint32_t, H1_4
, DO_FNEG
)
1050 DO_ZPZ_D(sve_fneg_d
, uint64_t, DO_FNEG
)
1052 #define DO_NOT(N) (~N)
1054 DO_ZPZ(sve_not_zpz_b
, uint8_t, H1
, DO_NOT
)
1055 DO_ZPZ(sve_not_zpz_h
, uint16_t, H1_2
, DO_NOT
)
1056 DO_ZPZ(sve_not_zpz_s
, uint32_t, H1_4
, DO_NOT
)
1057 DO_ZPZ_D(sve_not_zpz_d
, uint64_t, DO_NOT
)
1059 #define DO_SXTB(N) ((int8_t)N)
1060 #define DO_SXTH(N) ((int16_t)N)
1061 #define DO_SXTS(N) ((int32_t)N)
1062 #define DO_UXTB(N) ((uint8_t)N)
1063 #define DO_UXTH(N) ((uint16_t)N)
1064 #define DO_UXTS(N) ((uint32_t)N)
1066 DO_ZPZ(sve_sxtb_h
, uint16_t, H1_2
, DO_SXTB
)
1067 DO_ZPZ(sve_sxtb_s
, uint32_t, H1_4
, DO_SXTB
)
1068 DO_ZPZ(sve_sxth_s
, uint32_t, H1_4
, DO_SXTH
)
1069 DO_ZPZ_D(sve_sxtb_d
, uint64_t, DO_SXTB
)
1070 DO_ZPZ_D(sve_sxth_d
, uint64_t, DO_SXTH
)
1071 DO_ZPZ_D(sve_sxtw_d
, uint64_t, DO_SXTS
)
1073 DO_ZPZ(sve_uxtb_h
, uint16_t, H1_2
, DO_UXTB
)
1074 DO_ZPZ(sve_uxtb_s
, uint32_t, H1_4
, DO_UXTB
)
1075 DO_ZPZ(sve_uxth_s
, uint32_t, H1_4
, DO_UXTH
)
1076 DO_ZPZ_D(sve_uxtb_d
, uint64_t, DO_UXTB
)
1077 DO_ZPZ_D(sve_uxth_d
, uint64_t, DO_UXTH
)
1078 DO_ZPZ_D(sve_uxtw_d
, uint64_t, DO_UXTS
)
1080 #define DO_ABS(N) (N < 0 ? -N : N)
1082 DO_ZPZ(sve_abs_b
, int8_t, H1
, DO_ABS
)
1083 DO_ZPZ(sve_abs_h
, int16_t, H1_2
, DO_ABS
)
1084 DO_ZPZ(sve_abs_s
, int32_t, H1_4
, DO_ABS
)
1085 DO_ZPZ_D(sve_abs_d
, int64_t, DO_ABS
)
1087 #define DO_NEG(N) (-N)
1089 DO_ZPZ(sve_neg_b
, uint8_t, H1
, DO_NEG
)
1090 DO_ZPZ(sve_neg_h
, uint16_t, H1_2
, DO_NEG
)
1091 DO_ZPZ(sve_neg_s
, uint32_t, H1_4
, DO_NEG
)
1092 DO_ZPZ_D(sve_neg_d
, uint64_t, DO_NEG
)
1094 DO_ZPZ(sve_revb_h
, uint16_t, H1_2
, bswap16
)
1095 DO_ZPZ(sve_revb_s
, uint32_t, H1_4
, bswap32
)
1096 DO_ZPZ_D(sve_revb_d
, uint64_t, bswap64
)
1098 DO_ZPZ(sve_revh_s
, uint32_t, H1_4
, hswap32
)
1099 DO_ZPZ_D(sve_revh_d
, uint64_t, hswap64
)
1101 DO_ZPZ_D(sve_revw_d
, uint64_t, wswap64
)
1103 DO_ZPZ(sve_rbit_b
, uint8_t, H1
, revbit8
)
1104 DO_ZPZ(sve_rbit_h
, uint16_t, H1_2
, revbit16
)
1105 DO_ZPZ(sve_rbit_s
, uint32_t, H1_4
, revbit32
)
1106 DO_ZPZ_D(sve_rbit_d
, uint64_t, revbit64
)
1108 #define DO_SQABS(X) \
1109 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1110 x_ >= 0 ? x_ : x_ == min_ ? -min_ - 1 : -x_; })
1112 DO_ZPZ(sve2_sqabs_b
, int8_t, H1
, DO_SQABS
)
1113 DO_ZPZ(sve2_sqabs_h
, int16_t, H1_2
, DO_SQABS
)
1114 DO_ZPZ(sve2_sqabs_s
, int32_t, H1_4
, DO_SQABS
)
1115 DO_ZPZ_D(sve2_sqabs_d
, int64_t, DO_SQABS
)
1117 #define DO_SQNEG(X) \
1118 ({ __typeof(X) x_ = (X), min_ = 1ull << (sizeof(X) * 8 - 1); \
1119 x_ == min_ ? -min_ - 1 : -x_; })
1121 DO_ZPZ(sve2_sqneg_b
, uint8_t, H1
, DO_SQNEG
)
1122 DO_ZPZ(sve2_sqneg_h
, uint16_t, H1_2
, DO_SQNEG
)
1123 DO_ZPZ(sve2_sqneg_s
, uint32_t, H1_4
, DO_SQNEG
)
1124 DO_ZPZ_D(sve2_sqneg_d
, uint64_t, DO_SQNEG
)
1126 DO_ZPZ(sve2_urecpe_s
, uint32_t, H1_4
, helper_recpe_u32
)
1127 DO_ZPZ(sve2_ursqrte_s
, uint32_t, H1_4
, helper_rsqrte_u32
)
1129 /* Three-operand expander, unpredicated, in which the third operand is "wide".
1131 #define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
1132 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1134 intptr_t i, opr_sz = simd_oprsz(desc); \
1135 for (i = 0; i < opr_sz; ) { \
1136 TYPEW mm = *(TYPEW *)(vm + i); \
1138 TYPE nn = *(TYPE *)(vn + H(i)); \
1139 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
1140 i += sizeof(TYPE); \
1145 DO_ZZW(sve_asr_zzw_b
, int8_t, uint64_t, H1
, DO_ASR
)
1146 DO_ZZW(sve_lsr_zzw_b
, uint8_t, uint64_t, H1
, DO_LSR
)
1147 DO_ZZW(sve_lsl_zzw_b
, uint8_t, uint64_t, H1
, DO_LSL
)
1149 DO_ZZW(sve_asr_zzw_h
, int16_t, uint64_t, H1_2
, DO_ASR
)
1150 DO_ZZW(sve_lsr_zzw_h
, uint16_t, uint64_t, H1_2
, DO_LSR
)
1151 DO_ZZW(sve_lsl_zzw_h
, uint16_t, uint64_t, H1_2
, DO_LSL
)
1153 DO_ZZW(sve_asr_zzw_s
, int32_t, uint64_t, H1_4
, DO_ASR
)
1154 DO_ZZW(sve_lsr_zzw_s
, uint32_t, uint64_t, H1_4
, DO_LSR
)
1155 DO_ZZW(sve_lsl_zzw_s
, uint32_t, uint64_t, H1_4
, DO_LSL
)
1172 * Three-operand expander, unpredicated, in which the two inputs are
1173 * selected from the top or bottom half of the wide column.
1175 #define DO_ZZZ_TB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1176 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1178 intptr_t i, opr_sz = simd_oprsz(desc); \
1179 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1180 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1181 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1182 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1183 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1184 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1188 DO_ZZZ_TB(sve2_saddl_h
, int16_t, int8_t, H1_2
, H1
, DO_ADD
)
1189 DO_ZZZ_TB(sve2_saddl_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ADD
)
1190 DO_ZZZ_TB(sve2_saddl_d
, int64_t, int32_t, , H1_4
, DO_ADD
)
1192 DO_ZZZ_TB(sve2_ssubl_h
, int16_t, int8_t, H1_2
, H1
, DO_SUB
)
1193 DO_ZZZ_TB(sve2_ssubl_s
, int32_t, int16_t, H1_4
, H1_2
, DO_SUB
)
1194 DO_ZZZ_TB(sve2_ssubl_d
, int64_t, int32_t, , H1_4
, DO_SUB
)
1196 DO_ZZZ_TB(sve2_sabdl_h
, int16_t, int8_t, H1_2
, H1
, DO_ABD
)
1197 DO_ZZZ_TB(sve2_sabdl_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ABD
)
1198 DO_ZZZ_TB(sve2_sabdl_d
, int64_t, int32_t, , H1_4
, DO_ABD
)
1200 DO_ZZZ_TB(sve2_uaddl_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ADD
)
1201 DO_ZZZ_TB(sve2_uaddl_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ADD
)
1202 DO_ZZZ_TB(sve2_uaddl_d
, uint64_t, uint32_t, , H1_4
, DO_ADD
)
1204 DO_ZZZ_TB(sve2_usubl_h
, uint16_t, uint8_t, H1_2
, H1
, DO_SUB
)
1205 DO_ZZZ_TB(sve2_usubl_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_SUB
)
1206 DO_ZZZ_TB(sve2_usubl_d
, uint64_t, uint32_t, , H1_4
, DO_SUB
)
1208 DO_ZZZ_TB(sve2_uabdl_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ABD
)
1209 DO_ZZZ_TB(sve2_uabdl_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ABD
)
1210 DO_ZZZ_TB(sve2_uabdl_d
, uint64_t, uint32_t, , H1_4
, DO_ABD
)
1212 DO_ZZZ_TB(sve2_smull_zzz_h
, int16_t, int8_t, H1_2
, H1
, DO_MUL
)
1213 DO_ZZZ_TB(sve2_smull_zzz_s
, int32_t, int16_t, H1_4
, H1_2
, DO_MUL
)
1214 DO_ZZZ_TB(sve2_smull_zzz_d
, int64_t, int32_t, , H1_4
, DO_MUL
)
1216 DO_ZZZ_TB(sve2_umull_zzz_h
, uint16_t, uint8_t, H1_2
, H1
, DO_MUL
)
1217 DO_ZZZ_TB(sve2_umull_zzz_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_MUL
)
1218 DO_ZZZ_TB(sve2_umull_zzz_d
, uint64_t, uint32_t, , H1_4
, DO_MUL
)
1220 /* Note that the multiply cannot overflow, but the doubling can. */
1221 static inline int16_t do_sqdmull_h(int16_t n
, int16_t m
)
1223 int16_t val
= n
* m
;
1224 return DO_SQADD_H(val
, val
);
1227 static inline int32_t do_sqdmull_s(int32_t n
, int32_t m
)
1229 int32_t val
= n
* m
;
1230 return DO_SQADD_S(val
, val
);
1233 static inline int64_t do_sqdmull_d(int64_t n
, int64_t m
)
1235 int64_t val
= n
* m
;
1236 return do_sqadd_d(val
, val
);
1239 DO_ZZZ_TB(sve2_sqdmull_zzz_h
, int16_t, int8_t, H1_2
, H1
, do_sqdmull_h
)
1240 DO_ZZZ_TB(sve2_sqdmull_zzz_s
, int32_t, int16_t, H1_4
, H1_2
, do_sqdmull_s
)
1241 DO_ZZZ_TB(sve2_sqdmull_zzz_d
, int64_t, int32_t, , H1_4
, do_sqdmull_d
)
1245 #define DO_ZZZ_WTB(NAME, TYPEW, TYPEN, HW, HN, OP) \
1246 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1248 intptr_t i, opr_sz = simd_oprsz(desc); \
1249 int sel2 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1250 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1251 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
1252 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1253 *(TYPEW *)(vd + HW(i)) = OP(nn, mm); \
1257 DO_ZZZ_WTB(sve2_saddw_h
, int16_t, int8_t, H1_2
, H1
, DO_ADD
)
1258 DO_ZZZ_WTB(sve2_saddw_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ADD
)
1259 DO_ZZZ_WTB(sve2_saddw_d
, int64_t, int32_t, , H1_4
, DO_ADD
)
1261 DO_ZZZ_WTB(sve2_ssubw_h
, int16_t, int8_t, H1_2
, H1
, DO_SUB
)
1262 DO_ZZZ_WTB(sve2_ssubw_s
, int32_t, int16_t, H1_4
, H1_2
, DO_SUB
)
1263 DO_ZZZ_WTB(sve2_ssubw_d
, int64_t, int32_t, , H1_4
, DO_SUB
)
1265 DO_ZZZ_WTB(sve2_uaddw_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ADD
)
1266 DO_ZZZ_WTB(sve2_uaddw_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ADD
)
1267 DO_ZZZ_WTB(sve2_uaddw_d
, uint64_t, uint32_t, , H1_4
, DO_ADD
)
1269 DO_ZZZ_WTB(sve2_usubw_h
, uint16_t, uint8_t, H1_2
, H1
, DO_SUB
)
1270 DO_ZZZ_WTB(sve2_usubw_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_SUB
)
1271 DO_ZZZ_WTB(sve2_usubw_d
, uint64_t, uint32_t, , H1_4
, DO_SUB
)
1275 #define DO_ZZZ_NTB(NAME, TYPE, H, OP) \
1276 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1278 intptr_t i, opr_sz = simd_oprsz(desc); \
1279 intptr_t sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPE); \
1280 intptr_t sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPE); \
1281 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1282 TYPE nn = *(TYPE *)(vn + H(i + sel1)); \
1283 TYPE mm = *(TYPE *)(vm + H(i + sel2)); \
1284 *(TYPE *)(vd + H(i + sel1)) = OP(nn, mm); \
1288 DO_ZZZ_NTB(sve2_eoril_b
, uint8_t, H1
, DO_EOR
)
1289 DO_ZZZ_NTB(sve2_eoril_h
, uint16_t, H1_2
, DO_EOR
)
1290 DO_ZZZ_NTB(sve2_eoril_s
, uint32_t, H1_4
, DO_EOR
)
1291 DO_ZZZ_NTB(sve2_eoril_d
, uint64_t, , DO_EOR
)
1295 #define DO_ZZZW_ACC(NAME, TYPEW, TYPEN, HW, HN, OP) \
1296 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1298 intptr_t i, opr_sz = simd_oprsz(desc); \
1299 intptr_t sel1 = simd_data(desc) * sizeof(TYPEN); \
1300 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1301 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1302 TYPEW mm = *(TYPEN *)(vm + HN(i + sel1)); \
1303 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1304 *(TYPEW *)(vd + HW(i)) = OP(nn, mm) + aa; \
1308 DO_ZZZW_ACC(sve2_sabal_h
, int16_t, int8_t, H1_2
, H1
, DO_ABD
)
1309 DO_ZZZW_ACC(sve2_sabal_s
, int32_t, int16_t, H1_4
, H1_2
, DO_ABD
)
1310 DO_ZZZW_ACC(sve2_sabal_d
, int64_t, int32_t, , H1_4
, DO_ABD
)
1312 DO_ZZZW_ACC(sve2_uabal_h
, uint16_t, uint8_t, H1_2
, H1
, DO_ABD
)
1313 DO_ZZZW_ACC(sve2_uabal_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_ABD
)
1314 DO_ZZZW_ACC(sve2_uabal_d
, uint64_t, uint32_t, , H1_4
, DO_ABD
)
1316 DO_ZZZW_ACC(sve2_smlal_zzzw_h
, int16_t, int8_t, H1_2
, H1
, DO_MUL
)
1317 DO_ZZZW_ACC(sve2_smlal_zzzw_s
, int32_t, int16_t, H1_4
, H1_2
, DO_MUL
)
1318 DO_ZZZW_ACC(sve2_smlal_zzzw_d
, int64_t, int32_t, , H1_4
, DO_MUL
)
1320 DO_ZZZW_ACC(sve2_umlal_zzzw_h
, uint16_t, uint8_t, H1_2
, H1
, DO_MUL
)
1321 DO_ZZZW_ACC(sve2_umlal_zzzw_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_MUL
)
1322 DO_ZZZW_ACC(sve2_umlal_zzzw_d
, uint64_t, uint32_t, , H1_4
, DO_MUL
)
1324 #define DO_NMUL(N, M) -(N * M)
1326 DO_ZZZW_ACC(sve2_smlsl_zzzw_h
, int16_t, int8_t, H1_2
, H1
, DO_NMUL
)
1327 DO_ZZZW_ACC(sve2_smlsl_zzzw_s
, int32_t, int16_t, H1_4
, H1_2
, DO_NMUL
)
1328 DO_ZZZW_ACC(sve2_smlsl_zzzw_d
, int64_t, int32_t, , H1_4
, DO_NMUL
)
1330 DO_ZZZW_ACC(sve2_umlsl_zzzw_h
, uint16_t, uint8_t, H1_2
, H1
, DO_NMUL
)
1331 DO_ZZZW_ACC(sve2_umlsl_zzzw_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_NMUL
)
1332 DO_ZZZW_ACC(sve2_umlsl_zzzw_d
, uint64_t, uint32_t, , H1_4
, DO_NMUL
)
1336 #define DO_XTNB(NAME, TYPE, OP) \
1337 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1339 intptr_t i, opr_sz = simd_oprsz(desc); \
1340 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1341 TYPE nn = *(TYPE *)(vn + i); \
1342 nn = OP(nn) & MAKE_64BIT_MASK(0, sizeof(TYPE) * 4); \
1343 *(TYPE *)(vd + i) = nn; \
1347 #define DO_XTNT(NAME, TYPE, TYPEN, H, OP) \
1348 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1350 intptr_t i, opr_sz = simd_oprsz(desc), odd = H(sizeof(TYPEN)); \
1351 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1352 TYPE nn = *(TYPE *)(vn + i); \
1353 *(TYPEN *)(vd + i + odd) = OP(nn); \
1357 #define DO_SQXTN_H(n) do_sat_bhs(n, INT8_MIN, INT8_MAX)
1358 #define DO_SQXTN_S(n) do_sat_bhs(n, INT16_MIN, INT16_MAX)
1359 #define DO_SQXTN_D(n) do_sat_bhs(n, INT32_MIN, INT32_MAX)
1361 DO_XTNB(sve2_sqxtnb_h
, int16_t, DO_SQXTN_H
)
1362 DO_XTNB(sve2_sqxtnb_s
, int32_t, DO_SQXTN_S
)
1363 DO_XTNB(sve2_sqxtnb_d
, int64_t, DO_SQXTN_D
)
1365 DO_XTNT(sve2_sqxtnt_h
, int16_t, int8_t, H1
, DO_SQXTN_H
)
1366 DO_XTNT(sve2_sqxtnt_s
, int32_t, int16_t, H1_2
, DO_SQXTN_S
)
1367 DO_XTNT(sve2_sqxtnt_d
, int64_t, int32_t, H1_4
, DO_SQXTN_D
)
1369 #define DO_UQXTN_H(n) do_sat_bhs(n, 0, UINT8_MAX)
1370 #define DO_UQXTN_S(n) do_sat_bhs(n, 0, UINT16_MAX)
1371 #define DO_UQXTN_D(n) do_sat_bhs(n, 0, UINT32_MAX)
1373 DO_XTNB(sve2_uqxtnb_h
, uint16_t, DO_UQXTN_H
)
1374 DO_XTNB(sve2_uqxtnb_s
, uint32_t, DO_UQXTN_S
)
1375 DO_XTNB(sve2_uqxtnb_d
, uint64_t, DO_UQXTN_D
)
1377 DO_XTNT(sve2_uqxtnt_h
, uint16_t, uint8_t, H1
, DO_UQXTN_H
)
1378 DO_XTNT(sve2_uqxtnt_s
, uint32_t, uint16_t, H1_2
, DO_UQXTN_S
)
1379 DO_XTNT(sve2_uqxtnt_d
, uint64_t, uint32_t, H1_4
, DO_UQXTN_D
)
1381 DO_XTNB(sve2_sqxtunb_h
, int16_t, DO_UQXTN_H
)
1382 DO_XTNB(sve2_sqxtunb_s
, int32_t, DO_UQXTN_S
)
1383 DO_XTNB(sve2_sqxtunb_d
, int64_t, DO_UQXTN_D
)
1385 DO_XTNT(sve2_sqxtunt_h
, int16_t, int8_t, H1
, DO_UQXTN_H
)
1386 DO_XTNT(sve2_sqxtunt_s
, int32_t, int16_t, H1_2
, DO_UQXTN_S
)
1387 DO_XTNT(sve2_sqxtunt_d
, int64_t, int32_t, H1_4
, DO_UQXTN_D
)
1392 void HELPER(sve2_adcl_s
)(void *vd
, void *vn
, void *vm
, void *va
, uint32_t desc
)
1394 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1395 int sel
= H4(extract32(desc
, SIMD_DATA_SHIFT
, 1));
1396 uint32_t inv
= -extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
1397 uint32_t *a
= va
, *n
= vn
;
1398 uint64_t *d
= vd
, *m
= vm
;
1400 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
1401 uint32_t e1
= a
[2 * i
+ H4(0)];
1402 uint32_t e2
= n
[2 * i
+ sel
] ^ inv
;
1403 uint64_t c
= extract64(m
[i
], 32, 1);
1404 /* Compute and store the entire 33-bit result at once. */
1409 void HELPER(sve2_adcl_d
)(void *vd
, void *vn
, void *vm
, void *va
, uint32_t desc
)
1411 intptr_t i
, opr_sz
= simd_oprsz(desc
);
1412 int sel
= extract32(desc
, SIMD_DATA_SHIFT
, 1);
1413 uint64_t inv
= -(uint64_t)extract32(desc
, SIMD_DATA_SHIFT
+ 1, 1);
1414 uint64_t *d
= vd
, *a
= va
, *n
= vn
, *m
= vm
;
1416 for (i
= 0; i
< opr_sz
/ 8; i
+= 2) {
1417 Int128 e1
= int128_make64(a
[i
]);
1418 Int128 e2
= int128_make64(n
[i
+ sel
] ^ inv
);
1419 Int128 c
= int128_make64(m
[i
+ 1] & 1);
1420 Int128 r
= int128_add(int128_add(e1
, e2
), c
);
1421 d
[i
+ 0] = int128_getlo(r
);
1422 d
[i
+ 1] = int128_gethi(r
);
1426 #define DO_SQDMLAL(NAME, TYPEW, TYPEN, HW, HN, DMUL_OP, SUM_OP) \
1427 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1429 intptr_t i, opr_sz = simd_oprsz(desc); \
1430 int sel1 = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1431 int sel2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(TYPEN); \
1432 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1433 TYPEW nn = *(TYPEN *)(vn + HN(i + sel1)); \
1434 TYPEW mm = *(TYPEN *)(vm + HN(i + sel2)); \
1435 TYPEW aa = *(TYPEW *)(va + HW(i)); \
1436 *(TYPEW *)(vd + HW(i)) = SUM_OP(aa, DMUL_OP(nn, mm)); \
1440 DO_SQDMLAL(sve2_sqdmlal_zzzw_h
, int16_t, int8_t, H1_2
, H1
,
1441 do_sqdmull_h
, DO_SQADD_H
)
1442 DO_SQDMLAL(sve2_sqdmlal_zzzw_s
, int32_t, int16_t, H1_4
, H1_2
,
1443 do_sqdmull_s
, DO_SQADD_S
)
1444 DO_SQDMLAL(sve2_sqdmlal_zzzw_d
, int64_t, int32_t, , H1_4
,
1445 do_sqdmull_d
, do_sqadd_d
)
1447 DO_SQDMLAL(sve2_sqdmlsl_zzzw_h
, int16_t, int8_t, H1_2
, H1
,
1448 do_sqdmull_h
, DO_SQSUB_H
)
1449 DO_SQDMLAL(sve2_sqdmlsl_zzzw_s
, int32_t, int16_t, H1_4
, H1_2
,
1450 do_sqdmull_s
, DO_SQSUB_S
)
1451 DO_SQDMLAL(sve2_sqdmlsl_zzzw_d
, int64_t, int32_t, , H1_4
,
1452 do_sqdmull_d
, do_sqsub_d
)
1456 #define DO_CMLA_FUNC(NAME, TYPE, H, OP) \
1457 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1459 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1460 int rot = simd_data(desc); \
1461 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1462 bool sub_r = rot == 1 || rot == 2; \
1463 bool sub_i = rot >= 2; \
1464 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1465 for (i = 0; i < opr_sz; i += 2) { \
1466 TYPE elt1_a = n[H(i + sel_a)]; \
1467 TYPE elt2_a = m[H(i + sel_a)]; \
1468 TYPE elt2_b = m[H(i + sel_b)]; \
1469 d[H(i)] = OP(elt1_a, elt2_a, a[H(i)], sub_r); \
1470 d[H(i + 1)] = OP(elt1_a, elt2_b, a[H(i + 1)], sub_i); \
1474 #define DO_CMLA(N, M, A, S) (A + (N * M) * (S ? -1 : 1))
1476 DO_CMLA_FUNC(sve2_cmla_zzzz_b
, uint8_t, H1
, DO_CMLA
)
1477 DO_CMLA_FUNC(sve2_cmla_zzzz_h
, uint16_t, H2
, DO_CMLA
)
1478 DO_CMLA_FUNC(sve2_cmla_zzzz_s
, uint32_t, H4
, DO_CMLA
)
1479 DO_CMLA_FUNC(sve2_cmla_zzzz_d
, uint64_t, , DO_CMLA
)
1481 #define DO_SQRDMLAH_B(N, M, A, S) \
1482 do_sqrdmlah_b(N, M, A, S, true)
1483 #define DO_SQRDMLAH_H(N, M, A, S) \
1484 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, S, true, &discard); })
1485 #define DO_SQRDMLAH_S(N, M, A, S) \
1486 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, S, true, &discard); })
1487 #define DO_SQRDMLAH_D(N, M, A, S) \
1488 do_sqrdmlah_d(N, M, A, S, true)
1490 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_b
, int8_t, H1
, DO_SQRDMLAH_B
)
1491 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_h
, int16_t, H2
, DO_SQRDMLAH_H
)
1492 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_s
, int32_t, H4
, DO_SQRDMLAH_S
)
1493 DO_CMLA_FUNC(sve2_sqrdcmlah_zzzz_d
, int64_t, , DO_SQRDMLAH_D
)
1495 #define DO_CMLA_IDX_FUNC(NAME, TYPE, H, OP) \
1496 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1498 intptr_t i, j, oprsz = simd_oprsz(desc); \
1499 int rot = extract32(desc, SIMD_DATA_SHIFT, 2); \
1500 int idx = extract32(desc, SIMD_DATA_SHIFT + 2, 2) * 2; \
1501 int sel_a = rot & 1, sel_b = sel_a ^ 1; \
1502 bool sub_r = rot == 1 || rot == 2; \
1503 bool sub_i = rot >= 2; \
1504 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1505 for (i = 0; i < oprsz / sizeof(TYPE); i += 16 / sizeof(TYPE)) { \
1506 TYPE elt2_a = m[H(i + idx + sel_a)]; \
1507 TYPE elt2_b = m[H(i + idx + sel_b)]; \
1508 for (j = 0; j < 16 / sizeof(TYPE); j += 2) { \
1509 TYPE elt1_a = n[H(i + j + sel_a)]; \
1510 d[H2(i + j)] = OP(elt1_a, elt2_a, a[H(i + j)], sub_r); \
1511 d[H2(i + j + 1)] = OP(elt1_a, elt2_b, a[H(i + j + 1)], sub_i); \
1516 DO_CMLA_IDX_FUNC(sve2_cmla_idx_h
, int16_t, H2
, DO_CMLA
)
1517 DO_CMLA_IDX_FUNC(sve2_cmla_idx_s
, int32_t, H4
, DO_CMLA
)
1519 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_h
, int16_t, H2
, DO_SQRDMLAH_H
)
1520 DO_CMLA_IDX_FUNC(sve2_sqrdcmlah_idx_s
, int32_t, H4
, DO_SQRDMLAH_S
)
1524 #undef DO_CMLA_IDX_FUNC
1525 #undef DO_SQRDMLAH_B
1526 #undef DO_SQRDMLAH_H
1527 #undef DO_SQRDMLAH_S
1528 #undef DO_SQRDMLAH_D
1530 /* Note N and M are 4 elements bundled into one unit. */
1531 static int32_t do_cdot_s(uint32_t n
, uint32_t m
, int32_t a
,
1532 int sel_a
, int sel_b
, int sub_i
)
1534 for (int i
= 0; i
<= 1; i
++) {
1535 int32_t elt1_r
= (int8_t)(n
>> (16 * i
));
1536 int32_t elt1_i
= (int8_t)(n
>> (16 * i
+ 8));
1537 int32_t elt2_a
= (int8_t)(m
>> (16 * i
+ 8 * sel_a
));
1538 int32_t elt2_b
= (int8_t)(m
>> (16 * i
+ 8 * sel_b
));
1540 a
+= elt1_r
* elt2_a
+ elt1_i
* elt2_b
* sub_i
;
1545 static int64_t do_cdot_d(uint64_t n
, uint64_t m
, int64_t a
,
1546 int sel_a
, int sel_b
, int sub_i
)
1548 for (int i
= 0; i
<= 1; i
++) {
1549 int64_t elt1_r
= (int16_t)(n
>> (32 * i
+ 0));
1550 int64_t elt1_i
= (int16_t)(n
>> (32 * i
+ 16));
1551 int64_t elt2_a
= (int16_t)(m
>> (32 * i
+ 16 * sel_a
));
1552 int64_t elt2_b
= (int16_t)(m
>> (32 * i
+ 16 * sel_b
));
1554 a
+= elt1_r
* elt2_a
+ elt1_i
* elt2_b
* sub_i
;
1559 void HELPER(sve2_cdot_zzzz_s
)(void *vd
, void *vn
, void *vm
,
1560 void *va
, uint32_t desc
)
1562 int opr_sz
= simd_oprsz(desc
);
1563 int rot
= simd_data(desc
);
1564 int sel_a
= rot
& 1;
1565 int sel_b
= sel_a
^ 1;
1566 int sub_i
= (rot
== 0 || rot
== 3 ? -1 : 1);
1567 uint32_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
1569 for (int e
= 0; e
< opr_sz
/ 4; e
++) {
1570 d
[e
] = do_cdot_s(n
[e
], m
[e
], a
[e
], sel_a
, sel_b
, sub_i
);
1574 void HELPER(sve2_cdot_zzzz_d
)(void *vd
, void *vn
, void *vm
,
1575 void *va
, uint32_t desc
)
1577 int opr_sz
= simd_oprsz(desc
);
1578 int rot
= simd_data(desc
);
1579 int sel_a
= rot
& 1;
1580 int sel_b
= sel_a
^ 1;
1581 int sub_i
= (rot
== 0 || rot
== 3 ? -1 : 1);
1582 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
1584 for (int e
= 0; e
< opr_sz
/ 8; e
++) {
1585 d
[e
] = do_cdot_d(n
[e
], m
[e
], a
[e
], sel_a
, sel_b
, sub_i
);
1589 void HELPER(sve2_cdot_idx_s
)(void *vd
, void *vn
, void *vm
,
1590 void *va
, uint32_t desc
)
1592 int opr_sz
= simd_oprsz(desc
);
1593 int rot
= extract32(desc
, SIMD_DATA_SHIFT
, 2);
1594 int idx
= H4(extract32(desc
, SIMD_DATA_SHIFT
+ 2, 2));
1595 int sel_a
= rot
& 1;
1596 int sel_b
= sel_a
^ 1;
1597 int sub_i
= (rot
== 0 || rot
== 3 ? -1 : 1);
1598 uint32_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
1600 for (int seg
= 0; seg
< opr_sz
/ 4; seg
+= 4) {
1601 uint32_t seg_m
= m
[seg
+ idx
];
1602 for (int e
= 0; e
< 4; e
++) {
1603 d
[seg
+ e
] = do_cdot_s(n
[seg
+ e
], seg_m
, a
[seg
+ e
],
1604 sel_a
, sel_b
, sub_i
);
1609 void HELPER(sve2_cdot_idx_d
)(void *vd
, void *vn
, void *vm
,
1610 void *va
, uint32_t desc
)
1612 int seg
, opr_sz
= simd_oprsz(desc
);
1613 int rot
= extract32(desc
, SIMD_DATA_SHIFT
, 2);
1614 int idx
= extract32(desc
, SIMD_DATA_SHIFT
+ 2, 2);
1615 int sel_a
= rot
& 1;
1616 int sel_b
= sel_a
^ 1;
1617 int sub_i
= (rot
== 0 || rot
== 3 ? -1 : 1);
1618 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *a
= va
;
1620 for (seg
= 0; seg
< opr_sz
/ 8; seg
+= 2) {
1621 uint64_t seg_m
= m
[seg
+ idx
];
1622 for (int e
= 0; e
< 2; e
++) {
1623 d
[seg
+ e
] = do_cdot_d(n
[seg
+ e
], seg_m
, a
[seg
+ e
],
1624 sel_a
, sel_b
, sub_i
);
1629 #define DO_ZZXZ(NAME, TYPE, H, OP) \
1630 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1632 intptr_t oprsz = simd_oprsz(desc), segment = 16 / sizeof(TYPE); \
1633 intptr_t i, j, idx = simd_data(desc); \
1634 TYPE *d = vd, *a = va, *n = vn, *m = (TYPE *)vm + H(idx); \
1635 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1637 for (j = 0; j < segment; j++) { \
1638 d[i + j] = OP(n[i + j], mm, a[i + j]); \
1643 #define DO_SQRDMLAH_H(N, M, A) \
1644 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, false, true, &discard); })
1645 #define DO_SQRDMLAH_S(N, M, A) \
1646 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, false, true, &discard); })
1647 #define DO_SQRDMLAH_D(N, M, A) do_sqrdmlah_d(N, M, A, false, true)
1649 DO_ZZXZ(sve2_sqrdmlah_idx_h
, int16_t, H2
, DO_SQRDMLAH_H
)
1650 DO_ZZXZ(sve2_sqrdmlah_idx_s
, int32_t, H4
, DO_SQRDMLAH_S
)
1651 DO_ZZXZ(sve2_sqrdmlah_idx_d
, int64_t, , DO_SQRDMLAH_D
)
1653 #define DO_SQRDMLSH_H(N, M, A) \
1654 ({ uint32_t discard; do_sqrdmlah_h(N, M, A, true, true, &discard); })
1655 #define DO_SQRDMLSH_S(N, M, A) \
1656 ({ uint32_t discard; do_sqrdmlah_s(N, M, A, true, true, &discard); })
1657 #define DO_SQRDMLSH_D(N, M, A) do_sqrdmlah_d(N, M, A, true, true)
1659 DO_ZZXZ(sve2_sqrdmlsh_idx_h
, int16_t, H2
, DO_SQRDMLSH_H
)
1660 DO_ZZXZ(sve2_sqrdmlsh_idx_s
, int32_t, H4
, DO_SQRDMLSH_S
)
1661 DO_ZZXZ(sve2_sqrdmlsh_idx_d
, int64_t, , DO_SQRDMLSH_D
)
1665 #define DO_ZZXW(NAME, TYPEW, TYPEN, HW, HN, OP) \
1666 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1668 intptr_t i, j, oprsz = simd_oprsz(desc); \
1669 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1670 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1671 for (i = 0; i < oprsz; i += 16) { \
1672 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1673 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1674 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1675 TYPEW aa = *(TYPEW *)(va + HW(i + j)); \
1676 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm, aa); \
1681 #define DO_MLA(N, M, A) (A + N * M)
1683 DO_ZZXW(sve2_smlal_idx_s
, int32_t, int16_t, H1_4
, H1_2
, DO_MLA
)
1684 DO_ZZXW(sve2_smlal_idx_d
, int64_t, int32_t, , H1_4
, DO_MLA
)
1685 DO_ZZXW(sve2_umlal_idx_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_MLA
)
1686 DO_ZZXW(sve2_umlal_idx_d
, uint64_t, uint32_t, , H1_4
, DO_MLA
)
1688 #define DO_MLS(N, M, A) (A - N * M)
1690 DO_ZZXW(sve2_smlsl_idx_s
, int32_t, int16_t, H1_4
, H1_2
, DO_MLS
)
1691 DO_ZZXW(sve2_smlsl_idx_d
, int64_t, int32_t, , H1_4
, DO_MLS
)
1692 DO_ZZXW(sve2_umlsl_idx_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_MLS
)
1693 DO_ZZXW(sve2_umlsl_idx_d
, uint64_t, uint32_t, , H1_4
, DO_MLS
)
1695 #define DO_SQDMLAL_S(N, M, A) DO_SQADD_S(A, do_sqdmull_s(N, M))
1696 #define DO_SQDMLAL_D(N, M, A) do_sqadd_d(A, do_sqdmull_d(N, M))
1698 DO_ZZXW(sve2_sqdmlal_idx_s
, int32_t, int16_t, H1_4
, H1_2
, DO_SQDMLAL_S
)
1699 DO_ZZXW(sve2_sqdmlal_idx_d
, int64_t, int32_t, , H1_4
, DO_SQDMLAL_D
)
1701 #define DO_SQDMLSL_S(N, M, A) DO_SQSUB_S(A, do_sqdmull_s(N, M))
1702 #define DO_SQDMLSL_D(N, M, A) do_sqsub_d(A, do_sqdmull_d(N, M))
1704 DO_ZZXW(sve2_sqdmlsl_idx_s
, int32_t, int16_t, H1_4
, H1_2
, DO_SQDMLSL_S
)
1705 DO_ZZXW(sve2_sqdmlsl_idx_d
, int64_t, int32_t, , H1_4
, DO_SQDMLSL_D
)
1711 #define DO_ZZX(NAME, TYPEW, TYPEN, HW, HN, OP) \
1712 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1714 intptr_t i, j, oprsz = simd_oprsz(desc); \
1715 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1) * sizeof(TYPEN); \
1716 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 1, 3) * sizeof(TYPEN); \
1717 for (i = 0; i < oprsz; i += 16) { \
1718 TYPEW mm = *(TYPEN *)(vm + HN(i + idx)); \
1719 for (j = 0; j < 16; j += sizeof(TYPEW)) { \
1720 TYPEW nn = *(TYPEN *)(vn + HN(i + j + sel)); \
1721 *(TYPEW *)(vd + HW(i + j)) = OP(nn, mm); \
1726 DO_ZZX(sve2_sqdmull_idx_s
, int32_t, int16_t, H1_4
, H1_2
, do_sqdmull_s
)
1727 DO_ZZX(sve2_sqdmull_idx_d
, int64_t, int32_t, , H1_4
, do_sqdmull_d
)
1729 DO_ZZX(sve2_smull_idx_s
, int32_t, int16_t, H1_4
, H1_2
, DO_MUL
)
1730 DO_ZZX(sve2_smull_idx_d
, int64_t, int32_t, , H1_4
, DO_MUL
)
1732 DO_ZZX(sve2_umull_idx_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_MUL
)
1733 DO_ZZX(sve2_umull_idx_d
, uint64_t, uint32_t, , H1_4
, DO_MUL
)
1737 #define DO_BITPERM(NAME, TYPE, OP) \
1738 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1740 intptr_t i, opr_sz = simd_oprsz(desc); \
1741 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
1742 TYPE nn = *(TYPE *)(vn + i); \
1743 TYPE mm = *(TYPE *)(vm + i); \
1744 *(TYPE *)(vd + i) = OP(nn, mm, sizeof(TYPE) * 8); \
1748 static uint64_t bitextract(uint64_t data
, uint64_t mask
, int n
)
1753 for (db
= 0; db
< n
; ++db
) {
1754 if ((mask
>> db
) & 1) {
1755 res
|= ((data
>> db
) & 1) << rb
;
1762 DO_BITPERM(sve2_bext_b
, uint8_t, bitextract
)
1763 DO_BITPERM(sve2_bext_h
, uint16_t, bitextract
)
1764 DO_BITPERM(sve2_bext_s
, uint32_t, bitextract
)
1765 DO_BITPERM(sve2_bext_d
, uint64_t, bitextract
)
1767 static uint64_t bitdeposit(uint64_t data
, uint64_t mask
, int n
)
1772 for (rb
= 0; rb
< n
; ++rb
) {
1773 if ((mask
>> rb
) & 1) {
1774 res
|= ((data
>> db
) & 1) << rb
;
1781 DO_BITPERM(sve2_bdep_b
, uint8_t, bitdeposit
)
1782 DO_BITPERM(sve2_bdep_h
, uint16_t, bitdeposit
)
1783 DO_BITPERM(sve2_bdep_s
, uint32_t, bitdeposit
)
1784 DO_BITPERM(sve2_bdep_d
, uint64_t, bitdeposit
)
1786 static uint64_t bitgroup(uint64_t data
, uint64_t mask
, int n
)
1788 uint64_t resm
= 0, resu
= 0;
1789 int db
, rbm
= 0, rbu
= 0;
1791 for (db
= 0; db
< n
; ++db
) {
1792 uint64_t val
= (data
>> db
) & 1;
1793 if ((mask
>> db
) & 1) {
1794 resm
|= val
<< rbm
++;
1796 resu
|= val
<< rbu
++;
1800 return resm
| (resu
<< rbm
);
1803 DO_BITPERM(sve2_bgrp_b
, uint8_t, bitgroup
)
1804 DO_BITPERM(sve2_bgrp_h
, uint16_t, bitgroup
)
1805 DO_BITPERM(sve2_bgrp_s
, uint32_t, bitgroup
)
1806 DO_BITPERM(sve2_bgrp_d
, uint64_t, bitgroup
)
1810 #define DO_CADD(NAME, TYPE, H, ADD_OP, SUB_OP) \
1811 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1813 intptr_t i, opr_sz = simd_oprsz(desc); \
1814 int sub_r = simd_data(desc); \
1816 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1817 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1818 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1819 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1820 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1821 acc_r = ADD_OP(acc_r, el2_i); \
1822 acc_i = SUB_OP(acc_i, el2_r); \
1823 *(TYPE *)(vd + H(i)) = acc_r; \
1824 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1827 for (i = 0; i < opr_sz; i += 2 * sizeof(TYPE)) { \
1828 TYPE acc_r = *(TYPE *)(vn + H(i)); \
1829 TYPE acc_i = *(TYPE *)(vn + H(i + sizeof(TYPE))); \
1830 TYPE el2_r = *(TYPE *)(vm + H(i)); \
1831 TYPE el2_i = *(TYPE *)(vm + H(i + sizeof(TYPE))); \
1832 acc_r = SUB_OP(acc_r, el2_i); \
1833 acc_i = ADD_OP(acc_i, el2_r); \
1834 *(TYPE *)(vd + H(i)) = acc_r; \
1835 *(TYPE *)(vd + H(i + sizeof(TYPE))) = acc_i; \
1840 DO_CADD(sve2_cadd_b
, int8_t, H1
, DO_ADD
, DO_SUB
)
1841 DO_CADD(sve2_cadd_h
, int16_t, H1_2
, DO_ADD
, DO_SUB
)
1842 DO_CADD(sve2_cadd_s
, int32_t, H1_4
, DO_ADD
, DO_SUB
)
1843 DO_CADD(sve2_cadd_d
, int64_t, , DO_ADD
, DO_SUB
)
1845 DO_CADD(sve2_sqcadd_b
, int8_t, H1
, DO_SQADD_B
, DO_SQSUB_B
)
1846 DO_CADD(sve2_sqcadd_h
, int16_t, H1_2
, DO_SQADD_H
, DO_SQSUB_H
)
1847 DO_CADD(sve2_sqcadd_s
, int32_t, H1_4
, DO_SQADD_S
, DO_SQSUB_S
)
1848 DO_CADD(sve2_sqcadd_d
, int64_t, , do_sqadd_d
, do_sqsub_d
)
1852 #define DO_ZZI_SHLL(NAME, TYPEW, TYPEN, HW, HN) \
1853 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1855 intptr_t i, opr_sz = simd_oprsz(desc); \
1856 intptr_t sel = (simd_data(desc) & 1) * sizeof(TYPEN); \
1857 int shift = simd_data(desc) >> 1; \
1858 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
1859 TYPEW nn = *(TYPEN *)(vn + HN(i + sel)); \
1860 *(TYPEW *)(vd + HW(i)) = nn << shift; \
1864 DO_ZZI_SHLL(sve2_sshll_h
, int16_t, int8_t, H1_2
, H1
)
1865 DO_ZZI_SHLL(sve2_sshll_s
, int32_t, int16_t, H1_4
, H1_2
)
1866 DO_ZZI_SHLL(sve2_sshll_d
, int64_t, int32_t, , H1_4
)
1868 DO_ZZI_SHLL(sve2_ushll_h
, uint16_t, uint8_t, H1_2
, H1
)
1869 DO_ZZI_SHLL(sve2_ushll_s
, uint32_t, uint16_t, H1_4
, H1_2
)
1870 DO_ZZI_SHLL(sve2_ushll_d
, uint64_t, uint32_t, , H1_4
)
1874 /* Two-operand reduction expander, controlled by a predicate.
1875 * The difference between TYPERED and TYPERET has to do with
1876 * sign-extension. E.g. for SMAX, TYPERED must be signed,
1877 * but TYPERET must be unsigned so that e.g. a 32-bit value
1878 * is not sign-extended to the ABI uint64_t return type.
1880 /* ??? If we were to vectorize this by hand the reduction ordering
1881 * would change. For integer operands, this is perfectly fine.
1883 #define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
1884 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1886 intptr_t i, opr_sz = simd_oprsz(desc); \
1887 TYPERED ret = INIT; \
1888 for (i = 0; i < opr_sz; ) { \
1889 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1892 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
1893 ret = OP(ret, nn); \
1895 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
1898 return (TYPERET)ret; \
1901 #define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
1902 uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
1904 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1908 for (i = 0; i < opr_sz; i += 1) { \
1909 if (pg[H1(i)] & 1) { \
1911 ret = OP(ret, nn); \
1917 DO_VPZ(sve_orv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_ORR
)
1918 DO_VPZ(sve_orv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_ORR
)
1919 DO_VPZ(sve_orv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_ORR
)
1920 DO_VPZ_D(sve_orv_d
, uint64_t, uint64_t, 0, DO_ORR
)
1922 DO_VPZ(sve_eorv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_EOR
)
1923 DO_VPZ(sve_eorv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_EOR
)
1924 DO_VPZ(sve_eorv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_EOR
)
1925 DO_VPZ_D(sve_eorv_d
, uint64_t, uint64_t, 0, DO_EOR
)
1927 DO_VPZ(sve_andv_b
, uint8_t, uint8_t, uint8_t, H1
, -1, DO_AND
)
1928 DO_VPZ(sve_andv_h
, uint16_t, uint16_t, uint16_t, H1_2
, -1, DO_AND
)
1929 DO_VPZ(sve_andv_s
, uint32_t, uint32_t, uint32_t, H1_4
, -1, DO_AND
)
1930 DO_VPZ_D(sve_andv_d
, uint64_t, uint64_t, -1, DO_AND
)
1932 DO_VPZ(sve_saddv_b
, int8_t, uint64_t, uint64_t, H1
, 0, DO_ADD
)
1933 DO_VPZ(sve_saddv_h
, int16_t, uint64_t, uint64_t, H1_2
, 0, DO_ADD
)
1934 DO_VPZ(sve_saddv_s
, int32_t, uint64_t, uint64_t, H1_4
, 0, DO_ADD
)
1936 DO_VPZ(sve_uaddv_b
, uint8_t, uint64_t, uint64_t, H1
, 0, DO_ADD
)
1937 DO_VPZ(sve_uaddv_h
, uint16_t, uint64_t, uint64_t, H1_2
, 0, DO_ADD
)
1938 DO_VPZ(sve_uaddv_s
, uint32_t, uint64_t, uint64_t, H1_4
, 0, DO_ADD
)
1939 DO_VPZ_D(sve_uaddv_d
, uint64_t, uint64_t, 0, DO_ADD
)
1941 DO_VPZ(sve_smaxv_b
, int8_t, int8_t, uint8_t, H1
, INT8_MIN
, DO_MAX
)
1942 DO_VPZ(sve_smaxv_h
, int16_t, int16_t, uint16_t, H1_2
, INT16_MIN
, DO_MAX
)
1943 DO_VPZ(sve_smaxv_s
, int32_t, int32_t, uint32_t, H1_4
, INT32_MIN
, DO_MAX
)
1944 DO_VPZ_D(sve_smaxv_d
, int64_t, int64_t, INT64_MIN
, DO_MAX
)
1946 DO_VPZ(sve_umaxv_b
, uint8_t, uint8_t, uint8_t, H1
, 0, DO_MAX
)
1947 DO_VPZ(sve_umaxv_h
, uint16_t, uint16_t, uint16_t, H1_2
, 0, DO_MAX
)
1948 DO_VPZ(sve_umaxv_s
, uint32_t, uint32_t, uint32_t, H1_4
, 0, DO_MAX
)
1949 DO_VPZ_D(sve_umaxv_d
, uint64_t, uint64_t, 0, DO_MAX
)
1951 DO_VPZ(sve_sminv_b
, int8_t, int8_t, uint8_t, H1
, INT8_MAX
, DO_MIN
)
1952 DO_VPZ(sve_sminv_h
, int16_t, int16_t, uint16_t, H1_2
, INT16_MAX
, DO_MIN
)
1953 DO_VPZ(sve_sminv_s
, int32_t, int32_t, uint32_t, H1_4
, INT32_MAX
, DO_MIN
)
1954 DO_VPZ_D(sve_sminv_d
, int64_t, int64_t, INT64_MAX
, DO_MIN
)
1956 DO_VPZ(sve_uminv_b
, uint8_t, uint8_t, uint8_t, H1
, -1, DO_MIN
)
1957 DO_VPZ(sve_uminv_h
, uint16_t, uint16_t, uint16_t, H1_2
, -1, DO_MIN
)
1958 DO_VPZ(sve_uminv_s
, uint32_t, uint32_t, uint32_t, H1_4
, -1, DO_MIN
)
1959 DO_VPZ_D(sve_uminv_d
, uint64_t, uint64_t, -1, DO_MIN
)
1964 /* Two vector operand, one scalar operand, unpredicated. */
1965 #define DO_ZZI(NAME, TYPE, OP) \
1966 void HELPER(NAME)(void *vd, void *vn, uint64_t s64, uint32_t desc) \
1968 intptr_t i, opr_sz = simd_oprsz(desc) / sizeof(TYPE); \
1969 TYPE s = s64, *d = vd, *n = vn; \
1970 for (i = 0; i < opr_sz; ++i) { \
1971 d[i] = OP(n[i], s); \
1975 #define DO_SUBR(X, Y) (Y - X)
1977 DO_ZZI(sve_subri_b
, uint8_t, DO_SUBR
)
1978 DO_ZZI(sve_subri_h
, uint16_t, DO_SUBR
)
1979 DO_ZZI(sve_subri_s
, uint32_t, DO_SUBR
)
1980 DO_ZZI(sve_subri_d
, uint64_t, DO_SUBR
)
1982 DO_ZZI(sve_smaxi_b
, int8_t, DO_MAX
)
1983 DO_ZZI(sve_smaxi_h
, int16_t, DO_MAX
)
1984 DO_ZZI(sve_smaxi_s
, int32_t, DO_MAX
)
1985 DO_ZZI(sve_smaxi_d
, int64_t, DO_MAX
)
1987 DO_ZZI(sve_smini_b
, int8_t, DO_MIN
)
1988 DO_ZZI(sve_smini_h
, int16_t, DO_MIN
)
1989 DO_ZZI(sve_smini_s
, int32_t, DO_MIN
)
1990 DO_ZZI(sve_smini_d
, int64_t, DO_MIN
)
1992 DO_ZZI(sve_umaxi_b
, uint8_t, DO_MAX
)
1993 DO_ZZI(sve_umaxi_h
, uint16_t, DO_MAX
)
1994 DO_ZZI(sve_umaxi_s
, uint32_t, DO_MAX
)
1995 DO_ZZI(sve_umaxi_d
, uint64_t, DO_MAX
)
1997 DO_ZZI(sve_umini_b
, uint8_t, DO_MIN
)
1998 DO_ZZI(sve_umini_h
, uint16_t, DO_MIN
)
1999 DO_ZZI(sve_umini_s
, uint32_t, DO_MIN
)
2000 DO_ZZI(sve_umini_d
, uint64_t, DO_MIN
)
2020 /* Similar to the ARM LastActiveElement pseudocode function, except the
2021 result is multiplied by the element size. This includes the not found
2022 indication; e.g. not found for esz=3 is -8. */
2023 static intptr_t last_active_element(uint64_t *g
, intptr_t words
, intptr_t esz
)
2025 uint64_t mask
= pred_esz_masks
[esz
];
2029 uint64_t this_g
= g
[--i
] & mask
;
2031 return i
* 64 + (63 - clz64(this_g
));
2034 return (intptr_t)-1 << esz
;
2037 uint32_t HELPER(sve_pfirst
)(void *vd
, void *vg
, uint32_t pred_desc
)
2039 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
2040 uint32_t flags
= PREDTEST_INIT
;
2041 uint64_t *d
= vd
, *g
= vg
;
2045 uint64_t this_d
= d
[i
];
2046 uint64_t this_g
= g
[i
];
2050 /* Set in D the first bit of G. */
2051 this_d
|= this_g
& -this_g
;
2054 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
2056 } while (++i
< words
);
2061 uint32_t HELPER(sve_pnext
)(void *vd
, void *vg
, uint32_t pred_desc
)
2063 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
2064 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
2065 uint32_t flags
= PREDTEST_INIT
;
2066 uint64_t *d
= vd
, *g
= vg
, esz_mask
;
2069 next
= last_active_element(vd
, words
, esz
) + (1 << esz
);
2070 esz_mask
= pred_esz_masks
[esz
];
2072 /* Similar to the pseudocode for pnext, but scaled by ESZ
2073 so that we find the correct bit. */
2074 if (next
< words
* 64) {
2078 mask
= ~((1ull << (next
& 63)) - 1);
2082 uint64_t this_g
= g
[next
/ 64] & esz_mask
& mask
;
2084 next
= (next
& -64) + ctz64(this_g
);
2089 } while (next
< words
* 64);
2094 uint64_t this_d
= 0;
2095 if (i
== next
/ 64) {
2096 this_d
= 1ull << (next
& 63);
2099 flags
= iter_predtest_fwd(this_d
, g
[i
] & esz_mask
, flags
);
2100 } while (++i
< words
);
2106 * Copy Zn into Zd, and store zero into inactive elements.
2107 * If inv, store zeros into the active elements.
2109 void HELPER(sve_movz_b
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
2111 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2112 uint64_t inv
= -(uint64_t)(simd_data(desc
) & 1);
2113 uint64_t *d
= vd
, *n
= vn
;
2116 for (i
= 0; i
< opr_sz
; i
+= 1) {
2117 d
[i
] = n
[i
] & (expand_pred_b(pg
[H1(i
)]) ^ inv
);
2121 void HELPER(sve_movz_h
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
2123 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2124 uint64_t inv
= -(uint64_t)(simd_data(desc
) & 1);
2125 uint64_t *d
= vd
, *n
= vn
;
2128 for (i
= 0; i
< opr_sz
; i
+= 1) {
2129 d
[i
] = n
[i
] & (expand_pred_h(pg
[H1(i
)]) ^ inv
);
2133 void HELPER(sve_movz_s
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
2135 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2136 uint64_t inv
= -(uint64_t)(simd_data(desc
) & 1);
2137 uint64_t *d
= vd
, *n
= vn
;
2140 for (i
= 0; i
< opr_sz
; i
+= 1) {
2141 d
[i
] = n
[i
] & (expand_pred_s(pg
[H1(i
)]) ^ inv
);
2145 void HELPER(sve_movz_d
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
2147 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2148 uint64_t *d
= vd
, *n
= vn
;
2150 uint8_t inv
= simd_data(desc
);
2152 for (i
= 0; i
< opr_sz
; i
+= 1) {
2153 d
[i
] = n
[i
] & -(uint64_t)((pg
[H1(i
)] ^ inv
) & 1);
2157 /* Three-operand expander, immediate operand, controlled by a predicate.
2159 #define DO_ZPZI(NAME, TYPE, H, OP) \
2160 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2162 intptr_t i, opr_sz = simd_oprsz(desc); \
2163 TYPE imm = simd_data(desc); \
2164 for (i = 0; i < opr_sz; ) { \
2165 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2168 TYPE nn = *(TYPE *)(vn + H(i)); \
2169 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
2171 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2176 /* Similarly, specialized for 64-bit operands. */
2177 #define DO_ZPZI_D(NAME, TYPE, OP) \
2178 void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
2180 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2181 TYPE *d = vd, *n = vn; \
2182 TYPE imm = simd_data(desc); \
2184 for (i = 0; i < opr_sz; i += 1) { \
2185 if (pg[H1(i)] & 1) { \
2187 d[i] = OP(nn, imm); \
2192 #define DO_SHR(N, M) (N >> M)
2193 #define DO_SHL(N, M) (N << M)
2195 /* Arithmetic shift right for division. This rounds negative numbers
2196 toward zero as per signed division. Therefore before shifting,
2197 when N is negative, add 2**M-1. */
2198 #define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
2200 static inline uint64_t do_urshr(uint64_t x
, unsigned sh
)
2202 if (likely(sh
< 64)) {
2203 return (x
>> sh
) + ((x
>> (sh
- 1)) & 1);
2204 } else if (sh
== 64) {
2211 static inline int64_t do_srshr(int64_t x
, unsigned sh
)
2213 if (likely(sh
< 64)) {
2214 return (x
>> sh
) + ((x
>> (sh
- 1)) & 1);
2216 /* Rounding the sign bit always produces 0. */
2221 DO_ZPZI(sve_asr_zpzi_b
, int8_t, H1
, DO_SHR
)
2222 DO_ZPZI(sve_asr_zpzi_h
, int16_t, H1_2
, DO_SHR
)
2223 DO_ZPZI(sve_asr_zpzi_s
, int32_t, H1_4
, DO_SHR
)
2224 DO_ZPZI_D(sve_asr_zpzi_d
, int64_t, DO_SHR
)
2226 DO_ZPZI(sve_lsr_zpzi_b
, uint8_t, H1
, DO_SHR
)
2227 DO_ZPZI(sve_lsr_zpzi_h
, uint16_t, H1_2
, DO_SHR
)
2228 DO_ZPZI(sve_lsr_zpzi_s
, uint32_t, H1_4
, DO_SHR
)
2229 DO_ZPZI_D(sve_lsr_zpzi_d
, uint64_t, DO_SHR
)
2231 DO_ZPZI(sve_lsl_zpzi_b
, uint8_t, H1
, DO_SHL
)
2232 DO_ZPZI(sve_lsl_zpzi_h
, uint16_t, H1_2
, DO_SHL
)
2233 DO_ZPZI(sve_lsl_zpzi_s
, uint32_t, H1_4
, DO_SHL
)
2234 DO_ZPZI_D(sve_lsl_zpzi_d
, uint64_t, DO_SHL
)
2236 DO_ZPZI(sve_asrd_b
, int8_t, H1
, DO_ASRD
)
2237 DO_ZPZI(sve_asrd_h
, int16_t, H1_2
, DO_ASRD
)
2238 DO_ZPZI(sve_asrd_s
, int32_t, H1_4
, DO_ASRD
)
2239 DO_ZPZI_D(sve_asrd_d
, int64_t, DO_ASRD
)
2245 #define DO_SHRNB(NAME, TYPEW, TYPEN, OP) \
2246 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2248 intptr_t i, opr_sz = simd_oprsz(desc); \
2249 int shift = simd_data(desc); \
2250 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2251 TYPEW nn = *(TYPEW *)(vn + i); \
2252 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, shift); \
2256 #define DO_SHRNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
2257 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2259 intptr_t i, opr_sz = simd_oprsz(desc); \
2260 int shift = simd_data(desc); \
2261 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2262 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2263 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, shift); \
2267 DO_SHRNB(sve2_shrnb_h
, uint16_t, uint8_t, DO_SHR
)
2268 DO_SHRNB(sve2_shrnb_s
, uint32_t, uint16_t, DO_SHR
)
2269 DO_SHRNB(sve2_shrnb_d
, uint64_t, uint32_t, DO_SHR
)
2271 DO_SHRNT(sve2_shrnt_h
, uint16_t, uint8_t, H1_2
, H1
, DO_SHR
)
2272 DO_SHRNT(sve2_shrnt_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_SHR
)
2273 DO_SHRNT(sve2_shrnt_d
, uint64_t, uint32_t, , H1_4
, DO_SHR
)
2275 DO_SHRNB(sve2_rshrnb_h
, uint16_t, uint8_t, do_urshr
)
2276 DO_SHRNB(sve2_rshrnb_s
, uint32_t, uint16_t, do_urshr
)
2277 DO_SHRNB(sve2_rshrnb_d
, uint64_t, uint32_t, do_urshr
)
2279 DO_SHRNT(sve2_rshrnt_h
, uint16_t, uint8_t, H1_2
, H1
, do_urshr
)
2280 DO_SHRNT(sve2_rshrnt_s
, uint32_t, uint16_t, H1_4
, H1_2
, do_urshr
)
2281 DO_SHRNT(sve2_rshrnt_d
, uint64_t, uint32_t, , H1_4
, do_urshr
)
2283 #define DO_SQSHRUN_H(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT8_MAX)
2284 #define DO_SQSHRUN_S(x, sh) do_sat_bhs((int64_t)(x) >> sh, 0, UINT16_MAX)
2285 #define DO_SQSHRUN_D(x, sh) \
2286 do_sat_bhs((int64_t)(x) >> (sh < 64 ? sh : 63), 0, UINT32_MAX)
2288 DO_SHRNB(sve2_sqshrunb_h
, int16_t, uint8_t, DO_SQSHRUN_H
)
2289 DO_SHRNB(sve2_sqshrunb_s
, int32_t, uint16_t, DO_SQSHRUN_S
)
2290 DO_SHRNB(sve2_sqshrunb_d
, int64_t, uint32_t, DO_SQSHRUN_D
)
2292 DO_SHRNT(sve2_sqshrunt_h
, int16_t, uint8_t, H1_2
, H1
, DO_SQSHRUN_H
)
2293 DO_SHRNT(sve2_sqshrunt_s
, int32_t, uint16_t, H1_4
, H1_2
, DO_SQSHRUN_S
)
2294 DO_SHRNT(sve2_sqshrunt_d
, int64_t, uint32_t, , H1_4
, DO_SQSHRUN_D
)
2296 #define DO_SQRSHRUN_H(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT8_MAX)
2297 #define DO_SQRSHRUN_S(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT16_MAX)
2298 #define DO_SQRSHRUN_D(x, sh) do_sat_bhs(do_srshr(x, sh), 0, UINT32_MAX)
2300 DO_SHRNB(sve2_sqrshrunb_h
, int16_t, uint8_t, DO_SQRSHRUN_H
)
2301 DO_SHRNB(sve2_sqrshrunb_s
, int32_t, uint16_t, DO_SQRSHRUN_S
)
2302 DO_SHRNB(sve2_sqrshrunb_d
, int64_t, uint32_t, DO_SQRSHRUN_D
)
2304 DO_SHRNT(sve2_sqrshrunt_h
, int16_t, uint8_t, H1_2
, H1
, DO_SQRSHRUN_H
)
2305 DO_SHRNT(sve2_sqrshrunt_s
, int32_t, uint16_t, H1_4
, H1_2
, DO_SQRSHRUN_S
)
2306 DO_SHRNT(sve2_sqrshrunt_d
, int64_t, uint32_t, , H1_4
, DO_SQRSHRUN_D
)
2308 #define DO_SQSHRN_H(x, sh) do_sat_bhs(x >> sh, INT8_MIN, INT8_MAX)
2309 #define DO_SQSHRN_S(x, sh) do_sat_bhs(x >> sh, INT16_MIN, INT16_MAX)
2310 #define DO_SQSHRN_D(x, sh) do_sat_bhs(x >> sh, INT32_MIN, INT32_MAX)
2312 DO_SHRNB(sve2_sqshrnb_h
, int16_t, uint8_t, DO_SQSHRN_H
)
2313 DO_SHRNB(sve2_sqshrnb_s
, int32_t, uint16_t, DO_SQSHRN_S
)
2314 DO_SHRNB(sve2_sqshrnb_d
, int64_t, uint32_t, DO_SQSHRN_D
)
2316 DO_SHRNT(sve2_sqshrnt_h
, int16_t, uint8_t, H1_2
, H1
, DO_SQSHRN_H
)
2317 DO_SHRNT(sve2_sqshrnt_s
, int32_t, uint16_t, H1_4
, H1_2
, DO_SQSHRN_S
)
2318 DO_SHRNT(sve2_sqshrnt_d
, int64_t, uint32_t, , H1_4
, DO_SQSHRN_D
)
2320 #define DO_SQRSHRN_H(x, sh) do_sat_bhs(do_srshr(x, sh), INT8_MIN, INT8_MAX)
2321 #define DO_SQRSHRN_S(x, sh) do_sat_bhs(do_srshr(x, sh), INT16_MIN, INT16_MAX)
2322 #define DO_SQRSHRN_D(x, sh) do_sat_bhs(do_srshr(x, sh), INT32_MIN, INT32_MAX)
2324 DO_SHRNB(sve2_sqrshrnb_h
, int16_t, uint8_t, DO_SQRSHRN_H
)
2325 DO_SHRNB(sve2_sqrshrnb_s
, int32_t, uint16_t, DO_SQRSHRN_S
)
2326 DO_SHRNB(sve2_sqrshrnb_d
, int64_t, uint32_t, DO_SQRSHRN_D
)
2328 DO_SHRNT(sve2_sqrshrnt_h
, int16_t, uint8_t, H1_2
, H1
, DO_SQRSHRN_H
)
2329 DO_SHRNT(sve2_sqrshrnt_s
, int32_t, uint16_t, H1_4
, H1_2
, DO_SQRSHRN_S
)
2330 DO_SHRNT(sve2_sqrshrnt_d
, int64_t, uint32_t, , H1_4
, DO_SQRSHRN_D
)
2332 #define DO_UQSHRN_H(x, sh) MIN(x >> sh, UINT8_MAX)
2333 #define DO_UQSHRN_S(x, sh) MIN(x >> sh, UINT16_MAX)
2334 #define DO_UQSHRN_D(x, sh) MIN(x >> sh, UINT32_MAX)
2336 DO_SHRNB(sve2_uqshrnb_h
, uint16_t, uint8_t, DO_UQSHRN_H
)
2337 DO_SHRNB(sve2_uqshrnb_s
, uint32_t, uint16_t, DO_UQSHRN_S
)
2338 DO_SHRNB(sve2_uqshrnb_d
, uint64_t, uint32_t, DO_UQSHRN_D
)
2340 DO_SHRNT(sve2_uqshrnt_h
, uint16_t, uint8_t, H1_2
, H1
, DO_UQSHRN_H
)
2341 DO_SHRNT(sve2_uqshrnt_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_UQSHRN_S
)
2342 DO_SHRNT(sve2_uqshrnt_d
, uint64_t, uint32_t, , H1_4
, DO_UQSHRN_D
)
2344 #define DO_UQRSHRN_H(x, sh) MIN(do_urshr(x, sh), UINT8_MAX)
2345 #define DO_UQRSHRN_S(x, sh) MIN(do_urshr(x, sh), UINT16_MAX)
2346 #define DO_UQRSHRN_D(x, sh) MIN(do_urshr(x, sh), UINT32_MAX)
2348 DO_SHRNB(sve2_uqrshrnb_h
, uint16_t, uint8_t, DO_UQRSHRN_H
)
2349 DO_SHRNB(sve2_uqrshrnb_s
, uint32_t, uint16_t, DO_UQRSHRN_S
)
2350 DO_SHRNB(sve2_uqrshrnb_d
, uint64_t, uint32_t, DO_UQRSHRN_D
)
2352 DO_SHRNT(sve2_uqrshrnt_h
, uint16_t, uint8_t, H1_2
, H1
, DO_UQRSHRN_H
)
2353 DO_SHRNT(sve2_uqrshrnt_s
, uint32_t, uint16_t, H1_4
, H1_2
, DO_UQRSHRN_S
)
2354 DO_SHRNT(sve2_uqrshrnt_d
, uint64_t, uint32_t, , H1_4
, DO_UQRSHRN_D
)
2359 #define DO_BINOPNB(NAME, TYPEW, TYPEN, SHIFT, OP) \
2360 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2362 intptr_t i, opr_sz = simd_oprsz(desc); \
2363 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2364 TYPEW nn = *(TYPEW *)(vn + i); \
2365 TYPEW mm = *(TYPEW *)(vm + i); \
2366 *(TYPEW *)(vd + i) = (TYPEN)OP(nn, mm, SHIFT); \
2370 #define DO_BINOPNT(NAME, TYPEW, TYPEN, SHIFT, HW, HN, OP) \
2371 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2373 intptr_t i, opr_sz = simd_oprsz(desc); \
2374 for (i = 0; i < opr_sz; i += sizeof(TYPEW)) { \
2375 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
2376 TYPEW mm = *(TYPEW *)(vm + HW(i)); \
2377 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, mm, SHIFT); \
2381 #define DO_ADDHN(N, M, SH) ((N + M) >> SH)
2382 #define DO_RADDHN(N, M, SH) ((N + M + ((__typeof(N))1 << (SH - 1))) >> SH)
2383 #define DO_SUBHN(N, M, SH) ((N - M) >> SH)
2384 #define DO_RSUBHN(N, M, SH) ((N - M + ((__typeof(N))1 << (SH - 1))) >> SH)
2386 DO_BINOPNB(sve2_addhnb_h
, uint16_t, uint8_t, 8, DO_ADDHN
)
2387 DO_BINOPNB(sve2_addhnb_s
, uint32_t, uint16_t, 16, DO_ADDHN
)
2388 DO_BINOPNB(sve2_addhnb_d
, uint64_t, uint32_t, 32, DO_ADDHN
)
2390 DO_BINOPNT(sve2_addhnt_h
, uint16_t, uint8_t, 8, H1_2
, H1
, DO_ADDHN
)
2391 DO_BINOPNT(sve2_addhnt_s
, uint32_t, uint16_t, 16, H1_4
, H1_2
, DO_ADDHN
)
2392 DO_BINOPNT(sve2_addhnt_d
, uint64_t, uint32_t, 32, , H1_4
, DO_ADDHN
)
2394 DO_BINOPNB(sve2_raddhnb_h
, uint16_t, uint8_t, 8, DO_RADDHN
)
2395 DO_BINOPNB(sve2_raddhnb_s
, uint32_t, uint16_t, 16, DO_RADDHN
)
2396 DO_BINOPNB(sve2_raddhnb_d
, uint64_t, uint32_t, 32, DO_RADDHN
)
2398 DO_BINOPNT(sve2_raddhnt_h
, uint16_t, uint8_t, 8, H1_2
, H1
, DO_RADDHN
)
2399 DO_BINOPNT(sve2_raddhnt_s
, uint32_t, uint16_t, 16, H1_4
, H1_2
, DO_RADDHN
)
2400 DO_BINOPNT(sve2_raddhnt_d
, uint64_t, uint32_t, 32, , H1_4
, DO_RADDHN
)
2402 DO_BINOPNB(sve2_subhnb_h
, uint16_t, uint8_t, 8, DO_SUBHN
)
2403 DO_BINOPNB(sve2_subhnb_s
, uint32_t, uint16_t, 16, DO_SUBHN
)
2404 DO_BINOPNB(sve2_subhnb_d
, uint64_t, uint32_t, 32, DO_SUBHN
)
2406 DO_BINOPNT(sve2_subhnt_h
, uint16_t, uint8_t, 8, H1_2
, H1
, DO_SUBHN
)
2407 DO_BINOPNT(sve2_subhnt_s
, uint32_t, uint16_t, 16, H1_4
, H1_2
, DO_SUBHN
)
2408 DO_BINOPNT(sve2_subhnt_d
, uint64_t, uint32_t, 32, , H1_4
, DO_SUBHN
)
2410 DO_BINOPNB(sve2_rsubhnb_h
, uint16_t, uint8_t, 8, DO_RSUBHN
)
2411 DO_BINOPNB(sve2_rsubhnb_s
, uint32_t, uint16_t, 16, DO_RSUBHN
)
2412 DO_BINOPNB(sve2_rsubhnb_d
, uint64_t, uint32_t, 32, DO_RSUBHN
)
2414 DO_BINOPNT(sve2_rsubhnt_h
, uint16_t, uint8_t, 8, H1_2
, H1
, DO_RSUBHN
)
2415 DO_BINOPNT(sve2_rsubhnt_s
, uint32_t, uint16_t, 16, H1_4
, H1_2
, DO_RSUBHN
)
2416 DO_BINOPNT(sve2_rsubhnt_d
, uint64_t, uint32_t, 32, , H1_4
, DO_RSUBHN
)
2425 /* Fully general four-operand expander, controlled by a predicate.
2427 #define DO_ZPZZZ(NAME, TYPE, H, OP) \
2428 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2429 void *vg, uint32_t desc) \
2431 intptr_t i, opr_sz = simd_oprsz(desc); \
2432 for (i = 0; i < opr_sz; ) { \
2433 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
2436 TYPE nn = *(TYPE *)(vn + H(i)); \
2437 TYPE mm = *(TYPE *)(vm + H(i)); \
2438 TYPE aa = *(TYPE *)(va + H(i)); \
2439 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
2441 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
2446 /* Similarly, specialized for 64-bit operands. */
2447 #define DO_ZPZZZ_D(NAME, TYPE, OP) \
2448 void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
2449 void *vg, uint32_t desc) \
2451 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
2452 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
2454 for (i = 0; i < opr_sz; i += 1) { \
2455 if (pg[H1(i)] & 1) { \
2456 TYPE aa = a[i], nn = n[i], mm = m[i]; \
2457 d[i] = OP(aa, nn, mm); \
2462 #define DO_MLA(A, N, M) (A + N * M)
2463 #define DO_MLS(A, N, M) (A - N * M)
2465 DO_ZPZZZ(sve_mla_b
, uint8_t, H1
, DO_MLA
)
2466 DO_ZPZZZ(sve_mls_b
, uint8_t, H1
, DO_MLS
)
2468 DO_ZPZZZ(sve_mla_h
, uint16_t, H1_2
, DO_MLA
)
2469 DO_ZPZZZ(sve_mls_h
, uint16_t, H1_2
, DO_MLS
)
2471 DO_ZPZZZ(sve_mla_s
, uint32_t, H1_4
, DO_MLA
)
2472 DO_ZPZZZ(sve_mls_s
, uint32_t, H1_4
, DO_MLS
)
2474 DO_ZPZZZ_D(sve_mla_d
, uint64_t, DO_MLA
)
2475 DO_ZPZZZ_D(sve_mls_d
, uint64_t, DO_MLS
)
2482 void HELPER(sve_index_b
)(void *vd
, uint32_t start
,
2483 uint32_t incr
, uint32_t desc
)
2485 intptr_t i
, opr_sz
= simd_oprsz(desc
);
2487 for (i
= 0; i
< opr_sz
; i
+= 1) {
2488 d
[H1(i
)] = start
+ i
* incr
;
2492 void HELPER(sve_index_h
)(void *vd
, uint32_t start
,
2493 uint32_t incr
, uint32_t desc
)
2495 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
2497 for (i
= 0; i
< opr_sz
; i
+= 1) {
2498 d
[H2(i
)] = start
+ i
* incr
;
2502 void HELPER(sve_index_s
)(void *vd
, uint32_t start
,
2503 uint32_t incr
, uint32_t desc
)
2505 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
2507 for (i
= 0; i
< opr_sz
; i
+= 1) {
2508 d
[H4(i
)] = start
+ i
* incr
;
2512 void HELPER(sve_index_d
)(void *vd
, uint64_t start
,
2513 uint64_t incr
, uint32_t desc
)
2515 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2517 for (i
= 0; i
< opr_sz
; i
+= 1) {
2518 d
[i
] = start
+ i
* incr
;
2522 void HELPER(sve_adr_p32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2524 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
2525 uint32_t sh
= simd_data(desc
);
2526 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
2527 for (i
= 0; i
< opr_sz
; i
+= 1) {
2528 d
[i
] = n
[i
] + (m
[i
] << sh
);
2532 void HELPER(sve_adr_p64
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2534 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2535 uint64_t sh
= simd_data(desc
);
2536 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2537 for (i
= 0; i
< opr_sz
; i
+= 1) {
2538 d
[i
] = n
[i
] + (m
[i
] << sh
);
2542 void HELPER(sve_adr_s32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2544 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2545 uint64_t sh
= simd_data(desc
);
2546 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2547 for (i
= 0; i
< opr_sz
; i
+= 1) {
2548 d
[i
] = n
[i
] + ((uint64_t)(int32_t)m
[i
] << sh
);
2552 void HELPER(sve_adr_u32
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2554 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2555 uint64_t sh
= simd_data(desc
);
2556 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2557 for (i
= 0; i
< opr_sz
; i
+= 1) {
2558 d
[i
] = n
[i
] + ((uint64_t)(uint32_t)m
[i
] << sh
);
2562 void HELPER(sve_fexpa_h
)(void *vd
, void *vn
, uint32_t desc
)
2564 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2565 static const uint16_t coeff
[] = {
2566 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
2567 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
2568 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
2569 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
2571 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
2572 uint16_t *d
= vd
, *n
= vn
;
2574 for (i
= 0; i
< opr_sz
; i
++) {
2576 intptr_t idx
= extract32(nn
, 0, 5);
2577 uint16_t exp
= extract32(nn
, 5, 5);
2578 d
[i
] = coeff
[idx
] | (exp
<< 10);
2582 void HELPER(sve_fexpa_s
)(void *vd
, void *vn
, uint32_t desc
)
2584 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2585 static const uint32_t coeff
[] = {
2586 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
2587 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
2588 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
2589 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
2590 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
2591 0x1ef532, 0x20b051, 0x227043, 0x243516,
2592 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
2593 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
2594 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
2595 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
2596 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
2597 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
2598 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
2599 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
2600 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
2601 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
2603 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
2604 uint32_t *d
= vd
, *n
= vn
;
2606 for (i
= 0; i
< opr_sz
; i
++) {
2608 intptr_t idx
= extract32(nn
, 0, 6);
2609 uint32_t exp
= extract32(nn
, 6, 8);
2610 d
[i
] = coeff
[idx
] | (exp
<< 23);
2614 void HELPER(sve_fexpa_d
)(void *vd
, void *vn
, uint32_t desc
)
2616 /* These constants are cut-and-paste directly from the ARM pseudocode. */
2617 static const uint64_t coeff
[] = {
2618 0x0000000000000ull
, 0x02C9A3E778061ull
, 0x059B0D3158574ull
,
2619 0x0874518759BC8ull
, 0x0B5586CF9890Full
, 0x0E3EC32D3D1A2ull
,
2620 0x11301D0125B51ull
, 0x1429AAEA92DE0ull
, 0x172B83C7D517Bull
,
2621 0x1A35BEB6FCB75ull
, 0x1D4873168B9AAull
, 0x2063B88628CD6ull
,
2622 0x2387A6E756238ull
, 0x26B4565E27CDDull
, 0x29E9DF51FDEE1ull
,
2623 0x2D285A6E4030Bull
, 0x306FE0A31B715ull
, 0x33C08B26416FFull
,
2624 0x371A7373AA9CBull
, 0x3A7DB34E59FF7ull
, 0x3DEA64C123422ull
,
2625 0x4160A21F72E2Aull
, 0x44E086061892Dull
, 0x486A2B5C13CD0ull
,
2626 0x4BFDAD5362A27ull
, 0x4F9B2769D2CA7ull
, 0x5342B569D4F82ull
,
2627 0x56F4736B527DAull
, 0x5AB07DD485429ull
, 0x5E76F15AD2148ull
,
2628 0x6247EB03A5585ull
, 0x6623882552225ull
, 0x6A09E667F3BCDull
,
2629 0x6DFB23C651A2Full
, 0x71F75E8EC5F74ull
, 0x75FEB564267C9ull
,
2630 0x7A11473EB0187ull
, 0x7E2F336CF4E62ull
, 0x82589994CCE13ull
,
2631 0x868D99B4492EDull
, 0x8ACE5422AA0DBull
, 0x8F1AE99157736ull
,
2632 0x93737B0CDC5E5ull
, 0x97D829FDE4E50ull
, 0x9C49182A3F090ull
,
2633 0xA0C667B5DE565ull
, 0xA5503B23E255Dull
, 0xA9E6B5579FDBFull
,
2634 0xAE89F995AD3ADull
, 0xB33A2B84F15FBull
, 0xB7F76F2FB5E47ull
,
2635 0xBCC1E904BC1D2ull
, 0xC199BDD85529Cull
, 0xC67F12E57D14Bull
,
2636 0xCB720DCEF9069ull
, 0xD072D4A07897Cull
, 0xD5818DCFBA487ull
,
2637 0xDA9E603DB3285ull
, 0xDFC97337B9B5Full
, 0xE502EE78B3FF6ull
,
2638 0xEA4AFA2A490DAull
, 0xEFA1BEE615A27ull
, 0xF50765B6E4540ull
,
2641 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2642 uint64_t *d
= vd
, *n
= vn
;
2644 for (i
= 0; i
< opr_sz
; i
++) {
2646 intptr_t idx
= extract32(nn
, 0, 6);
2647 uint64_t exp
= extract32(nn
, 6, 11);
2648 d
[i
] = coeff
[idx
] | (exp
<< 52);
2652 void HELPER(sve_ftssel_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2654 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 2;
2655 uint16_t *d
= vd
, *n
= vn
, *m
= vm
;
2656 for (i
= 0; i
< opr_sz
; i
+= 1) {
2662 d
[i
] = nn
^ (mm
& 2) << 14;
2666 void HELPER(sve_ftssel_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2668 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
2669 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
2670 for (i
= 0; i
< opr_sz
; i
+= 1) {
2676 d
[i
] = nn
^ (mm
& 2) << 30;
2680 void HELPER(sve_ftssel_d
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2682 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2683 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
2684 for (i
= 0; i
< opr_sz
; i
+= 1) {
2690 d
[i
] = nn
^ (mm
& 2) << 62;
2695 * Signed saturating addition with scalar operand.
2698 void HELPER(sve_sqaddi_b
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
2700 intptr_t i
, oprsz
= simd_oprsz(desc
);
2702 for (i
= 0; i
< oprsz
; i
+= sizeof(int8_t)) {
2703 *(int8_t *)(d
+ i
) = DO_SQADD_B(b
, *(int8_t *)(a
+ i
));
2707 void HELPER(sve_sqaddi_h
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
2709 intptr_t i
, oprsz
= simd_oprsz(desc
);
2711 for (i
= 0; i
< oprsz
; i
+= sizeof(int16_t)) {
2712 *(int16_t *)(d
+ i
) = DO_SQADD_H(b
, *(int16_t *)(a
+ i
));
2716 void HELPER(sve_sqaddi_s
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
2718 intptr_t i
, oprsz
= simd_oprsz(desc
);
2720 for (i
= 0; i
< oprsz
; i
+= sizeof(int32_t)) {
2721 *(int32_t *)(d
+ i
) = DO_SQADD_S(b
, *(int32_t *)(a
+ i
));
2725 void HELPER(sve_sqaddi_d
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
2727 intptr_t i
, oprsz
= simd_oprsz(desc
);
2729 for (i
= 0; i
< oprsz
; i
+= sizeof(int64_t)) {
2730 *(int64_t *)(d
+ i
) = do_sqadd_d(b
, *(int64_t *)(a
+ i
));
2735 * Unsigned saturating addition with scalar operand.
2738 void HELPER(sve_uqaddi_b
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
2740 intptr_t i
, oprsz
= simd_oprsz(desc
);
2742 for (i
= 0; i
< oprsz
; i
+= sizeof(uint8_t)) {
2743 *(uint8_t *)(d
+ i
) = DO_UQADD_B(b
, *(uint8_t *)(a
+ i
));
2747 void HELPER(sve_uqaddi_h
)(void *d
, void *a
, int32_t b
, uint32_t desc
)
2749 intptr_t i
, oprsz
= simd_oprsz(desc
);
2751 for (i
= 0; i
< oprsz
; i
+= sizeof(uint16_t)) {
2752 *(uint16_t *)(d
+ i
) = DO_UQADD_H(b
, *(uint16_t *)(a
+ i
));
2756 void HELPER(sve_uqaddi_s
)(void *d
, void *a
, int64_t b
, uint32_t desc
)
2758 intptr_t i
, oprsz
= simd_oprsz(desc
);
2760 for (i
= 0; i
< oprsz
; i
+= sizeof(uint32_t)) {
2761 *(uint32_t *)(d
+ i
) = DO_UQADD_S(b
, *(uint32_t *)(a
+ i
));
2765 void HELPER(sve_uqaddi_d
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
2767 intptr_t i
, oprsz
= simd_oprsz(desc
);
2769 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
2770 *(uint64_t *)(d
+ i
) = do_uqadd_d(b
, *(uint64_t *)(a
+ i
));
2774 void HELPER(sve_uqsubi_d
)(void *d
, void *a
, uint64_t b
, uint32_t desc
)
2776 intptr_t i
, oprsz
= simd_oprsz(desc
);
2778 for (i
= 0; i
< oprsz
; i
+= sizeof(uint64_t)) {
2779 *(uint64_t *)(d
+ i
) = do_uqsub_d(*(uint64_t *)(a
+ i
), b
);
2783 /* Two operand predicated copy immediate with merge. All valid immediates
2784 * can fit within 17 signed bits in the simd_data field.
2786 void HELPER(sve_cpy_m_b
)(void *vd
, void *vn
, void *vg
,
2787 uint64_t mm
, uint32_t desc
)
2789 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2790 uint64_t *d
= vd
, *n
= vn
;
2793 mm
= dup_const(MO_8
, mm
);
2794 for (i
= 0; i
< opr_sz
; i
+= 1) {
2796 uint64_t pp
= expand_pred_b(pg
[H1(i
)]);
2797 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
2801 void HELPER(sve_cpy_m_h
)(void *vd
, void *vn
, void *vg
,
2802 uint64_t mm
, uint32_t desc
)
2804 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2805 uint64_t *d
= vd
, *n
= vn
;
2808 mm
= dup_const(MO_16
, mm
);
2809 for (i
= 0; i
< opr_sz
; i
+= 1) {
2811 uint64_t pp
= expand_pred_h(pg
[H1(i
)]);
2812 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
2816 void HELPER(sve_cpy_m_s
)(void *vd
, void *vn
, void *vg
,
2817 uint64_t mm
, uint32_t desc
)
2819 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2820 uint64_t *d
= vd
, *n
= vn
;
2823 mm
= dup_const(MO_32
, mm
);
2824 for (i
= 0; i
< opr_sz
; i
+= 1) {
2826 uint64_t pp
= expand_pred_s(pg
[H1(i
)]);
2827 d
[i
] = (mm
& pp
) | (nn
& ~pp
);
2831 void HELPER(sve_cpy_m_d
)(void *vd
, void *vn
, void *vg
,
2832 uint64_t mm
, uint32_t desc
)
2834 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2835 uint64_t *d
= vd
, *n
= vn
;
2838 for (i
= 0; i
< opr_sz
; i
+= 1) {
2840 d
[i
] = (pg
[H1(i
)] & 1 ? mm
: nn
);
2844 void HELPER(sve_cpy_z_b
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2846 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2850 val
= dup_const(MO_8
, val
);
2851 for (i
= 0; i
< opr_sz
; i
+= 1) {
2852 d
[i
] = val
& expand_pred_b(pg
[H1(i
)]);
2856 void HELPER(sve_cpy_z_h
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2858 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2862 val
= dup_const(MO_16
, val
);
2863 for (i
= 0; i
< opr_sz
; i
+= 1) {
2864 d
[i
] = val
& expand_pred_h(pg
[H1(i
)]);
2868 void HELPER(sve_cpy_z_s
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2870 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2874 val
= dup_const(MO_32
, val
);
2875 for (i
= 0; i
< opr_sz
; i
+= 1) {
2876 d
[i
] = val
& expand_pred_s(pg
[H1(i
)]);
2880 void HELPER(sve_cpy_z_d
)(void *vd
, void *vg
, uint64_t val
, uint32_t desc
)
2882 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
2886 for (i
= 0; i
< opr_sz
; i
+= 1) {
2887 d
[i
] = (pg
[H1(i
)] & 1 ? val
: 0);
2891 /* Big-endian hosts need to frob the byte indices. If the copy
2892 * happens to be 8-byte aligned, then no frobbing necessary.
2894 static void swap_memmove(void *vd
, void *vs
, size_t n
)
2896 uintptr_t d
= (uintptr_t)vd
;
2897 uintptr_t s
= (uintptr_t)vs
;
2898 uintptr_t o
= (d
| s
| n
) & 7;
2901 #ifndef HOST_WORDS_BIGENDIAN
2910 if (d
< s
|| d
>= s
+ n
) {
2911 for (i
= 0; i
< n
; i
+= 4) {
2912 *(uint32_t *)H1_4(d
+ i
) = *(uint32_t *)H1_4(s
+ i
);
2915 for (i
= n
; i
> 0; ) {
2917 *(uint32_t *)H1_4(d
+ i
) = *(uint32_t *)H1_4(s
+ i
);
2924 if (d
< s
|| d
>= s
+ n
) {
2925 for (i
= 0; i
< n
; i
+= 2) {
2926 *(uint16_t *)H1_2(d
+ i
) = *(uint16_t *)H1_2(s
+ i
);
2929 for (i
= n
; i
> 0; ) {
2931 *(uint16_t *)H1_2(d
+ i
) = *(uint16_t *)H1_2(s
+ i
);
2937 if (d
< s
|| d
>= s
+ n
) {
2938 for (i
= 0; i
< n
; i
++) {
2939 *(uint8_t *)H1(d
+ i
) = *(uint8_t *)H1(s
+ i
);
2942 for (i
= n
; i
> 0; ) {
2944 *(uint8_t *)H1(d
+ i
) = *(uint8_t *)H1(s
+ i
);
2951 /* Similarly for memset of 0. */
2952 static void swap_memzero(void *vd
, size_t n
)
2954 uintptr_t d
= (uintptr_t)vd
;
2955 uintptr_t o
= (d
| n
) & 7;
2958 /* Usually, the first bit of a predicate is set, so N is 0. */
2959 if (likely(n
== 0)) {
2963 #ifndef HOST_WORDS_BIGENDIAN
2972 for (i
= 0; i
< n
; i
+= 4) {
2973 *(uint32_t *)H1_4(d
+ i
) = 0;
2979 for (i
= 0; i
< n
; i
+= 2) {
2980 *(uint16_t *)H1_2(d
+ i
) = 0;
2985 for (i
= 0; i
< n
; i
++) {
2986 *(uint8_t *)H1(d
+ i
) = 0;
2992 void HELPER(sve_ext
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
2994 intptr_t opr_sz
= simd_oprsz(desc
);
2995 size_t n_ofs
= simd_data(desc
);
2996 size_t n_siz
= opr_sz
- n_ofs
;
2999 swap_memmove(vd
, vn
+ n_ofs
, n_siz
);
3000 swap_memmove(vd
+ n_siz
, vm
, n_ofs
);
3001 } else if (vd
!= vn
) {
3002 swap_memmove(vd
+ n_siz
, vd
, n_ofs
);
3003 swap_memmove(vd
, vn
+ n_ofs
, n_siz
);
3005 /* vd == vn == vm. Need temp space. */
3007 swap_memmove(&tmp
, vm
, n_ofs
);
3008 swap_memmove(vd
, vd
+ n_ofs
, n_siz
);
3009 memcpy(vd
+ n_siz
, &tmp
, n_ofs
);
3013 #define DO_INSR(NAME, TYPE, H) \
3014 void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
3016 intptr_t opr_sz = simd_oprsz(desc); \
3017 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
3018 *(TYPE *)(vd + H(0)) = val; \
3021 DO_INSR(sve_insr_b
, uint8_t, H1
)
3022 DO_INSR(sve_insr_h
, uint16_t, H1_2
)
3023 DO_INSR(sve_insr_s
, uint32_t, H1_4
)
3024 DO_INSR(sve_insr_d
, uint64_t, )
3028 void HELPER(sve_rev_b
)(void *vd
, void *vn
, uint32_t desc
)
3030 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
3031 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
3032 uint64_t f
= *(uint64_t *)(vn
+ i
);
3033 uint64_t b
= *(uint64_t *)(vn
+ j
);
3034 *(uint64_t *)(vd
+ i
) = bswap64(b
);
3035 *(uint64_t *)(vd
+ j
) = bswap64(f
);
3039 void HELPER(sve_rev_h
)(void *vd
, void *vn
, uint32_t desc
)
3041 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
3042 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
3043 uint64_t f
= *(uint64_t *)(vn
+ i
);
3044 uint64_t b
= *(uint64_t *)(vn
+ j
);
3045 *(uint64_t *)(vd
+ i
) = hswap64(b
);
3046 *(uint64_t *)(vd
+ j
) = hswap64(f
);
3050 void HELPER(sve_rev_s
)(void *vd
, void *vn
, uint32_t desc
)
3052 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
3053 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
3054 uint64_t f
= *(uint64_t *)(vn
+ i
);
3055 uint64_t b
= *(uint64_t *)(vn
+ j
);
3056 *(uint64_t *)(vd
+ i
) = rol64(b
, 32);
3057 *(uint64_t *)(vd
+ j
) = rol64(f
, 32);
3061 void HELPER(sve_rev_d
)(void *vd
, void *vn
, uint32_t desc
)
3063 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
);
3064 for (i
= 0, j
= opr_sz
- 8; i
< opr_sz
/ 2; i
+= 8, j
-= 8) {
3065 uint64_t f
= *(uint64_t *)(vn
+ i
);
3066 uint64_t b
= *(uint64_t *)(vn
+ j
);
3067 *(uint64_t *)(vd
+ i
) = b
;
3068 *(uint64_t *)(vd
+ j
) = f
;
3072 typedef void tb_impl_fn(void *, void *, void *, void *, uintptr_t, bool);
3074 static inline void do_tbl1(void *vd
, void *vn
, void *vm
, uint32_t desc
,
3075 bool is_tbx
, tb_impl_fn
*fn
)
3077 ARMVectorReg scratch
;
3078 uintptr_t oprsz
= simd_oprsz(desc
);
3080 if (unlikely(vd
== vn
)) {
3081 vn
= memcpy(&scratch
, vn
, oprsz
);
3084 fn(vd
, vn
, NULL
, vm
, oprsz
, is_tbx
);
3087 static inline void do_tbl2(void *vd
, void *vn0
, void *vn1
, void *vm
,
3088 uint32_t desc
, bool is_tbx
, tb_impl_fn
*fn
)
3090 ARMVectorReg scratch
;
3091 uintptr_t oprsz
= simd_oprsz(desc
);
3093 if (unlikely(vd
== vn0
)) {
3094 vn0
= memcpy(&scratch
, vn0
, oprsz
);
3098 } else if (unlikely(vd
== vn1
)) {
3099 vn1
= memcpy(&scratch
, vn1
, oprsz
);
3102 fn(vd
, vn0
, vn1
, vm
, oprsz
, is_tbx
);
3105 #define DO_TB(SUFF, TYPE, H) \
3106 static inline void do_tb_##SUFF(void *vd, void *vt0, void *vt1, \
3107 void *vm, uintptr_t oprsz, bool is_tbx) \
3109 TYPE *d = vd, *tbl0 = vt0, *tbl1 = vt1, *indexes = vm; \
3110 uintptr_t i, nelem = oprsz / sizeof(TYPE); \
3111 for (i = 0; i < nelem; ++i) { \
3112 TYPE index = indexes[H1(i)], val = 0; \
3113 if (index < nelem) { \
3114 val = tbl0[H(index)]; \
3117 if (tbl1 && index < nelem) { \
3118 val = tbl1[H(index)]; \
3119 } else if (is_tbx) { \
3126 void HELPER(sve_tbl_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3128 do_tbl1(vd, vn, vm, desc, false, do_tb_##SUFF); \
3130 void HELPER(sve2_tbl_##SUFF)(void *vd, void *vn0, void *vn1, \
3131 void *vm, uint32_t desc) \
3133 do_tbl2(vd, vn0, vn1, vm, desc, false, do_tb_##SUFF); \
3135 void HELPER(sve2_tbx_##SUFF)(void *vd, void *vn, void *vm, uint32_t desc) \
3137 do_tbl1(vd, vn, vm, desc, true, do_tb_##SUFF); \
3140 DO_TB(b
, uint8_t, H1
)
3141 DO_TB(h
, uint16_t, H2
)
3142 DO_TB(s
, uint32_t, H4
)
3143 DO_TB(d
, uint64_t, )
3147 #define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
3148 void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
3150 intptr_t i, opr_sz = simd_oprsz(desc); \
3154 if (unlikely(vn - vd < opr_sz)) { \
3155 n = memcpy(&tmp, n, opr_sz / 2); \
3157 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
3158 d[HD(i)] = n[HS(i)]; \
3162 DO_UNPK(sve_sunpk_h
, int16_t, int8_t, H2
, H1
)
3163 DO_UNPK(sve_sunpk_s
, int32_t, int16_t, H4
, H2
)
3164 DO_UNPK(sve_sunpk_d
, int64_t, int32_t, , H4
)
3166 DO_UNPK(sve_uunpk_h
, uint16_t, uint8_t, H2
, H1
)
3167 DO_UNPK(sve_uunpk_s
, uint32_t, uint16_t, H4
, H2
)
3168 DO_UNPK(sve_uunpk_d
, uint64_t, uint32_t, , H4
)
3172 /* Mask of bits included in the even numbered predicates of width esz.
3173 * We also use this for expand_bits/compress_bits, and so extend the
3174 * same pattern out to 16-bit units.
3176 static const uint64_t even_bit_esz_masks
[5] = {
3177 0x5555555555555555ull
,
3178 0x3333333333333333ull
,
3179 0x0f0f0f0f0f0f0f0full
,
3180 0x00ff00ff00ff00ffull
,
3181 0x0000ffff0000ffffull
,
3184 /* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
3185 * For N==0, this corresponds to the operation that in qemu/bitops.h
3186 * we call half_shuffle64; this algorithm is from Hacker's Delight,
3187 * section 7-2 Shuffling Bits.
3189 static uint64_t expand_bits(uint64_t x
, int n
)
3194 for (i
= 4; i
>= n
; i
--) {
3196 x
= ((x
<< sh
) | x
) & even_bit_esz_masks
[i
];
3201 /* Compress units of 2**(N+1) bits to units of 2**N bits.
3202 * For N==0, this corresponds to the operation that in qemu/bitops.h
3203 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
3204 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
3206 static uint64_t compress_bits(uint64_t x
, int n
)
3210 for (i
= n
; i
<= 4; i
++) {
3212 x
&= even_bit_esz_masks
[i
];
3215 return x
& 0xffffffffu
;
3218 void HELPER(sve_zip_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
3220 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3221 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3222 intptr_t high
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
);
3223 int esize
= 1 << esz
;
3228 uint64_t nn
= *(uint64_t *)vn
;
3229 uint64_t mm
= *(uint64_t *)vm
;
3230 int half
= 4 * oprsz
;
3232 nn
= extract64(nn
, high
* half
, half
);
3233 mm
= extract64(mm
, high
* half
, half
);
3234 nn
= expand_bits(nn
, esz
);
3235 mm
= expand_bits(mm
, esz
);
3236 d
[0] = nn
| (mm
<< esize
);
3238 ARMPredicateReg tmp
;
3240 /* We produce output faster than we consume input.
3241 Therefore we must be mindful of possible overlap. */
3243 vn
= memcpy(&tmp
, vn
, oprsz
);
3247 } else if (vd
== vm
) {
3248 vm
= memcpy(&tmp
, vm
, oprsz
);
3254 if ((oprsz
& 7) == 0) {
3255 uint32_t *n
= vn
, *m
= vm
;
3258 for (i
= 0; i
< oprsz
/ 8; i
++) {
3259 uint64_t nn
= n
[H4(high
+ i
)];
3260 uint64_t mm
= m
[H4(high
+ i
)];
3262 nn
= expand_bits(nn
, esz
);
3263 mm
= expand_bits(mm
, esz
);
3264 d
[i
] = nn
| (mm
<< esize
);
3267 uint8_t *n
= vn
, *m
= vm
;
3270 for (i
= 0; i
< oprsz
/ 2; i
++) {
3271 uint16_t nn
= n
[H1(high
+ i
)];
3272 uint16_t mm
= m
[H1(high
+ i
)];
3274 nn
= expand_bits(nn
, esz
);
3275 mm
= expand_bits(mm
, esz
);
3276 d16
[H2(i
)] = nn
| (mm
<< esize
);
3282 void HELPER(sve_uzp_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
3284 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3285 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3286 int odd
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
) << esz
;
3287 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3292 l
= compress_bits(n
[0] >> odd
, esz
);
3293 h
= compress_bits(m
[0] >> odd
, esz
);
3294 d
[0] = l
| (h
<< (4 * oprsz
));
3296 ARMPredicateReg tmp_m
;
3297 intptr_t oprsz_16
= oprsz
/ 16;
3299 if ((vm
- vd
) < (uintptr_t)oprsz
) {
3300 m
= memcpy(&tmp_m
, vm
, oprsz
);
3303 for (i
= 0; i
< oprsz_16
; i
++) {
3306 l
= compress_bits(l
>> odd
, esz
);
3307 h
= compress_bits(h
>> odd
, esz
);
3308 d
[i
] = l
| (h
<< 32);
3312 * For VL which is not a multiple of 512, the results from M do not
3313 * align nicely with the uint64_t for D. Put the aligned results
3314 * from M into TMP_M and then copy it into place afterward.
3317 int final_shift
= (oprsz
& 15) * 2;
3321 l
= compress_bits(l
>> odd
, esz
);
3322 h
= compress_bits(h
>> odd
, esz
);
3323 d
[i
] = l
| (h
<< final_shift
);
3325 for (i
= 0; i
< oprsz_16
; i
++) {
3328 l
= compress_bits(l
>> odd
, esz
);
3329 h
= compress_bits(h
>> odd
, esz
);
3330 tmp_m
.p
[i
] = l
| (h
<< 32);
3334 l
= compress_bits(l
>> odd
, esz
);
3335 h
= compress_bits(h
>> odd
, esz
);
3336 tmp_m
.p
[i
] = l
| (h
<< final_shift
);
3338 swap_memmove(vd
+ oprsz
/ 2, &tmp_m
, oprsz
/ 2);
3340 for (i
= 0; i
< oprsz_16
; i
++) {
3343 l
= compress_bits(l
>> odd
, esz
);
3344 h
= compress_bits(h
>> odd
, esz
);
3345 d
[oprsz_16
+ i
] = l
| (h
<< 32);
3351 void HELPER(sve_trn_p
)(void *vd
, void *vn
, void *vm
, uint32_t pred_desc
)
3353 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3354 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3355 int odd
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
);
3356 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3363 mask
= even_bit_esz_masks
[esz
];
3370 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); i
++) {
3371 uint64_t nn
= (n
[i
] & mask
) >> shr
;
3372 uint64_t mm
= (m
[i
] & mask
) << shl
;
3377 /* Reverse units of 2**N bits. */
3378 static uint64_t reverse_bits_64(uint64_t x
, int n
)
3383 for (i
= 2, sh
= 4; i
>= n
; i
--, sh
>>= 1) {
3384 uint64_t mask
= even_bit_esz_masks
[i
];
3385 x
= ((x
& mask
) << sh
) | ((x
>> sh
) & mask
);
3390 static uint8_t reverse_bits_8(uint8_t x
, int n
)
3392 static const uint8_t mask
[3] = { 0x55, 0x33, 0x0f };
3395 for (i
= 2, sh
= 4; i
>= n
; i
--, sh
>>= 1) {
3396 x
= ((x
& mask
[i
]) << sh
) | ((x
>> sh
) & mask
[i
]);
3401 void HELPER(sve_rev_p
)(void *vd
, void *vn
, uint32_t pred_desc
)
3403 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3404 int esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3405 intptr_t i
, oprsz_2
= oprsz
/ 2;
3408 uint64_t l
= *(uint64_t *)vn
;
3409 l
= reverse_bits_64(l
<< (64 - 8 * oprsz
), esz
);
3410 *(uint64_t *)vd
= l
;
3411 } else if ((oprsz
& 15) == 0) {
3412 for (i
= 0; i
< oprsz_2
; i
+= 8) {
3413 intptr_t ih
= oprsz
- 8 - i
;
3414 uint64_t l
= reverse_bits_64(*(uint64_t *)(vn
+ i
), esz
);
3415 uint64_t h
= reverse_bits_64(*(uint64_t *)(vn
+ ih
), esz
);
3416 *(uint64_t *)(vd
+ i
) = h
;
3417 *(uint64_t *)(vd
+ ih
) = l
;
3420 for (i
= 0; i
< oprsz_2
; i
+= 1) {
3421 intptr_t il
= H1(i
);
3422 intptr_t ih
= H1(oprsz
- 1 - i
);
3423 uint8_t l
= reverse_bits_8(*(uint8_t *)(vn
+ il
), esz
);
3424 uint8_t h
= reverse_bits_8(*(uint8_t *)(vn
+ ih
), esz
);
3425 *(uint8_t *)(vd
+ il
) = h
;
3426 *(uint8_t *)(vd
+ ih
) = l
;
3431 void HELPER(sve_punpk_p
)(void *vd
, void *vn
, uint32_t pred_desc
)
3433 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
3434 intptr_t high
= FIELD_EX32(pred_desc
, PREDDESC
, DATA
);
3439 uint64_t nn
= *(uint64_t *)vn
;
3440 int half
= 4 * oprsz
;
3442 nn
= extract64(nn
, high
* half
, half
);
3443 nn
= expand_bits(nn
, 0);
3446 ARMPredicateReg tmp_n
;
3448 /* We produce output faster than we consume input.
3449 Therefore we must be mindful of possible overlap. */
3450 if ((vn
- vd
) < (uintptr_t)oprsz
) {
3451 vn
= memcpy(&tmp_n
, vn
, oprsz
);
3457 if ((oprsz
& 7) == 0) {
3461 for (i
= 0; i
< oprsz
/ 8; i
++) {
3462 uint64_t nn
= n
[H4(high
+ i
)];
3463 d
[i
] = expand_bits(nn
, 0);
3469 for (i
= 0; i
< oprsz
/ 2; i
++) {
3470 uint16_t nn
= n
[H1(high
+ i
)];
3471 d16
[H2(i
)] = expand_bits(nn
, 0);
3477 #define DO_ZIP(NAME, TYPE, H) \
3478 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3480 intptr_t oprsz = simd_oprsz(desc); \
3481 intptr_t i, oprsz_2 = oprsz / 2; \
3482 ARMVectorReg tmp_n, tmp_m; \
3483 /* We produce output faster than we consume input. \
3484 Therefore we must be mindful of possible overlap. */ \
3485 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
3486 vn = memcpy(&tmp_n, vn, oprsz_2); \
3488 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3489 vm = memcpy(&tmp_m, vm, oprsz_2); \
3491 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3492 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
3493 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
3497 DO_ZIP(sve_zip_b
, uint8_t, H1
)
3498 DO_ZIP(sve_zip_h
, uint16_t, H1_2
)
3499 DO_ZIP(sve_zip_s
, uint32_t, H1_4
)
3500 DO_ZIP(sve_zip_d
, uint64_t, )
3502 #define DO_UZP(NAME, TYPE, H) \
3503 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3505 intptr_t oprsz = simd_oprsz(desc); \
3506 intptr_t oprsz_2 = oprsz / 2; \
3507 intptr_t odd_ofs = simd_data(desc); \
3509 ARMVectorReg tmp_m; \
3510 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
3511 vm = memcpy(&tmp_m, vm, oprsz); \
3513 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3514 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
3516 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
3517 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
3521 DO_UZP(sve_uzp_b
, uint8_t, H1
)
3522 DO_UZP(sve_uzp_h
, uint16_t, H1_2
)
3523 DO_UZP(sve_uzp_s
, uint32_t, H1_4
)
3524 DO_UZP(sve_uzp_d
, uint64_t, )
3526 #define DO_TRN(NAME, TYPE, H) \
3527 void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
3529 intptr_t oprsz = simd_oprsz(desc); \
3530 intptr_t odd_ofs = simd_data(desc); \
3532 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
3533 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
3534 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
3535 *(TYPE *)(vd + H(i + 0)) = ae; \
3536 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
3540 DO_TRN(sve_trn_b
, uint8_t, H1
)
3541 DO_TRN(sve_trn_h
, uint16_t, H1_2
)
3542 DO_TRN(sve_trn_s
, uint32_t, H1_4
)
3543 DO_TRN(sve_trn_d
, uint64_t, )
3549 void HELPER(sve_compact_s
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
3551 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
) / 4;
3552 uint32_t *d
= vd
, *n
= vn
;
3555 for (i
= j
= 0; i
< opr_sz
; i
++) {
3556 if (pg
[H1(i
/ 2)] & (i
& 1 ? 0x10 : 0x01)) {
3557 d
[H4(j
)] = n
[H4(i
)];
3561 for (; j
< opr_sz
; j
++) {
3566 void HELPER(sve_compact_d
)(void *vd
, void *vn
, void *vg
, uint32_t desc
)
3568 intptr_t i
, j
, opr_sz
= simd_oprsz(desc
) / 8;
3569 uint64_t *d
= vd
, *n
= vn
;
3572 for (i
= j
= 0; i
< opr_sz
; i
++) {
3573 if (pg
[H1(i
)] & 1) {
3578 for (; j
< opr_sz
; j
++) {
3583 /* Similar to the ARM LastActiveElement pseudocode function, except the
3584 * result is multiplied by the element size. This includes the not found
3585 * indication; e.g. not found for esz=3 is -8.
3587 int32_t HELPER(sve_last_active_element
)(void *vg
, uint32_t pred_desc
)
3589 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
3590 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
3592 return last_active_element(vg
, words
, esz
);
3595 void HELPER(sve_splice
)(void *vd
, void *vn
, void *vm
, void *vg
, uint32_t desc
)
3597 intptr_t opr_sz
= simd_oprsz(desc
) / 8;
3598 int esz
= simd_data(desc
);
3599 uint64_t pg
, first_g
, last_g
, len
, mask
= pred_esz_masks
[esz
];
3600 intptr_t i
, first_i
, last_i
;
3603 first_i
= last_i
= 0;
3604 first_g
= last_g
= 0;
3606 /* Find the extent of the active elements within VG. */
3607 for (i
= QEMU_ALIGN_UP(opr_sz
, 8) - 8; i
>= 0; i
-= 8) {
3608 pg
= *(uint64_t *)(vg
+ i
) & mask
;
3621 first_i
= first_i
* 8 + ctz64(first_g
);
3622 last_i
= last_i
* 8 + 63 - clz64(last_g
);
3623 len
= last_i
- first_i
+ (1 << esz
);
3625 vm
= memcpy(&tmp
, vm
, opr_sz
* 8);
3627 swap_memmove(vd
, vn
+ first_i
, len
);
3629 swap_memmove(vd
+ len
, vm
, opr_sz
* 8 - len
);
3632 void HELPER(sve_sel_zpzz_b
)(void *vd
, void *vn
, void *vm
,
3633 void *vg
, uint32_t desc
)
3635 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
3636 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3639 for (i
= 0; i
< opr_sz
; i
+= 1) {
3640 uint64_t nn
= n
[i
], mm
= m
[i
];
3641 uint64_t pp
= expand_pred_b(pg
[H1(i
)]);
3642 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
3646 void HELPER(sve_sel_zpzz_h
)(void *vd
, void *vn
, void *vm
,
3647 void *vg
, uint32_t desc
)
3649 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
3650 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3653 for (i
= 0; i
< opr_sz
; i
+= 1) {
3654 uint64_t nn
= n
[i
], mm
= m
[i
];
3655 uint64_t pp
= expand_pred_h(pg
[H1(i
)]);
3656 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
3660 void HELPER(sve_sel_zpzz_s
)(void *vd
, void *vn
, void *vm
,
3661 void *vg
, uint32_t desc
)
3663 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
3664 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3667 for (i
= 0; i
< opr_sz
; i
+= 1) {
3668 uint64_t nn
= n
[i
], mm
= m
[i
];
3669 uint64_t pp
= expand_pred_s(pg
[H1(i
)]);
3670 d
[i
] = (nn
& pp
) | (mm
& ~pp
);
3674 void HELPER(sve_sel_zpzz_d
)(void *vd
, void *vn
, void *vm
,
3675 void *vg
, uint32_t desc
)
3677 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
3678 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
3681 for (i
= 0; i
< opr_sz
; i
+= 1) {
3682 uint64_t nn
= n
[i
], mm
= m
[i
];
3683 d
[i
] = (pg
[H1(i
)] & 1 ? nn
: mm
);
3687 /* Two operand comparison controlled by a predicate.
3688 * ??? It is very tempting to want to be able to expand this inline
3689 * with x86 instructions, e.g.
3691 * vcmpeqw zm, zn, %ymm0
3692 * vpmovmskb %ymm0, %eax
3696 * or even aarch64, e.g.
3698 * // mask = 4000 1000 0400 0100 0040 0010 0004 0001
3699 * cmeq v0.8h, zn, zm
3700 * and v0.8h, v0.8h, mask
3704 * However, coming up with an abstraction that allows vector inputs and
3705 * a scalar output, and also handles the byte-ordering of sub-uint64_t
3706 * scalar outputs, is tricky.
3708 #define DO_CMP_PPZZ(NAME, TYPE, OP, H, MASK) \
3709 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3711 intptr_t opr_sz = simd_oprsz(desc); \
3712 uint32_t flags = PREDTEST_INIT; \
3713 intptr_t i = opr_sz; \
3715 uint64_t out = 0, pg; \
3717 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3718 TYPE nn = *(TYPE *)(vn + H(i)); \
3719 TYPE mm = *(TYPE *)(vm + H(i)); \
3722 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3724 *(uint64_t *)(vd + (i >> 3)) = out; \
3725 flags = iter_predtest_bwd(out, pg, flags); \
3730 #define DO_CMP_PPZZ_B(NAME, TYPE, OP) \
3731 DO_CMP_PPZZ(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3732 #define DO_CMP_PPZZ_H(NAME, TYPE, OP) \
3733 DO_CMP_PPZZ(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3734 #define DO_CMP_PPZZ_S(NAME, TYPE, OP) \
3735 DO_CMP_PPZZ(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3736 #define DO_CMP_PPZZ_D(NAME, TYPE, OP) \
3737 DO_CMP_PPZZ(NAME, TYPE, OP, , 0x0101010101010101ull)
3739 DO_CMP_PPZZ_B(sve_cmpeq_ppzz_b
, uint8_t, ==)
3740 DO_CMP_PPZZ_H(sve_cmpeq_ppzz_h
, uint16_t, ==)
3741 DO_CMP_PPZZ_S(sve_cmpeq_ppzz_s
, uint32_t, ==)
3742 DO_CMP_PPZZ_D(sve_cmpeq_ppzz_d
, uint64_t, ==)
3744 DO_CMP_PPZZ_B(sve_cmpne_ppzz_b
, uint8_t, !=)
3745 DO_CMP_PPZZ_H(sve_cmpne_ppzz_h
, uint16_t, !=)
3746 DO_CMP_PPZZ_S(sve_cmpne_ppzz_s
, uint32_t, !=)
3747 DO_CMP_PPZZ_D(sve_cmpne_ppzz_d
, uint64_t, !=)
3749 DO_CMP_PPZZ_B(sve_cmpgt_ppzz_b
, int8_t, >)
3750 DO_CMP_PPZZ_H(sve_cmpgt_ppzz_h
, int16_t, >)
3751 DO_CMP_PPZZ_S(sve_cmpgt_ppzz_s
, int32_t, >)
3752 DO_CMP_PPZZ_D(sve_cmpgt_ppzz_d
, int64_t, >)
3754 DO_CMP_PPZZ_B(sve_cmpge_ppzz_b
, int8_t, >=)
3755 DO_CMP_PPZZ_H(sve_cmpge_ppzz_h
, int16_t, >=)
3756 DO_CMP_PPZZ_S(sve_cmpge_ppzz_s
, int32_t, >=)
3757 DO_CMP_PPZZ_D(sve_cmpge_ppzz_d
, int64_t, >=)
3759 DO_CMP_PPZZ_B(sve_cmphi_ppzz_b
, uint8_t, >)
3760 DO_CMP_PPZZ_H(sve_cmphi_ppzz_h
, uint16_t, >)
3761 DO_CMP_PPZZ_S(sve_cmphi_ppzz_s
, uint32_t, >)
3762 DO_CMP_PPZZ_D(sve_cmphi_ppzz_d
, uint64_t, >)
3764 DO_CMP_PPZZ_B(sve_cmphs_ppzz_b
, uint8_t, >=)
3765 DO_CMP_PPZZ_H(sve_cmphs_ppzz_h
, uint16_t, >=)
3766 DO_CMP_PPZZ_S(sve_cmphs_ppzz_s
, uint32_t, >=)
3767 DO_CMP_PPZZ_D(sve_cmphs_ppzz_d
, uint64_t, >=)
3769 #undef DO_CMP_PPZZ_B
3770 #undef DO_CMP_PPZZ_H
3771 #undef DO_CMP_PPZZ_S
3772 #undef DO_CMP_PPZZ_D
3775 /* Similar, but the second source is "wide". */
3776 #define DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H, MASK) \
3777 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
3779 intptr_t opr_sz = simd_oprsz(desc); \
3780 uint32_t flags = PREDTEST_INIT; \
3781 intptr_t i = opr_sz; \
3783 uint64_t out = 0, pg; \
3785 TYPEW mm = *(TYPEW *)(vm + i - 8); \
3787 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3788 TYPE nn = *(TYPE *)(vn + H(i)); \
3792 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3794 *(uint64_t *)(vd + (i >> 3)) = out; \
3795 flags = iter_predtest_bwd(out, pg, flags); \
3800 #define DO_CMP_PPZW_B(NAME, TYPE, TYPEW, OP) \
3801 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1, 0xffffffffffffffffull)
3802 #define DO_CMP_PPZW_H(NAME, TYPE, TYPEW, OP) \
3803 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_2, 0x5555555555555555ull)
3804 #define DO_CMP_PPZW_S(NAME, TYPE, TYPEW, OP) \
3805 DO_CMP_PPZW(NAME, TYPE, TYPEW, OP, H1_4, 0x1111111111111111ull)
3807 DO_CMP_PPZW_B(sve_cmpeq_ppzw_b
, int8_t, uint64_t, ==)
3808 DO_CMP_PPZW_H(sve_cmpeq_ppzw_h
, int16_t, uint64_t, ==)
3809 DO_CMP_PPZW_S(sve_cmpeq_ppzw_s
, int32_t, uint64_t, ==)
3811 DO_CMP_PPZW_B(sve_cmpne_ppzw_b
, int8_t, uint64_t, !=)
3812 DO_CMP_PPZW_H(sve_cmpne_ppzw_h
, int16_t, uint64_t, !=)
3813 DO_CMP_PPZW_S(sve_cmpne_ppzw_s
, int32_t, uint64_t, !=)
3815 DO_CMP_PPZW_B(sve_cmpgt_ppzw_b
, int8_t, int64_t, >)
3816 DO_CMP_PPZW_H(sve_cmpgt_ppzw_h
, int16_t, int64_t, >)
3817 DO_CMP_PPZW_S(sve_cmpgt_ppzw_s
, int32_t, int64_t, >)
3819 DO_CMP_PPZW_B(sve_cmpge_ppzw_b
, int8_t, int64_t, >=)
3820 DO_CMP_PPZW_H(sve_cmpge_ppzw_h
, int16_t, int64_t, >=)
3821 DO_CMP_PPZW_S(sve_cmpge_ppzw_s
, int32_t, int64_t, >=)
3823 DO_CMP_PPZW_B(sve_cmphi_ppzw_b
, uint8_t, uint64_t, >)
3824 DO_CMP_PPZW_H(sve_cmphi_ppzw_h
, uint16_t, uint64_t, >)
3825 DO_CMP_PPZW_S(sve_cmphi_ppzw_s
, uint32_t, uint64_t, >)
3827 DO_CMP_PPZW_B(sve_cmphs_ppzw_b
, uint8_t, uint64_t, >=)
3828 DO_CMP_PPZW_H(sve_cmphs_ppzw_h
, uint16_t, uint64_t, >=)
3829 DO_CMP_PPZW_S(sve_cmphs_ppzw_s
, uint32_t, uint64_t, >=)
3831 DO_CMP_PPZW_B(sve_cmplt_ppzw_b
, int8_t, int64_t, <)
3832 DO_CMP_PPZW_H(sve_cmplt_ppzw_h
, int16_t, int64_t, <)
3833 DO_CMP_PPZW_S(sve_cmplt_ppzw_s
, int32_t, int64_t, <)
3835 DO_CMP_PPZW_B(sve_cmple_ppzw_b
, int8_t, int64_t, <=)
3836 DO_CMP_PPZW_H(sve_cmple_ppzw_h
, int16_t, int64_t, <=)
3837 DO_CMP_PPZW_S(sve_cmple_ppzw_s
, int32_t, int64_t, <=)
3839 DO_CMP_PPZW_B(sve_cmplo_ppzw_b
, uint8_t, uint64_t, <)
3840 DO_CMP_PPZW_H(sve_cmplo_ppzw_h
, uint16_t, uint64_t, <)
3841 DO_CMP_PPZW_S(sve_cmplo_ppzw_s
, uint32_t, uint64_t, <)
3843 DO_CMP_PPZW_B(sve_cmpls_ppzw_b
, uint8_t, uint64_t, <=)
3844 DO_CMP_PPZW_H(sve_cmpls_ppzw_h
, uint16_t, uint64_t, <=)
3845 DO_CMP_PPZW_S(sve_cmpls_ppzw_s
, uint32_t, uint64_t, <=)
3847 #undef DO_CMP_PPZW_B
3848 #undef DO_CMP_PPZW_H
3849 #undef DO_CMP_PPZW_S
3852 /* Similar, but the second source is immediate. */
3853 #define DO_CMP_PPZI(NAME, TYPE, OP, H, MASK) \
3854 uint32_t HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
3856 intptr_t opr_sz = simd_oprsz(desc); \
3857 uint32_t flags = PREDTEST_INIT; \
3858 TYPE mm = simd_data(desc); \
3859 intptr_t i = opr_sz; \
3861 uint64_t out = 0, pg; \
3863 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
3864 TYPE nn = *(TYPE *)(vn + H(i)); \
3867 pg = *(uint64_t *)(vg + (i >> 3)) & MASK; \
3869 *(uint64_t *)(vd + (i >> 3)) = out; \
3870 flags = iter_predtest_bwd(out, pg, flags); \
3875 #define DO_CMP_PPZI_B(NAME, TYPE, OP) \
3876 DO_CMP_PPZI(NAME, TYPE, OP, H1, 0xffffffffffffffffull)
3877 #define DO_CMP_PPZI_H(NAME, TYPE, OP) \
3878 DO_CMP_PPZI(NAME, TYPE, OP, H1_2, 0x5555555555555555ull)
3879 #define DO_CMP_PPZI_S(NAME, TYPE, OP) \
3880 DO_CMP_PPZI(NAME, TYPE, OP, H1_4, 0x1111111111111111ull)
3881 #define DO_CMP_PPZI_D(NAME, TYPE, OP) \
3882 DO_CMP_PPZI(NAME, TYPE, OP, , 0x0101010101010101ull)
3884 DO_CMP_PPZI_B(sve_cmpeq_ppzi_b
, uint8_t, ==)
3885 DO_CMP_PPZI_H(sve_cmpeq_ppzi_h
, uint16_t, ==)
3886 DO_CMP_PPZI_S(sve_cmpeq_ppzi_s
, uint32_t, ==)
3887 DO_CMP_PPZI_D(sve_cmpeq_ppzi_d
, uint64_t, ==)
3889 DO_CMP_PPZI_B(sve_cmpne_ppzi_b
, uint8_t, !=)
3890 DO_CMP_PPZI_H(sve_cmpne_ppzi_h
, uint16_t, !=)
3891 DO_CMP_PPZI_S(sve_cmpne_ppzi_s
, uint32_t, !=)
3892 DO_CMP_PPZI_D(sve_cmpne_ppzi_d
, uint64_t, !=)
3894 DO_CMP_PPZI_B(sve_cmpgt_ppzi_b
, int8_t, >)
3895 DO_CMP_PPZI_H(sve_cmpgt_ppzi_h
, int16_t, >)
3896 DO_CMP_PPZI_S(sve_cmpgt_ppzi_s
, int32_t, >)
3897 DO_CMP_PPZI_D(sve_cmpgt_ppzi_d
, int64_t, >)
3899 DO_CMP_PPZI_B(sve_cmpge_ppzi_b
, int8_t, >=)
3900 DO_CMP_PPZI_H(sve_cmpge_ppzi_h
, int16_t, >=)
3901 DO_CMP_PPZI_S(sve_cmpge_ppzi_s
, int32_t, >=)
3902 DO_CMP_PPZI_D(sve_cmpge_ppzi_d
, int64_t, >=)
3904 DO_CMP_PPZI_B(sve_cmphi_ppzi_b
, uint8_t, >)
3905 DO_CMP_PPZI_H(sve_cmphi_ppzi_h
, uint16_t, >)
3906 DO_CMP_PPZI_S(sve_cmphi_ppzi_s
, uint32_t, >)
3907 DO_CMP_PPZI_D(sve_cmphi_ppzi_d
, uint64_t, >)
3909 DO_CMP_PPZI_B(sve_cmphs_ppzi_b
, uint8_t, >=)
3910 DO_CMP_PPZI_H(sve_cmphs_ppzi_h
, uint16_t, >=)
3911 DO_CMP_PPZI_S(sve_cmphs_ppzi_s
, uint32_t, >=)
3912 DO_CMP_PPZI_D(sve_cmphs_ppzi_d
, uint64_t, >=)
3914 DO_CMP_PPZI_B(sve_cmplt_ppzi_b
, int8_t, <)
3915 DO_CMP_PPZI_H(sve_cmplt_ppzi_h
, int16_t, <)
3916 DO_CMP_PPZI_S(sve_cmplt_ppzi_s
, int32_t, <)
3917 DO_CMP_PPZI_D(sve_cmplt_ppzi_d
, int64_t, <)
3919 DO_CMP_PPZI_B(sve_cmple_ppzi_b
, int8_t, <=)
3920 DO_CMP_PPZI_H(sve_cmple_ppzi_h
, int16_t, <=)
3921 DO_CMP_PPZI_S(sve_cmple_ppzi_s
, int32_t, <=)
3922 DO_CMP_PPZI_D(sve_cmple_ppzi_d
, int64_t, <=)
3924 DO_CMP_PPZI_B(sve_cmplo_ppzi_b
, uint8_t, <)
3925 DO_CMP_PPZI_H(sve_cmplo_ppzi_h
, uint16_t, <)
3926 DO_CMP_PPZI_S(sve_cmplo_ppzi_s
, uint32_t, <)
3927 DO_CMP_PPZI_D(sve_cmplo_ppzi_d
, uint64_t, <)
3929 DO_CMP_PPZI_B(sve_cmpls_ppzi_b
, uint8_t, <=)
3930 DO_CMP_PPZI_H(sve_cmpls_ppzi_h
, uint16_t, <=)
3931 DO_CMP_PPZI_S(sve_cmpls_ppzi_s
, uint32_t, <=)
3932 DO_CMP_PPZI_D(sve_cmpls_ppzi_d
, uint64_t, <=)
3934 #undef DO_CMP_PPZI_B
3935 #undef DO_CMP_PPZI_H
3936 #undef DO_CMP_PPZI_S
3937 #undef DO_CMP_PPZI_D
3940 /* Similar to the ARM LastActive pseudocode function. */
3941 static bool last_active_pred(void *vd
, void *vg
, intptr_t oprsz
)
3945 for (i
= QEMU_ALIGN_UP(oprsz
, 8) - 8; i
>= 0; i
-= 8) {
3946 uint64_t pg
= *(uint64_t *)(vg
+ i
);
3948 return (pow2floor(pg
) & *(uint64_t *)(vd
+ i
)) != 0;
3954 /* Compute a mask into RETB that is true for all G, up to and including
3955 * (if after) or excluding (if !after) the first G & N.
3956 * Return true if BRK found.
3958 static bool compute_brk(uint64_t *retb
, uint64_t n
, uint64_t g
,
3959 bool brk
, bool after
)
3965 } else if ((g
& n
) == 0) {
3966 /* For all G, no N are set; break not found. */
3969 /* Break somewhere in N. Locate it. */
3970 b
= g
& n
; /* guard true, pred true */
3971 b
= b
& -b
; /* first such */
3973 b
= b
| (b
- 1); /* break after same */
3975 b
= b
- 1; /* break before same */
3984 /* Compute a zeroing BRK. */
3985 static void compute_brk_z(uint64_t *d
, uint64_t *n
, uint64_t *g
,
3986 intptr_t oprsz
, bool after
)
3991 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
3992 uint64_t this_b
, this_g
= g
[i
];
3994 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
3995 d
[i
] = this_b
& this_g
;
3999 /* Likewise, but also compute flags. */
4000 static uint32_t compute_brks_z(uint64_t *d
, uint64_t *n
, uint64_t *g
,
4001 intptr_t oprsz
, bool after
)
4003 uint32_t flags
= PREDTEST_INIT
;
4007 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
4008 uint64_t this_b
, this_d
, this_g
= g
[i
];
4010 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
4011 d
[i
] = this_d
= this_b
& this_g
;
4012 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
4017 /* Compute a merging BRK. */
4018 static void compute_brk_m(uint64_t *d
, uint64_t *n
, uint64_t *g
,
4019 intptr_t oprsz
, bool after
)
4024 for (i
= 0; i
< DIV_ROUND_UP(oprsz
, 8); ++i
) {
4025 uint64_t this_b
, this_g
= g
[i
];
4027 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
4028 d
[i
] = (this_b
& this_g
) | (d
[i
] & ~this_g
);
4032 /* Likewise, but also compute flags. */
4033 static uint32_t compute_brks_m(uint64_t *d
, uint64_t *n
, uint64_t *g
,
4034 intptr_t oprsz
, bool after
)
4036 uint32_t flags
= PREDTEST_INIT
;
4040 for (i
= 0; i
< oprsz
/ 8; ++i
) {
4041 uint64_t this_b
, this_d
= d
[i
], this_g
= g
[i
];
4043 brk
= compute_brk(&this_b
, n
[i
], this_g
, brk
, after
);
4044 d
[i
] = this_d
= (this_b
& this_g
) | (this_d
& ~this_g
);
4045 flags
= iter_predtest_fwd(this_d
, this_g
, flags
);
4050 static uint32_t do_zero(ARMPredicateReg
*d
, intptr_t oprsz
)
4052 /* It is quicker to zero the whole predicate than loop on OPRSZ.
4053 * The compiler should turn this into 4 64-bit integer stores.
4055 memset(d
, 0, sizeof(ARMPredicateReg
));
4056 return PREDTEST_INIT
;
4059 void HELPER(sve_brkpa
)(void *vd
, void *vn
, void *vm
, void *vg
,
4062 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4063 if (last_active_pred(vn
, vg
, oprsz
)) {
4064 compute_brk_z(vd
, vm
, vg
, oprsz
, true);
4070 uint32_t HELPER(sve_brkpas
)(void *vd
, void *vn
, void *vm
, void *vg
,
4073 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4074 if (last_active_pred(vn
, vg
, oprsz
)) {
4075 return compute_brks_z(vd
, vm
, vg
, oprsz
, true);
4077 return do_zero(vd
, oprsz
);
4081 void HELPER(sve_brkpb
)(void *vd
, void *vn
, void *vm
, void *vg
,
4084 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4085 if (last_active_pred(vn
, vg
, oprsz
)) {
4086 compute_brk_z(vd
, vm
, vg
, oprsz
, false);
4092 uint32_t HELPER(sve_brkpbs
)(void *vd
, void *vn
, void *vm
, void *vg
,
4095 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4096 if (last_active_pred(vn
, vg
, oprsz
)) {
4097 return compute_brks_z(vd
, vm
, vg
, oprsz
, false);
4099 return do_zero(vd
, oprsz
);
4103 void HELPER(sve_brka_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4105 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4106 compute_brk_z(vd
, vn
, vg
, oprsz
, true);
4109 uint32_t HELPER(sve_brkas_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4111 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4112 return compute_brks_z(vd
, vn
, vg
, oprsz
, true);
4115 void HELPER(sve_brkb_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4117 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4118 compute_brk_z(vd
, vn
, vg
, oprsz
, false);
4121 uint32_t HELPER(sve_brkbs_z
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4123 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4124 return compute_brks_z(vd
, vn
, vg
, oprsz
, false);
4127 void HELPER(sve_brka_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4129 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4130 compute_brk_m(vd
, vn
, vg
, oprsz
, true);
4133 uint32_t HELPER(sve_brkas_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4135 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4136 return compute_brks_m(vd
, vn
, vg
, oprsz
, true);
4139 void HELPER(sve_brkb_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4141 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4142 compute_brk_m(vd
, vn
, vg
, oprsz
, false);
4145 uint32_t HELPER(sve_brkbs_m
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4147 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4148 return compute_brks_m(vd
, vn
, vg
, oprsz
, false);
4151 void HELPER(sve_brkn
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4153 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4154 if (!last_active_pred(vn
, vg
, oprsz
)) {
4159 /* As if PredTest(Ones(PL), D, esz). */
4160 static uint32_t predtest_ones(ARMPredicateReg
*d
, intptr_t oprsz
,
4163 uint32_t flags
= PREDTEST_INIT
;
4166 for (i
= 0; i
< oprsz
/ 8; i
++) {
4167 flags
= iter_predtest_fwd(d
->p
[i
], esz_mask
, flags
);
4170 uint64_t mask
= ~(-1ULL << (8 * (oprsz
& 7)));
4171 flags
= iter_predtest_fwd(d
->p
[i
], esz_mask
& mask
, flags
);
4176 uint32_t HELPER(sve_brkns
)(void *vd
, void *vn
, void *vg
, uint32_t pred_desc
)
4178 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4179 if (last_active_pred(vn
, vg
, oprsz
)) {
4180 return predtest_ones(vd
, oprsz
, -1);
4182 return do_zero(vd
, oprsz
);
4186 uint64_t HELPER(sve_cntp
)(void *vn
, void *vg
, uint32_t pred_desc
)
4188 intptr_t words
= DIV_ROUND_UP(FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
), 8);
4189 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
4190 uint64_t *n
= vn
, *g
= vg
, sum
= 0, mask
= pred_esz_masks
[esz
];
4193 for (i
= 0; i
< words
; ++i
) {
4194 uint64_t t
= n
[i
] & g
[i
] & mask
;
4200 uint32_t HELPER(sve_whilel
)(void *vd
, uint32_t count
, uint32_t pred_desc
)
4202 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4203 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
4204 uint64_t esz_mask
= pred_esz_masks
[esz
];
4205 ARMPredicateReg
*d
= vd
;
4209 /* Begin with a zero predicate register. */
4210 flags
= do_zero(d
, oprsz
);
4215 /* Set all of the requested bits. */
4216 for (i
= 0; i
< count
/ 64; ++i
) {
4220 d
->p
[i
] = MAKE_64BIT_MASK(0, count
& 63) & esz_mask
;
4223 return predtest_ones(d
, oprsz
, esz_mask
);
4226 uint32_t HELPER(sve_whileg
)(void *vd
, uint32_t count
, uint32_t pred_desc
)
4228 intptr_t oprsz
= FIELD_EX32(pred_desc
, PREDDESC
, OPRSZ
);
4229 intptr_t esz
= FIELD_EX32(pred_desc
, PREDDESC
, ESZ
);
4230 uint64_t esz_mask
= pred_esz_masks
[esz
];
4231 ARMPredicateReg
*d
= vd
;
4232 intptr_t i
, invcount
, oprbits
;
4236 return do_zero(d
, oprsz
);
4239 oprbits
= oprsz
* 8;
4240 tcg_debug_assert(count
<= oprbits
);
4244 bits
&= MAKE_64BIT_MASK(0, oprbits
& 63);
4247 invcount
= oprbits
- count
;
4248 for (i
= (oprsz
- 1) / 8; i
> invcount
/ 64; --i
) {
4253 d
->p
[i
] = bits
& MAKE_64BIT_MASK(invcount
& 63, 64);
4259 return predtest_ones(d
, oprsz
, esz_mask
);
4262 /* Recursive reduction on a function;
4263 * C.f. the ARM ARM function ReducePredicated.
4265 * While it would be possible to write this without the DATA temporary,
4266 * it is much simpler to process the predicate register this way.
4267 * The recursion is bounded to depth 7 (128 fp16 elements), so there's
4268 * little to gain with a more complex non-recursive form.
4270 #define DO_REDUCE(NAME, TYPE, H, FUNC, IDENT) \
4271 static TYPE NAME##_reduce(TYPE *data, float_status *status, uintptr_t n) \
4276 uintptr_t half = n / 2; \
4277 TYPE lo = NAME##_reduce(data, status, half); \
4278 TYPE hi = NAME##_reduce(data + half, status, half); \
4279 return TYPE##_##FUNC(lo, hi, status); \
4282 uint64_t HELPER(NAME)(void *vn, void *vg, void *vs, uint32_t desc) \
4284 uintptr_t i, oprsz = simd_oprsz(desc), maxsz = simd_data(desc); \
4285 TYPE data[sizeof(ARMVectorReg) / sizeof(TYPE)]; \
4286 for (i = 0; i < oprsz; ) { \
4287 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
4289 TYPE nn = *(TYPE *)(vn + H(i)); \
4290 *(TYPE *)((void *)data + i) = (pg & 1 ? nn : IDENT); \
4291 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
4294 for (; i < maxsz; i += sizeof(TYPE)) { \
4295 *(TYPE *)((void *)data + i) = IDENT; \
4297 return NAME##_reduce(data, vs, maxsz / sizeof(TYPE)); \
4300 DO_REDUCE(sve_faddv_h
, float16
, H1_2
, add
, float16_zero
)
4301 DO_REDUCE(sve_faddv_s
, float32
, H1_4
, add
, float32_zero
)
4302 DO_REDUCE(sve_faddv_d
, float64
, , add
, float64_zero
)
4304 /* Identity is floatN_default_nan, without the function call. */
4305 DO_REDUCE(sve_fminnmv_h
, float16
, H1_2
, minnum
, 0x7E00)
4306 DO_REDUCE(sve_fminnmv_s
, float32
, H1_4
, minnum
, 0x7FC00000)
4307 DO_REDUCE(sve_fminnmv_d
, float64
, , minnum
, 0x7FF8000000000000ULL
)
4309 DO_REDUCE(sve_fmaxnmv_h
, float16
, H1_2
, maxnum
, 0x7E00)
4310 DO_REDUCE(sve_fmaxnmv_s
, float32
, H1_4
, maxnum
, 0x7FC00000)
4311 DO_REDUCE(sve_fmaxnmv_d
, float64
, , maxnum
, 0x7FF8000000000000ULL
)
4313 DO_REDUCE(sve_fminv_h
, float16
, H1_2
, min
, float16_infinity
)
4314 DO_REDUCE(sve_fminv_s
, float32
, H1_4
, min
, float32_infinity
)
4315 DO_REDUCE(sve_fminv_d
, float64
, , min
, float64_infinity
)
4317 DO_REDUCE(sve_fmaxv_h
, float16
, H1_2
, max
, float16_chs(float16_infinity
))
4318 DO_REDUCE(sve_fmaxv_s
, float32
, H1_4
, max
, float32_chs(float32_infinity
))
4319 DO_REDUCE(sve_fmaxv_d
, float64
, , max
, float64_chs(float64_infinity
))
4323 uint64_t HELPER(sve_fadda_h
)(uint64_t nn
, void *vm
, void *vg
,
4324 void *status
, uint32_t desc
)
4326 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
);
4327 float16 result
= nn
;
4330 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
4333 float16 mm
= *(float16
*)(vm
+ H1_2(i
));
4334 result
= float16_add(result
, mm
, status
);
4336 i
+= sizeof(float16
), pg
>>= sizeof(float16
);
4338 } while (i
< opr_sz
);
4343 uint64_t HELPER(sve_fadda_s
)(uint64_t nn
, void *vm
, void *vg
,
4344 void *status
, uint32_t desc
)
4346 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
);
4347 float32 result
= nn
;
4350 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3));
4353 float32 mm
= *(float32
*)(vm
+ H1_2(i
));
4354 result
= float32_add(result
, mm
, status
);
4356 i
+= sizeof(float32
), pg
>>= sizeof(float32
);
4358 } while (i
< opr_sz
);
4363 uint64_t HELPER(sve_fadda_d
)(uint64_t nn
, void *vm
, void *vg
,
4364 void *status
, uint32_t desc
)
4366 intptr_t i
= 0, opr_sz
= simd_oprsz(desc
) / 8;
4370 for (i
= 0; i
< opr_sz
; i
++) {
4371 if (pg
[H1(i
)] & 1) {
4372 nn
= float64_add(nn
, m
[i
], status
);
4379 /* Fully general three-operand expander, controlled by a predicate,
4380 * With the extra float_status parameter.
4382 #define DO_ZPZZ_FP(NAME, TYPE, H, OP) \
4383 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4384 void *status, uint32_t desc) \
4386 intptr_t i = simd_oprsz(desc); \
4389 uint64_t pg = g[(i - 1) >> 6]; \
4391 i -= sizeof(TYPE); \
4392 if (likely((pg >> (i & 63)) & 1)) { \
4393 TYPE nn = *(TYPE *)(vn + H(i)); \
4394 TYPE mm = *(TYPE *)(vm + H(i)); \
4395 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4401 DO_ZPZZ_FP(sve_fadd_h
, uint16_t, H1_2
, float16_add
)
4402 DO_ZPZZ_FP(sve_fadd_s
, uint32_t, H1_4
, float32_add
)
4403 DO_ZPZZ_FP(sve_fadd_d
, uint64_t, , float64_add
)
4405 DO_ZPZZ_FP(sve_fsub_h
, uint16_t, H1_2
, float16_sub
)
4406 DO_ZPZZ_FP(sve_fsub_s
, uint32_t, H1_4
, float32_sub
)
4407 DO_ZPZZ_FP(sve_fsub_d
, uint64_t, , float64_sub
)
4409 DO_ZPZZ_FP(sve_fmul_h
, uint16_t, H1_2
, float16_mul
)
4410 DO_ZPZZ_FP(sve_fmul_s
, uint32_t, H1_4
, float32_mul
)
4411 DO_ZPZZ_FP(sve_fmul_d
, uint64_t, , float64_mul
)
4413 DO_ZPZZ_FP(sve_fdiv_h
, uint16_t, H1_2
, float16_div
)
4414 DO_ZPZZ_FP(sve_fdiv_s
, uint32_t, H1_4
, float32_div
)
4415 DO_ZPZZ_FP(sve_fdiv_d
, uint64_t, , float64_div
)
4417 DO_ZPZZ_FP(sve_fmin_h
, uint16_t, H1_2
, float16_min
)
4418 DO_ZPZZ_FP(sve_fmin_s
, uint32_t, H1_4
, float32_min
)
4419 DO_ZPZZ_FP(sve_fmin_d
, uint64_t, , float64_min
)
4421 DO_ZPZZ_FP(sve_fmax_h
, uint16_t, H1_2
, float16_max
)
4422 DO_ZPZZ_FP(sve_fmax_s
, uint32_t, H1_4
, float32_max
)
4423 DO_ZPZZ_FP(sve_fmax_d
, uint64_t, , float64_max
)
4425 DO_ZPZZ_FP(sve_fminnum_h
, uint16_t, H1_2
, float16_minnum
)
4426 DO_ZPZZ_FP(sve_fminnum_s
, uint32_t, H1_4
, float32_minnum
)
4427 DO_ZPZZ_FP(sve_fminnum_d
, uint64_t, , float64_minnum
)
4429 DO_ZPZZ_FP(sve_fmaxnum_h
, uint16_t, H1_2
, float16_maxnum
)
4430 DO_ZPZZ_FP(sve_fmaxnum_s
, uint32_t, H1_4
, float32_maxnum
)
4431 DO_ZPZZ_FP(sve_fmaxnum_d
, uint64_t, , float64_maxnum
)
4433 static inline float16
abd_h(float16 a
, float16 b
, float_status
*s
)
4435 return float16_abs(float16_sub(a
, b
, s
));
4438 static inline float32
abd_s(float32 a
, float32 b
, float_status
*s
)
4440 return float32_abs(float32_sub(a
, b
, s
));
4443 static inline float64
abd_d(float64 a
, float64 b
, float_status
*s
)
4445 return float64_abs(float64_sub(a
, b
, s
));
4448 DO_ZPZZ_FP(sve_fabd_h
, uint16_t, H1_2
, abd_h
)
4449 DO_ZPZZ_FP(sve_fabd_s
, uint32_t, H1_4
, abd_s
)
4450 DO_ZPZZ_FP(sve_fabd_d
, uint64_t, , abd_d
)
4452 static inline float64
scalbn_d(float64 a
, int64_t b
, float_status
*s
)
4454 int b_int
= MIN(MAX(b
, INT_MIN
), INT_MAX
);
4455 return float64_scalbn(a
, b_int
, s
);
4458 DO_ZPZZ_FP(sve_fscalbn_h
, int16_t, H1_2
, float16_scalbn
)
4459 DO_ZPZZ_FP(sve_fscalbn_s
, int32_t, H1_4
, float32_scalbn
)
4460 DO_ZPZZ_FP(sve_fscalbn_d
, int64_t, , scalbn_d
)
4462 DO_ZPZZ_FP(sve_fmulx_h
, uint16_t, H1_2
, helper_advsimd_mulxh
)
4463 DO_ZPZZ_FP(sve_fmulx_s
, uint32_t, H1_4
, helper_vfp_mulxs
)
4464 DO_ZPZZ_FP(sve_fmulx_d
, uint64_t, , helper_vfp_mulxd
)
4468 /* Three-operand expander, with one scalar operand, controlled by
4469 * a predicate, with the extra float_status parameter.
4471 #define DO_ZPZS_FP(NAME, TYPE, H, OP) \
4472 void HELPER(NAME)(void *vd, void *vn, void *vg, uint64_t scalar, \
4473 void *status, uint32_t desc) \
4475 intptr_t i = simd_oprsz(desc); \
4479 uint64_t pg = g[(i - 1) >> 6]; \
4481 i -= sizeof(TYPE); \
4482 if (likely((pg >> (i & 63)) & 1)) { \
4483 TYPE nn = *(TYPE *)(vn + H(i)); \
4484 *(TYPE *)(vd + H(i)) = OP(nn, mm, status); \
4490 DO_ZPZS_FP(sve_fadds_h
, float16
, H1_2
, float16_add
)
4491 DO_ZPZS_FP(sve_fadds_s
, float32
, H1_4
, float32_add
)
4492 DO_ZPZS_FP(sve_fadds_d
, float64
, , float64_add
)
4494 DO_ZPZS_FP(sve_fsubs_h
, float16
, H1_2
, float16_sub
)
4495 DO_ZPZS_FP(sve_fsubs_s
, float32
, H1_4
, float32_sub
)
4496 DO_ZPZS_FP(sve_fsubs_d
, float64
, , float64_sub
)
4498 DO_ZPZS_FP(sve_fmuls_h
, float16
, H1_2
, float16_mul
)
4499 DO_ZPZS_FP(sve_fmuls_s
, float32
, H1_4
, float32_mul
)
4500 DO_ZPZS_FP(sve_fmuls_d
, float64
, , float64_mul
)
4502 static inline float16
subr_h(float16 a
, float16 b
, float_status
*s
)
4504 return float16_sub(b
, a
, s
);
4507 static inline float32
subr_s(float32 a
, float32 b
, float_status
*s
)
4509 return float32_sub(b
, a
, s
);
4512 static inline float64
subr_d(float64 a
, float64 b
, float_status
*s
)
4514 return float64_sub(b
, a
, s
);
4517 DO_ZPZS_FP(sve_fsubrs_h
, float16
, H1_2
, subr_h
)
4518 DO_ZPZS_FP(sve_fsubrs_s
, float32
, H1_4
, subr_s
)
4519 DO_ZPZS_FP(sve_fsubrs_d
, float64
, , subr_d
)
4521 DO_ZPZS_FP(sve_fmaxnms_h
, float16
, H1_2
, float16_maxnum
)
4522 DO_ZPZS_FP(sve_fmaxnms_s
, float32
, H1_4
, float32_maxnum
)
4523 DO_ZPZS_FP(sve_fmaxnms_d
, float64
, , float64_maxnum
)
4525 DO_ZPZS_FP(sve_fminnms_h
, float16
, H1_2
, float16_minnum
)
4526 DO_ZPZS_FP(sve_fminnms_s
, float32
, H1_4
, float32_minnum
)
4527 DO_ZPZS_FP(sve_fminnms_d
, float64
, , float64_minnum
)
4529 DO_ZPZS_FP(sve_fmaxs_h
, float16
, H1_2
, float16_max
)
4530 DO_ZPZS_FP(sve_fmaxs_s
, float32
, H1_4
, float32_max
)
4531 DO_ZPZS_FP(sve_fmaxs_d
, float64
, , float64_max
)
4533 DO_ZPZS_FP(sve_fmins_h
, float16
, H1_2
, float16_min
)
4534 DO_ZPZS_FP(sve_fmins_s
, float32
, H1_4
, float32_min
)
4535 DO_ZPZS_FP(sve_fmins_d
, float64
, , float64_min
)
4537 /* Fully general two-operand expander, controlled by a predicate,
4538 * With the extra float_status parameter.
4540 #define DO_ZPZ_FP(NAME, TYPE, H, OP) \
4541 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
4543 intptr_t i = simd_oprsz(desc); \
4546 uint64_t pg = g[(i - 1) >> 6]; \
4548 i -= sizeof(TYPE); \
4549 if (likely((pg >> (i & 63)) & 1)) { \
4550 TYPE nn = *(TYPE *)(vn + H(i)); \
4551 *(TYPE *)(vd + H(i)) = OP(nn, status); \
4557 /* SVE fp16 conversions always use IEEE mode. Like AdvSIMD, they ignore
4558 * FZ16. When converting from fp16, this affects flushing input denormals;
4559 * when converting to fp16, this affects flushing output denormals.
4561 static inline float32
sve_f16_to_f32(float16 f
, float_status
*fpst
)
4563 bool save
= get_flush_inputs_to_zero(fpst
);
4566 set_flush_inputs_to_zero(false, fpst
);
4567 ret
= float16_to_float32(f
, true, fpst
);
4568 set_flush_inputs_to_zero(save
, fpst
);
4572 static inline float64
sve_f16_to_f64(float16 f
, float_status
*fpst
)
4574 bool save
= get_flush_inputs_to_zero(fpst
);
4577 set_flush_inputs_to_zero(false, fpst
);
4578 ret
= float16_to_float64(f
, true, fpst
);
4579 set_flush_inputs_to_zero(save
, fpst
);
4583 static inline float16
sve_f32_to_f16(float32 f
, float_status
*fpst
)
4585 bool save
= get_flush_to_zero(fpst
);
4588 set_flush_to_zero(false, fpst
);
4589 ret
= float32_to_float16(f
, true, fpst
);
4590 set_flush_to_zero(save
, fpst
);
4594 static inline float16
sve_f64_to_f16(float64 f
, float_status
*fpst
)
4596 bool save
= get_flush_to_zero(fpst
);
4599 set_flush_to_zero(false, fpst
);
4600 ret
= float64_to_float16(f
, true, fpst
);
4601 set_flush_to_zero(save
, fpst
);
4605 static inline int16_t vfp_float16_to_int16_rtz(float16 f
, float_status
*s
)
4607 if (float16_is_any_nan(f
)) {
4608 float_raise(float_flag_invalid
, s
);
4611 return float16_to_int16_round_to_zero(f
, s
);
4614 static inline int64_t vfp_float16_to_int64_rtz(float16 f
, float_status
*s
)
4616 if (float16_is_any_nan(f
)) {
4617 float_raise(float_flag_invalid
, s
);
4620 return float16_to_int64_round_to_zero(f
, s
);
4623 static inline int64_t vfp_float32_to_int64_rtz(float32 f
, float_status
*s
)
4625 if (float32_is_any_nan(f
)) {
4626 float_raise(float_flag_invalid
, s
);
4629 return float32_to_int64_round_to_zero(f
, s
);
4632 static inline int64_t vfp_float64_to_int64_rtz(float64 f
, float_status
*s
)
4634 if (float64_is_any_nan(f
)) {
4635 float_raise(float_flag_invalid
, s
);
4638 return float64_to_int64_round_to_zero(f
, s
);
4641 static inline uint16_t vfp_float16_to_uint16_rtz(float16 f
, float_status
*s
)
4643 if (float16_is_any_nan(f
)) {
4644 float_raise(float_flag_invalid
, s
);
4647 return float16_to_uint16_round_to_zero(f
, s
);
4650 static inline uint64_t vfp_float16_to_uint64_rtz(float16 f
, float_status
*s
)
4652 if (float16_is_any_nan(f
)) {
4653 float_raise(float_flag_invalid
, s
);
4656 return float16_to_uint64_round_to_zero(f
, s
);
4659 static inline uint64_t vfp_float32_to_uint64_rtz(float32 f
, float_status
*s
)
4661 if (float32_is_any_nan(f
)) {
4662 float_raise(float_flag_invalid
, s
);
4665 return float32_to_uint64_round_to_zero(f
, s
);
4668 static inline uint64_t vfp_float64_to_uint64_rtz(float64 f
, float_status
*s
)
4670 if (float64_is_any_nan(f
)) {
4671 float_raise(float_flag_invalid
, s
);
4674 return float64_to_uint64_round_to_zero(f
, s
);
4677 DO_ZPZ_FP(sve_fcvt_sh
, uint32_t, H1_4
, sve_f32_to_f16
)
4678 DO_ZPZ_FP(sve_fcvt_hs
, uint32_t, H1_4
, sve_f16_to_f32
)
4679 DO_ZPZ_FP(sve_fcvt_dh
, uint64_t, , sve_f64_to_f16
)
4680 DO_ZPZ_FP(sve_fcvt_hd
, uint64_t, , sve_f16_to_f64
)
4681 DO_ZPZ_FP(sve_fcvt_ds
, uint64_t, , float64_to_float32
)
4682 DO_ZPZ_FP(sve_fcvt_sd
, uint64_t, , float32_to_float64
)
4684 DO_ZPZ_FP(sve_fcvtzs_hh
, uint16_t, H1_2
, vfp_float16_to_int16_rtz
)
4685 DO_ZPZ_FP(sve_fcvtzs_hs
, uint32_t, H1_4
, helper_vfp_tosizh
)
4686 DO_ZPZ_FP(sve_fcvtzs_ss
, uint32_t, H1_4
, helper_vfp_tosizs
)
4687 DO_ZPZ_FP(sve_fcvtzs_hd
, uint64_t, , vfp_float16_to_int64_rtz
)
4688 DO_ZPZ_FP(sve_fcvtzs_sd
, uint64_t, , vfp_float32_to_int64_rtz
)
4689 DO_ZPZ_FP(sve_fcvtzs_ds
, uint64_t, , helper_vfp_tosizd
)
4690 DO_ZPZ_FP(sve_fcvtzs_dd
, uint64_t, , vfp_float64_to_int64_rtz
)
4692 DO_ZPZ_FP(sve_fcvtzu_hh
, uint16_t, H1_2
, vfp_float16_to_uint16_rtz
)
4693 DO_ZPZ_FP(sve_fcvtzu_hs
, uint32_t, H1_4
, helper_vfp_touizh
)
4694 DO_ZPZ_FP(sve_fcvtzu_ss
, uint32_t, H1_4
, helper_vfp_touizs
)
4695 DO_ZPZ_FP(sve_fcvtzu_hd
, uint64_t, , vfp_float16_to_uint64_rtz
)
4696 DO_ZPZ_FP(sve_fcvtzu_sd
, uint64_t, , vfp_float32_to_uint64_rtz
)
4697 DO_ZPZ_FP(sve_fcvtzu_ds
, uint64_t, , helper_vfp_touizd
)
4698 DO_ZPZ_FP(sve_fcvtzu_dd
, uint64_t, , vfp_float64_to_uint64_rtz
)
4700 DO_ZPZ_FP(sve_frint_h
, uint16_t, H1_2
, helper_advsimd_rinth
)
4701 DO_ZPZ_FP(sve_frint_s
, uint32_t, H1_4
, helper_rints
)
4702 DO_ZPZ_FP(sve_frint_d
, uint64_t, , helper_rintd
)
4704 DO_ZPZ_FP(sve_frintx_h
, uint16_t, H1_2
, float16_round_to_int
)
4705 DO_ZPZ_FP(sve_frintx_s
, uint32_t, H1_4
, float32_round_to_int
)
4706 DO_ZPZ_FP(sve_frintx_d
, uint64_t, , float64_round_to_int
)
4708 DO_ZPZ_FP(sve_frecpx_h
, uint16_t, H1_2
, helper_frecpx_f16
)
4709 DO_ZPZ_FP(sve_frecpx_s
, uint32_t, H1_4
, helper_frecpx_f32
)
4710 DO_ZPZ_FP(sve_frecpx_d
, uint64_t, , helper_frecpx_f64
)
4712 DO_ZPZ_FP(sve_fsqrt_h
, uint16_t, H1_2
, float16_sqrt
)
4713 DO_ZPZ_FP(sve_fsqrt_s
, uint32_t, H1_4
, float32_sqrt
)
4714 DO_ZPZ_FP(sve_fsqrt_d
, uint64_t, , float64_sqrt
)
4716 DO_ZPZ_FP(sve_scvt_hh
, uint16_t, H1_2
, int16_to_float16
)
4717 DO_ZPZ_FP(sve_scvt_sh
, uint32_t, H1_4
, int32_to_float16
)
4718 DO_ZPZ_FP(sve_scvt_ss
, uint32_t, H1_4
, int32_to_float32
)
4719 DO_ZPZ_FP(sve_scvt_sd
, uint64_t, , int32_to_float64
)
4720 DO_ZPZ_FP(sve_scvt_dh
, uint64_t, , int64_to_float16
)
4721 DO_ZPZ_FP(sve_scvt_ds
, uint64_t, , int64_to_float32
)
4722 DO_ZPZ_FP(sve_scvt_dd
, uint64_t, , int64_to_float64
)
4724 DO_ZPZ_FP(sve_ucvt_hh
, uint16_t, H1_2
, uint16_to_float16
)
4725 DO_ZPZ_FP(sve_ucvt_sh
, uint32_t, H1_4
, uint32_to_float16
)
4726 DO_ZPZ_FP(sve_ucvt_ss
, uint32_t, H1_4
, uint32_to_float32
)
4727 DO_ZPZ_FP(sve_ucvt_sd
, uint64_t, , uint32_to_float64
)
4728 DO_ZPZ_FP(sve_ucvt_dh
, uint64_t, , uint64_to_float16
)
4729 DO_ZPZ_FP(sve_ucvt_ds
, uint64_t, , uint64_to_float32
)
4730 DO_ZPZ_FP(sve_ucvt_dd
, uint64_t, , uint64_to_float64
)
4734 static void do_fmla_zpzzz_h(void *vd
, void *vn
, void *vm
, void *va
, void *vg
,
4735 float_status
*status
, uint32_t desc
,
4736 uint16_t neg1
, uint16_t neg3
)
4738 intptr_t i
= simd_oprsz(desc
);
4742 uint64_t pg
= g
[(i
- 1) >> 6];
4745 if (likely((pg
>> (i
& 63)) & 1)) {
4746 float16 e1
, e2
, e3
, r
;
4748 e1
= *(uint16_t *)(vn
+ H1_2(i
)) ^ neg1
;
4749 e2
= *(uint16_t *)(vm
+ H1_2(i
));
4750 e3
= *(uint16_t *)(va
+ H1_2(i
)) ^ neg3
;
4751 r
= float16_muladd(e1
, e2
, e3
, 0, status
);
4752 *(uint16_t *)(vd
+ H1_2(i
)) = r
;
4758 void HELPER(sve_fmla_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4759 void *vg
, void *status
, uint32_t desc
)
4761 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0);
4764 void HELPER(sve_fmls_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4765 void *vg
, void *status
, uint32_t desc
)
4767 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x8000, 0);
4770 void HELPER(sve_fnmla_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4771 void *vg
, void *status
, uint32_t desc
)
4773 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x8000, 0x8000);
4776 void HELPER(sve_fnmls_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
4777 void *vg
, void *status
, uint32_t desc
)
4779 do_fmla_zpzzz_h(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0x8000);
4782 static void do_fmla_zpzzz_s(void *vd
, void *vn
, void *vm
, void *va
, void *vg
,
4783 float_status
*status
, uint32_t desc
,
4784 uint32_t neg1
, uint32_t neg3
)
4786 intptr_t i
= simd_oprsz(desc
);
4790 uint64_t pg
= g
[(i
- 1) >> 6];
4793 if (likely((pg
>> (i
& 63)) & 1)) {
4794 float32 e1
, e2
, e3
, r
;
4796 e1
= *(uint32_t *)(vn
+ H1_4(i
)) ^ neg1
;
4797 e2
= *(uint32_t *)(vm
+ H1_4(i
));
4798 e3
= *(uint32_t *)(va
+ H1_4(i
)) ^ neg3
;
4799 r
= float32_muladd(e1
, e2
, e3
, 0, status
);
4800 *(uint32_t *)(vd
+ H1_4(i
)) = r
;
4806 void HELPER(sve_fmla_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4807 void *vg
, void *status
, uint32_t desc
)
4809 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0);
4812 void HELPER(sve_fmls_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4813 void *vg
, void *status
, uint32_t desc
)
4815 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x80000000, 0);
4818 void HELPER(sve_fnmla_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4819 void *vg
, void *status
, uint32_t desc
)
4821 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0x80000000, 0x80000000);
4824 void HELPER(sve_fnmls_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
4825 void *vg
, void *status
, uint32_t desc
)
4827 do_fmla_zpzzz_s(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0x80000000);
4830 static void do_fmla_zpzzz_d(void *vd
, void *vn
, void *vm
, void *va
, void *vg
,
4831 float_status
*status
, uint32_t desc
,
4832 uint64_t neg1
, uint64_t neg3
)
4834 intptr_t i
= simd_oprsz(desc
);
4838 uint64_t pg
= g
[(i
- 1) >> 6];
4841 if (likely((pg
>> (i
& 63)) & 1)) {
4842 float64 e1
, e2
, e3
, r
;
4844 e1
= *(uint64_t *)(vn
+ i
) ^ neg1
;
4845 e2
= *(uint64_t *)(vm
+ i
);
4846 e3
= *(uint64_t *)(va
+ i
) ^ neg3
;
4847 r
= float64_muladd(e1
, e2
, e3
, 0, status
);
4848 *(uint64_t *)(vd
+ i
) = r
;
4854 void HELPER(sve_fmla_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4855 void *vg
, void *status
, uint32_t desc
)
4857 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, 0);
4860 void HELPER(sve_fmls_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4861 void *vg
, void *status
, uint32_t desc
)
4863 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, INT64_MIN
, 0);
4866 void HELPER(sve_fnmla_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4867 void *vg
, void *status
, uint32_t desc
)
4869 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, INT64_MIN
, INT64_MIN
);
4872 void HELPER(sve_fnmls_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
4873 void *vg
, void *status
, uint32_t desc
)
4875 do_fmla_zpzzz_d(vd
, vn
, vm
, va
, vg
, status
, desc
, 0, INT64_MIN
);
4878 /* Two operand floating-point comparison controlled by a predicate.
4879 * Unlike the integer version, we are not allowed to optimistically
4880 * compare operands, since the comparison may have side effects wrt
4883 #define DO_FPCMP_PPZZ(NAME, TYPE, H, OP) \
4884 void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, \
4885 void *status, uint32_t desc) \
4887 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4888 uint64_t *d = vd, *g = vg; \
4890 uint64_t out = 0, pg = g[j]; \
4892 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4893 if (likely((pg >> (i & 63)) & 1)) { \
4894 TYPE nn = *(TYPE *)(vn + H(i)); \
4895 TYPE mm = *(TYPE *)(vm + H(i)); \
4896 out |= OP(TYPE, nn, mm, status); \
4903 #define DO_FPCMP_PPZZ_H(NAME, OP) \
4904 DO_FPCMP_PPZZ(NAME##_h, float16, H1_2, OP)
4905 #define DO_FPCMP_PPZZ_S(NAME, OP) \
4906 DO_FPCMP_PPZZ(NAME##_s, float32, H1_4, OP)
4907 #define DO_FPCMP_PPZZ_D(NAME, OP) \
4908 DO_FPCMP_PPZZ(NAME##_d, float64, , OP)
4910 #define DO_FPCMP_PPZZ_ALL(NAME, OP) \
4911 DO_FPCMP_PPZZ_H(NAME, OP) \
4912 DO_FPCMP_PPZZ_S(NAME, OP) \
4913 DO_FPCMP_PPZZ_D(NAME, OP)
4915 #define DO_FCMGE(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) <= 0
4916 #define DO_FCMGT(TYPE, X, Y, ST) TYPE##_compare(Y, X, ST) < 0
4917 #define DO_FCMLE(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) <= 0
4918 #define DO_FCMLT(TYPE, X, Y, ST) TYPE##_compare(X, Y, ST) < 0
4919 #define DO_FCMEQ(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) == 0
4920 #define DO_FCMNE(TYPE, X, Y, ST) TYPE##_compare_quiet(X, Y, ST) != 0
4921 #define DO_FCMUO(TYPE, X, Y, ST) \
4922 TYPE##_compare_quiet(X, Y, ST) == float_relation_unordered
4923 #define DO_FACGE(TYPE, X, Y, ST) \
4924 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) <= 0
4925 #define DO_FACGT(TYPE, X, Y, ST) \
4926 TYPE##_compare(TYPE##_abs(Y), TYPE##_abs(X), ST) < 0
4928 DO_FPCMP_PPZZ_ALL(sve_fcmge
, DO_FCMGE
)
4929 DO_FPCMP_PPZZ_ALL(sve_fcmgt
, DO_FCMGT
)
4930 DO_FPCMP_PPZZ_ALL(sve_fcmeq
, DO_FCMEQ
)
4931 DO_FPCMP_PPZZ_ALL(sve_fcmne
, DO_FCMNE
)
4932 DO_FPCMP_PPZZ_ALL(sve_fcmuo
, DO_FCMUO
)
4933 DO_FPCMP_PPZZ_ALL(sve_facge
, DO_FACGE
)
4934 DO_FPCMP_PPZZ_ALL(sve_facgt
, DO_FACGT
)
4936 #undef DO_FPCMP_PPZZ_ALL
4937 #undef DO_FPCMP_PPZZ_D
4938 #undef DO_FPCMP_PPZZ_S
4939 #undef DO_FPCMP_PPZZ_H
4940 #undef DO_FPCMP_PPZZ
4942 /* One operand floating-point comparison against zero, controlled
4945 #define DO_FPCMP_PPZ0(NAME, TYPE, H, OP) \
4946 void HELPER(NAME)(void *vd, void *vn, void *vg, \
4947 void *status, uint32_t desc) \
4949 intptr_t i = simd_oprsz(desc), j = (i - 1) >> 6; \
4950 uint64_t *d = vd, *g = vg; \
4952 uint64_t out = 0, pg = g[j]; \
4954 i -= sizeof(TYPE), out <<= sizeof(TYPE); \
4955 if ((pg >> (i & 63)) & 1) { \
4956 TYPE nn = *(TYPE *)(vn + H(i)); \
4957 out |= OP(TYPE, nn, 0, status); \
4964 #define DO_FPCMP_PPZ0_H(NAME, OP) \
4965 DO_FPCMP_PPZ0(NAME##_h, float16, H1_2, OP)
4966 #define DO_FPCMP_PPZ0_S(NAME, OP) \
4967 DO_FPCMP_PPZ0(NAME##_s, float32, H1_4, OP)
4968 #define DO_FPCMP_PPZ0_D(NAME, OP) \
4969 DO_FPCMP_PPZ0(NAME##_d, float64, , OP)
4971 #define DO_FPCMP_PPZ0_ALL(NAME, OP) \
4972 DO_FPCMP_PPZ0_H(NAME, OP) \
4973 DO_FPCMP_PPZ0_S(NAME, OP) \
4974 DO_FPCMP_PPZ0_D(NAME, OP)
4976 DO_FPCMP_PPZ0_ALL(sve_fcmge0
, DO_FCMGE
)
4977 DO_FPCMP_PPZ0_ALL(sve_fcmgt0
, DO_FCMGT
)
4978 DO_FPCMP_PPZ0_ALL(sve_fcmle0
, DO_FCMLE
)
4979 DO_FPCMP_PPZ0_ALL(sve_fcmlt0
, DO_FCMLT
)
4980 DO_FPCMP_PPZ0_ALL(sve_fcmeq0
, DO_FCMEQ
)
4981 DO_FPCMP_PPZ0_ALL(sve_fcmne0
, DO_FCMNE
)
4983 /* FP Trig Multiply-Add. */
4985 void HELPER(sve_ftmad_h
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
4987 static const float16 coeff
[16] = {
4988 0x3c00, 0xb155, 0x2030, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4989 0x3c00, 0xb800, 0x293a, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000,
4991 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float16
);
4992 intptr_t x
= simd_data(desc
);
4993 float16
*d
= vd
, *n
= vn
, *m
= vm
;
4994 for (i
= 0; i
< opr_sz
; i
++) {
4997 if (float16_is_neg(mm
)) {
4998 mm
= float16_abs(mm
);
5001 d
[i
] = float16_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
5005 void HELPER(sve_ftmad_s
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
5007 static const float32 coeff
[16] = {
5008 0x3f800000, 0xbe2aaaab, 0x3c088886, 0xb95008b9,
5009 0x36369d6d, 0x00000000, 0x00000000, 0x00000000,
5010 0x3f800000, 0xbf000000, 0x3d2aaaa6, 0xbab60705,
5011 0x37cd37cc, 0x00000000, 0x00000000, 0x00000000,
5013 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float32
);
5014 intptr_t x
= simd_data(desc
);
5015 float32
*d
= vd
, *n
= vn
, *m
= vm
;
5016 for (i
= 0; i
< opr_sz
; i
++) {
5019 if (float32_is_neg(mm
)) {
5020 mm
= float32_abs(mm
);
5023 d
[i
] = float32_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
5027 void HELPER(sve_ftmad_d
)(void *vd
, void *vn
, void *vm
, void *vs
, uint32_t desc
)
5029 static const float64 coeff
[16] = {
5030 0x3ff0000000000000ull
, 0xbfc5555555555543ull
,
5031 0x3f8111111110f30cull
, 0xbf2a01a019b92fc6ull
,
5032 0x3ec71de351f3d22bull
, 0xbe5ae5e2b60f7b91ull
,
5033 0x3de5d8408868552full
, 0x0000000000000000ull
,
5034 0x3ff0000000000000ull
, 0xbfe0000000000000ull
,
5035 0x3fa5555555555536ull
, 0xbf56c16c16c13a0bull
,
5036 0x3efa01a019b1e8d8ull
, 0xbe927e4f7282f468ull
,
5037 0x3e21ee96d2641b13ull
, 0xbda8f76380fbb401ull
,
5039 intptr_t i
, opr_sz
= simd_oprsz(desc
) / sizeof(float64
);
5040 intptr_t x
= simd_data(desc
);
5041 float64
*d
= vd
, *n
= vn
, *m
= vm
;
5042 for (i
= 0; i
< opr_sz
; i
++) {
5045 if (float64_is_neg(mm
)) {
5046 mm
= float64_abs(mm
);
5049 d
[i
] = float64_muladd(n
[i
], mm
, coeff
[xx
], 0, vs
);
5057 void HELPER(sve_fcadd_h
)(void *vd
, void *vn
, void *vm
, void *vg
,
5058 void *vs
, uint32_t desc
)
5060 intptr_t j
, i
= simd_oprsz(desc
);
5062 float16 neg_imag
= float16_set_sign(0, simd_data(desc
));
5063 float16 neg_real
= float16_chs(neg_imag
);
5066 uint64_t pg
= g
[(i
- 1) >> 6];
5068 float16 e0
, e1
, e2
, e3
;
5070 /* I holds the real index; J holds the imag index. */
5071 j
= i
- sizeof(float16
);
5072 i
-= 2 * sizeof(float16
);
5074 e0
= *(float16
*)(vn
+ H1_2(i
));
5075 e1
= *(float16
*)(vm
+ H1_2(j
)) ^ neg_real
;
5076 e2
= *(float16
*)(vn
+ H1_2(j
));
5077 e3
= *(float16
*)(vm
+ H1_2(i
)) ^ neg_imag
;
5079 if (likely((pg
>> (i
& 63)) & 1)) {
5080 *(float16
*)(vd
+ H1_2(i
)) = float16_add(e0
, e1
, vs
);
5082 if (likely((pg
>> (j
& 63)) & 1)) {
5083 *(float16
*)(vd
+ H1_2(j
)) = float16_add(e2
, e3
, vs
);
5089 void HELPER(sve_fcadd_s
)(void *vd
, void *vn
, void *vm
, void *vg
,
5090 void *vs
, uint32_t desc
)
5092 intptr_t j
, i
= simd_oprsz(desc
);
5094 float32 neg_imag
= float32_set_sign(0, simd_data(desc
));
5095 float32 neg_real
= float32_chs(neg_imag
);
5098 uint64_t pg
= g
[(i
- 1) >> 6];
5100 float32 e0
, e1
, e2
, e3
;
5102 /* I holds the real index; J holds the imag index. */
5103 j
= i
- sizeof(float32
);
5104 i
-= 2 * sizeof(float32
);
5106 e0
= *(float32
*)(vn
+ H1_2(i
));
5107 e1
= *(float32
*)(vm
+ H1_2(j
)) ^ neg_real
;
5108 e2
= *(float32
*)(vn
+ H1_2(j
));
5109 e3
= *(float32
*)(vm
+ H1_2(i
)) ^ neg_imag
;
5111 if (likely((pg
>> (i
& 63)) & 1)) {
5112 *(float32
*)(vd
+ H1_2(i
)) = float32_add(e0
, e1
, vs
);
5114 if (likely((pg
>> (j
& 63)) & 1)) {
5115 *(float32
*)(vd
+ H1_2(j
)) = float32_add(e2
, e3
, vs
);
5121 void HELPER(sve_fcadd_d
)(void *vd
, void *vn
, void *vm
, void *vg
,
5122 void *vs
, uint32_t desc
)
5124 intptr_t j
, i
= simd_oprsz(desc
);
5126 float64 neg_imag
= float64_set_sign(0, simd_data(desc
));
5127 float64 neg_real
= float64_chs(neg_imag
);
5130 uint64_t pg
= g
[(i
- 1) >> 6];
5132 float64 e0
, e1
, e2
, e3
;
5134 /* I holds the real index; J holds the imag index. */
5135 j
= i
- sizeof(float64
);
5136 i
-= 2 * sizeof(float64
);
5138 e0
= *(float64
*)(vn
+ H1_2(i
));
5139 e1
= *(float64
*)(vm
+ H1_2(j
)) ^ neg_real
;
5140 e2
= *(float64
*)(vn
+ H1_2(j
));
5141 e3
= *(float64
*)(vm
+ H1_2(i
)) ^ neg_imag
;
5143 if (likely((pg
>> (i
& 63)) & 1)) {
5144 *(float64
*)(vd
+ H1_2(i
)) = float64_add(e0
, e1
, vs
);
5146 if (likely((pg
>> (j
& 63)) & 1)) {
5147 *(float64
*)(vd
+ H1_2(j
)) = float64_add(e2
, e3
, vs
);
5154 * FP Complex Multiply
5157 void HELPER(sve_fcmla_zpzzz_h
)(void *vd
, void *vn
, void *vm
, void *va
,
5158 void *vg
, void *status
, uint32_t desc
)
5160 intptr_t j
, i
= simd_oprsz(desc
);
5161 unsigned rot
= simd_data(desc
);
5162 bool flip
= rot
& 1;
5163 float16 neg_imag
, neg_real
;
5166 neg_imag
= float16_set_sign(0, (rot
& 2) != 0);
5167 neg_real
= float16_set_sign(0, rot
== 1 || rot
== 2);
5170 uint64_t pg
= g
[(i
- 1) >> 6];
5172 float16 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
5174 /* I holds the real index; J holds the imag index. */
5175 j
= i
- sizeof(float16
);
5176 i
-= 2 * sizeof(float16
);
5178 nr
= *(float16
*)(vn
+ H1_2(i
));
5179 ni
= *(float16
*)(vn
+ H1_2(j
));
5180 mr
= *(float16
*)(vm
+ H1_2(i
));
5181 mi
= *(float16
*)(vm
+ H1_2(j
));
5183 e2
= (flip
? ni
: nr
);
5184 e1
= (flip
? mi
: mr
) ^ neg_real
;
5186 e3
= (flip
? mr
: mi
) ^ neg_imag
;
5188 if (likely((pg
>> (i
& 63)) & 1)) {
5189 d
= *(float16
*)(va
+ H1_2(i
));
5190 d
= float16_muladd(e2
, e1
, d
, 0, status
);
5191 *(float16
*)(vd
+ H1_2(i
)) = d
;
5193 if (likely((pg
>> (j
& 63)) & 1)) {
5194 d
= *(float16
*)(va
+ H1_2(j
));
5195 d
= float16_muladd(e4
, e3
, d
, 0, status
);
5196 *(float16
*)(vd
+ H1_2(j
)) = d
;
5202 void HELPER(sve_fcmla_zpzzz_s
)(void *vd
, void *vn
, void *vm
, void *va
,
5203 void *vg
, void *status
, uint32_t desc
)
5205 intptr_t j
, i
= simd_oprsz(desc
);
5206 unsigned rot
= simd_data(desc
);
5207 bool flip
= rot
& 1;
5208 float32 neg_imag
, neg_real
;
5211 neg_imag
= float32_set_sign(0, (rot
& 2) != 0);
5212 neg_real
= float32_set_sign(0, rot
== 1 || rot
== 2);
5215 uint64_t pg
= g
[(i
- 1) >> 6];
5217 float32 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
5219 /* I holds the real index; J holds the imag index. */
5220 j
= i
- sizeof(float32
);
5221 i
-= 2 * sizeof(float32
);
5223 nr
= *(float32
*)(vn
+ H1_2(i
));
5224 ni
= *(float32
*)(vn
+ H1_2(j
));
5225 mr
= *(float32
*)(vm
+ H1_2(i
));
5226 mi
= *(float32
*)(vm
+ H1_2(j
));
5228 e2
= (flip
? ni
: nr
);
5229 e1
= (flip
? mi
: mr
) ^ neg_real
;
5231 e3
= (flip
? mr
: mi
) ^ neg_imag
;
5233 if (likely((pg
>> (i
& 63)) & 1)) {
5234 d
= *(float32
*)(va
+ H1_2(i
));
5235 d
= float32_muladd(e2
, e1
, d
, 0, status
);
5236 *(float32
*)(vd
+ H1_2(i
)) = d
;
5238 if (likely((pg
>> (j
& 63)) & 1)) {
5239 d
= *(float32
*)(va
+ H1_2(j
));
5240 d
= float32_muladd(e4
, e3
, d
, 0, status
);
5241 *(float32
*)(vd
+ H1_2(j
)) = d
;
5247 void HELPER(sve_fcmla_zpzzz_d
)(void *vd
, void *vn
, void *vm
, void *va
,
5248 void *vg
, void *status
, uint32_t desc
)
5250 intptr_t j
, i
= simd_oprsz(desc
);
5251 unsigned rot
= simd_data(desc
);
5252 bool flip
= rot
& 1;
5253 float64 neg_imag
, neg_real
;
5256 neg_imag
= float64_set_sign(0, (rot
& 2) != 0);
5257 neg_real
= float64_set_sign(0, rot
== 1 || rot
== 2);
5260 uint64_t pg
= g
[(i
- 1) >> 6];
5262 float64 e1
, e2
, e3
, e4
, nr
, ni
, mr
, mi
, d
;
5264 /* I holds the real index; J holds the imag index. */
5265 j
= i
- sizeof(float64
);
5266 i
-= 2 * sizeof(float64
);
5268 nr
= *(float64
*)(vn
+ H1_2(i
));
5269 ni
= *(float64
*)(vn
+ H1_2(j
));
5270 mr
= *(float64
*)(vm
+ H1_2(i
));
5271 mi
= *(float64
*)(vm
+ H1_2(j
));
5273 e2
= (flip
? ni
: nr
);
5274 e1
= (flip
? mi
: mr
) ^ neg_real
;
5276 e3
= (flip
? mr
: mi
) ^ neg_imag
;
5278 if (likely((pg
>> (i
& 63)) & 1)) {
5279 d
= *(float64
*)(va
+ H1_2(i
));
5280 d
= float64_muladd(e2
, e1
, d
, 0, status
);
5281 *(float64
*)(vd
+ H1_2(i
)) = d
;
5283 if (likely((pg
>> (j
& 63)) & 1)) {
5284 d
= *(float64
*)(va
+ H1_2(j
));
5285 d
= float64_muladd(e4
, e3
, d
, 0, status
);
5286 *(float64
*)(vd
+ H1_2(j
)) = d
;
5293 * Load contiguous data, protected by a governing predicate.
5297 * Load one element into @vd + @reg_off from @host.
5298 * The controlling predicate is known to be true.
5300 typedef void sve_ldst1_host_fn(void *vd
, intptr_t reg_off
, void *host
);
5303 * Load one element into @vd + @reg_off from (@env, @vaddr, @ra).
5304 * The controlling predicate is known to be true.
5306 typedef void sve_ldst1_tlb_fn(CPUARMState
*env
, void *vd
, intptr_t reg_off
,
5307 target_ulong vaddr
, uintptr_t retaddr
);
5310 * Generate the above primitives.
5313 #define DO_LD_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5314 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5316 TYPEM val = HOST(host); \
5317 *(TYPEE *)(vd + H(reg_off)) = val; \
5320 #define DO_ST_HOST(NAME, H, TYPEE, TYPEM, HOST) \
5321 static void sve_##NAME##_host(void *vd, intptr_t reg_off, void *host) \
5322 { HOST(host, (TYPEM)*(TYPEE *)(vd + H(reg_off))); }
5324 #define DO_LD_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5325 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5326 target_ulong addr, uintptr_t ra) \
5328 *(TYPEE *)(vd + H(reg_off)) = \
5329 (TYPEM)TLB(env, useronly_clean_ptr(addr), ra); \
5332 #define DO_ST_TLB(NAME, H, TYPEE, TYPEM, TLB) \
5333 static void sve_##NAME##_tlb(CPUARMState *env, void *vd, intptr_t reg_off, \
5334 target_ulong addr, uintptr_t ra) \
5336 TLB(env, useronly_clean_ptr(addr), \
5337 (TYPEM)*(TYPEE *)(vd + H(reg_off)), ra); \
5340 #define DO_LD_PRIM_1(NAME, H, TE, TM) \
5341 DO_LD_HOST(NAME, H, TE, TM, ldub_p) \
5342 DO_LD_TLB(NAME, H, TE, TM, cpu_ldub_data_ra)
5344 DO_LD_PRIM_1(ld1bb
, H1
, uint8_t, uint8_t)
5345 DO_LD_PRIM_1(ld1bhu
, H1_2
, uint16_t, uint8_t)
5346 DO_LD_PRIM_1(ld1bhs
, H1_2
, uint16_t, int8_t)
5347 DO_LD_PRIM_1(ld1bsu
, H1_4
, uint32_t, uint8_t)
5348 DO_LD_PRIM_1(ld1bss
, H1_4
, uint32_t, int8_t)
5349 DO_LD_PRIM_1(ld1bdu
, , uint64_t, uint8_t)
5350 DO_LD_PRIM_1(ld1bds
, , uint64_t, int8_t)
5352 #define DO_ST_PRIM_1(NAME, H, TE, TM) \
5353 DO_ST_HOST(st1##NAME, H, TE, TM, stb_p) \
5354 DO_ST_TLB(st1##NAME, H, TE, TM, cpu_stb_data_ra)
5356 DO_ST_PRIM_1(bb
, H1
, uint8_t, uint8_t)
5357 DO_ST_PRIM_1(bh
, H1_2
, uint16_t, uint8_t)
5358 DO_ST_PRIM_1(bs
, H1_4
, uint32_t, uint8_t)
5359 DO_ST_PRIM_1(bd
, , uint64_t, uint8_t)
5361 #define DO_LD_PRIM_2(NAME, H, TE, TM, LD) \
5362 DO_LD_HOST(ld1##NAME##_be, H, TE, TM, LD##_be_p) \
5363 DO_LD_HOST(ld1##NAME##_le, H, TE, TM, LD##_le_p) \
5364 DO_LD_TLB(ld1##NAME##_be, H, TE, TM, cpu_##LD##_be_data_ra) \
5365 DO_LD_TLB(ld1##NAME##_le, H, TE, TM, cpu_##LD##_le_data_ra)
5367 #define DO_ST_PRIM_2(NAME, H, TE, TM, ST) \
5368 DO_ST_HOST(st1##NAME##_be, H, TE, TM, ST##_be_p) \
5369 DO_ST_HOST(st1##NAME##_le, H, TE, TM, ST##_le_p) \
5370 DO_ST_TLB(st1##NAME##_be, H, TE, TM, cpu_##ST##_be_data_ra) \
5371 DO_ST_TLB(st1##NAME##_le, H, TE, TM, cpu_##ST##_le_data_ra)
5373 DO_LD_PRIM_2(hh
, H1_2
, uint16_t, uint16_t, lduw
)
5374 DO_LD_PRIM_2(hsu
, H1_4
, uint32_t, uint16_t, lduw
)
5375 DO_LD_PRIM_2(hss
, H1_4
, uint32_t, int16_t, lduw
)
5376 DO_LD_PRIM_2(hdu
, , uint64_t, uint16_t, lduw
)
5377 DO_LD_PRIM_2(hds
, , uint64_t, int16_t, lduw
)
5379 DO_ST_PRIM_2(hh
, H1_2
, uint16_t, uint16_t, stw
)
5380 DO_ST_PRIM_2(hs
, H1_4
, uint32_t, uint16_t, stw
)
5381 DO_ST_PRIM_2(hd
, , uint64_t, uint16_t, stw
)
5383 DO_LD_PRIM_2(ss
, H1_4
, uint32_t, uint32_t, ldl
)
5384 DO_LD_PRIM_2(sdu
, , uint64_t, uint32_t, ldl
)
5385 DO_LD_PRIM_2(sds
, , uint64_t, int32_t, ldl
)
5387 DO_ST_PRIM_2(ss
, H1_4
, uint32_t, uint32_t, stl
)
5388 DO_ST_PRIM_2(sd
, , uint64_t, uint32_t, stl
)
5390 DO_LD_PRIM_2(dd
, , uint64_t, uint64_t, ldq
)
5391 DO_ST_PRIM_2(dd
, , uint64_t, uint64_t, stq
)
5402 * Skip through a sequence of inactive elements in the guarding predicate @vg,
5403 * beginning at @reg_off bounded by @reg_max. Return the offset of the active
5404 * element >= @reg_off, or @reg_max if there were no active elements at all.
5406 static intptr_t find_next_active(uint64_t *vg
, intptr_t reg_off
,
5407 intptr_t reg_max
, int esz
)
5409 uint64_t pg_mask
= pred_esz_masks
[esz
];
5410 uint64_t pg
= (vg
[reg_off
>> 6] & pg_mask
) >> (reg_off
& 63);
5412 /* In normal usage, the first element is active. */
5413 if (likely(pg
& 1)) {
5421 if (unlikely(reg_off
>= reg_max
)) {
5422 /* The entire predicate was false. */
5425 pg
= vg
[reg_off
>> 6] & pg_mask
;
5428 reg_off
+= ctz64(pg
);
5430 /* We should never see an out of range predicate bit set. */
5431 tcg_debug_assert(reg_off
< reg_max
);
5436 * Resolve the guest virtual address to info->host and info->flags.
5437 * If @nofault, return false if the page is invalid, otherwise
5438 * exit via page fault exception.
5447 static bool sve_probe_page(SVEHostPage
*info
, bool nofault
,
5448 CPUARMState
*env
, target_ulong addr
,
5449 int mem_off
, MMUAccessType access_type
,
5450 int mmu_idx
, uintptr_t retaddr
)
5457 * User-only currently always issues with TBI. See the comment
5458 * above useronly_clean_ptr. Usually we clean this top byte away
5459 * during translation, but we can't do that for e.g. vector + imm
5462 * We currently always enable TBI for user-only, and do not provide
5463 * a way to turn it off. So clean the pointer unconditionally here,
5464 * rather than look it up here, or pass it down from above.
5466 addr
= useronly_clean_ptr(addr
);
5468 flags
= probe_access_flags(env
, addr
, access_type
, mmu_idx
, nofault
,
5469 &info
->host
, retaddr
);
5470 info
->flags
= flags
;
5472 if (flags
& TLB_INVALID_MASK
) {
5477 /* Ensure that info->host[] is relative to addr, not addr + mem_off. */
5478 info
->host
-= mem_off
;
5480 #ifdef CONFIG_USER_ONLY
5481 memset(&info
->attrs
, 0, sizeof(info
->attrs
));
5484 * Find the iotlbentry for addr and return the transaction attributes.
5485 * This *must* be present in the TLB because we just found the mapping.
5488 uintptr_t index
= tlb_index(env
, mmu_idx
, addr
);
5490 # ifdef CONFIG_DEBUG_TCG
5491 CPUTLBEntry
*entry
= tlb_entry(env
, mmu_idx
, addr
);
5492 target_ulong comparator
= (access_type
== MMU_DATA_LOAD
5494 : tlb_addr_write(entry
));
5495 g_assert(tlb_hit(comparator
, addr
));
5498 CPUIOTLBEntry
*iotlbentry
= &env_tlb(env
)->d
[mmu_idx
].iotlb
[index
];
5499 info
->attrs
= iotlbentry
->attrs
;
5508 * Analyse contiguous data, protected by a governing predicate.
5519 * First and last element wholly contained within the two pages.
5520 * mem_off_first[0] and reg_off_first[0] are always set >= 0.
5521 * reg_off_last[0] may be < 0 if the first element crosses pages.
5522 * All of mem_off_first[1], reg_off_first[1] and reg_off_last[1]
5523 * are set >= 0 only if there are complete elements on a second page.
5525 * The reg_off_* offsets are relative to the internal vector register.
5526 * The mem_off_first offset is relative to the memory address; the
5527 * two offsets are different when a load operation extends, a store
5528 * operation truncates, or for multi-register operations.
5530 int16_t mem_off_first
[2];
5531 int16_t reg_off_first
[2];
5532 int16_t reg_off_last
[2];
5535 * One element that is misaligned and spans both pages,
5536 * or -1 if there is no such active element.
5538 int16_t mem_off_split
;
5539 int16_t reg_off_split
;
5542 * The byte offset at which the entire operation crosses a page boundary.
5543 * Set >= 0 if and only if the entire operation spans two pages.
5547 /* TLB data for the two pages. */
5548 SVEHostPage page
[2];
5552 * Find first active element on each page, and a loose bound for the
5553 * final element on each page. Identify any single element that spans
5554 * the page boundary. Return true if there are any active elements.
5556 static bool sve_cont_ldst_elements(SVEContLdSt
*info
, target_ulong addr
,
5557 uint64_t *vg
, intptr_t reg_max
,
5560 const int esize
= 1 << esz
;
5561 const uint64_t pg_mask
= pred_esz_masks
[esz
];
5562 intptr_t reg_off_first
= -1, reg_off_last
= -1, reg_off_split
;
5563 intptr_t mem_off_last
, mem_off_split
;
5564 intptr_t page_split
, elt_split
;
5567 /* Set all of the element indices to -1, and the TLB data to 0. */
5568 memset(info
, -1, offsetof(SVEContLdSt
, page
));
5569 memset(info
->page
, 0, sizeof(info
->page
));
5571 /* Gross scan over the entire predicate to find bounds. */
5574 uint64_t pg
= vg
[i
] & pg_mask
;
5576 reg_off_last
= i
* 64 + 63 - clz64(pg
);
5577 if (reg_off_first
< 0) {
5578 reg_off_first
= i
* 64 + ctz64(pg
);
5581 } while (++i
* 64 < reg_max
);
5583 if (unlikely(reg_off_first
< 0)) {
5584 /* No active elements, no pages touched. */
5587 tcg_debug_assert(reg_off_last
>= 0 && reg_off_last
< reg_max
);
5589 info
->reg_off_first
[0] = reg_off_first
;
5590 info
->mem_off_first
[0] = (reg_off_first
>> esz
) * msize
;
5591 mem_off_last
= (reg_off_last
>> esz
) * msize
;
5593 page_split
= -(addr
| TARGET_PAGE_MASK
);
5594 if (likely(mem_off_last
+ msize
<= page_split
)) {
5595 /* The entire operation fits within a single page. */
5596 info
->reg_off_last
[0] = reg_off_last
;
5600 info
->page_split
= page_split
;
5601 elt_split
= page_split
/ msize
;
5602 reg_off_split
= elt_split
<< esz
;
5603 mem_off_split
= elt_split
* msize
;
5606 * This is the last full element on the first page, but it is not
5607 * necessarily active. If there is no full element, i.e. the first
5608 * active element is the one that's split, this value remains -1.
5609 * It is useful as iteration bounds.
5611 if (elt_split
!= 0) {
5612 info
->reg_off_last
[0] = reg_off_split
- esize
;
5615 /* Determine if an unaligned element spans the pages. */
5616 if (page_split
% msize
!= 0) {
5617 /* It is helpful to know if the split element is active. */
5618 if ((vg
[reg_off_split
>> 6] >> (reg_off_split
& 63)) & 1) {
5619 info
->reg_off_split
= reg_off_split
;
5620 info
->mem_off_split
= mem_off_split
;
5622 if (reg_off_split
== reg_off_last
) {
5623 /* The page crossing element is last. */
5627 reg_off_split
+= esize
;
5628 mem_off_split
+= msize
;
5632 * We do want the first active element on the second page, because
5633 * this may affect the address reported in an exception.
5635 reg_off_split
= find_next_active(vg
, reg_off_split
, reg_max
, esz
);
5636 tcg_debug_assert(reg_off_split
<= reg_off_last
);
5637 info
->reg_off_first
[1] = reg_off_split
;
5638 info
->mem_off_first
[1] = (reg_off_split
>> esz
) * msize
;
5639 info
->reg_off_last
[1] = reg_off_last
;
5644 * Resolve the guest virtual addresses to info->page[].
5645 * Control the generation of page faults with @fault. Return false if
5646 * there is no work to do, which can only happen with @fault == FAULT_NO.
5648 static bool sve_cont_ldst_pages(SVEContLdSt
*info
, SVEContFault fault
,
5649 CPUARMState
*env
, target_ulong addr
,
5650 MMUAccessType access_type
, uintptr_t retaddr
)
5652 int mmu_idx
= cpu_mmu_index(env
, false);
5653 int mem_off
= info
->mem_off_first
[0];
5654 bool nofault
= fault
== FAULT_NO
;
5655 bool have_work
= true;
5657 if (!sve_probe_page(&info
->page
[0], nofault
, env
, addr
, mem_off
,
5658 access_type
, mmu_idx
, retaddr
)) {
5659 /* No work to be done. */
5663 if (likely(info
->page_split
< 0)) {
5664 /* The entire operation was on the one page. */
5669 * If the second page is invalid, then we want the fault address to be
5670 * the first byte on that page which is accessed.
5672 if (info
->mem_off_split
>= 0) {
5674 * There is an element split across the pages. The fault address
5675 * should be the first byte of the second page.
5677 mem_off
= info
->page_split
;
5679 * If the split element is also the first active element
5680 * of the vector, then: For first-fault we should continue
5681 * to generate faults for the second page. For no-fault,
5682 * we have work only if the second page is valid.
5684 if (info
->mem_off_first
[0] < info
->mem_off_split
) {
5685 nofault
= FAULT_FIRST
;
5690 * There is no element split across the pages. The fault address
5691 * should be the first active element on the second page.
5693 mem_off
= info
->mem_off_first
[1];
5695 * There must have been one active element on the first page,
5696 * so we're out of first-fault territory.
5698 nofault
= fault
!= FAULT_ALL
;
5701 have_work
|= sve_probe_page(&info
->page
[1], nofault
, env
, addr
, mem_off
,
5702 access_type
, mmu_idx
, retaddr
);
5706 static void sve_cont_ldst_watchpoints(SVEContLdSt
*info
, CPUARMState
*env
,
5707 uint64_t *vg
, target_ulong addr
,
5708 int esize
, int msize
, int wp_access
,
5711 #ifndef CONFIG_USER_ONLY
5712 intptr_t mem_off
, reg_off
, reg_last
;
5713 int flags0
= info
->page
[0].flags
;
5714 int flags1
= info
->page
[1].flags
;
5716 if (likely(!((flags0
| flags1
) & TLB_WATCHPOINT
))) {
5720 /* Indicate that watchpoints are handled. */
5721 info
->page
[0].flags
= flags0
& ~TLB_WATCHPOINT
;
5722 info
->page
[1].flags
= flags1
& ~TLB_WATCHPOINT
;
5724 if (flags0
& TLB_WATCHPOINT
) {
5725 mem_off
= info
->mem_off_first
[0];
5726 reg_off
= info
->reg_off_first
[0];
5727 reg_last
= info
->reg_off_last
[0];
5729 while (reg_off
<= reg_last
) {
5730 uint64_t pg
= vg
[reg_off
>> 6];
5732 if ((pg
>> (reg_off
& 63)) & 1) {
5733 cpu_check_watchpoint(env_cpu(env
), addr
+ mem_off
,
5734 msize
, info
->page
[0].attrs
,
5735 wp_access
, retaddr
);
5739 } while (reg_off
<= reg_last
&& (reg_off
& 63));
5743 mem_off
= info
->mem_off_split
;
5745 cpu_check_watchpoint(env_cpu(env
), addr
+ mem_off
, msize
,
5746 info
->page
[0].attrs
, wp_access
, retaddr
);
5749 mem_off
= info
->mem_off_first
[1];
5750 if ((flags1
& TLB_WATCHPOINT
) && mem_off
>= 0) {
5751 reg_off
= info
->reg_off_first
[1];
5752 reg_last
= info
->reg_off_last
[1];
5755 uint64_t pg
= vg
[reg_off
>> 6];
5757 if ((pg
>> (reg_off
& 63)) & 1) {
5758 cpu_check_watchpoint(env_cpu(env
), addr
+ mem_off
,
5759 msize
, info
->page
[1].attrs
,
5760 wp_access
, retaddr
);
5764 } while (reg_off
& 63);
5765 } while (reg_off
<= reg_last
);
5770 static void sve_cont_ldst_mte_check(SVEContLdSt
*info
, CPUARMState
*env
,
5771 uint64_t *vg
, target_ulong addr
, int esize
,
5772 int msize
, uint32_t mtedesc
, uintptr_t ra
)
5774 intptr_t mem_off
, reg_off
, reg_last
;
5776 /* Process the page only if MemAttr == Tagged. */
5777 if (arm_tlb_mte_tagged(&info
->page
[0].attrs
)) {
5778 mem_off
= info
->mem_off_first
[0];
5779 reg_off
= info
->reg_off_first
[0];
5780 reg_last
= info
->reg_off_split
;
5782 reg_last
= info
->reg_off_last
[0];
5786 uint64_t pg
= vg
[reg_off
>> 6];
5788 if ((pg
>> (reg_off
& 63)) & 1) {
5789 mte_check(env
, mtedesc
, addr
, ra
);
5793 } while (reg_off
<= reg_last
&& (reg_off
& 63));
5794 } while (reg_off
<= reg_last
);
5797 mem_off
= info
->mem_off_first
[1];
5798 if (mem_off
>= 0 && arm_tlb_mte_tagged(&info
->page
[1].attrs
)) {
5799 reg_off
= info
->reg_off_first
[1];
5800 reg_last
= info
->reg_off_last
[1];
5803 uint64_t pg
= vg
[reg_off
>> 6];
5805 if ((pg
>> (reg_off
& 63)) & 1) {
5806 mte_check(env
, mtedesc
, addr
, ra
);
5810 } while (reg_off
& 63);
5811 } while (reg_off
<= reg_last
);
5816 * Common helper for all contiguous 1,2,3,4-register predicated stores.
5818 static inline QEMU_ALWAYS_INLINE
5819 void sve_ldN_r(CPUARMState
*env
, uint64_t *vg
, const target_ulong addr
,
5820 uint32_t desc
, const uintptr_t retaddr
,
5821 const int esz
, const int msz
, const int N
, uint32_t mtedesc
,
5822 sve_ldst1_host_fn
*host_fn
,
5823 sve_ldst1_tlb_fn
*tlb_fn
)
5825 const unsigned rd
= simd_data(desc
);
5826 const intptr_t reg_max
= simd_oprsz(desc
);
5827 intptr_t reg_off
, reg_last
, mem_off
;
5832 /* Find the active elements. */
5833 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, N
<< msz
)) {
5834 /* The entire predicate was false; no load occurs. */
5835 for (i
= 0; i
< N
; ++i
) {
5836 memset(&env
->vfp
.zregs
[(rd
+ i
) & 31], 0, reg_max
);
5841 /* Probe the page(s). Exit with exception for any invalid page. */
5842 sve_cont_ldst_pages(&info
, FAULT_ALL
, env
, addr
, MMU_DATA_LOAD
, retaddr
);
5844 /* Handle watchpoints for all active elements. */
5845 sve_cont_ldst_watchpoints(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
5846 BP_MEM_READ
, retaddr
);
5849 * Handle mte checks for all active elements.
5850 * Since TBI must be set for MTE, !mtedesc => !mte_active.
5853 sve_cont_ldst_mte_check(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
5857 flags
= info
.page
[0].flags
| info
.page
[1].flags
;
5858 if (unlikely(flags
!= 0)) {
5859 #ifdef CONFIG_USER_ONLY
5860 g_assert_not_reached();
5863 * At least one page includes MMIO.
5864 * Any bus operation can fail with cpu_transaction_failed,
5865 * which for ARM will raise SyncExternal. Perform the load
5866 * into scratch memory to preserve register state until the end.
5868 ARMVectorReg scratch
[4] = { };
5870 mem_off
= info
.mem_off_first
[0];
5871 reg_off
= info
.reg_off_first
[0];
5872 reg_last
= info
.reg_off_last
[1];
5874 reg_last
= info
.reg_off_split
;
5876 reg_last
= info
.reg_off_last
[0];
5881 uint64_t pg
= vg
[reg_off
>> 6];
5883 if ((pg
>> (reg_off
& 63)) & 1) {
5884 for (i
= 0; i
< N
; ++i
) {
5885 tlb_fn(env
, &scratch
[i
], reg_off
,
5886 addr
+ mem_off
+ (i
<< msz
), retaddr
);
5889 reg_off
+= 1 << esz
;
5890 mem_off
+= N
<< msz
;
5891 } while (reg_off
& 63);
5892 } while (reg_off
<= reg_last
);
5894 for (i
= 0; i
< N
; ++i
) {
5895 memcpy(&env
->vfp
.zregs
[(rd
+ i
) & 31], &scratch
[i
], reg_max
);
5901 /* The entire operation is in RAM, on valid pages. */
5903 for (i
= 0; i
< N
; ++i
) {
5904 memset(&env
->vfp
.zregs
[(rd
+ i
) & 31], 0, reg_max
);
5907 mem_off
= info
.mem_off_first
[0];
5908 reg_off
= info
.reg_off_first
[0];
5909 reg_last
= info
.reg_off_last
[0];
5910 host
= info
.page
[0].host
;
5912 while (reg_off
<= reg_last
) {
5913 uint64_t pg
= vg
[reg_off
>> 6];
5915 if ((pg
>> (reg_off
& 63)) & 1) {
5916 for (i
= 0; i
< N
; ++i
) {
5917 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5918 host
+ mem_off
+ (i
<< msz
));
5921 reg_off
+= 1 << esz
;
5922 mem_off
+= N
<< msz
;
5923 } while (reg_off
<= reg_last
&& (reg_off
& 63));
5927 * Use the slow path to manage the cross-page misalignment.
5928 * But we know this is RAM and cannot trap.
5930 mem_off
= info
.mem_off_split
;
5931 if (unlikely(mem_off
>= 0)) {
5932 reg_off
= info
.reg_off_split
;
5933 for (i
= 0; i
< N
; ++i
) {
5934 tlb_fn(env
, &env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5935 addr
+ mem_off
+ (i
<< msz
), retaddr
);
5939 mem_off
= info
.mem_off_first
[1];
5940 if (unlikely(mem_off
>= 0)) {
5941 reg_off
= info
.reg_off_first
[1];
5942 reg_last
= info
.reg_off_last
[1];
5943 host
= info
.page
[1].host
;
5946 uint64_t pg
= vg
[reg_off
>> 6];
5948 if ((pg
>> (reg_off
& 63)) & 1) {
5949 for (i
= 0; i
< N
; ++i
) {
5950 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
5951 host
+ mem_off
+ (i
<< msz
));
5954 reg_off
+= 1 << esz
;
5955 mem_off
+= N
<< msz
;
5956 } while (reg_off
& 63);
5957 } while (reg_off
<= reg_last
);
5961 static inline QEMU_ALWAYS_INLINE
5962 void sve_ldN_r_mte(CPUARMState
*env
, uint64_t *vg
, target_ulong addr
,
5963 uint32_t desc
, const uintptr_t ra
,
5964 const int esz
, const int msz
, const int N
,
5965 sve_ldst1_host_fn
*host_fn
,
5966 sve_ldst1_tlb_fn
*tlb_fn
)
5968 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
5969 int bit55
= extract64(addr
, 55, 1);
5971 /* Remove mtedesc from the normal sve descriptor. */
5972 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
5974 /* Perform gross MTE suppression early. */
5975 if (!tbi_check(desc
, bit55
) ||
5976 tcma_check(desc
, bit55
, allocation_tag_from_addr(addr
))) {
5980 sve_ldN_r(env
, vg
, addr
, desc
, ra
, esz
, msz
, N
, mtedesc
, host_fn
, tlb_fn
);
5983 #define DO_LD1_1(NAME, ESZ) \
5984 void HELPER(sve_##NAME##_r)(CPUARMState *env, void *vg, \
5985 target_ulong addr, uint32_t desc) \
5987 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, 0, \
5988 sve_##NAME##_host, sve_##NAME##_tlb); \
5990 void HELPER(sve_##NAME##_r_mte)(CPUARMState *env, void *vg, \
5991 target_ulong addr, uint32_t desc) \
5993 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, 1, \
5994 sve_##NAME##_host, sve_##NAME##_tlb); \
5997 #define DO_LD1_2(NAME, ESZ, MSZ) \
5998 void HELPER(sve_##NAME##_le_r)(CPUARMState *env, void *vg, \
5999 target_ulong addr, uint32_t desc) \
6001 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6002 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6004 void HELPER(sve_##NAME##_be_r)(CPUARMState *env, void *vg, \
6005 target_ulong addr, uint32_t desc) \
6007 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, 0, \
6008 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6010 void HELPER(sve_##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6011 target_ulong addr, uint32_t desc) \
6013 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6014 sve_##NAME##_le_host, sve_##NAME##_le_tlb); \
6016 void HELPER(sve_##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6017 target_ulong addr, uint32_t desc) \
6019 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, 1, \
6020 sve_##NAME##_be_host, sve_##NAME##_be_tlb); \
6023 DO_LD1_1(ld1bb
, MO_8
)
6024 DO_LD1_1(ld1bhu
, MO_16
)
6025 DO_LD1_1(ld1bhs
, MO_16
)
6026 DO_LD1_1(ld1bsu
, MO_32
)
6027 DO_LD1_1(ld1bss
, MO_32
)
6028 DO_LD1_1(ld1bdu
, MO_64
)
6029 DO_LD1_1(ld1bds
, MO_64
)
6031 DO_LD1_2(ld1hh
, MO_16
, MO_16
)
6032 DO_LD1_2(ld1hsu
, MO_32
, MO_16
)
6033 DO_LD1_2(ld1hss
, MO_32
, MO_16
)
6034 DO_LD1_2(ld1hdu
, MO_64
, MO_16
)
6035 DO_LD1_2(ld1hds
, MO_64
, MO_16
)
6037 DO_LD1_2(ld1ss
, MO_32
, MO_32
)
6038 DO_LD1_2(ld1sdu
, MO_64
, MO_32
)
6039 DO_LD1_2(ld1sds
, MO_64
, MO_32
)
6041 DO_LD1_2(ld1dd
, MO_64
, MO_64
)
6046 #define DO_LDN_1(N) \
6047 void HELPER(sve_ld##N##bb_r)(CPUARMState *env, void *vg, \
6048 target_ulong addr, uint32_t desc) \
6050 sve_ldN_r(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, 0, \
6051 sve_ld1bb_host, sve_ld1bb_tlb); \
6053 void HELPER(sve_ld##N##bb_r_mte)(CPUARMState *env, void *vg, \
6054 target_ulong addr, uint32_t desc) \
6056 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), MO_8, MO_8, N, \
6057 sve_ld1bb_host, sve_ld1bb_tlb); \
6060 #define DO_LDN_2(N, SUFF, ESZ) \
6061 void HELPER(sve_ld##N##SUFF##_le_r)(CPUARMState *env, void *vg, \
6062 target_ulong addr, uint32_t desc) \
6064 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6065 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6067 void HELPER(sve_ld##N##SUFF##_be_r)(CPUARMState *env, void *vg, \
6068 target_ulong addr, uint32_t desc) \
6070 sve_ldN_r(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, 0, \
6071 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6073 void HELPER(sve_ld##N##SUFF##_le_r_mte)(CPUARMState *env, void *vg, \
6074 target_ulong addr, uint32_t desc) \
6076 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6077 sve_ld1##SUFF##_le_host, sve_ld1##SUFF##_le_tlb); \
6079 void HELPER(sve_ld##N##SUFF##_be_r_mte)(CPUARMState *env, void *vg, \
6080 target_ulong addr, uint32_t desc) \
6082 sve_ldN_r_mte(env, vg, addr, desc, GETPC(), ESZ, ESZ, N, \
6083 sve_ld1##SUFF##_be_host, sve_ld1##SUFF##_be_tlb); \
6090 DO_LDN_2(2, hh
, MO_16
)
6091 DO_LDN_2(3, hh
, MO_16
)
6092 DO_LDN_2(4, hh
, MO_16
)
6094 DO_LDN_2(2, ss
, MO_32
)
6095 DO_LDN_2(3, ss
, MO_32
)
6096 DO_LDN_2(4, ss
, MO_32
)
6098 DO_LDN_2(2, dd
, MO_64
)
6099 DO_LDN_2(3, dd
, MO_64
)
6100 DO_LDN_2(4, dd
, MO_64
)
6106 * Load contiguous data, first-fault and no-fault.
6108 * For user-only, one could argue that we should hold the mmap_lock during
6109 * the operation so that there is no race between page_check_range and the
6110 * load operation. However, unmapping pages out from under a running thread
6111 * is extraordinarily unlikely. This theoretical race condition also affects
6112 * linux-user/ in its get_user/put_user macros.
6114 * TODO: Construct some helpers, written in assembly, that interact with
6115 * handle_cpu_signal to produce memory ops which can properly report errors
6119 /* Fault on byte I. All bits in FFR from I are cleared. The vector
6120 * result from I is CONSTRAINED UNPREDICTABLE; we choose the MERGE
6121 * option, which leaves subsequent data unchanged.
6123 static void record_fault(CPUARMState
*env
, uintptr_t i
, uintptr_t oprsz
)
6125 uint64_t *ffr
= env
->vfp
.pregs
[FFR_PRED_NUM
].p
;
6128 ffr
[i
/ 64] &= MAKE_64BIT_MASK(0, i
& 63);
6129 i
= ROUND_UP(i
, 64);
6131 for (; i
< oprsz
; i
+= 64) {
6137 * Common helper for all contiguous no-fault and first-fault loads.
6139 static inline QEMU_ALWAYS_INLINE
6140 void sve_ldnfff1_r(CPUARMState
*env
, void *vg
, const target_ulong addr
,
6141 uint32_t desc
, const uintptr_t retaddr
, uint32_t mtedesc
,
6142 const int esz
, const int msz
, const SVEContFault fault
,
6143 sve_ldst1_host_fn
*host_fn
,
6144 sve_ldst1_tlb_fn
*tlb_fn
)
6146 const unsigned rd
= simd_data(desc
);
6147 void *vd
= &env
->vfp
.zregs
[rd
];
6148 const intptr_t reg_max
= simd_oprsz(desc
);
6149 intptr_t reg_off
, mem_off
, reg_last
;
6154 /* Find the active elements. */
6155 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, 1 << msz
)) {
6156 /* The entire predicate was false; no load occurs. */
6157 memset(vd
, 0, reg_max
);
6160 reg_off
= info
.reg_off_first
[0];
6162 /* Probe the page(s). */
6163 if (!sve_cont_ldst_pages(&info
, fault
, env
, addr
, MMU_DATA_LOAD
, retaddr
)) {
6164 /* Fault on first element. */
6165 tcg_debug_assert(fault
== FAULT_NO
);
6166 memset(vd
, 0, reg_max
);
6170 mem_off
= info
.mem_off_first
[0];
6171 flags
= info
.page
[0].flags
;
6174 * Disable MTE checking if the Tagged bit is not set. Since TBI must
6175 * be set within MTEDESC for MTE, !mtedesc => !mte_active.
6177 if (arm_tlb_mte_tagged(&info
.page
[0].attrs
)) {
6181 if (fault
== FAULT_FIRST
) {
6182 /* Trapping mte check for the first-fault element. */
6184 mte_check(env
, mtedesc
, addr
+ mem_off
, retaddr
);
6188 * Special handling of the first active element,
6189 * if it crosses a page boundary or is MMIO.
6191 bool is_split
= mem_off
== info
.mem_off_split
;
6192 if (unlikely(flags
!= 0) || unlikely(is_split
)) {
6194 * Use the slow path for cross-page handling.
6195 * Might trap for MMIO or watchpoints.
6197 tlb_fn(env
, vd
, reg_off
, addr
+ mem_off
, retaddr
);
6199 /* After any fault, zero the other elements. */
6200 swap_memzero(vd
, reg_off
);
6201 reg_off
+= 1 << esz
;
6202 mem_off
+= 1 << msz
;
6203 swap_memzero(vd
+ reg_off
, reg_max
- reg_off
);
6209 memset(vd
, 0, reg_max
);
6212 memset(vd
, 0, reg_max
);
6213 if (unlikely(mem_off
== info
.mem_off_split
)) {
6214 /* The first active element crosses a page boundary. */
6215 flags
|= info
.page
[1].flags
;
6216 if (unlikely(flags
& TLB_MMIO
)) {
6217 /* Some page is MMIO, see below. */
6220 if (unlikely(flags
& TLB_WATCHPOINT
) &&
6221 (cpu_watchpoint_address_matches
6222 (env_cpu(env
), addr
+ mem_off
, 1 << msz
)
6224 /* Watchpoint hit, see below. */
6227 if (mtedesc
&& !mte_probe(env
, mtedesc
, addr
+ mem_off
)) {
6231 * Use the slow path for cross-page handling.
6232 * This is RAM, without a watchpoint, and will not trap.
6234 tlb_fn(env
, vd
, reg_off
, addr
+ mem_off
, retaddr
);
6240 * From this point on, all memory operations are MemSingleNF.
6242 * Per the MemSingleNF pseudocode, a no-fault load from Device memory
6243 * must not actually hit the bus -- it returns (UNKNOWN, FAULT) instead.
6245 * Unfortuately we do not have access to the memory attributes from the
6246 * PTE to tell Device memory from Normal memory. So we make a mostly
6247 * correct check, and indicate (UNKNOWN, FAULT) for any MMIO.
6248 * This gives the right answer for the common cases of "Normal memory,
6249 * backed by host RAM" and "Device memory, backed by MMIO".
6250 * The architecture allows us to suppress an NF load and return
6251 * (UNKNOWN, FAULT) for any reason, so our behaviour for the corner
6252 * case of "Normal memory, backed by MMIO" is permitted. The case we
6253 * get wrong is "Device memory, backed by host RAM", for which we
6254 * should return (UNKNOWN, FAULT) for but do not.
6256 * Similarly, CPU_BP breakpoints would raise exceptions, and so
6257 * return (UNKNOWN, FAULT). For simplicity, we consider gdb and
6258 * architectural breakpoints the same.
6260 if (unlikely(flags
& TLB_MMIO
)) {
6264 reg_last
= info
.reg_off_last
[0];
6265 host
= info
.page
[0].host
;
6268 uint64_t pg
= *(uint64_t *)(vg
+ (reg_off
>> 3));
6270 if ((pg
>> (reg_off
& 63)) & 1) {
6271 if (unlikely(flags
& TLB_WATCHPOINT
) &&
6272 (cpu_watchpoint_address_matches
6273 (env_cpu(env
), addr
+ mem_off
, 1 << msz
)
6277 if (mtedesc
&& !mte_probe(env
, mtedesc
, addr
+ mem_off
)) {
6280 host_fn(vd
, reg_off
, host
+ mem_off
);
6282 reg_off
+= 1 << esz
;
6283 mem_off
+= 1 << msz
;
6284 } while (reg_off
<= reg_last
&& (reg_off
& 63));
6285 } while (reg_off
<= reg_last
);
6288 * MemSingleNF is allowed to fail for any reason. We have special
6289 * code above to handle the first element crossing a page boundary.
6290 * As an implementation choice, decline to handle a cross-page element
6291 * in any other position.
6293 reg_off
= info
.reg_off_split
;
6299 reg_off
= info
.reg_off_first
[1];
6300 if (likely(reg_off
< 0)) {
6301 /* No active elements on the second page. All done. */
6306 * MemSingleNF is allowed to fail for any reason. As an implementation
6307 * choice, decline to handle elements on the second page. This should
6308 * be low frequency as the guest walks through memory -- the next
6309 * iteration of the guest's loop should be aligned on the page boundary,
6310 * and then all following iterations will stay aligned.
6314 record_fault(env
, reg_off
, reg_max
);
6317 static inline QEMU_ALWAYS_INLINE
6318 void sve_ldnfff1_r_mte(CPUARMState
*env
, void *vg
, target_ulong addr
,
6319 uint32_t desc
, const uintptr_t retaddr
,
6320 const int esz
, const int msz
, const SVEContFault fault
,
6321 sve_ldst1_host_fn
*host_fn
,
6322 sve_ldst1_tlb_fn
*tlb_fn
)
6324 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6325 int bit55
= extract64(addr
, 55, 1);
6327 /* Remove mtedesc from the normal sve descriptor. */
6328 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6330 /* Perform gross MTE suppression early. */
6331 if (!tbi_check(desc
, bit55
) ||
6332 tcma_check(desc
, bit55
, allocation_tag_from_addr(addr
))) {
6336 sve_ldnfff1_r(env
, vg
, addr
, desc
, retaddr
, mtedesc
,
6337 esz
, msz
, fault
, host_fn
, tlb_fn
);
6340 #define DO_LDFF1_LDNF1_1(PART, ESZ) \
6341 void HELPER(sve_ldff1##PART##_r)(CPUARMState *env, void *vg, \
6342 target_ulong addr, uint32_t desc) \
6344 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_FIRST, \
6345 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6347 void HELPER(sve_ldnf1##PART##_r)(CPUARMState *env, void *vg, \
6348 target_ulong addr, uint32_t desc) \
6350 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MO_8, FAULT_NO, \
6351 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6353 void HELPER(sve_ldff1##PART##_r_mte)(CPUARMState *env, void *vg, \
6354 target_ulong addr, uint32_t desc) \
6356 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_FIRST, \
6357 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6359 void HELPER(sve_ldnf1##PART##_r_mte)(CPUARMState *env, void *vg, \
6360 target_ulong addr, uint32_t desc) \
6362 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, FAULT_NO, \
6363 sve_ld1##PART##_host, sve_ld1##PART##_tlb); \
6366 #define DO_LDFF1_LDNF1_2(PART, ESZ, MSZ) \
6367 void HELPER(sve_ldff1##PART##_le_r)(CPUARMState *env, void *vg, \
6368 target_ulong addr, uint32_t desc) \
6370 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6371 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6373 void HELPER(sve_ldnf1##PART##_le_r)(CPUARMState *env, void *vg, \
6374 target_ulong addr, uint32_t desc) \
6376 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6377 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6379 void HELPER(sve_ldff1##PART##_be_r)(CPUARMState *env, void *vg, \
6380 target_ulong addr, uint32_t desc) \
6382 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_FIRST, \
6383 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6385 void HELPER(sve_ldnf1##PART##_be_r)(CPUARMState *env, void *vg, \
6386 target_ulong addr, uint32_t desc) \
6388 sve_ldnfff1_r(env, vg, addr, desc, GETPC(), 0, ESZ, MSZ, FAULT_NO, \
6389 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6391 void HELPER(sve_ldff1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6392 target_ulong addr, uint32_t desc) \
6394 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6395 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6397 void HELPER(sve_ldnf1##PART##_le_r_mte)(CPUARMState *env, void *vg, \
6398 target_ulong addr, uint32_t desc) \
6400 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6401 sve_ld1##PART##_le_host, sve_ld1##PART##_le_tlb); \
6403 void HELPER(sve_ldff1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6404 target_ulong addr, uint32_t desc) \
6406 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_FIRST, \
6407 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6409 void HELPER(sve_ldnf1##PART##_be_r_mte)(CPUARMState *env, void *vg, \
6410 target_ulong addr, uint32_t desc) \
6412 sve_ldnfff1_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, FAULT_NO, \
6413 sve_ld1##PART##_be_host, sve_ld1##PART##_be_tlb); \
6416 DO_LDFF1_LDNF1_1(bb
, MO_8
)
6417 DO_LDFF1_LDNF1_1(bhu
, MO_16
)
6418 DO_LDFF1_LDNF1_1(bhs
, MO_16
)
6419 DO_LDFF1_LDNF1_1(bsu
, MO_32
)
6420 DO_LDFF1_LDNF1_1(bss
, MO_32
)
6421 DO_LDFF1_LDNF1_1(bdu
, MO_64
)
6422 DO_LDFF1_LDNF1_1(bds
, MO_64
)
6424 DO_LDFF1_LDNF1_2(hh
, MO_16
, MO_16
)
6425 DO_LDFF1_LDNF1_2(hsu
, MO_32
, MO_16
)
6426 DO_LDFF1_LDNF1_2(hss
, MO_32
, MO_16
)
6427 DO_LDFF1_LDNF1_2(hdu
, MO_64
, MO_16
)
6428 DO_LDFF1_LDNF1_2(hds
, MO_64
, MO_16
)
6430 DO_LDFF1_LDNF1_2(ss
, MO_32
, MO_32
)
6431 DO_LDFF1_LDNF1_2(sdu
, MO_64
, MO_32
)
6432 DO_LDFF1_LDNF1_2(sds
, MO_64
, MO_32
)
6434 DO_LDFF1_LDNF1_2(dd
, MO_64
, MO_64
)
6436 #undef DO_LDFF1_LDNF1_1
6437 #undef DO_LDFF1_LDNF1_2
6440 * Common helper for all contiguous 1,2,3,4-register predicated stores.
6443 static inline QEMU_ALWAYS_INLINE
6444 void sve_stN_r(CPUARMState
*env
, uint64_t *vg
, target_ulong addr
,
6445 uint32_t desc
, const uintptr_t retaddr
,
6446 const int esz
, const int msz
, const int N
, uint32_t mtedesc
,
6447 sve_ldst1_host_fn
*host_fn
,
6448 sve_ldst1_tlb_fn
*tlb_fn
)
6450 const unsigned rd
= simd_data(desc
);
6451 const intptr_t reg_max
= simd_oprsz(desc
);
6452 intptr_t reg_off
, reg_last
, mem_off
;
6457 /* Find the active elements. */
6458 if (!sve_cont_ldst_elements(&info
, addr
, vg
, reg_max
, esz
, N
<< msz
)) {
6459 /* The entire predicate was false; no store occurs. */
6463 /* Probe the page(s). Exit with exception for any invalid page. */
6464 sve_cont_ldst_pages(&info
, FAULT_ALL
, env
, addr
, MMU_DATA_STORE
, retaddr
);
6466 /* Handle watchpoints for all active elements. */
6467 sve_cont_ldst_watchpoints(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
6468 BP_MEM_WRITE
, retaddr
);
6471 * Handle mte checks for all active elements.
6472 * Since TBI must be set for MTE, !mtedesc => !mte_active.
6475 sve_cont_ldst_mte_check(&info
, env
, vg
, addr
, 1 << esz
, N
<< msz
,
6479 flags
= info
.page
[0].flags
| info
.page
[1].flags
;
6480 if (unlikely(flags
!= 0)) {
6481 #ifdef CONFIG_USER_ONLY
6482 g_assert_not_reached();
6485 * At least one page includes MMIO.
6486 * Any bus operation can fail with cpu_transaction_failed,
6487 * which for ARM will raise SyncExternal. We cannot avoid
6488 * this fault and will leave with the store incomplete.
6490 mem_off
= info
.mem_off_first
[0];
6491 reg_off
= info
.reg_off_first
[0];
6492 reg_last
= info
.reg_off_last
[1];
6494 reg_last
= info
.reg_off_split
;
6496 reg_last
= info
.reg_off_last
[0];
6501 uint64_t pg
= vg
[reg_off
>> 6];
6503 if ((pg
>> (reg_off
& 63)) & 1) {
6504 for (i
= 0; i
< N
; ++i
) {
6505 tlb_fn(env
, &env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
6506 addr
+ mem_off
+ (i
<< msz
), retaddr
);
6509 reg_off
+= 1 << esz
;
6510 mem_off
+= N
<< msz
;
6511 } while (reg_off
& 63);
6512 } while (reg_off
<= reg_last
);
6517 mem_off
= info
.mem_off_first
[0];
6518 reg_off
= info
.reg_off_first
[0];
6519 reg_last
= info
.reg_off_last
[0];
6520 host
= info
.page
[0].host
;
6522 while (reg_off
<= reg_last
) {
6523 uint64_t pg
= vg
[reg_off
>> 6];
6525 if ((pg
>> (reg_off
& 63)) & 1) {
6526 for (i
= 0; i
< N
; ++i
) {
6527 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
6528 host
+ mem_off
+ (i
<< msz
));
6531 reg_off
+= 1 << esz
;
6532 mem_off
+= N
<< msz
;
6533 } while (reg_off
<= reg_last
&& (reg_off
& 63));
6537 * Use the slow path to manage the cross-page misalignment.
6538 * But we know this is RAM and cannot trap.
6540 mem_off
= info
.mem_off_split
;
6541 if (unlikely(mem_off
>= 0)) {
6542 reg_off
= info
.reg_off_split
;
6543 for (i
= 0; i
< N
; ++i
) {
6544 tlb_fn(env
, &env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
6545 addr
+ mem_off
+ (i
<< msz
), retaddr
);
6549 mem_off
= info
.mem_off_first
[1];
6550 if (unlikely(mem_off
>= 0)) {
6551 reg_off
= info
.reg_off_first
[1];
6552 reg_last
= info
.reg_off_last
[1];
6553 host
= info
.page
[1].host
;
6556 uint64_t pg
= vg
[reg_off
>> 6];
6558 if ((pg
>> (reg_off
& 63)) & 1) {
6559 for (i
= 0; i
< N
; ++i
) {
6560 host_fn(&env
->vfp
.zregs
[(rd
+ i
) & 31], reg_off
,
6561 host
+ mem_off
+ (i
<< msz
));
6564 reg_off
+= 1 << esz
;
6565 mem_off
+= N
<< msz
;
6566 } while (reg_off
& 63);
6567 } while (reg_off
<= reg_last
);
6571 static inline QEMU_ALWAYS_INLINE
6572 void sve_stN_r_mte(CPUARMState
*env
, uint64_t *vg
, target_ulong addr
,
6573 uint32_t desc
, const uintptr_t ra
,
6574 const int esz
, const int msz
, const int N
,
6575 sve_ldst1_host_fn
*host_fn
,
6576 sve_ldst1_tlb_fn
*tlb_fn
)
6578 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6579 int bit55
= extract64(addr
, 55, 1);
6581 /* Remove mtedesc from the normal sve descriptor. */
6582 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6584 /* Perform gross MTE suppression early. */
6585 if (!tbi_check(desc
, bit55
) ||
6586 tcma_check(desc
, bit55
, allocation_tag_from_addr(addr
))) {
6590 sve_stN_r(env
, vg
, addr
, desc
, ra
, esz
, msz
, N
, mtedesc
, host_fn
, tlb_fn
);
6593 #define DO_STN_1(N, NAME, ESZ) \
6594 void HELPER(sve_st##N##NAME##_r)(CPUARMState *env, void *vg, \
6595 target_ulong addr, uint32_t desc) \
6597 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, 0, \
6598 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6600 void HELPER(sve_st##N##NAME##_r_mte)(CPUARMState *env, void *vg, \
6601 target_ulong addr, uint32_t desc) \
6603 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MO_8, N, \
6604 sve_st1##NAME##_host, sve_st1##NAME##_tlb); \
6607 #define DO_STN_2(N, NAME, ESZ, MSZ) \
6608 void HELPER(sve_st##N##NAME##_le_r)(CPUARMState *env, void *vg, \
6609 target_ulong addr, uint32_t desc) \
6611 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6612 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6614 void HELPER(sve_st##N##NAME##_be_r)(CPUARMState *env, void *vg, \
6615 target_ulong addr, uint32_t desc) \
6617 sve_stN_r(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, 0, \
6618 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6620 void HELPER(sve_st##N##NAME##_le_r_mte)(CPUARMState *env, void *vg, \
6621 target_ulong addr, uint32_t desc) \
6623 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6624 sve_st1##NAME##_le_host, sve_st1##NAME##_le_tlb); \
6626 void HELPER(sve_st##N##NAME##_be_r_mte)(CPUARMState *env, void *vg, \
6627 target_ulong addr, uint32_t desc) \
6629 sve_stN_r_mte(env, vg, addr, desc, GETPC(), ESZ, MSZ, N, \
6630 sve_st1##NAME##_be_host, sve_st1##NAME##_be_tlb); \
6633 DO_STN_1(1, bb
, MO_8
)
6634 DO_STN_1(1, bh
, MO_16
)
6635 DO_STN_1(1, bs
, MO_32
)
6636 DO_STN_1(1, bd
, MO_64
)
6637 DO_STN_1(2, bb
, MO_8
)
6638 DO_STN_1(3, bb
, MO_8
)
6639 DO_STN_1(4, bb
, MO_8
)
6641 DO_STN_2(1, hh
, MO_16
, MO_16
)
6642 DO_STN_2(1, hs
, MO_32
, MO_16
)
6643 DO_STN_2(1, hd
, MO_64
, MO_16
)
6644 DO_STN_2(2, hh
, MO_16
, MO_16
)
6645 DO_STN_2(3, hh
, MO_16
, MO_16
)
6646 DO_STN_2(4, hh
, MO_16
, MO_16
)
6648 DO_STN_2(1, ss
, MO_32
, MO_32
)
6649 DO_STN_2(1, sd
, MO_64
, MO_32
)
6650 DO_STN_2(2, ss
, MO_32
, MO_32
)
6651 DO_STN_2(3, ss
, MO_32
, MO_32
)
6652 DO_STN_2(4, ss
, MO_32
, MO_32
)
6654 DO_STN_2(1, dd
, MO_64
, MO_64
)
6655 DO_STN_2(2, dd
, MO_64
, MO_64
)
6656 DO_STN_2(3, dd
, MO_64
, MO_64
)
6657 DO_STN_2(4, dd
, MO_64
, MO_64
)
6663 * Loads with a vector index.
6667 * Load the element at @reg + @reg_ofs, sign or zero-extend as needed.
6669 typedef target_ulong
zreg_off_fn(void *reg
, intptr_t reg_ofs
);
6671 static target_ulong
off_zsu_s(void *reg
, intptr_t reg_ofs
)
6673 return *(uint32_t *)(reg
+ H1_4(reg_ofs
));
6676 static target_ulong
off_zss_s(void *reg
, intptr_t reg_ofs
)
6678 return *(int32_t *)(reg
+ H1_4(reg_ofs
));
6681 static target_ulong
off_zsu_d(void *reg
, intptr_t reg_ofs
)
6683 return (uint32_t)*(uint64_t *)(reg
+ reg_ofs
);
6686 static target_ulong
off_zss_d(void *reg
, intptr_t reg_ofs
)
6688 return (int32_t)*(uint64_t *)(reg
+ reg_ofs
);
6691 static target_ulong
off_zd_d(void *reg
, intptr_t reg_ofs
)
6693 return *(uint64_t *)(reg
+ reg_ofs
);
6696 static inline QEMU_ALWAYS_INLINE
6697 void sve_ld1_z(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6698 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6699 uint32_t mtedesc
, int esize
, int msize
,
6700 zreg_off_fn
*off_fn
,
6701 sve_ldst1_host_fn
*host_fn
,
6702 sve_ldst1_tlb_fn
*tlb_fn
)
6704 const int mmu_idx
= cpu_mmu_index(env
, false);
6705 const intptr_t reg_max
= simd_oprsz(desc
);
6706 const int scale
= simd_data(desc
);
6707 ARMVectorReg scratch
;
6709 SVEHostPage info
, info2
;
6711 memset(&scratch
, 0, reg_max
);
6714 uint64_t pg
= vg
[reg_off
>> 6];
6716 if (likely(pg
& 1)) {
6717 target_ulong addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6718 target_ulong in_page
= -(addr
| TARGET_PAGE_MASK
);
6720 sve_probe_page(&info
, false, env
, addr
, 0, MMU_DATA_LOAD
,
6723 if (likely(in_page
>= msize
)) {
6724 if (unlikely(info
.flags
& TLB_WATCHPOINT
)) {
6725 cpu_check_watchpoint(env_cpu(env
), addr
, msize
,
6726 info
.attrs
, BP_MEM_READ
, retaddr
);
6728 if (mtedesc
&& arm_tlb_mte_tagged(&info
.attrs
)) {
6729 mte_check(env
, mtedesc
, addr
, retaddr
);
6731 host_fn(&scratch
, reg_off
, info
.host
);
6733 /* Element crosses the page boundary. */
6734 sve_probe_page(&info2
, false, env
, addr
+ in_page
, 0,
6735 MMU_DATA_LOAD
, mmu_idx
, retaddr
);
6736 if (unlikely((info
.flags
| info2
.flags
) & TLB_WATCHPOINT
)) {
6737 cpu_check_watchpoint(env_cpu(env
), addr
,
6739 BP_MEM_READ
, retaddr
);
6741 if (mtedesc
&& arm_tlb_mte_tagged(&info
.attrs
)) {
6742 mte_check(env
, mtedesc
, addr
, retaddr
);
6744 tlb_fn(env
, &scratch
, reg_off
, addr
, retaddr
);
6749 } while (reg_off
& 63);
6750 } while (reg_off
< reg_max
);
6752 /* Wait until all exceptions have been raised to write back. */
6753 memcpy(vd
, &scratch
, reg_max
);
6756 static inline QEMU_ALWAYS_INLINE
6757 void sve_ld1_z_mte(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6758 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6759 int esize
, int msize
, zreg_off_fn
*off_fn
,
6760 sve_ldst1_host_fn
*host_fn
,
6761 sve_ldst1_tlb_fn
*tlb_fn
)
6763 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6764 /* Remove mtedesc from the normal sve descriptor. */
6765 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6768 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6769 * offset base entirely over the address space hole to change the
6770 * pointer tag, or change the bit55 selector. So we could here
6771 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6773 sve_ld1_z(env
, vd
, vg
, vm
, base
, desc
, retaddr
, mtedesc
,
6774 esize
, msize
, off_fn
, host_fn
, tlb_fn
);
6777 #define DO_LD1_ZPZ_S(MEM, OFS, MSZ) \
6778 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6779 void *vm, target_ulong base, uint32_t desc) \
6781 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
6782 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6784 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6785 void *vm, target_ulong base, uint32_t desc) \
6787 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
6788 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6791 #define DO_LD1_ZPZ_D(MEM, OFS, MSZ) \
6792 void HELPER(sve_ld##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
6793 void *vm, target_ulong base, uint32_t desc) \
6795 sve_ld1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
6796 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6798 void HELPER(sve_ld##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
6799 void *vm, target_ulong base, uint32_t desc) \
6801 sve_ld1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
6802 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6805 DO_LD1_ZPZ_S(bsu
, zsu
, MO_8
)
6806 DO_LD1_ZPZ_S(bsu
, zss
, MO_8
)
6807 DO_LD1_ZPZ_D(bdu
, zsu
, MO_8
)
6808 DO_LD1_ZPZ_D(bdu
, zss
, MO_8
)
6809 DO_LD1_ZPZ_D(bdu
, zd
, MO_8
)
6811 DO_LD1_ZPZ_S(bss
, zsu
, MO_8
)
6812 DO_LD1_ZPZ_S(bss
, zss
, MO_8
)
6813 DO_LD1_ZPZ_D(bds
, zsu
, MO_8
)
6814 DO_LD1_ZPZ_D(bds
, zss
, MO_8
)
6815 DO_LD1_ZPZ_D(bds
, zd
, MO_8
)
6817 DO_LD1_ZPZ_S(hsu_le
, zsu
, MO_16
)
6818 DO_LD1_ZPZ_S(hsu_le
, zss
, MO_16
)
6819 DO_LD1_ZPZ_D(hdu_le
, zsu
, MO_16
)
6820 DO_LD1_ZPZ_D(hdu_le
, zss
, MO_16
)
6821 DO_LD1_ZPZ_D(hdu_le
, zd
, MO_16
)
6823 DO_LD1_ZPZ_S(hsu_be
, zsu
, MO_16
)
6824 DO_LD1_ZPZ_S(hsu_be
, zss
, MO_16
)
6825 DO_LD1_ZPZ_D(hdu_be
, zsu
, MO_16
)
6826 DO_LD1_ZPZ_D(hdu_be
, zss
, MO_16
)
6827 DO_LD1_ZPZ_D(hdu_be
, zd
, MO_16
)
6829 DO_LD1_ZPZ_S(hss_le
, zsu
, MO_16
)
6830 DO_LD1_ZPZ_S(hss_le
, zss
, MO_16
)
6831 DO_LD1_ZPZ_D(hds_le
, zsu
, MO_16
)
6832 DO_LD1_ZPZ_D(hds_le
, zss
, MO_16
)
6833 DO_LD1_ZPZ_D(hds_le
, zd
, MO_16
)
6835 DO_LD1_ZPZ_S(hss_be
, zsu
, MO_16
)
6836 DO_LD1_ZPZ_S(hss_be
, zss
, MO_16
)
6837 DO_LD1_ZPZ_D(hds_be
, zsu
, MO_16
)
6838 DO_LD1_ZPZ_D(hds_be
, zss
, MO_16
)
6839 DO_LD1_ZPZ_D(hds_be
, zd
, MO_16
)
6841 DO_LD1_ZPZ_S(ss_le
, zsu
, MO_32
)
6842 DO_LD1_ZPZ_S(ss_le
, zss
, MO_32
)
6843 DO_LD1_ZPZ_D(sdu_le
, zsu
, MO_32
)
6844 DO_LD1_ZPZ_D(sdu_le
, zss
, MO_32
)
6845 DO_LD1_ZPZ_D(sdu_le
, zd
, MO_32
)
6847 DO_LD1_ZPZ_S(ss_be
, zsu
, MO_32
)
6848 DO_LD1_ZPZ_S(ss_be
, zss
, MO_32
)
6849 DO_LD1_ZPZ_D(sdu_be
, zsu
, MO_32
)
6850 DO_LD1_ZPZ_D(sdu_be
, zss
, MO_32
)
6851 DO_LD1_ZPZ_D(sdu_be
, zd
, MO_32
)
6853 DO_LD1_ZPZ_D(sds_le
, zsu
, MO_32
)
6854 DO_LD1_ZPZ_D(sds_le
, zss
, MO_32
)
6855 DO_LD1_ZPZ_D(sds_le
, zd
, MO_32
)
6857 DO_LD1_ZPZ_D(sds_be
, zsu
, MO_32
)
6858 DO_LD1_ZPZ_D(sds_be
, zss
, MO_32
)
6859 DO_LD1_ZPZ_D(sds_be
, zd
, MO_32
)
6861 DO_LD1_ZPZ_D(dd_le
, zsu
, MO_64
)
6862 DO_LD1_ZPZ_D(dd_le
, zss
, MO_64
)
6863 DO_LD1_ZPZ_D(dd_le
, zd
, MO_64
)
6865 DO_LD1_ZPZ_D(dd_be
, zsu
, MO_64
)
6866 DO_LD1_ZPZ_D(dd_be
, zss
, MO_64
)
6867 DO_LD1_ZPZ_D(dd_be
, zd
, MO_64
)
6872 /* First fault loads with a vector index. */
6875 * Common helpers for all gather first-faulting loads.
6878 static inline QEMU_ALWAYS_INLINE
6879 void sve_ldff1_z(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6880 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6881 uint32_t mtedesc
, const int esz
, const int msz
,
6882 zreg_off_fn
*off_fn
,
6883 sve_ldst1_host_fn
*host_fn
,
6884 sve_ldst1_tlb_fn
*tlb_fn
)
6886 const int mmu_idx
= cpu_mmu_index(env
, false);
6887 const intptr_t reg_max
= simd_oprsz(desc
);
6888 const int scale
= simd_data(desc
);
6889 const int esize
= 1 << esz
;
6890 const int msize
= 1 << msz
;
6893 target_ulong addr
, in_page
;
6895 /* Skip to the first true predicate. */
6896 reg_off
= find_next_active(vg
, 0, reg_max
, esz
);
6897 if (unlikely(reg_off
>= reg_max
)) {
6898 /* The entire predicate was false; no load occurs. */
6899 memset(vd
, 0, reg_max
);
6904 * Probe the first element, allowing faults.
6906 addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6908 mte_check(env
, mtedesc
, addr
, retaddr
);
6910 tlb_fn(env
, vd
, reg_off
, addr
, retaddr
);
6912 /* After any fault, zero the other elements. */
6913 swap_memzero(vd
, reg_off
);
6915 swap_memzero(vd
+ reg_off
, reg_max
- reg_off
);
6918 * Probe the remaining elements, not allowing faults.
6920 while (reg_off
< reg_max
) {
6921 uint64_t pg
= vg
[reg_off
>> 6];
6923 if (likely((pg
>> (reg_off
& 63)) & 1)) {
6924 addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
6925 in_page
= -(addr
| TARGET_PAGE_MASK
);
6927 if (unlikely(in_page
< msize
)) {
6928 /* Stop if the element crosses a page boundary. */
6932 sve_probe_page(&info
, true, env
, addr
, 0, MMU_DATA_LOAD
,
6934 if (unlikely(info
.flags
& (TLB_INVALID_MASK
| TLB_MMIO
))) {
6937 if (unlikely(info
.flags
& TLB_WATCHPOINT
) &&
6938 (cpu_watchpoint_address_matches
6939 (env_cpu(env
), addr
, msize
) & BP_MEM_READ
)) {
6943 arm_tlb_mte_tagged(&info
.attrs
) &&
6944 !mte_probe(env
, mtedesc
, addr
)) {
6948 host_fn(vd
, reg_off
, info
.host
);
6951 } while (reg_off
& 63);
6956 record_fault(env
, reg_off
, reg_max
);
6959 static inline QEMU_ALWAYS_INLINE
6960 void sve_ldff1_z_mte(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
6961 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
6962 const int esz
, const int msz
,
6963 zreg_off_fn
*off_fn
,
6964 sve_ldst1_host_fn
*host_fn
,
6965 sve_ldst1_tlb_fn
*tlb_fn
)
6967 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6968 /* Remove mtedesc from the normal sve descriptor. */
6969 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
6972 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
6973 * offset base entirely over the address space hole to change the
6974 * pointer tag, or change the bit55 selector. So we could here
6975 * examine TBI + TCMA like we do for sve_ldN_r_mte().
6977 sve_ldff1_z(env
, vd
, vg
, vm
, base
, desc
, retaddr
, mtedesc
,
6978 esz
, msz
, off_fn
, host_fn
, tlb_fn
);
6981 #define DO_LDFF1_ZPZ_S(MEM, OFS, MSZ) \
6982 void HELPER(sve_ldff##MEM##_##OFS) \
6983 (CPUARMState *env, void *vd, void *vg, \
6984 void *vm, target_ulong base, uint32_t desc) \
6986 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_32, MSZ, \
6987 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6989 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
6990 (CPUARMState *env, void *vd, void *vg, \
6991 void *vm, target_ulong base, uint32_t desc) \
6993 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_32, MSZ, \
6994 off_##OFS##_s, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
6997 #define DO_LDFF1_ZPZ_D(MEM, OFS, MSZ) \
6998 void HELPER(sve_ldff##MEM##_##OFS) \
6999 (CPUARMState *env, void *vd, void *vg, \
7000 void *vm, target_ulong base, uint32_t desc) \
7002 sve_ldff1_z(env, vd, vg, vm, base, desc, GETPC(), 0, MO_64, MSZ, \
7003 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7005 void HELPER(sve_ldff##MEM##_##OFS##_mte) \
7006 (CPUARMState *env, void *vd, void *vg, \
7007 void *vm, target_ulong base, uint32_t desc) \
7009 sve_ldff1_z_mte(env, vd, vg, vm, base, desc, GETPC(), MO_64, MSZ, \
7010 off_##OFS##_d, sve_ld1##MEM##_host, sve_ld1##MEM##_tlb); \
7013 DO_LDFF1_ZPZ_S(bsu
, zsu
, MO_8
)
7014 DO_LDFF1_ZPZ_S(bsu
, zss
, MO_8
)
7015 DO_LDFF1_ZPZ_D(bdu
, zsu
, MO_8
)
7016 DO_LDFF1_ZPZ_D(bdu
, zss
, MO_8
)
7017 DO_LDFF1_ZPZ_D(bdu
, zd
, MO_8
)
7019 DO_LDFF1_ZPZ_S(bss
, zsu
, MO_8
)
7020 DO_LDFF1_ZPZ_S(bss
, zss
, MO_8
)
7021 DO_LDFF1_ZPZ_D(bds
, zsu
, MO_8
)
7022 DO_LDFF1_ZPZ_D(bds
, zss
, MO_8
)
7023 DO_LDFF1_ZPZ_D(bds
, zd
, MO_8
)
7025 DO_LDFF1_ZPZ_S(hsu_le
, zsu
, MO_16
)
7026 DO_LDFF1_ZPZ_S(hsu_le
, zss
, MO_16
)
7027 DO_LDFF1_ZPZ_D(hdu_le
, zsu
, MO_16
)
7028 DO_LDFF1_ZPZ_D(hdu_le
, zss
, MO_16
)
7029 DO_LDFF1_ZPZ_D(hdu_le
, zd
, MO_16
)
7031 DO_LDFF1_ZPZ_S(hsu_be
, zsu
, MO_16
)
7032 DO_LDFF1_ZPZ_S(hsu_be
, zss
, MO_16
)
7033 DO_LDFF1_ZPZ_D(hdu_be
, zsu
, MO_16
)
7034 DO_LDFF1_ZPZ_D(hdu_be
, zss
, MO_16
)
7035 DO_LDFF1_ZPZ_D(hdu_be
, zd
, MO_16
)
7037 DO_LDFF1_ZPZ_S(hss_le
, zsu
, MO_16
)
7038 DO_LDFF1_ZPZ_S(hss_le
, zss
, MO_16
)
7039 DO_LDFF1_ZPZ_D(hds_le
, zsu
, MO_16
)
7040 DO_LDFF1_ZPZ_D(hds_le
, zss
, MO_16
)
7041 DO_LDFF1_ZPZ_D(hds_le
, zd
, MO_16
)
7043 DO_LDFF1_ZPZ_S(hss_be
, zsu
, MO_16
)
7044 DO_LDFF1_ZPZ_S(hss_be
, zss
, MO_16
)
7045 DO_LDFF1_ZPZ_D(hds_be
, zsu
, MO_16
)
7046 DO_LDFF1_ZPZ_D(hds_be
, zss
, MO_16
)
7047 DO_LDFF1_ZPZ_D(hds_be
, zd
, MO_16
)
7049 DO_LDFF1_ZPZ_S(ss_le
, zsu
, MO_32
)
7050 DO_LDFF1_ZPZ_S(ss_le
, zss
, MO_32
)
7051 DO_LDFF1_ZPZ_D(sdu_le
, zsu
, MO_32
)
7052 DO_LDFF1_ZPZ_D(sdu_le
, zss
, MO_32
)
7053 DO_LDFF1_ZPZ_D(sdu_le
, zd
, MO_32
)
7055 DO_LDFF1_ZPZ_S(ss_be
, zsu
, MO_32
)
7056 DO_LDFF1_ZPZ_S(ss_be
, zss
, MO_32
)
7057 DO_LDFF1_ZPZ_D(sdu_be
, zsu
, MO_32
)
7058 DO_LDFF1_ZPZ_D(sdu_be
, zss
, MO_32
)
7059 DO_LDFF1_ZPZ_D(sdu_be
, zd
, MO_32
)
7061 DO_LDFF1_ZPZ_D(sds_le
, zsu
, MO_32
)
7062 DO_LDFF1_ZPZ_D(sds_le
, zss
, MO_32
)
7063 DO_LDFF1_ZPZ_D(sds_le
, zd
, MO_32
)
7065 DO_LDFF1_ZPZ_D(sds_be
, zsu
, MO_32
)
7066 DO_LDFF1_ZPZ_D(sds_be
, zss
, MO_32
)
7067 DO_LDFF1_ZPZ_D(sds_be
, zd
, MO_32
)
7069 DO_LDFF1_ZPZ_D(dd_le
, zsu
, MO_64
)
7070 DO_LDFF1_ZPZ_D(dd_le
, zss
, MO_64
)
7071 DO_LDFF1_ZPZ_D(dd_le
, zd
, MO_64
)
7073 DO_LDFF1_ZPZ_D(dd_be
, zsu
, MO_64
)
7074 DO_LDFF1_ZPZ_D(dd_be
, zss
, MO_64
)
7075 DO_LDFF1_ZPZ_D(dd_be
, zd
, MO_64
)
7077 /* Stores with a vector index. */
7079 static inline QEMU_ALWAYS_INLINE
7080 void sve_st1_z(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
7081 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
7082 uint32_t mtedesc
, int esize
, int msize
,
7083 zreg_off_fn
*off_fn
,
7084 sve_ldst1_host_fn
*host_fn
,
7085 sve_ldst1_tlb_fn
*tlb_fn
)
7087 const int mmu_idx
= cpu_mmu_index(env
, false);
7088 const intptr_t reg_max
= simd_oprsz(desc
);
7089 const int scale
= simd_data(desc
);
7090 void *host
[ARM_MAX_VQ
* 4];
7091 intptr_t reg_off
, i
;
7092 SVEHostPage info
, info2
;
7095 * Probe all of the elements for host addresses and flags.
7099 uint64_t pg
= vg
[reg_off
>> 6];
7101 target_ulong addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
7102 target_ulong in_page
= -(addr
| TARGET_PAGE_MASK
);
7105 if (likely((pg
>> (reg_off
& 63)) & 1)) {
7106 if (likely(in_page
>= msize
)) {
7107 sve_probe_page(&info
, false, env
, addr
, 0, MMU_DATA_STORE
,
7109 host
[i
] = info
.host
;
7112 * Element crosses the page boundary.
7113 * Probe both pages, but do not record the host address,
7114 * so that we use the slow path.
7116 sve_probe_page(&info
, false, env
, addr
, 0,
7117 MMU_DATA_STORE
, mmu_idx
, retaddr
);
7118 sve_probe_page(&info2
, false, env
, addr
+ in_page
, 0,
7119 MMU_DATA_STORE
, mmu_idx
, retaddr
);
7120 info
.flags
|= info2
.flags
;
7123 if (unlikely(info
.flags
& TLB_WATCHPOINT
)) {
7124 cpu_check_watchpoint(env_cpu(env
), addr
, msize
,
7125 info
.attrs
, BP_MEM_WRITE
, retaddr
);
7128 if (mtedesc
&& arm_tlb_mte_tagged(&info
.attrs
)) {
7129 mte_check(env
, mtedesc
, addr
, retaddr
);
7134 } while (reg_off
& 63);
7135 } while (reg_off
< reg_max
);
7138 * Now that we have recognized all exceptions except SyncExternal
7139 * (from TLB_MMIO), which we cannot avoid, perform all of the stores.
7141 * Note for the common case of an element in RAM, not crossing a page
7142 * boundary, we have stored the host address in host[]. This doubles
7143 * as a first-level check against the predicate, since only enabled
7144 * elements have non-null host addresses.
7149 if (likely(h
!= NULL
)) {
7150 host_fn(vd
, reg_off
, h
);
7151 } else if ((vg
[reg_off
>> 6] >> (reg_off
& 63)) & 1) {
7152 target_ulong addr
= base
+ (off_fn(vm
, reg_off
) << scale
);
7153 tlb_fn(env
, vd
, reg_off
, addr
, retaddr
);
7157 } while (reg_off
< reg_max
);
7160 static inline QEMU_ALWAYS_INLINE
7161 void sve_st1_z_mte(CPUARMState
*env
, void *vd
, uint64_t *vg
, void *vm
,
7162 target_ulong base
, uint32_t desc
, uintptr_t retaddr
,
7163 int esize
, int msize
, zreg_off_fn
*off_fn
,
7164 sve_ldst1_host_fn
*host_fn
,
7165 sve_ldst1_tlb_fn
*tlb_fn
)
7167 uint32_t mtedesc
= desc
>> (SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
7168 /* Remove mtedesc from the normal sve descriptor. */
7169 desc
= extract32(desc
, 0, SIMD_DATA_SHIFT
+ SVE_MTEDESC_SHIFT
);
7172 * ??? TODO: For the 32-bit offset extractions, base + ofs cannot
7173 * offset base entirely over the address space hole to change the
7174 * pointer tag, or change the bit55 selector. So we could here
7175 * examine TBI + TCMA like we do for sve_ldN_r_mte().
7177 sve_st1_z(env
, vd
, vg
, vm
, base
, desc
, retaddr
, mtedesc
,
7178 esize
, msize
, off_fn
, host_fn
, tlb_fn
);
7181 #define DO_ST1_ZPZ_S(MEM, OFS, MSZ) \
7182 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7183 void *vm, target_ulong base, uint32_t desc) \
7185 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 4, 1 << MSZ, \
7186 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7188 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7189 void *vm, target_ulong base, uint32_t desc) \
7191 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 4, 1 << MSZ, \
7192 off_##OFS##_s, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7195 #define DO_ST1_ZPZ_D(MEM, OFS, MSZ) \
7196 void HELPER(sve_st##MEM##_##OFS)(CPUARMState *env, void *vd, void *vg, \
7197 void *vm, target_ulong base, uint32_t desc) \
7199 sve_st1_z(env, vd, vg, vm, base, desc, GETPC(), 0, 8, 1 << MSZ, \
7200 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7202 void HELPER(sve_st##MEM##_##OFS##_mte)(CPUARMState *env, void *vd, void *vg, \
7203 void *vm, target_ulong base, uint32_t desc) \
7205 sve_st1_z_mte(env, vd, vg, vm, base, desc, GETPC(), 8, 1 << MSZ, \
7206 off_##OFS##_d, sve_st1##MEM##_host, sve_st1##MEM##_tlb); \
7209 DO_ST1_ZPZ_S(bs
, zsu
, MO_8
)
7210 DO_ST1_ZPZ_S(hs_le
, zsu
, MO_16
)
7211 DO_ST1_ZPZ_S(hs_be
, zsu
, MO_16
)
7212 DO_ST1_ZPZ_S(ss_le
, zsu
, MO_32
)
7213 DO_ST1_ZPZ_S(ss_be
, zsu
, MO_32
)
7215 DO_ST1_ZPZ_S(bs
, zss
, MO_8
)
7216 DO_ST1_ZPZ_S(hs_le
, zss
, MO_16
)
7217 DO_ST1_ZPZ_S(hs_be
, zss
, MO_16
)
7218 DO_ST1_ZPZ_S(ss_le
, zss
, MO_32
)
7219 DO_ST1_ZPZ_S(ss_be
, zss
, MO_32
)
7221 DO_ST1_ZPZ_D(bd
, zsu
, MO_8
)
7222 DO_ST1_ZPZ_D(hd_le
, zsu
, MO_16
)
7223 DO_ST1_ZPZ_D(hd_be
, zsu
, MO_16
)
7224 DO_ST1_ZPZ_D(sd_le
, zsu
, MO_32
)
7225 DO_ST1_ZPZ_D(sd_be
, zsu
, MO_32
)
7226 DO_ST1_ZPZ_D(dd_le
, zsu
, MO_64
)
7227 DO_ST1_ZPZ_D(dd_be
, zsu
, MO_64
)
7229 DO_ST1_ZPZ_D(bd
, zss
, MO_8
)
7230 DO_ST1_ZPZ_D(hd_le
, zss
, MO_16
)
7231 DO_ST1_ZPZ_D(hd_be
, zss
, MO_16
)
7232 DO_ST1_ZPZ_D(sd_le
, zss
, MO_32
)
7233 DO_ST1_ZPZ_D(sd_be
, zss
, MO_32
)
7234 DO_ST1_ZPZ_D(dd_le
, zss
, MO_64
)
7235 DO_ST1_ZPZ_D(dd_be
, zss
, MO_64
)
7237 DO_ST1_ZPZ_D(bd
, zd
, MO_8
)
7238 DO_ST1_ZPZ_D(hd_le
, zd
, MO_16
)
7239 DO_ST1_ZPZ_D(hd_be
, zd
, MO_16
)
7240 DO_ST1_ZPZ_D(sd_le
, zd
, MO_32
)
7241 DO_ST1_ZPZ_D(sd_be
, zd
, MO_32
)
7242 DO_ST1_ZPZ_D(dd_le
, zd
, MO_64
)
7243 DO_ST1_ZPZ_D(dd_be
, zd
, MO_64
)
7248 void HELPER(sve2_eor3
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
7250 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7251 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
7253 for (i
= 0; i
< opr_sz
; ++i
) {
7254 d
[i
] = n
[i
] ^ m
[i
] ^ k
[i
];
7258 void HELPER(sve2_bcax
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
7260 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7261 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
7263 for (i
= 0; i
< opr_sz
; ++i
) {
7264 d
[i
] = n
[i
] ^ (m
[i
] & ~k
[i
]);
7268 void HELPER(sve2_bsl1n
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
7270 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7271 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
7273 for (i
= 0; i
< opr_sz
; ++i
) {
7274 d
[i
] = (~n
[i
] & k
[i
]) | (m
[i
] & ~k
[i
]);
7278 void HELPER(sve2_bsl2n
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
7280 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7281 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
7283 for (i
= 0; i
< opr_sz
; ++i
) {
7284 d
[i
] = (n
[i
] & k
[i
]) | (~m
[i
] & ~k
[i
]);
7288 void HELPER(sve2_nbsl
)(void *vd
, void *vn
, void *vm
, void *vk
, uint32_t desc
)
7290 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7291 uint64_t *d
= vd
, *n
= vn
, *m
= vm
, *k
= vk
;
7293 for (i
= 0; i
< opr_sz
; ++i
) {
7294 d
[i
] = ~((n
[i
] & k
[i
]) | (m
[i
] & ~k
[i
]));
7299 * Returns true if m0 or m1 contains the low uint8_t/uint16_t in n.
7300 * See hasless(v,1) from
7301 * https://graphics.stanford.edu/~seander/bithacks.html#ZeroInWord
7303 static inline bool do_match2(uint64_t n
, uint64_t m0
, uint64_t m1
, int esz
)
7305 int bits
= 8 << esz
;
7306 uint64_t ones
= dup_const(esz
, 1);
7307 uint64_t signs
= ones
<< (bits
- 1);
7308 uint64_t cmp0
, cmp1
;
7310 cmp1
= dup_const(esz
, n
);
7313 cmp0
= (cmp0
- ones
) & ~cmp0
;
7314 cmp1
= (cmp1
- ones
) & ~cmp1
;
7315 return (cmp0
| cmp1
) & signs
;
7318 static inline uint32_t do_match(void *vd
, void *vn
, void *vm
, void *vg
,
7319 uint32_t desc
, int esz
, bool nmatch
)
7321 uint16_t esz_mask
= pred_esz_masks
[esz
];
7322 intptr_t opr_sz
= simd_oprsz(desc
);
7323 uint32_t flags
= PREDTEST_INIT
;
7326 for (i
= 0; i
< opr_sz
; i
+= 16) {
7327 uint64_t m0
= *(uint64_t *)(vm
+ i
);
7328 uint64_t m1
= *(uint64_t *)(vm
+ i
+ 8);
7329 uint16_t pg
= *(uint16_t *)(vg
+ H1_2(i
>> 3)) & esz_mask
;
7332 for (j
= 0; j
< 16; j
+= 8) {
7333 uint64_t n
= *(uint64_t *)(vn
+ i
+ j
);
7335 for (k
= 0; k
< 8; k
+= 1 << esz
) {
7336 if (pg
& (1 << (j
+ k
))) {
7337 bool o
= do_match2(n
>> (k
* 8), m0
, m1
, esz
);
7338 out
|= (o
^ nmatch
) << (j
+ k
);
7342 *(uint16_t *)(vd
+ H1_2(i
>> 3)) = out
;
7343 flags
= iter_predtest_fwd(out
, pg
, flags
);
7348 #define DO_PPZZ_MATCH(NAME, ESZ, INV) \
7349 uint32_t HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
7351 return do_match(vd, vn, vm, vg, desc, ESZ, INV); \
7354 DO_PPZZ_MATCH(sve2_match_ppzz_b
, MO_8
, false)
7355 DO_PPZZ_MATCH(sve2_match_ppzz_h
, MO_16
, false)
7357 DO_PPZZ_MATCH(sve2_nmatch_ppzz_b
, MO_8
, true)
7358 DO_PPZZ_MATCH(sve2_nmatch_ppzz_h
, MO_16
, true)
7360 #undef DO_PPZZ_MATCH
7362 void HELPER(sve2_histcnt_s
)(void *vd
, void *vn
, void *vm
, void *vg
,
7365 ARMVectorReg scratch
;
7367 intptr_t opr_sz
= simd_oprsz(desc
);
7368 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
7372 n
= memcpy(&scratch
, n
, opr_sz
);
7376 } else if (d
== m
) {
7377 m
= memcpy(&scratch
, m
, opr_sz
);
7380 for (i
= 0; i
< opr_sz
; i
+= 4) {
7384 pred
= pg
[H1(i
>> 3)] >> (i
& 7);
7386 uint32_t nn
= n
[H4(i
>> 2)];
7388 for (j
= 0; j
<= i
; j
+= 4) {
7389 pred
= pg
[H1(j
>> 3)] >> (j
& 7);
7390 if ((pred
& 1) && nn
== m
[H4(j
>> 2)]) {
7395 d
[H4(i
>> 2)] = count
;
7399 void HELPER(sve2_histcnt_d
)(void *vd
, void *vn
, void *vm
, void *vg
,
7402 ARMVectorReg scratch
;
7404 intptr_t opr_sz
= simd_oprsz(desc
);
7405 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
7409 n
= memcpy(&scratch
, n
, opr_sz
);
7413 } else if (d
== m
) {
7414 m
= memcpy(&scratch
, m
, opr_sz
);
7417 for (i
= 0; i
< opr_sz
/ 8; ++i
) {
7419 if (pg
[H1(i
)] & 1) {
7421 for (j
= 0; j
<= i
; ++j
) {
7422 if ((pg
[H1(j
)] & 1) && nn
== m
[j
]) {
7432 * Returns the number of bytes in m0 and m1 that match n.
7433 * Unlike do_match2 we don't just need true/false, we need an exact count.
7434 * This requires two extra logical operations.
7436 static inline uint64_t do_histseg_cnt(uint8_t n
, uint64_t m0
, uint64_t m1
)
7438 const uint64_t mask
= dup_const(MO_8
, 0x7f);
7439 uint64_t cmp0
, cmp1
;
7441 cmp1
= dup_const(MO_8
, n
);
7446 * 1: clear msb of each byte to avoid carry to next byte (& mask)
7447 * 2: carry in to msb if byte != 0 (+ mask)
7448 * 3: set msb if cmp has msb set (| cmp)
7449 * 4: set ~msb to ignore them (| mask)
7450 * We now have 0xff for byte != 0 or 0x7f for byte == 0.
7451 * 5: invert, resulting in 0x80 if and only if byte == 0.
7453 cmp0
= ~(((cmp0
& mask
) + mask
) | cmp0
| mask
);
7454 cmp1
= ~(((cmp1
& mask
) + mask
) | cmp1
| mask
);
7457 * Combine the two compares in a way that the bits do
7458 * not overlap, and so preserves the count of set bits.
7459 * If the host has an efficient instruction for ctpop,
7460 * then ctpop(x) + ctpop(y) has the same number of
7461 * operations as ctpop(x | (y >> 1)). If the host does
7462 * not have an efficient ctpop, then we only want to
7465 return ctpop64(cmp0
| (cmp1
>> 1));
7468 void HELPER(sve2_histseg
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
7471 intptr_t opr_sz
= simd_oprsz(desc
);
7473 for (i
= 0; i
< opr_sz
; i
+= 16) {
7474 uint64_t n0
= *(uint64_t *)(vn
+ i
);
7475 uint64_t m0
= *(uint64_t *)(vm
+ i
);
7476 uint64_t n1
= *(uint64_t *)(vn
+ i
+ 8);
7477 uint64_t m1
= *(uint64_t *)(vm
+ i
+ 8);
7481 for (j
= 0; j
< 64; j
+= 8) {
7482 uint64_t cnt0
= do_histseg_cnt(n0
>> j
, m0
, m1
);
7483 uint64_t cnt1
= do_histseg_cnt(n1
>> j
, m0
, m1
);
7488 *(uint64_t *)(vd
+ i
) = out0
;
7489 *(uint64_t *)(vd
+ i
+ 8) = out1
;
7493 void HELPER(sve2_xar_b
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
7495 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7496 int shr
= simd_data(desc
);
7498 uint64_t mask
= dup_const(MO_8
, 0xff >> shr
);
7499 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
7501 for (i
= 0; i
< opr_sz
; ++i
) {
7502 uint64_t t
= n
[i
] ^ m
[i
];
7503 d
[i
] = ((t
>> shr
) & mask
) | ((t
<< shl
) & ~mask
);
7507 void HELPER(sve2_xar_h
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
7509 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 8;
7510 int shr
= simd_data(desc
);
7512 uint64_t mask
= dup_const(MO_16
, 0xffff >> shr
);
7513 uint64_t *d
= vd
, *n
= vn
, *m
= vm
;
7515 for (i
= 0; i
< opr_sz
; ++i
) {
7516 uint64_t t
= n
[i
] ^ m
[i
];
7517 d
[i
] = ((t
>> shr
) & mask
) | ((t
<< shl
) & ~mask
);
7521 void HELPER(sve2_xar_s
)(void *vd
, void *vn
, void *vm
, uint32_t desc
)
7523 intptr_t i
, opr_sz
= simd_oprsz(desc
) / 4;
7524 int shr
= simd_data(desc
);
7525 uint32_t *d
= vd
, *n
= vn
, *m
= vm
;
7527 for (i
= 0; i
< opr_sz
; ++i
) {
7528 d
[i
] = ror32(n
[i
] ^ m
[i
], shr
);
7532 void HELPER(fmmla_s
)(void *vd
, void *vn
, void *vm
, void *va
,
7533 void *status
, uint32_t desc
)
7535 intptr_t s
, opr_sz
= simd_oprsz(desc
) / (sizeof(float32
) * 4);
7537 for (s
= 0; s
< opr_sz
; ++s
) {
7538 float32
*n
= vn
+ s
* sizeof(float32
) * 4;
7539 float32
*m
= vm
+ s
* sizeof(float32
) * 4;
7540 float32
*a
= va
+ s
* sizeof(float32
) * 4;
7541 float32
*d
= vd
+ s
* sizeof(float32
) * 4;
7542 float32 n00
= n
[H4(0)], n01
= n
[H4(1)];
7543 float32 n10
= n
[H4(2)], n11
= n
[H4(3)];
7544 float32 m00
= m
[H4(0)], m01
= m
[H4(1)];
7545 float32 m10
= m
[H4(2)], m11
= m
[H4(3)];
7549 p0
= float32_mul(n00
, m00
, status
);
7550 p1
= float32_mul(n01
, m01
, status
);
7551 d
[H4(0)] = float32_add(a
[H4(0)], float32_add(p0
, p1
, status
), status
);
7554 p0
= float32_mul(n00
, m10
, status
);
7555 p1
= float32_mul(n01
, m11
, status
);
7556 d
[H4(1)] = float32_add(a
[H4(1)], float32_add(p0
, p1
, status
), status
);
7559 p0
= float32_mul(n10
, m00
, status
);
7560 p1
= float32_mul(n11
, m01
, status
);
7561 d
[H4(2)] = float32_add(a
[H4(2)], float32_add(p0
, p1
, status
), status
);
7564 p0
= float32_mul(n10
, m10
, status
);
7565 p1
= float32_mul(n11
, m11
, status
);
7566 d
[H4(3)] = float32_add(a
[H4(3)], float32_add(p0
, p1
, status
), status
);
7570 void HELPER(fmmla_d
)(void *vd
, void *vn
, void *vm
, void *va
,
7571 void *status
, uint32_t desc
)
7573 intptr_t s
, opr_sz
= simd_oprsz(desc
) / (sizeof(float64
) * 4);
7575 for (s
= 0; s
< opr_sz
; ++s
) {
7576 float64
*n
= vn
+ s
* sizeof(float64
) * 4;
7577 float64
*m
= vm
+ s
* sizeof(float64
) * 4;
7578 float64
*a
= va
+ s
* sizeof(float64
) * 4;
7579 float64
*d
= vd
+ s
* sizeof(float64
) * 4;
7580 float64 n00
= n
[0], n01
= n
[1], n10
= n
[2], n11
= n
[3];
7581 float64 m00
= m
[0], m01
= m
[1], m10
= m
[2], m11
= m
[3];
7585 p0
= float64_mul(n00
, m00
, status
);
7586 p1
= float64_mul(n01
, m01
, status
);
7587 d
[0] = float64_add(a
[0], float64_add(p0
, p1
, status
), status
);
7590 p0
= float64_mul(n00
, m10
, status
);
7591 p1
= float64_mul(n01
, m11
, status
);
7592 d
[1] = float64_add(a
[1], float64_add(p0
, p1
, status
), status
);
7595 p0
= float64_mul(n10
, m00
, status
);
7596 p1
= float64_mul(n11
, m01
, status
);
7597 d
[2] = float64_add(a
[2], float64_add(p0
, p1
, status
), status
);
7600 p0
= float64_mul(n10
, m10
, status
);
7601 p1
= float64_mul(n11
, m11
, status
);
7602 d
[3] = float64_add(a
[3], float64_add(p0
, p1
, status
), status
);
7606 #define DO_FCVTNT(NAME, TYPEW, TYPEN, HW, HN, OP) \
7607 void HELPER(NAME)(void *vd, void *vn, void *vg, void *status, uint32_t desc) \
7609 intptr_t i = simd_oprsz(desc); \
7612 uint64_t pg = g[(i - 1) >> 6]; \
7614 i -= sizeof(TYPEW); \
7615 if (likely((pg >> (i & 63)) & 1)) { \
7616 TYPEW nn = *(TYPEW *)(vn + HW(i)); \
7617 *(TYPEN *)(vd + HN(i + sizeof(TYPEN))) = OP(nn, status); \
7623 DO_FCVTNT(sve2_fcvtnt_sh
, uint32_t, uint16_t, H1_4
, H1_2
, sve_f32_to_f16
)
7624 DO_FCVTNT(sve2_fcvtnt_ds
, uint64_t, uint32_t, , H1_4
, float64_to_float32
)