]> git.proxmox.com Git - mirror_qemu.git/blame - target/arm/sve_helper.c
target/arm: Implement SVE reverse within elements
[mirror_qemu.git] / target / arm / sve_helper.c
CommitLineData
9e18d7a6
RH
1/*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "qemu/osdep.h"
21#include "cpu.h"
22#include "exec/exec-all.h"
23#include "exec/cpu_ldst.h"
24#include "exec/helper-proto.h"
25#include "tcg/tcg-gvec-desc.h"
a1f233f2 26#include "fpu/softfloat.h"
9e18d7a6
RH
27
28
f97cfd59
RH
29/* Note that vector data is stored in host-endian 64-bit chunks,
30 so addressing units smaller than that needs a host-endian fixup. */
31#ifdef HOST_WORDS_BIGENDIAN
32#define H1(x) ((x) ^ 7)
33#define H1_2(x) ((x) ^ 6)
34#define H1_4(x) ((x) ^ 4)
35#define H2(x) ((x) ^ 3)
36#define H4(x) ((x) ^ 1)
37#else
38#define H1(x) (x)
39#define H1_2(x) (x)
40#define H1_4(x) (x)
41#define H2(x) (x)
42#define H4(x) (x)
43#endif
44
9e18d7a6
RH
45/* Return a value for NZCV as per the ARM PredTest pseudofunction.
46 *
47 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
48 * and bit 0 set if C is set. Compare the definitions of these variables
49 * within CPUARMState.
50 */
51
52/* For no G bits set, NZCV = C. */
53#define PREDTEST_INIT 1
54
55/* This is an iterative function, called for each Pd and Pg word
56 * moving forward.
57 */
58static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
59{
60 if (likely(g)) {
61 /* Compute N from first D & G.
62 Use bit 2 to signal first G bit seen. */
63 if (!(flags & 4)) {
64 flags |= ((d & (g & -g)) != 0) << 31;
65 flags |= 4;
66 }
67
68 /* Accumulate Z from each D & G. */
69 flags |= ((d & g) != 0) << 1;
70
71 /* Compute C from last !(D & G). Replace previous. */
72 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
73 }
74 return flags;
75}
76
77/* The same for a single word predicate. */
78uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
79{
80 return iter_predtest_fwd(d, g, PREDTEST_INIT);
81}
82
83/* The same for a multi-word predicate. */
84uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
85{
86 uint32_t flags = PREDTEST_INIT;
87 uint64_t *d = vd, *g = vg;
88 uintptr_t i = 0;
89
90 do {
91 flags = iter_predtest_fwd(d[i], g[i], flags);
92 } while (++i < words);
93
94 return flags;
95}
516e246a 96
ccd841c3
RH
97/* Expand active predicate bits to bytes, for byte elements.
98 * for (i = 0; i < 256; ++i) {
99 * unsigned long m = 0;
100 * for (j = 0; j < 8; j++) {
101 * if ((i >> j) & 1) {
102 * m |= 0xfful << (j << 3);
103 * }
104 * }
105 * printf("0x%016lx,\n", m);
106 * }
107 */
108static inline uint64_t expand_pred_b(uint8_t byte)
109{
110 static const uint64_t word[256] = {
111 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
112 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
113 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
114 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
115 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
116 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
117 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
118 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
119 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
120 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
121 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
122 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
123 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
124 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
125 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
126 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
127 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
128 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
129 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
130 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
131 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
132 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
133 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
134 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
135 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
136 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
137 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
138 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
139 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
140 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
141 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
142 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
143 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
144 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
145 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
146 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
147 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
148 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
149 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
150 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
151 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
152 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
153 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
154 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
155 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
156 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
157 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
158 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
159 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
160 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
161 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
162 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
163 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
164 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
165 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
166 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
167 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
168 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
169 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
170 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
171 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
172 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
173 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
174 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
175 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
176 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
177 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
178 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
179 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
180 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
181 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
182 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
183 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
184 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
185 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
186 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
187 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
188 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
189 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
190 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
191 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
192 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
193 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
194 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
195 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
196 0xffffffffffffffff,
197 };
198 return word[byte];
199}
200
201/* Similarly for half-word elements.
202 * for (i = 0; i < 256; ++i) {
203 * unsigned long m = 0;
204 * if (i & 0xaa) {
205 * continue;
206 * }
207 * for (j = 0; j < 8; j += 2) {
208 * if ((i >> j) & 1) {
209 * m |= 0xfffful << (j << 3);
210 * }
211 * }
212 * printf("[0x%x] = 0x%016lx,\n", i, m);
213 * }
214 */
215static inline uint64_t expand_pred_h(uint8_t byte)
216{
217 static const uint64_t word[] = {
218 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
219 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
220 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
221 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
222 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
223 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
224 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
225 [0x55] = 0xffffffffffffffff,
226 };
227 return word[byte & 0x55];
228}
229
230/* Similarly for single word elements. */
231static inline uint64_t expand_pred_s(uint8_t byte)
232{
233 static const uint64_t word[] = {
234 [0x01] = 0x00000000ffffffffull,
235 [0x10] = 0xffffffff00000000ull,
236 [0x11] = 0xffffffffffffffffull,
237 };
238 return word[byte & 0x11];
239}
240
dae8fb90
RH
241/* Swap 16-bit words within a 32-bit word. */
242static inline uint32_t hswap32(uint32_t h)
243{
244 return rol32(h, 16);
245}
246
247/* Swap 16-bit words within a 64-bit word. */
248static inline uint64_t hswap64(uint64_t h)
249{
250 uint64_t m = 0x0000ffff0000ffffull;
251 h = rol64(h, 32);
252 return ((h & m) << 16) | ((h >> 16) & m);
253}
254
255/* Swap 32-bit words within a 64-bit word. */
256static inline uint64_t wswap64(uint64_t h)
257{
258 return rol64(h, 32);
259}
260
516e246a
RH
261#define LOGICAL_PPPP(NAME, FUNC) \
262void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
263{ \
264 uintptr_t opr_sz = simd_oprsz(desc); \
265 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
266 uintptr_t i; \
267 for (i = 0; i < opr_sz / 8; ++i) { \
268 d[i] = FUNC(n[i], m[i], g[i]); \
269 } \
270}
271
272#define DO_AND(N, M, G) (((N) & (M)) & (G))
273#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
274#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
275#define DO_ORR(N, M, G) (((N) | (M)) & (G))
276#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
277#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
278#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
279#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
280
281LOGICAL_PPPP(sve_and_pppp, DO_AND)
282LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
283LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
284LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
285LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
286LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
287LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
288LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
289
290#undef DO_AND
291#undef DO_BIC
292#undef DO_EOR
293#undef DO_ORR
294#undef DO_ORN
295#undef DO_NOR
296#undef DO_NAND
297#undef DO_SEL
298#undef LOGICAL_PPPP
028e2a7b 299
f97cfd59
RH
300/* Fully general three-operand expander, controlled by a predicate.
301 * This is complicated by the host-endian storage of the register file.
302 */
303/* ??? I don't expect the compiler could ever vectorize this itself.
304 * With some tables we can convert bit masks to byte masks, and with
305 * extra care wrt byte/word ordering we could use gcc generic vectors
306 * and do 16 bytes at a time.
307 */
308#define DO_ZPZZ(NAME, TYPE, H, OP) \
309void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
310{ \
311 intptr_t i, opr_sz = simd_oprsz(desc); \
312 for (i = 0; i < opr_sz; ) { \
313 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
314 do { \
315 if (pg & 1) { \
316 TYPE nn = *(TYPE *)(vn + H(i)); \
317 TYPE mm = *(TYPE *)(vm + H(i)); \
318 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
319 } \
320 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
321 } while (i & 15); \
322 } \
323}
324
325/* Similarly, specialized for 64-bit operands. */
326#define DO_ZPZZ_D(NAME, TYPE, OP) \
327void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
328{ \
329 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
330 TYPE *d = vd, *n = vn, *m = vm; \
331 uint8_t *pg = vg; \
332 for (i = 0; i < opr_sz; i += 1) { \
333 if (pg[H1(i)] & 1) { \
334 TYPE nn = n[i], mm = m[i]; \
335 d[i] = OP(nn, mm); \
336 } \
337 } \
338}
339
340#define DO_AND(N, M) (N & M)
341#define DO_EOR(N, M) (N ^ M)
342#define DO_ORR(N, M) (N | M)
343#define DO_BIC(N, M) (N & ~M)
344#define DO_ADD(N, M) (N + M)
345#define DO_SUB(N, M) (N - M)
346#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
347#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
348#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
349#define DO_MUL(N, M) (N * M)
350#define DO_DIV(N, M) (M ? N / M : 0)
351
352DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
353DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
354DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
355DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
356
357DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
358DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
359DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
360DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
361
362DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
363DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
364DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
365DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
366
367DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
368DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
369DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
370DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
371
372DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
373DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
374DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
375DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
376
377DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
378DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
379DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
380DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
381
382DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
383DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
384DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
385DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
386
387DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
388DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
389DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
390DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
391
392DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
393DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
394DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
395DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
396
397DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
398DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
399DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
400DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
401
402DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
403DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
404DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
405DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
406
407DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
408DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
409DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
410DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
411
412/* Because the computation type is at least twice as large as required,
413 these work for both signed and unsigned source types. */
414static inline uint8_t do_mulh_b(int32_t n, int32_t m)
415{
416 return (n * m) >> 8;
417}
418
419static inline uint16_t do_mulh_h(int32_t n, int32_t m)
420{
421 return (n * m) >> 16;
422}
423
424static inline uint32_t do_mulh_s(int64_t n, int64_t m)
425{
426 return (n * m) >> 32;
427}
428
429static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
430{
431 uint64_t lo, hi;
432 muls64(&lo, &hi, n, m);
433 return hi;
434}
435
436static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
437{
438 uint64_t lo, hi;
439 mulu64(&lo, &hi, n, m);
440 return hi;
441}
442
443DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
444DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
445DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
446DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
447
448DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
449DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
450DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
451DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
452
453DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
454DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
455DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
456DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
457
458DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_DIV)
459DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV)
460
461DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV)
462DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV)
463
27721dbb
RH
464/* Note that all bits of the shift are significant
465 and not modulo the element size. */
466#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
467#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
468#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
469
470DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
471DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
472DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
473
474DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
475DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
476DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
477
478DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
479DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
480DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
481
482DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
483DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
484DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
485
f97cfd59
RH
486#undef DO_ZPZZ
487#undef DO_ZPZZ_D
047cec97 488
fe7f8dfb
RH
489/* Three-operand expander, controlled by a predicate, in which the
490 * third operand is "wide". That is, for D = N op M, the same 64-bit
491 * value of M is used with all of the narrower values of N.
492 */
493#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
494void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
495{ \
496 intptr_t i, opr_sz = simd_oprsz(desc); \
497 for (i = 0; i < opr_sz; ) { \
498 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
499 TYPEW mm = *(TYPEW *)(vm + i); \
500 do { \
501 if (pg & 1) { \
502 TYPE nn = *(TYPE *)(vn + H(i)); \
503 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
504 } \
505 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
506 } while (i & 7); \
507 } \
508}
509
510DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
511DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
512DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
513
514DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
515DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
516DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
517
518DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
519DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
520DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
521
522#undef DO_ZPZW
523
afac6d04
RH
524/* Fully general two-operand expander, controlled by a predicate.
525 */
526#define DO_ZPZ(NAME, TYPE, H, OP) \
527void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
528{ \
529 intptr_t i, opr_sz = simd_oprsz(desc); \
530 for (i = 0; i < opr_sz; ) { \
531 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
532 do { \
533 if (pg & 1) { \
534 TYPE nn = *(TYPE *)(vn + H(i)); \
535 *(TYPE *)(vd + H(i)) = OP(nn); \
536 } \
537 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
538 } while (i & 15); \
539 } \
540}
541
542/* Similarly, specialized for 64-bit operands. */
543#define DO_ZPZ_D(NAME, TYPE, OP) \
544void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
545{ \
546 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
547 TYPE *d = vd, *n = vn; \
548 uint8_t *pg = vg; \
549 for (i = 0; i < opr_sz; i += 1) { \
550 if (pg[H1(i)] & 1) { \
551 TYPE nn = n[i]; \
552 d[i] = OP(nn); \
553 } \
554 } \
555}
556
557#define DO_CLS_B(N) (clrsb32(N) - 24)
558#define DO_CLS_H(N) (clrsb32(N) - 16)
559
560DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
561DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
562DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
563DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
564
565#define DO_CLZ_B(N) (clz32(N) - 24)
566#define DO_CLZ_H(N) (clz32(N) - 16)
567
568DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
569DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
570DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
571DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
572
573DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
574DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
575DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
576DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
577
578#define DO_CNOT(N) (N == 0)
579
580DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
581DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
582DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
583DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
584
585#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
586
587DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
588DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
589DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
590
591#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
592
593DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
594DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
595DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
596
597#define DO_NOT(N) (~N)
598
599DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
600DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
601DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
602DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
603
604#define DO_SXTB(N) ((int8_t)N)
605#define DO_SXTH(N) ((int16_t)N)
606#define DO_SXTS(N) ((int32_t)N)
607#define DO_UXTB(N) ((uint8_t)N)
608#define DO_UXTH(N) ((uint16_t)N)
609#define DO_UXTS(N) ((uint32_t)N)
610
611DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
612DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
613DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
614DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
615DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
616DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
617
618DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
619DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
620DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
621DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
622DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
623DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
624
625#define DO_ABS(N) (N < 0 ? -N : N)
626
627DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
628DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
629DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
630DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
631
632#define DO_NEG(N) (-N)
633
634DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
635DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
636DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
637DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
638
dae8fb90
RH
639DO_ZPZ(sve_revb_h, uint16_t, H1_2, bswap16)
640DO_ZPZ(sve_revb_s, uint32_t, H1_4, bswap32)
641DO_ZPZ_D(sve_revb_d, uint64_t, bswap64)
642
643DO_ZPZ(sve_revh_s, uint32_t, H1_4, hswap32)
644DO_ZPZ_D(sve_revh_d, uint64_t, hswap64)
645
646DO_ZPZ_D(sve_revw_d, uint64_t, wswap64)
647
648DO_ZPZ(sve_rbit_b, uint8_t, H1, revbit8)
649DO_ZPZ(sve_rbit_h, uint16_t, H1_2, revbit16)
650DO_ZPZ(sve_rbit_s, uint32_t, H1_4, revbit32)
651DO_ZPZ_D(sve_rbit_d, uint64_t, revbit64)
652
d9d78dcc
RH
653/* Three-operand expander, unpredicated, in which the third operand is "wide".
654 */
655#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
656void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
657{ \
658 intptr_t i, opr_sz = simd_oprsz(desc); \
659 for (i = 0; i < opr_sz; ) { \
660 TYPEW mm = *(TYPEW *)(vm + i); \
661 do { \
662 TYPE nn = *(TYPE *)(vn + H(i)); \
663 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
664 i += sizeof(TYPE); \
665 } while (i & 7); \
666 } \
667}
668
669DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
670DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
671DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
672
673DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
674DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
675DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
676
677DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
678DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
679DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
680
681#undef DO_ZZW
682
afac6d04
RH
683#undef DO_CLS_B
684#undef DO_CLS_H
685#undef DO_CLZ_B
686#undef DO_CLZ_H
687#undef DO_CNOT
688#undef DO_FABS
689#undef DO_FNEG
690#undef DO_ABS
691#undef DO_NEG
692#undef DO_ZPZ
693#undef DO_ZPZ_D
694
047cec97
RH
695/* Two-operand reduction expander, controlled by a predicate.
696 * The difference between TYPERED and TYPERET has to do with
697 * sign-extension. E.g. for SMAX, TYPERED must be signed,
698 * but TYPERET must be unsigned so that e.g. a 32-bit value
699 * is not sign-extended to the ABI uint64_t return type.
700 */
701/* ??? If we were to vectorize this by hand the reduction ordering
702 * would change. For integer operands, this is perfectly fine.
703 */
704#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
705uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
706{ \
707 intptr_t i, opr_sz = simd_oprsz(desc); \
708 TYPERED ret = INIT; \
709 for (i = 0; i < opr_sz; ) { \
710 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
711 do { \
712 if (pg & 1) { \
713 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
714 ret = OP(ret, nn); \
715 } \
716 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
717 } while (i & 15); \
718 } \
719 return (TYPERET)ret; \
720}
721
722#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
723uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
724{ \
725 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
726 TYPEE *n = vn; \
727 uint8_t *pg = vg; \
728 TYPER ret = INIT; \
729 for (i = 0; i < opr_sz; i += 1) { \
730 if (pg[H1(i)] & 1) { \
731 TYPEE nn = n[i]; \
732 ret = OP(ret, nn); \
733 } \
734 } \
735 return ret; \
736}
737
738DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
739DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
740DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
741DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
742
743DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
744DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
745DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
746DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
747
748DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
749DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
750DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
751DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
752
753DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
754DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
755DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
756
757DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
758DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
759DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
760DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
761
762DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
763DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
764DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
765DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
766
767DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
768DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
769DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
770DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
771
772DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
773DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
774DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
775DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
776
777DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
778DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
779DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
780DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
781
782#undef DO_VPZ
783#undef DO_VPZ_D
784
f97cfd59
RH
785#undef DO_AND
786#undef DO_ORR
787#undef DO_EOR
788#undef DO_BIC
789#undef DO_ADD
790#undef DO_SUB
791#undef DO_MAX
792#undef DO_MIN
793#undef DO_ABD
794#undef DO_MUL
795#undef DO_DIV
27721dbb
RH
796#undef DO_ASR
797#undef DO_LSR
798#undef DO_LSL
f97cfd59 799
028e2a7b
RH
800/* Similar to the ARM LastActiveElement pseudocode function, except the
801 result is multiplied by the element size. This includes the not found
802 indication; e.g. not found for esz=3 is -8. */
803static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
804{
805 uint64_t mask = pred_esz_masks[esz];
806 intptr_t i = words;
807
808 do {
809 uint64_t this_g = g[--i] & mask;
810 if (this_g) {
811 return i * 64 + (63 - clz64(this_g));
812 }
813 } while (i > 0);
814 return (intptr_t)-1 << esz;
815}
816
817uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
818{
819 uint32_t flags = PREDTEST_INIT;
820 uint64_t *d = vd, *g = vg;
821 intptr_t i = 0;
822
823 do {
824 uint64_t this_d = d[i];
825 uint64_t this_g = g[i];
826
827 if (this_g) {
828 if (!(flags & 4)) {
829 /* Set in D the first bit of G. */
830 this_d |= this_g & -this_g;
831 d[i] = this_d;
832 }
833 flags = iter_predtest_fwd(this_d, this_g, flags);
834 }
835 } while (++i < words);
836
837 return flags;
838}
839
840uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
841{
842 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
843 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
844 uint32_t flags = PREDTEST_INIT;
845 uint64_t *d = vd, *g = vg, esz_mask;
846 intptr_t i, next;
847
848 next = last_active_element(vd, words, esz) + (1 << esz);
849 esz_mask = pred_esz_masks[esz];
850
851 /* Similar to the pseudocode for pnext, but scaled by ESZ
852 so that we find the correct bit. */
853 if (next < words * 64) {
854 uint64_t mask = -1;
855
856 if (next & 63) {
857 mask = ~((1ull << (next & 63)) - 1);
858 next &= -64;
859 }
860 do {
861 uint64_t this_g = g[next / 64] & esz_mask & mask;
862 if (this_g != 0) {
863 next = (next & -64) + ctz64(this_g);
864 break;
865 }
866 next += 64;
867 mask = -1;
868 } while (next < words * 64);
869 }
870
871 i = 0;
872 do {
873 uint64_t this_d = 0;
874 if (i == next / 64) {
875 this_d = 1ull << (next & 63);
876 }
877 d[i] = this_d;
878 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
879 } while (++i < words);
880
881 return flags;
882}
ccd841c3
RH
883
884/* Store zero into every active element of Zd. We will use this for two
885 * and three-operand predicated instructions for which logic dictates a
886 * zero result. In particular, logical shift by element size, which is
887 * otherwise undefined on the host.
888 *
889 * For element sizes smaller than uint64_t, we use tables to expand
890 * the N bits of the controlling predicate to a byte mask, and clear
891 * those bytes.
892 */
893void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
894{
895 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
896 uint64_t *d = vd;
897 uint8_t *pg = vg;
898 for (i = 0; i < opr_sz; i += 1) {
899 d[i] &= ~expand_pred_b(pg[H1(i)]);
900 }
901}
902
903void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
904{
905 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
906 uint64_t *d = vd;
907 uint8_t *pg = vg;
908 for (i = 0; i < opr_sz; i += 1) {
909 d[i] &= ~expand_pred_h(pg[H1(i)]);
910 }
911}
912
913void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
914{
915 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
916 uint64_t *d = vd;
917 uint8_t *pg = vg;
918 for (i = 0; i < opr_sz; i += 1) {
919 d[i] &= ~expand_pred_s(pg[H1(i)]);
920 }
921}
922
923void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
924{
925 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
926 uint64_t *d = vd;
927 uint8_t *pg = vg;
928 for (i = 0; i < opr_sz; i += 1) {
929 if (pg[H1(i)] & 1) {
930 d[i] = 0;
931 }
932 }
933}
934
935/* Three-operand expander, immediate operand, controlled by a predicate.
936 */
937#define DO_ZPZI(NAME, TYPE, H, OP) \
938void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
939{ \
940 intptr_t i, opr_sz = simd_oprsz(desc); \
941 TYPE imm = simd_data(desc); \
942 for (i = 0; i < opr_sz; ) { \
943 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
944 do { \
945 if (pg & 1) { \
946 TYPE nn = *(TYPE *)(vn + H(i)); \
947 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
948 } \
949 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
950 } while (i & 15); \
951 } \
952}
953
954/* Similarly, specialized for 64-bit operands. */
955#define DO_ZPZI_D(NAME, TYPE, OP) \
956void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
957{ \
958 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
959 TYPE *d = vd, *n = vn; \
960 TYPE imm = simd_data(desc); \
961 uint8_t *pg = vg; \
962 for (i = 0; i < opr_sz; i += 1) { \
963 if (pg[H1(i)] & 1) { \
964 TYPE nn = n[i]; \
965 d[i] = OP(nn, imm); \
966 } \
967 } \
968}
969
970#define DO_SHR(N, M) (N >> M)
971#define DO_SHL(N, M) (N << M)
972
973/* Arithmetic shift right for division. This rounds negative numbers
974 toward zero as per signed division. Therefore before shifting,
975 when N is negative, add 2**M-1. */
976#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
977
978DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
979DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
980DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
981DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
982
983DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
984DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
985DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
986DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
987
988DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
989DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
990DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
991DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
992
993DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
994DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
995DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
996DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
997
998#undef DO_SHR
999#undef DO_SHL
1000#undef DO_ASRD
1001#undef DO_ZPZI
1002#undef DO_ZPZI_D
96a36e4a
RH
1003
1004/* Fully general four-operand expander, controlled by a predicate.
1005 */
1006#define DO_ZPZZZ(NAME, TYPE, H, OP) \
1007void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1008 void *vg, uint32_t desc) \
1009{ \
1010 intptr_t i, opr_sz = simd_oprsz(desc); \
1011 for (i = 0; i < opr_sz; ) { \
1012 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
1013 do { \
1014 if (pg & 1) { \
1015 TYPE nn = *(TYPE *)(vn + H(i)); \
1016 TYPE mm = *(TYPE *)(vm + H(i)); \
1017 TYPE aa = *(TYPE *)(va + H(i)); \
1018 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
1019 } \
1020 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
1021 } while (i & 15); \
1022 } \
1023}
1024
1025/* Similarly, specialized for 64-bit operands. */
1026#define DO_ZPZZZ_D(NAME, TYPE, OP) \
1027void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
1028 void *vg, uint32_t desc) \
1029{ \
1030 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
1031 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
1032 uint8_t *pg = vg; \
1033 for (i = 0; i < opr_sz; i += 1) { \
1034 if (pg[H1(i)] & 1) { \
1035 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1036 d[i] = OP(aa, nn, mm); \
1037 } \
1038 } \
1039}
1040
1041#define DO_MLA(A, N, M) (A + N * M)
1042#define DO_MLS(A, N, M) (A - N * M)
1043
1044DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1045DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1046
1047DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1048DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1049
1050DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1051DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1052
1053DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1054DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1055
1056#undef DO_MLA
1057#undef DO_MLS
1058#undef DO_ZPZZZ
1059#undef DO_ZPZZZ_D
9a56c9c3
RH
1060
1061void HELPER(sve_index_b)(void *vd, uint32_t start,
1062 uint32_t incr, uint32_t desc)
1063{
1064 intptr_t i, opr_sz = simd_oprsz(desc);
1065 uint8_t *d = vd;
1066 for (i = 0; i < opr_sz; i += 1) {
1067 d[H1(i)] = start + i * incr;
1068 }
1069}
1070
1071void HELPER(sve_index_h)(void *vd, uint32_t start,
1072 uint32_t incr, uint32_t desc)
1073{
1074 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1075 uint16_t *d = vd;
1076 for (i = 0; i < opr_sz; i += 1) {
1077 d[H2(i)] = start + i * incr;
1078 }
1079}
1080
1081void HELPER(sve_index_s)(void *vd, uint32_t start,
1082 uint32_t incr, uint32_t desc)
1083{
1084 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1085 uint32_t *d = vd;
1086 for (i = 0; i < opr_sz; i += 1) {
1087 d[H4(i)] = start + i * incr;
1088 }
1089}
1090
1091void HELPER(sve_index_d)(void *vd, uint64_t start,
1092 uint64_t incr, uint32_t desc)
1093{
1094 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1095 uint64_t *d = vd;
1096 for (i = 0; i < opr_sz; i += 1) {
1097 d[i] = start + i * incr;
1098 }
1099}
4b242d9c
RH
1100
1101void HELPER(sve_adr_p32)(void *vd, void *vn, void *vm, uint32_t desc)
1102{
1103 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1104 uint32_t sh = simd_data(desc);
1105 uint32_t *d = vd, *n = vn, *m = vm;
1106 for (i = 0; i < opr_sz; i += 1) {
1107 d[i] = n[i] + (m[i] << sh);
1108 }
1109}
1110
1111void HELPER(sve_adr_p64)(void *vd, void *vn, void *vm, uint32_t desc)
1112{
1113 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1114 uint64_t sh = simd_data(desc);
1115 uint64_t *d = vd, *n = vn, *m = vm;
1116 for (i = 0; i < opr_sz; i += 1) {
1117 d[i] = n[i] + (m[i] << sh);
1118 }
1119}
1120
1121void HELPER(sve_adr_s32)(void *vd, void *vn, void *vm, uint32_t desc)
1122{
1123 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1124 uint64_t sh = simd_data(desc);
1125 uint64_t *d = vd, *n = vn, *m = vm;
1126 for (i = 0; i < opr_sz; i += 1) {
1127 d[i] = n[i] + ((uint64_t)(int32_t)m[i] << sh);
1128 }
1129}
1130
1131void HELPER(sve_adr_u32)(void *vd, void *vn, void *vm, uint32_t desc)
1132{
1133 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1134 uint64_t sh = simd_data(desc);
1135 uint64_t *d = vd, *n = vn, *m = vm;
1136 for (i = 0; i < opr_sz; i += 1) {
1137 d[i] = n[i] + ((uint64_t)(uint32_t)m[i] << sh);
1138 }
1139}
0762cd42
RH
1140
1141void HELPER(sve_fexpa_h)(void *vd, void *vn, uint32_t desc)
1142{
1143 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1144 static const uint16_t coeff[] = {
1145 0x0000, 0x0016, 0x002d, 0x0045, 0x005d, 0x0075, 0x008e, 0x00a8,
1146 0x00c2, 0x00dc, 0x00f8, 0x0114, 0x0130, 0x014d, 0x016b, 0x0189,
1147 0x01a8, 0x01c8, 0x01e8, 0x0209, 0x022b, 0x024e, 0x0271, 0x0295,
1148 0x02ba, 0x02e0, 0x0306, 0x032e, 0x0356, 0x037f, 0x03a9, 0x03d4,
1149 };
1150 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1151 uint16_t *d = vd, *n = vn;
1152
1153 for (i = 0; i < opr_sz; i++) {
1154 uint16_t nn = n[i];
1155 intptr_t idx = extract32(nn, 0, 5);
1156 uint16_t exp = extract32(nn, 5, 5);
1157 d[i] = coeff[idx] | (exp << 10);
1158 }
1159}
1160
1161void HELPER(sve_fexpa_s)(void *vd, void *vn, uint32_t desc)
1162{
1163 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1164 static const uint32_t coeff[] = {
1165 0x000000, 0x0164d2, 0x02cd87, 0x043a29,
1166 0x05aac3, 0x071f62, 0x08980f, 0x0a14d5,
1167 0x0b95c2, 0x0d1adf, 0x0ea43a, 0x1031dc,
1168 0x11c3d3, 0x135a2b, 0x14f4f0, 0x16942d,
1169 0x1837f0, 0x19e046, 0x1b8d3a, 0x1d3eda,
1170 0x1ef532, 0x20b051, 0x227043, 0x243516,
1171 0x25fed7, 0x27cd94, 0x29a15b, 0x2b7a3a,
1172 0x2d583f, 0x2f3b79, 0x3123f6, 0x3311c4,
1173 0x3504f3, 0x36fd92, 0x38fbaf, 0x3aff5b,
1174 0x3d08a4, 0x3f179a, 0x412c4d, 0x4346cd,
1175 0x45672a, 0x478d75, 0x49b9be, 0x4bec15,
1176 0x4e248c, 0x506334, 0x52a81e, 0x54f35b,
1177 0x5744fd, 0x599d16, 0x5bfbb8, 0x5e60f5,
1178 0x60ccdf, 0x633f89, 0x65b907, 0x68396a,
1179 0x6ac0c7, 0x6d4f30, 0x6fe4ba, 0x728177,
1180 0x75257d, 0x77d0df, 0x7a83b3, 0x7d3e0c,
1181 };
1182 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1183 uint32_t *d = vd, *n = vn;
1184
1185 for (i = 0; i < opr_sz; i++) {
1186 uint32_t nn = n[i];
1187 intptr_t idx = extract32(nn, 0, 6);
1188 uint32_t exp = extract32(nn, 6, 8);
1189 d[i] = coeff[idx] | (exp << 23);
1190 }
1191}
1192
1193void HELPER(sve_fexpa_d)(void *vd, void *vn, uint32_t desc)
1194{
1195 /* These constants are cut-and-paste directly from the ARM pseudocode. */
1196 static const uint64_t coeff[] = {
1197 0x0000000000000ull, 0x02C9A3E778061ull, 0x059B0D3158574ull,
1198 0x0874518759BC8ull, 0x0B5586CF9890Full, 0x0E3EC32D3D1A2ull,
1199 0x11301D0125B51ull, 0x1429AAEA92DE0ull, 0x172B83C7D517Bull,
1200 0x1A35BEB6FCB75ull, 0x1D4873168B9AAull, 0x2063B88628CD6ull,
1201 0x2387A6E756238ull, 0x26B4565E27CDDull, 0x29E9DF51FDEE1ull,
1202 0x2D285A6E4030Bull, 0x306FE0A31B715ull, 0x33C08B26416FFull,
1203 0x371A7373AA9CBull, 0x3A7DB34E59FF7ull, 0x3DEA64C123422ull,
1204 0x4160A21F72E2Aull, 0x44E086061892Dull, 0x486A2B5C13CD0ull,
1205 0x4BFDAD5362A27ull, 0x4F9B2769D2CA7ull, 0x5342B569D4F82ull,
1206 0x56F4736B527DAull, 0x5AB07DD485429ull, 0x5E76F15AD2148ull,
1207 0x6247EB03A5585ull, 0x6623882552225ull, 0x6A09E667F3BCDull,
1208 0x6DFB23C651A2Full, 0x71F75E8EC5F74ull, 0x75FEB564267C9ull,
1209 0x7A11473EB0187ull, 0x7E2F336CF4E62ull, 0x82589994CCE13ull,
1210 0x868D99B4492EDull, 0x8ACE5422AA0DBull, 0x8F1AE99157736ull,
1211 0x93737B0CDC5E5ull, 0x97D829FDE4E50ull, 0x9C49182A3F090ull,
1212 0xA0C667B5DE565ull, 0xA5503B23E255Dull, 0xA9E6B5579FDBFull,
1213 0xAE89F995AD3ADull, 0xB33A2B84F15FBull, 0xB7F76F2FB5E47ull,
1214 0xBCC1E904BC1D2ull, 0xC199BDD85529Cull, 0xC67F12E57D14Bull,
1215 0xCB720DCEF9069ull, 0xD072D4A07897Cull, 0xD5818DCFBA487ull,
1216 0xDA9E603DB3285ull, 0xDFC97337B9B5Full, 0xE502EE78B3FF6ull,
1217 0xEA4AFA2A490DAull, 0xEFA1BEE615A27ull, 0xF50765B6E4540ull,
1218 0xFA7C1819E90D8ull,
1219 };
1220 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1221 uint64_t *d = vd, *n = vn;
1222
1223 for (i = 0; i < opr_sz; i++) {
1224 uint64_t nn = n[i];
1225 intptr_t idx = extract32(nn, 0, 6);
1226 uint64_t exp = extract32(nn, 6, 11);
1227 d[i] = coeff[idx] | (exp << 52);
1228 }
1229}
a1f233f2
RH
1230
1231void HELPER(sve_ftssel_h)(void *vd, void *vn, void *vm, uint32_t desc)
1232{
1233 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1234 uint16_t *d = vd, *n = vn, *m = vm;
1235 for (i = 0; i < opr_sz; i += 1) {
1236 uint16_t nn = n[i];
1237 uint16_t mm = m[i];
1238 if (mm & 1) {
1239 nn = float16_one;
1240 }
1241 d[i] = nn ^ (mm & 2) << 14;
1242 }
1243}
1244
1245void HELPER(sve_ftssel_s)(void *vd, void *vn, void *vm, uint32_t desc)
1246{
1247 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1248 uint32_t *d = vd, *n = vn, *m = vm;
1249 for (i = 0; i < opr_sz; i += 1) {
1250 uint32_t nn = n[i];
1251 uint32_t mm = m[i];
1252 if (mm & 1) {
1253 nn = float32_one;
1254 }
1255 d[i] = nn ^ (mm & 2) << 30;
1256 }
1257}
1258
1259void HELPER(sve_ftssel_d)(void *vd, void *vn, void *vm, uint32_t desc)
1260{
1261 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1262 uint64_t *d = vd, *n = vn, *m = vm;
1263 for (i = 0; i < opr_sz; i += 1) {
1264 uint64_t nn = n[i];
1265 uint64_t mm = m[i];
1266 if (mm & 1) {
1267 nn = float64_one;
1268 }
1269 d[i] = nn ^ (mm & 2) << 62;
1270 }
1271}
24e82e68
RH
1272
1273/*
1274 * Signed saturating addition with scalar operand.
1275 */
1276
1277void HELPER(sve_sqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1278{
1279 intptr_t i, oprsz = simd_oprsz(desc);
1280
1281 for (i = 0; i < oprsz; i += sizeof(int8_t)) {
1282 int r = *(int8_t *)(a + i) + b;
1283 if (r > INT8_MAX) {
1284 r = INT8_MAX;
1285 } else if (r < INT8_MIN) {
1286 r = INT8_MIN;
1287 }
1288 *(int8_t *)(d + i) = r;
1289 }
1290}
1291
1292void HELPER(sve_sqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1293{
1294 intptr_t i, oprsz = simd_oprsz(desc);
1295
1296 for (i = 0; i < oprsz; i += sizeof(int16_t)) {
1297 int r = *(int16_t *)(a + i) + b;
1298 if (r > INT16_MAX) {
1299 r = INT16_MAX;
1300 } else if (r < INT16_MIN) {
1301 r = INT16_MIN;
1302 }
1303 *(int16_t *)(d + i) = r;
1304 }
1305}
1306
1307void HELPER(sve_sqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1308{
1309 intptr_t i, oprsz = simd_oprsz(desc);
1310
1311 for (i = 0; i < oprsz; i += sizeof(int32_t)) {
1312 int64_t r = *(int32_t *)(a + i) + b;
1313 if (r > INT32_MAX) {
1314 r = INT32_MAX;
1315 } else if (r < INT32_MIN) {
1316 r = INT32_MIN;
1317 }
1318 *(int32_t *)(d + i) = r;
1319 }
1320}
1321
1322void HELPER(sve_sqaddi_d)(void *d, void *a, int64_t b, uint32_t desc)
1323{
1324 intptr_t i, oprsz = simd_oprsz(desc);
1325
1326 for (i = 0; i < oprsz; i += sizeof(int64_t)) {
1327 int64_t ai = *(int64_t *)(a + i);
1328 int64_t r = ai + b;
1329 if (((r ^ ai) & ~(ai ^ b)) < 0) {
1330 /* Signed overflow. */
1331 r = (r < 0 ? INT64_MAX : INT64_MIN);
1332 }
1333 *(int64_t *)(d + i) = r;
1334 }
1335}
1336
1337/*
1338 * Unsigned saturating addition with scalar operand.
1339 */
1340
1341void HELPER(sve_uqaddi_b)(void *d, void *a, int32_t b, uint32_t desc)
1342{
1343 intptr_t i, oprsz = simd_oprsz(desc);
1344
1345 for (i = 0; i < oprsz; i += sizeof(uint8_t)) {
1346 int r = *(uint8_t *)(a + i) + b;
1347 if (r > UINT8_MAX) {
1348 r = UINT8_MAX;
1349 } else if (r < 0) {
1350 r = 0;
1351 }
1352 *(uint8_t *)(d + i) = r;
1353 }
1354}
1355
1356void HELPER(sve_uqaddi_h)(void *d, void *a, int32_t b, uint32_t desc)
1357{
1358 intptr_t i, oprsz = simd_oprsz(desc);
1359
1360 for (i = 0; i < oprsz; i += sizeof(uint16_t)) {
1361 int r = *(uint16_t *)(a + i) + b;
1362 if (r > UINT16_MAX) {
1363 r = UINT16_MAX;
1364 } else if (r < 0) {
1365 r = 0;
1366 }
1367 *(uint16_t *)(d + i) = r;
1368 }
1369}
1370
1371void HELPER(sve_uqaddi_s)(void *d, void *a, int64_t b, uint32_t desc)
1372{
1373 intptr_t i, oprsz = simd_oprsz(desc);
1374
1375 for (i = 0; i < oprsz; i += sizeof(uint32_t)) {
1376 int64_t r = *(uint32_t *)(a + i) + b;
1377 if (r > UINT32_MAX) {
1378 r = UINT32_MAX;
1379 } else if (r < 0) {
1380 r = 0;
1381 }
1382 *(uint32_t *)(d + i) = r;
1383 }
1384}
1385
1386void HELPER(sve_uqaddi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1387{
1388 intptr_t i, oprsz = simd_oprsz(desc);
1389
1390 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1391 uint64_t r = *(uint64_t *)(a + i) + b;
1392 if (r < b) {
1393 r = UINT64_MAX;
1394 }
1395 *(uint64_t *)(d + i) = r;
1396 }
1397}
1398
1399void HELPER(sve_uqsubi_d)(void *d, void *a, uint64_t b, uint32_t desc)
1400{
1401 intptr_t i, oprsz = simd_oprsz(desc);
1402
1403 for (i = 0; i < oprsz; i += sizeof(uint64_t)) {
1404 uint64_t ai = *(uint64_t *)(a + i);
1405 *(uint64_t *)(d + i) = (ai < b ? 0 : ai - b);
1406 }
1407}
f25a2361
RH
1408
1409/* Two operand predicated copy immediate with merge. All valid immediates
1410 * can fit within 17 signed bits in the simd_data field.
1411 */
1412void HELPER(sve_cpy_m_b)(void *vd, void *vn, void *vg,
1413 uint64_t mm, uint32_t desc)
1414{
1415 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1416 uint64_t *d = vd, *n = vn;
1417 uint8_t *pg = vg;
1418
1419 mm = dup_const(MO_8, mm);
1420 for (i = 0; i < opr_sz; i += 1) {
1421 uint64_t nn = n[i];
1422 uint64_t pp = expand_pred_b(pg[H1(i)]);
1423 d[i] = (mm & pp) | (nn & ~pp);
1424 }
1425}
1426
1427void HELPER(sve_cpy_m_h)(void *vd, void *vn, void *vg,
1428 uint64_t mm, uint32_t desc)
1429{
1430 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1431 uint64_t *d = vd, *n = vn;
1432 uint8_t *pg = vg;
1433
1434 mm = dup_const(MO_16, mm);
1435 for (i = 0; i < opr_sz; i += 1) {
1436 uint64_t nn = n[i];
1437 uint64_t pp = expand_pred_h(pg[H1(i)]);
1438 d[i] = (mm & pp) | (nn & ~pp);
1439 }
1440}
1441
1442void HELPER(sve_cpy_m_s)(void *vd, void *vn, void *vg,
1443 uint64_t mm, uint32_t desc)
1444{
1445 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1446 uint64_t *d = vd, *n = vn;
1447 uint8_t *pg = vg;
1448
1449 mm = dup_const(MO_32, mm);
1450 for (i = 0; i < opr_sz; i += 1) {
1451 uint64_t nn = n[i];
1452 uint64_t pp = expand_pred_s(pg[H1(i)]);
1453 d[i] = (mm & pp) | (nn & ~pp);
1454 }
1455}
1456
1457void HELPER(sve_cpy_m_d)(void *vd, void *vn, void *vg,
1458 uint64_t mm, uint32_t desc)
1459{
1460 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1461 uint64_t *d = vd, *n = vn;
1462 uint8_t *pg = vg;
1463
1464 for (i = 0; i < opr_sz; i += 1) {
1465 uint64_t nn = n[i];
1466 d[i] = (pg[H1(i)] & 1 ? mm : nn);
1467 }
1468}
1469
1470void HELPER(sve_cpy_z_b)(void *vd, void *vg, uint64_t val, uint32_t desc)
1471{
1472 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1473 uint64_t *d = vd;
1474 uint8_t *pg = vg;
1475
1476 val = dup_const(MO_8, val);
1477 for (i = 0; i < opr_sz; i += 1) {
1478 d[i] = val & expand_pred_b(pg[H1(i)]);
1479 }
1480}
1481
1482void HELPER(sve_cpy_z_h)(void *vd, void *vg, uint64_t val, uint32_t desc)
1483{
1484 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1485 uint64_t *d = vd;
1486 uint8_t *pg = vg;
1487
1488 val = dup_const(MO_16, val);
1489 for (i = 0; i < opr_sz; i += 1) {
1490 d[i] = val & expand_pred_h(pg[H1(i)]);
1491 }
1492}
1493
1494void HELPER(sve_cpy_z_s)(void *vd, void *vg, uint64_t val, uint32_t desc)
1495{
1496 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1497 uint64_t *d = vd;
1498 uint8_t *pg = vg;
1499
1500 val = dup_const(MO_32, val);
1501 for (i = 0; i < opr_sz; i += 1) {
1502 d[i] = val & expand_pred_s(pg[H1(i)]);
1503 }
1504}
1505
1506void HELPER(sve_cpy_z_d)(void *vd, void *vg, uint64_t val, uint32_t desc)
1507{
1508 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1509 uint64_t *d = vd;
1510 uint8_t *pg = vg;
1511
1512 for (i = 0; i < opr_sz; i += 1) {
1513 d[i] = (pg[H1(i)] & 1 ? val : 0);
1514 }
1515}
b94f8f60
RH
1516
1517/* Big-endian hosts need to frob the byte indicies. If the copy
1518 * happens to be 8-byte aligned, then no frobbing necessary.
1519 */
1520static void swap_memmove(void *vd, void *vs, size_t n)
1521{
1522 uintptr_t d = (uintptr_t)vd;
1523 uintptr_t s = (uintptr_t)vs;
1524 uintptr_t o = (d | s | n) & 7;
1525 size_t i;
1526
1527#ifndef HOST_WORDS_BIGENDIAN
1528 o = 0;
1529#endif
1530 switch (o) {
1531 case 0:
1532 memmove(vd, vs, n);
1533 break;
1534
1535 case 4:
1536 if (d < s || d >= s + n) {
1537 for (i = 0; i < n; i += 4) {
1538 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1539 }
1540 } else {
1541 for (i = n; i > 0; ) {
1542 i -= 4;
1543 *(uint32_t *)H1_4(d + i) = *(uint32_t *)H1_4(s + i);
1544 }
1545 }
1546 break;
1547
1548 case 2:
1549 case 6:
1550 if (d < s || d >= s + n) {
1551 for (i = 0; i < n; i += 2) {
1552 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1553 }
1554 } else {
1555 for (i = n; i > 0; ) {
1556 i -= 2;
1557 *(uint16_t *)H1_2(d + i) = *(uint16_t *)H1_2(s + i);
1558 }
1559 }
1560 break;
1561
1562 default:
1563 if (d < s || d >= s + n) {
1564 for (i = 0; i < n; i++) {
1565 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1566 }
1567 } else {
1568 for (i = n; i > 0; ) {
1569 i -= 1;
1570 *(uint8_t *)H1(d + i) = *(uint8_t *)H1(s + i);
1571 }
1572 }
1573 break;
1574 }
1575}
1576
1577void HELPER(sve_ext)(void *vd, void *vn, void *vm, uint32_t desc)
1578{
1579 intptr_t opr_sz = simd_oprsz(desc);
1580 size_t n_ofs = simd_data(desc);
1581 size_t n_siz = opr_sz - n_ofs;
1582
1583 if (vd != vm) {
1584 swap_memmove(vd, vn + n_ofs, n_siz);
1585 swap_memmove(vd + n_siz, vm, n_ofs);
1586 } else if (vd != vn) {
1587 swap_memmove(vd + n_siz, vd, n_ofs);
1588 swap_memmove(vd, vn + n_ofs, n_siz);
1589 } else {
1590 /* vd == vn == vm. Need temp space. */
1591 ARMVectorReg tmp;
1592 swap_memmove(&tmp, vm, n_ofs);
1593 swap_memmove(vd, vd + n_ofs, n_siz);
1594 memcpy(vd + n_siz, &tmp, n_ofs);
1595 }
1596}
30562ab7
RH
1597
1598#define DO_INSR(NAME, TYPE, H) \
1599void HELPER(NAME)(void *vd, void *vn, uint64_t val, uint32_t desc) \
1600{ \
1601 intptr_t opr_sz = simd_oprsz(desc); \
1602 swap_memmove(vd + sizeof(TYPE), vn, opr_sz - sizeof(TYPE)); \
1603 *(TYPE *)(vd + H(0)) = val; \
1604}
1605
1606DO_INSR(sve_insr_b, uint8_t, H1)
1607DO_INSR(sve_insr_h, uint16_t, H1_2)
1608DO_INSR(sve_insr_s, uint32_t, H1_4)
1609DO_INSR(sve_insr_d, uint64_t, )
1610
1611#undef DO_INSR
1612
1613void HELPER(sve_rev_b)(void *vd, void *vn, uint32_t desc)
1614{
1615 intptr_t i, j, opr_sz = simd_oprsz(desc);
1616 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1617 uint64_t f = *(uint64_t *)(vn + i);
1618 uint64_t b = *(uint64_t *)(vn + j);
1619 *(uint64_t *)(vd + i) = bswap64(b);
1620 *(uint64_t *)(vd + j) = bswap64(f);
1621 }
1622}
1623
30562ab7
RH
1624void HELPER(sve_rev_h)(void *vd, void *vn, uint32_t desc)
1625{
1626 intptr_t i, j, opr_sz = simd_oprsz(desc);
1627 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1628 uint64_t f = *(uint64_t *)(vn + i);
1629 uint64_t b = *(uint64_t *)(vn + j);
1630 *(uint64_t *)(vd + i) = hswap64(b);
1631 *(uint64_t *)(vd + j) = hswap64(f);
1632 }
1633}
1634
1635void HELPER(sve_rev_s)(void *vd, void *vn, uint32_t desc)
1636{
1637 intptr_t i, j, opr_sz = simd_oprsz(desc);
1638 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1639 uint64_t f = *(uint64_t *)(vn + i);
1640 uint64_t b = *(uint64_t *)(vn + j);
1641 *(uint64_t *)(vd + i) = rol64(b, 32);
1642 *(uint64_t *)(vd + j) = rol64(f, 32);
1643 }
1644}
1645
1646void HELPER(sve_rev_d)(void *vd, void *vn, uint32_t desc)
1647{
1648 intptr_t i, j, opr_sz = simd_oprsz(desc);
1649 for (i = 0, j = opr_sz - 8; i < opr_sz / 2; i += 8, j -= 8) {
1650 uint64_t f = *(uint64_t *)(vn + i);
1651 uint64_t b = *(uint64_t *)(vn + j);
1652 *(uint64_t *)(vd + i) = b;
1653 *(uint64_t *)(vd + j) = f;
1654 }
1655}
1656
1657#define DO_TBL(NAME, TYPE, H) \
1658void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1659{ \
1660 intptr_t i, opr_sz = simd_oprsz(desc); \
1661 uintptr_t elem = opr_sz / sizeof(TYPE); \
1662 TYPE *d = vd, *n = vn, *m = vm; \
1663 ARMVectorReg tmp; \
1664 if (unlikely(vd == vn)) { \
1665 n = memcpy(&tmp, vn, opr_sz); \
1666 } \
1667 for (i = 0; i < elem; i++) { \
1668 TYPE j = m[H(i)]; \
1669 d[H(i)] = j < elem ? n[H(j)] : 0; \
1670 } \
1671}
1672
1673DO_TBL(sve_tbl_b, uint8_t, H1)
1674DO_TBL(sve_tbl_h, uint16_t, H2)
1675DO_TBL(sve_tbl_s, uint32_t, H4)
1676DO_TBL(sve_tbl_d, uint64_t, )
1677
1678#undef TBL
1679
1680#define DO_UNPK(NAME, TYPED, TYPES, HD, HS) \
1681void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1682{ \
1683 intptr_t i, opr_sz = simd_oprsz(desc); \
1684 TYPED *d = vd; \
1685 TYPES *n = vn; \
1686 ARMVectorReg tmp; \
1687 if (unlikely(vn - vd < opr_sz)) { \
1688 n = memcpy(&tmp, n, opr_sz / 2); \
1689 } \
1690 for (i = 0; i < opr_sz / sizeof(TYPED); i++) { \
1691 d[HD(i)] = n[HS(i)]; \
1692 } \
1693}
1694
1695DO_UNPK(sve_sunpk_h, int16_t, int8_t, H2, H1)
1696DO_UNPK(sve_sunpk_s, int32_t, int16_t, H4, H2)
1697DO_UNPK(sve_sunpk_d, int64_t, int32_t, , H4)
1698
1699DO_UNPK(sve_uunpk_h, uint16_t, uint8_t, H2, H1)
1700DO_UNPK(sve_uunpk_s, uint32_t, uint16_t, H4, H2)
1701DO_UNPK(sve_uunpk_d, uint64_t, uint32_t, , H4)
1702
1703#undef DO_UNPK
d731d8cb
RH
1704
1705/* Mask of bits included in the even numbered predicates of width esz.
1706 * We also use this for expand_bits/compress_bits, and so extend the
1707 * same pattern out to 16-bit units.
1708 */
1709static const uint64_t even_bit_esz_masks[5] = {
1710 0x5555555555555555ull,
1711 0x3333333333333333ull,
1712 0x0f0f0f0f0f0f0f0full,
1713 0x00ff00ff00ff00ffull,
1714 0x0000ffff0000ffffull,
1715};
1716
1717/* Zero-extend units of 2**N bits to units of 2**(N+1) bits.
1718 * For N==0, this corresponds to the operation that in qemu/bitops.h
1719 * we call half_shuffle64; this algorithm is from Hacker's Delight,
1720 * section 7-2 Shuffling Bits.
1721 */
1722static uint64_t expand_bits(uint64_t x, int n)
1723{
1724 int i;
1725
1726 x &= 0xffffffffu;
1727 for (i = 4; i >= n; i--) {
1728 int sh = 1 << i;
1729 x = ((x << sh) | x) & even_bit_esz_masks[i];
1730 }
1731 return x;
1732}
1733
1734/* Compress units of 2**(N+1) bits to units of 2**N bits.
1735 * For N==0, this corresponds to the operation that in qemu/bitops.h
1736 * we call half_unshuffle64; this algorithm is from Hacker's Delight,
1737 * section 7-2 Shuffling Bits, where it is called an inverse half shuffle.
1738 */
1739static uint64_t compress_bits(uint64_t x, int n)
1740{
1741 int i;
1742
1743 for (i = n; i <= 4; i++) {
1744 int sh = 1 << i;
1745 x &= even_bit_esz_masks[i];
1746 x = (x >> sh) | x;
1747 }
1748 return x & 0xffffffffu;
1749}
1750
1751void HELPER(sve_zip_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1752{
1753 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1754 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1755 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1756 uint64_t *d = vd;
1757 intptr_t i;
1758
1759 if (oprsz <= 8) {
1760 uint64_t nn = *(uint64_t *)vn;
1761 uint64_t mm = *(uint64_t *)vm;
1762 int half = 4 * oprsz;
1763
1764 nn = extract64(nn, high * half, half);
1765 mm = extract64(mm, high * half, half);
1766 nn = expand_bits(nn, esz);
1767 mm = expand_bits(mm, esz);
1768 d[0] = nn + (mm << (1 << esz));
1769 } else {
1770 ARMPredicateReg tmp_n, tmp_m;
1771
1772 /* We produce output faster than we consume input.
1773 Therefore we must be mindful of possible overlap. */
1774 if ((vn - vd) < (uintptr_t)oprsz) {
1775 vn = memcpy(&tmp_n, vn, oprsz);
1776 }
1777 if ((vm - vd) < (uintptr_t)oprsz) {
1778 vm = memcpy(&tmp_m, vm, oprsz);
1779 }
1780 if (high) {
1781 high = oprsz >> 1;
1782 }
1783
1784 if ((high & 3) == 0) {
1785 uint32_t *n = vn, *m = vm;
1786 high >>= 2;
1787
1788 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1789 uint64_t nn = n[H4(high + i)];
1790 uint64_t mm = m[H4(high + i)];
1791
1792 nn = expand_bits(nn, esz);
1793 mm = expand_bits(mm, esz);
1794 d[i] = nn + (mm << (1 << esz));
1795 }
1796 } else {
1797 uint8_t *n = vn, *m = vm;
1798 uint16_t *d16 = vd;
1799
1800 for (i = 0; i < oprsz / 2; i++) {
1801 uint16_t nn = n[H1(high + i)];
1802 uint16_t mm = m[H1(high + i)];
1803
1804 nn = expand_bits(nn, esz);
1805 mm = expand_bits(mm, esz);
1806 d16[H2(i)] = nn + (mm << (1 << esz));
1807 }
1808 }
1809 }
1810}
1811
1812void HELPER(sve_uzp_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1813{
1814 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1815 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1816 int odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1) << esz;
1817 uint64_t *d = vd, *n = vn, *m = vm;
1818 uint64_t l, h;
1819 intptr_t i;
1820
1821 if (oprsz <= 8) {
1822 l = compress_bits(n[0] >> odd, esz);
1823 h = compress_bits(m[0] >> odd, esz);
1824 d[0] = extract64(l + (h << (4 * oprsz)), 0, 8 * oprsz);
1825 } else {
1826 ARMPredicateReg tmp_m;
1827 intptr_t oprsz_16 = oprsz / 16;
1828
1829 if ((vm - vd) < (uintptr_t)oprsz) {
1830 m = memcpy(&tmp_m, vm, oprsz);
1831 }
1832
1833 for (i = 0; i < oprsz_16; i++) {
1834 l = n[2 * i + 0];
1835 h = n[2 * i + 1];
1836 l = compress_bits(l >> odd, esz);
1837 h = compress_bits(h >> odd, esz);
1838 d[i] = l + (h << 32);
1839 }
1840
1841 /* For VL which is not a power of 2, the results from M do not
1842 align nicely with the uint64_t for D. Put the aligned results
1843 from M into TMP_M and then copy it into place afterward. */
1844 if (oprsz & 15) {
1845 d[i] = compress_bits(n[2 * i] >> odd, esz);
1846
1847 for (i = 0; i < oprsz_16; i++) {
1848 l = m[2 * i + 0];
1849 h = m[2 * i + 1];
1850 l = compress_bits(l >> odd, esz);
1851 h = compress_bits(h >> odd, esz);
1852 tmp_m.p[i] = l + (h << 32);
1853 }
1854 tmp_m.p[i] = compress_bits(m[2 * i] >> odd, esz);
1855
1856 swap_memmove(vd + oprsz / 2, &tmp_m, oprsz / 2);
1857 } else {
1858 for (i = 0; i < oprsz_16; i++) {
1859 l = m[2 * i + 0];
1860 h = m[2 * i + 1];
1861 l = compress_bits(l >> odd, esz);
1862 h = compress_bits(h >> odd, esz);
1863 d[oprsz_16 + i] = l + (h << 32);
1864 }
1865 }
1866 }
1867}
1868
1869void HELPER(sve_trn_p)(void *vd, void *vn, void *vm, uint32_t pred_desc)
1870{
1871 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1872 uintptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1873 bool odd = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1874 uint64_t *d = vd, *n = vn, *m = vm;
1875 uint64_t mask;
1876 int shr, shl;
1877 intptr_t i;
1878
1879 shl = 1 << esz;
1880 shr = 0;
1881 mask = even_bit_esz_masks[esz];
1882 if (odd) {
1883 mask <<= shl;
1884 shr = shl;
1885 shl = 0;
1886 }
1887
1888 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1889 uint64_t nn = (n[i] & mask) >> shr;
1890 uint64_t mm = (m[i] & mask) << shl;
1891 d[i] = nn + mm;
1892 }
1893}
1894
1895/* Reverse units of 2**N bits. */
1896static uint64_t reverse_bits_64(uint64_t x, int n)
1897{
1898 int i, sh;
1899
1900 x = bswap64(x);
1901 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
1902 uint64_t mask = even_bit_esz_masks[i];
1903 x = ((x & mask) << sh) | ((x >> sh) & mask);
1904 }
1905 return x;
1906}
1907
1908static uint8_t reverse_bits_8(uint8_t x, int n)
1909{
1910 static const uint8_t mask[3] = { 0x55, 0x33, 0x0f };
1911 int i, sh;
1912
1913 for (i = 2, sh = 4; i >= n; i--, sh >>= 1) {
1914 x = ((x & mask[i]) << sh) | ((x >> sh) & mask[i]);
1915 }
1916 return x;
1917}
1918
1919void HELPER(sve_rev_p)(void *vd, void *vn, uint32_t pred_desc)
1920{
1921 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1922 int esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
1923 intptr_t i, oprsz_2 = oprsz / 2;
1924
1925 if (oprsz <= 8) {
1926 uint64_t l = *(uint64_t *)vn;
1927 l = reverse_bits_64(l << (64 - 8 * oprsz), esz);
1928 *(uint64_t *)vd = l;
1929 } else if ((oprsz & 15) == 0) {
1930 for (i = 0; i < oprsz_2; i += 8) {
1931 intptr_t ih = oprsz - 8 - i;
1932 uint64_t l = reverse_bits_64(*(uint64_t *)(vn + i), esz);
1933 uint64_t h = reverse_bits_64(*(uint64_t *)(vn + ih), esz);
1934 *(uint64_t *)(vd + i) = h;
1935 *(uint64_t *)(vd + ih) = l;
1936 }
1937 } else {
1938 for (i = 0; i < oprsz_2; i += 1) {
1939 intptr_t il = H1(i);
1940 intptr_t ih = H1(oprsz - 1 - i);
1941 uint8_t l = reverse_bits_8(*(uint8_t *)(vn + il), esz);
1942 uint8_t h = reverse_bits_8(*(uint8_t *)(vn + ih), esz);
1943 *(uint8_t *)(vd + il) = h;
1944 *(uint8_t *)(vd + ih) = l;
1945 }
1946 }
1947}
1948
1949void HELPER(sve_punpk_p)(void *vd, void *vn, uint32_t pred_desc)
1950{
1951 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
1952 intptr_t high = extract32(pred_desc, SIMD_DATA_SHIFT + 2, 1);
1953 uint64_t *d = vd;
1954 intptr_t i;
1955
1956 if (oprsz <= 8) {
1957 uint64_t nn = *(uint64_t *)vn;
1958 int half = 4 * oprsz;
1959
1960 nn = extract64(nn, high * half, half);
1961 nn = expand_bits(nn, 0);
1962 d[0] = nn;
1963 } else {
1964 ARMPredicateReg tmp_n;
1965
1966 /* We produce output faster than we consume input.
1967 Therefore we must be mindful of possible overlap. */
1968 if ((vn - vd) < (uintptr_t)oprsz) {
1969 vn = memcpy(&tmp_n, vn, oprsz);
1970 }
1971 if (high) {
1972 high = oprsz >> 1;
1973 }
1974
1975 if ((high & 3) == 0) {
1976 uint32_t *n = vn;
1977 high >>= 2;
1978
1979 for (i = 0; i < DIV_ROUND_UP(oprsz, 8); i++) {
1980 uint64_t nn = n[H4(high + i)];
1981 d[i] = expand_bits(nn, 0);
1982 }
1983 } else {
1984 uint16_t *d16 = vd;
1985 uint8_t *n = vn;
1986
1987 for (i = 0; i < oprsz / 2; i++) {
1988 uint16_t nn = n[H1(high + i)];
1989 d16[H2(i)] = expand_bits(nn, 0);
1990 }
1991 }
1992 }
1993}
234b48e9
RH
1994
1995#define DO_ZIP(NAME, TYPE, H) \
1996void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1997{ \
1998 intptr_t oprsz = simd_oprsz(desc); \
1999 intptr_t i, oprsz_2 = oprsz / 2; \
2000 ARMVectorReg tmp_n, tmp_m; \
2001 /* We produce output faster than we consume input. \
2002 Therefore we must be mindful of possible overlap. */ \
2003 if (unlikely((vn - vd) < (uintptr_t)oprsz)) { \
2004 vn = memcpy(&tmp_n, vn, oprsz_2); \
2005 } \
2006 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2007 vm = memcpy(&tmp_m, vm, oprsz_2); \
2008 } \
2009 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2010 *(TYPE *)(vd + H(2 * i + 0)) = *(TYPE *)(vn + H(i)); \
2011 *(TYPE *)(vd + H(2 * i + sizeof(TYPE))) = *(TYPE *)(vm + H(i)); \
2012 } \
2013}
2014
2015DO_ZIP(sve_zip_b, uint8_t, H1)
2016DO_ZIP(sve_zip_h, uint16_t, H1_2)
2017DO_ZIP(sve_zip_s, uint32_t, H1_4)
2018DO_ZIP(sve_zip_d, uint64_t, )
2019
2020#define DO_UZP(NAME, TYPE, H) \
2021void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2022{ \
2023 intptr_t oprsz = simd_oprsz(desc); \
2024 intptr_t oprsz_2 = oprsz / 2; \
2025 intptr_t odd_ofs = simd_data(desc); \
2026 intptr_t i; \
2027 ARMVectorReg tmp_m; \
2028 if (unlikely((vm - vd) < (uintptr_t)oprsz)) { \
2029 vm = memcpy(&tmp_m, vm, oprsz); \
2030 } \
2031 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2032 *(TYPE *)(vd + H(i)) = *(TYPE *)(vn + H(2 * i + odd_ofs)); \
2033 } \
2034 for (i = 0; i < oprsz_2; i += sizeof(TYPE)) { \
2035 *(TYPE *)(vd + H(oprsz_2 + i)) = *(TYPE *)(vm + H(2 * i + odd_ofs)); \
2036 } \
2037}
2038
2039DO_UZP(sve_uzp_b, uint8_t, H1)
2040DO_UZP(sve_uzp_h, uint16_t, H1_2)
2041DO_UZP(sve_uzp_s, uint32_t, H1_4)
2042DO_UZP(sve_uzp_d, uint64_t, )
2043
2044#define DO_TRN(NAME, TYPE, H) \
2045void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2046{ \
2047 intptr_t oprsz = simd_oprsz(desc); \
2048 intptr_t odd_ofs = simd_data(desc); \
2049 intptr_t i; \
2050 for (i = 0; i < oprsz; i += 2 * sizeof(TYPE)) { \
2051 TYPE ae = *(TYPE *)(vn + H(i + odd_ofs)); \
2052 TYPE be = *(TYPE *)(vm + H(i + odd_ofs)); \
2053 *(TYPE *)(vd + H(i + 0)) = ae; \
2054 *(TYPE *)(vd + H(i + sizeof(TYPE))) = be; \
2055 } \
2056}
2057
2058DO_TRN(sve_trn_b, uint8_t, H1)
2059DO_TRN(sve_trn_h, uint16_t, H1_2)
2060DO_TRN(sve_trn_s, uint32_t, H1_4)
2061DO_TRN(sve_trn_d, uint64_t, )
2062
2063#undef DO_ZIP
2064#undef DO_UZP
2065#undef DO_TRN
3ca879ae
RH
2066
2067void HELPER(sve_compact_s)(void *vd, void *vn, void *vg, uint32_t desc)
2068{
2069 intptr_t i, j, opr_sz = simd_oprsz(desc) / 4;
2070 uint32_t *d = vd, *n = vn;
2071 uint8_t *pg = vg;
2072
2073 for (i = j = 0; i < opr_sz; i++) {
2074 if (pg[H1(i / 2)] & (i & 1 ? 0x10 : 0x01)) {
2075 d[H4(j)] = n[H4(i)];
2076 j++;
2077 }
2078 }
2079 for (; j < opr_sz; j++) {
2080 d[H4(j)] = 0;
2081 }
2082}
2083
2084void HELPER(sve_compact_d)(void *vd, void *vn, void *vg, uint32_t desc)
2085{
2086 intptr_t i, j, opr_sz = simd_oprsz(desc) / 8;
2087 uint64_t *d = vd, *n = vn;
2088 uint8_t *pg = vg;
2089
2090 for (i = j = 0; i < opr_sz; i++) {
2091 if (pg[H1(i)] & 1) {
2092 d[j] = n[i];
2093 j++;
2094 }
2095 }
2096 for (; j < opr_sz; j++) {
2097 d[j] = 0;
2098 }
2099}
ef23cb72
RH
2100
2101/* Similar to the ARM LastActiveElement pseudocode function, except the
2102 * result is multiplied by the element size. This includes the not found
2103 * indication; e.g. not found for esz=3 is -8.
2104 */
2105int32_t HELPER(sve_last_active_element)(void *vg, uint32_t pred_desc)
2106{
2107 intptr_t oprsz = extract32(pred_desc, 0, SIMD_OPRSZ_BITS) + 2;
2108 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
2109
2110 return last_active_element(vg, DIV_ROUND_UP(oprsz, 8), esz);
2111}