]> git.proxmox.com Git - mirror_qemu.git/blame - target/arm/sve_helper.c
target/arm: Implement SVE Bitwise Shift - Unpredicated Group
[mirror_qemu.git] / target / arm / sve_helper.c
CommitLineData
9e18d7a6
RH
1/*
2 * ARM SVE Operations
3 *
4 * Copyright (c) 2018 Linaro, Ltd.
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "qemu/osdep.h"
21#include "cpu.h"
22#include "exec/exec-all.h"
23#include "exec/cpu_ldst.h"
24#include "exec/helper-proto.h"
25#include "tcg/tcg-gvec-desc.h"
26
27
f97cfd59
RH
28/* Note that vector data is stored in host-endian 64-bit chunks,
29 so addressing units smaller than that needs a host-endian fixup. */
30#ifdef HOST_WORDS_BIGENDIAN
31#define H1(x) ((x) ^ 7)
32#define H1_2(x) ((x) ^ 6)
33#define H1_4(x) ((x) ^ 4)
34#define H2(x) ((x) ^ 3)
35#define H4(x) ((x) ^ 1)
36#else
37#define H1(x) (x)
38#define H1_2(x) (x)
39#define H1_4(x) (x)
40#define H2(x) (x)
41#define H4(x) (x)
42#endif
43
9e18d7a6
RH
44/* Return a value for NZCV as per the ARM PredTest pseudofunction.
45 *
46 * The return value has bit 31 set if N is set, bit 1 set if Z is clear,
47 * and bit 0 set if C is set. Compare the definitions of these variables
48 * within CPUARMState.
49 */
50
51/* For no G bits set, NZCV = C. */
52#define PREDTEST_INIT 1
53
54/* This is an iterative function, called for each Pd and Pg word
55 * moving forward.
56 */
57static uint32_t iter_predtest_fwd(uint64_t d, uint64_t g, uint32_t flags)
58{
59 if (likely(g)) {
60 /* Compute N from first D & G.
61 Use bit 2 to signal first G bit seen. */
62 if (!(flags & 4)) {
63 flags |= ((d & (g & -g)) != 0) << 31;
64 flags |= 4;
65 }
66
67 /* Accumulate Z from each D & G. */
68 flags |= ((d & g) != 0) << 1;
69
70 /* Compute C from last !(D & G). Replace previous. */
71 flags = deposit32(flags, 0, 1, (d & pow2floor(g)) == 0);
72 }
73 return flags;
74}
75
76/* The same for a single word predicate. */
77uint32_t HELPER(sve_predtest1)(uint64_t d, uint64_t g)
78{
79 return iter_predtest_fwd(d, g, PREDTEST_INIT);
80}
81
82/* The same for a multi-word predicate. */
83uint32_t HELPER(sve_predtest)(void *vd, void *vg, uint32_t words)
84{
85 uint32_t flags = PREDTEST_INIT;
86 uint64_t *d = vd, *g = vg;
87 uintptr_t i = 0;
88
89 do {
90 flags = iter_predtest_fwd(d[i], g[i], flags);
91 } while (++i < words);
92
93 return flags;
94}
516e246a 95
ccd841c3
RH
96/* Expand active predicate bits to bytes, for byte elements.
97 * for (i = 0; i < 256; ++i) {
98 * unsigned long m = 0;
99 * for (j = 0; j < 8; j++) {
100 * if ((i >> j) & 1) {
101 * m |= 0xfful << (j << 3);
102 * }
103 * }
104 * printf("0x%016lx,\n", m);
105 * }
106 */
107static inline uint64_t expand_pred_b(uint8_t byte)
108{
109 static const uint64_t word[256] = {
110 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
111 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
112 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
113 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
114 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
115 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
116 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
117 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
118 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
119 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
120 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
121 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
122 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
123 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
124 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
125 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
126 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
127 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
128 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
129 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
130 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
131 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
132 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
133 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
134 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
135 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
136 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
137 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
138 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
139 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
140 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
141 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
142 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
143 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
144 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
145 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
146 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
147 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
148 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
149 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
150 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
151 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
152 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
153 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
154 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
155 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
156 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
157 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
158 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
159 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
160 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
161 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
162 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
163 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
164 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
165 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
166 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
167 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
168 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
169 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
170 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
171 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
172 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
173 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
174 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
175 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
176 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
177 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
178 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
179 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
180 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
181 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
182 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
183 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
184 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
185 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
186 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
187 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
188 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
189 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
190 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
191 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
192 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
193 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
194 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
195 0xffffffffffffffff,
196 };
197 return word[byte];
198}
199
200/* Similarly for half-word elements.
201 * for (i = 0; i < 256; ++i) {
202 * unsigned long m = 0;
203 * if (i & 0xaa) {
204 * continue;
205 * }
206 * for (j = 0; j < 8; j += 2) {
207 * if ((i >> j) & 1) {
208 * m |= 0xfffful << (j << 3);
209 * }
210 * }
211 * printf("[0x%x] = 0x%016lx,\n", i, m);
212 * }
213 */
214static inline uint64_t expand_pred_h(uint8_t byte)
215{
216 static const uint64_t word[] = {
217 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
218 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
219 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
220 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
221 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
222 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
223 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
224 [0x55] = 0xffffffffffffffff,
225 };
226 return word[byte & 0x55];
227}
228
229/* Similarly for single word elements. */
230static inline uint64_t expand_pred_s(uint8_t byte)
231{
232 static const uint64_t word[] = {
233 [0x01] = 0x00000000ffffffffull,
234 [0x10] = 0xffffffff00000000ull,
235 [0x11] = 0xffffffffffffffffull,
236 };
237 return word[byte & 0x11];
238}
239
516e246a
RH
240#define LOGICAL_PPPP(NAME, FUNC) \
241void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
242{ \
243 uintptr_t opr_sz = simd_oprsz(desc); \
244 uint64_t *d = vd, *n = vn, *m = vm, *g = vg; \
245 uintptr_t i; \
246 for (i = 0; i < opr_sz / 8; ++i) { \
247 d[i] = FUNC(n[i], m[i], g[i]); \
248 } \
249}
250
251#define DO_AND(N, M, G) (((N) & (M)) & (G))
252#define DO_BIC(N, M, G) (((N) & ~(M)) & (G))
253#define DO_EOR(N, M, G) (((N) ^ (M)) & (G))
254#define DO_ORR(N, M, G) (((N) | (M)) & (G))
255#define DO_ORN(N, M, G) (((N) | ~(M)) & (G))
256#define DO_NOR(N, M, G) (~((N) | (M)) & (G))
257#define DO_NAND(N, M, G) (~((N) & (M)) & (G))
258#define DO_SEL(N, M, G) (((N) & (G)) | ((M) & ~(G)))
259
260LOGICAL_PPPP(sve_and_pppp, DO_AND)
261LOGICAL_PPPP(sve_bic_pppp, DO_BIC)
262LOGICAL_PPPP(sve_eor_pppp, DO_EOR)
263LOGICAL_PPPP(sve_sel_pppp, DO_SEL)
264LOGICAL_PPPP(sve_orr_pppp, DO_ORR)
265LOGICAL_PPPP(sve_orn_pppp, DO_ORN)
266LOGICAL_PPPP(sve_nor_pppp, DO_NOR)
267LOGICAL_PPPP(sve_nand_pppp, DO_NAND)
268
269#undef DO_AND
270#undef DO_BIC
271#undef DO_EOR
272#undef DO_ORR
273#undef DO_ORN
274#undef DO_NOR
275#undef DO_NAND
276#undef DO_SEL
277#undef LOGICAL_PPPP
028e2a7b 278
f97cfd59
RH
279/* Fully general three-operand expander, controlled by a predicate.
280 * This is complicated by the host-endian storage of the register file.
281 */
282/* ??? I don't expect the compiler could ever vectorize this itself.
283 * With some tables we can convert bit masks to byte masks, and with
284 * extra care wrt byte/word ordering we could use gcc generic vectors
285 * and do 16 bytes at a time.
286 */
287#define DO_ZPZZ(NAME, TYPE, H, OP) \
288void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
289{ \
290 intptr_t i, opr_sz = simd_oprsz(desc); \
291 for (i = 0; i < opr_sz; ) { \
292 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
293 do { \
294 if (pg & 1) { \
295 TYPE nn = *(TYPE *)(vn + H(i)); \
296 TYPE mm = *(TYPE *)(vm + H(i)); \
297 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
298 } \
299 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
300 } while (i & 15); \
301 } \
302}
303
304/* Similarly, specialized for 64-bit operands. */
305#define DO_ZPZZ_D(NAME, TYPE, OP) \
306void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
307{ \
308 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
309 TYPE *d = vd, *n = vn, *m = vm; \
310 uint8_t *pg = vg; \
311 for (i = 0; i < opr_sz; i += 1) { \
312 if (pg[H1(i)] & 1) { \
313 TYPE nn = n[i], mm = m[i]; \
314 d[i] = OP(nn, mm); \
315 } \
316 } \
317}
318
319#define DO_AND(N, M) (N & M)
320#define DO_EOR(N, M) (N ^ M)
321#define DO_ORR(N, M) (N | M)
322#define DO_BIC(N, M) (N & ~M)
323#define DO_ADD(N, M) (N + M)
324#define DO_SUB(N, M) (N - M)
325#define DO_MAX(N, M) ((N) >= (M) ? (N) : (M))
326#define DO_MIN(N, M) ((N) >= (M) ? (M) : (N))
327#define DO_ABD(N, M) ((N) >= (M) ? (N) - (M) : (M) - (N))
328#define DO_MUL(N, M) (N * M)
329#define DO_DIV(N, M) (M ? N / M : 0)
330
331DO_ZPZZ(sve_and_zpzz_b, uint8_t, H1, DO_AND)
332DO_ZPZZ(sve_and_zpzz_h, uint16_t, H1_2, DO_AND)
333DO_ZPZZ(sve_and_zpzz_s, uint32_t, H1_4, DO_AND)
334DO_ZPZZ_D(sve_and_zpzz_d, uint64_t, DO_AND)
335
336DO_ZPZZ(sve_orr_zpzz_b, uint8_t, H1, DO_ORR)
337DO_ZPZZ(sve_orr_zpzz_h, uint16_t, H1_2, DO_ORR)
338DO_ZPZZ(sve_orr_zpzz_s, uint32_t, H1_4, DO_ORR)
339DO_ZPZZ_D(sve_orr_zpzz_d, uint64_t, DO_ORR)
340
341DO_ZPZZ(sve_eor_zpzz_b, uint8_t, H1, DO_EOR)
342DO_ZPZZ(sve_eor_zpzz_h, uint16_t, H1_2, DO_EOR)
343DO_ZPZZ(sve_eor_zpzz_s, uint32_t, H1_4, DO_EOR)
344DO_ZPZZ_D(sve_eor_zpzz_d, uint64_t, DO_EOR)
345
346DO_ZPZZ(sve_bic_zpzz_b, uint8_t, H1, DO_BIC)
347DO_ZPZZ(sve_bic_zpzz_h, uint16_t, H1_2, DO_BIC)
348DO_ZPZZ(sve_bic_zpzz_s, uint32_t, H1_4, DO_BIC)
349DO_ZPZZ_D(sve_bic_zpzz_d, uint64_t, DO_BIC)
350
351DO_ZPZZ(sve_add_zpzz_b, uint8_t, H1, DO_ADD)
352DO_ZPZZ(sve_add_zpzz_h, uint16_t, H1_2, DO_ADD)
353DO_ZPZZ(sve_add_zpzz_s, uint32_t, H1_4, DO_ADD)
354DO_ZPZZ_D(sve_add_zpzz_d, uint64_t, DO_ADD)
355
356DO_ZPZZ(sve_sub_zpzz_b, uint8_t, H1, DO_SUB)
357DO_ZPZZ(sve_sub_zpzz_h, uint16_t, H1_2, DO_SUB)
358DO_ZPZZ(sve_sub_zpzz_s, uint32_t, H1_4, DO_SUB)
359DO_ZPZZ_D(sve_sub_zpzz_d, uint64_t, DO_SUB)
360
361DO_ZPZZ(sve_smax_zpzz_b, int8_t, H1, DO_MAX)
362DO_ZPZZ(sve_smax_zpzz_h, int16_t, H1_2, DO_MAX)
363DO_ZPZZ(sve_smax_zpzz_s, int32_t, H1_4, DO_MAX)
364DO_ZPZZ_D(sve_smax_zpzz_d, int64_t, DO_MAX)
365
366DO_ZPZZ(sve_umax_zpzz_b, uint8_t, H1, DO_MAX)
367DO_ZPZZ(sve_umax_zpzz_h, uint16_t, H1_2, DO_MAX)
368DO_ZPZZ(sve_umax_zpzz_s, uint32_t, H1_4, DO_MAX)
369DO_ZPZZ_D(sve_umax_zpzz_d, uint64_t, DO_MAX)
370
371DO_ZPZZ(sve_smin_zpzz_b, int8_t, H1, DO_MIN)
372DO_ZPZZ(sve_smin_zpzz_h, int16_t, H1_2, DO_MIN)
373DO_ZPZZ(sve_smin_zpzz_s, int32_t, H1_4, DO_MIN)
374DO_ZPZZ_D(sve_smin_zpzz_d, int64_t, DO_MIN)
375
376DO_ZPZZ(sve_umin_zpzz_b, uint8_t, H1, DO_MIN)
377DO_ZPZZ(sve_umin_zpzz_h, uint16_t, H1_2, DO_MIN)
378DO_ZPZZ(sve_umin_zpzz_s, uint32_t, H1_4, DO_MIN)
379DO_ZPZZ_D(sve_umin_zpzz_d, uint64_t, DO_MIN)
380
381DO_ZPZZ(sve_sabd_zpzz_b, int8_t, H1, DO_ABD)
382DO_ZPZZ(sve_sabd_zpzz_h, int16_t, H1_2, DO_ABD)
383DO_ZPZZ(sve_sabd_zpzz_s, int32_t, H1_4, DO_ABD)
384DO_ZPZZ_D(sve_sabd_zpzz_d, int64_t, DO_ABD)
385
386DO_ZPZZ(sve_uabd_zpzz_b, uint8_t, H1, DO_ABD)
387DO_ZPZZ(sve_uabd_zpzz_h, uint16_t, H1_2, DO_ABD)
388DO_ZPZZ(sve_uabd_zpzz_s, uint32_t, H1_4, DO_ABD)
389DO_ZPZZ_D(sve_uabd_zpzz_d, uint64_t, DO_ABD)
390
391/* Because the computation type is at least twice as large as required,
392 these work for both signed and unsigned source types. */
393static inline uint8_t do_mulh_b(int32_t n, int32_t m)
394{
395 return (n * m) >> 8;
396}
397
398static inline uint16_t do_mulh_h(int32_t n, int32_t m)
399{
400 return (n * m) >> 16;
401}
402
403static inline uint32_t do_mulh_s(int64_t n, int64_t m)
404{
405 return (n * m) >> 32;
406}
407
408static inline uint64_t do_smulh_d(uint64_t n, uint64_t m)
409{
410 uint64_t lo, hi;
411 muls64(&lo, &hi, n, m);
412 return hi;
413}
414
415static inline uint64_t do_umulh_d(uint64_t n, uint64_t m)
416{
417 uint64_t lo, hi;
418 mulu64(&lo, &hi, n, m);
419 return hi;
420}
421
422DO_ZPZZ(sve_mul_zpzz_b, uint8_t, H1, DO_MUL)
423DO_ZPZZ(sve_mul_zpzz_h, uint16_t, H1_2, DO_MUL)
424DO_ZPZZ(sve_mul_zpzz_s, uint32_t, H1_4, DO_MUL)
425DO_ZPZZ_D(sve_mul_zpzz_d, uint64_t, DO_MUL)
426
427DO_ZPZZ(sve_smulh_zpzz_b, int8_t, H1, do_mulh_b)
428DO_ZPZZ(sve_smulh_zpzz_h, int16_t, H1_2, do_mulh_h)
429DO_ZPZZ(sve_smulh_zpzz_s, int32_t, H1_4, do_mulh_s)
430DO_ZPZZ_D(sve_smulh_zpzz_d, uint64_t, do_smulh_d)
431
432DO_ZPZZ(sve_umulh_zpzz_b, uint8_t, H1, do_mulh_b)
433DO_ZPZZ(sve_umulh_zpzz_h, uint16_t, H1_2, do_mulh_h)
434DO_ZPZZ(sve_umulh_zpzz_s, uint32_t, H1_4, do_mulh_s)
435DO_ZPZZ_D(sve_umulh_zpzz_d, uint64_t, do_umulh_d)
436
437DO_ZPZZ(sve_sdiv_zpzz_s, int32_t, H1_4, DO_DIV)
438DO_ZPZZ_D(sve_sdiv_zpzz_d, int64_t, DO_DIV)
439
440DO_ZPZZ(sve_udiv_zpzz_s, uint32_t, H1_4, DO_DIV)
441DO_ZPZZ_D(sve_udiv_zpzz_d, uint64_t, DO_DIV)
442
27721dbb
RH
443/* Note that all bits of the shift are significant
444 and not modulo the element size. */
445#define DO_ASR(N, M) (N >> MIN(M, sizeof(N) * 8 - 1))
446#define DO_LSR(N, M) (M < sizeof(N) * 8 ? N >> M : 0)
447#define DO_LSL(N, M) (M < sizeof(N) * 8 ? N << M : 0)
448
449DO_ZPZZ(sve_asr_zpzz_b, int8_t, H1, DO_ASR)
450DO_ZPZZ(sve_lsr_zpzz_b, uint8_t, H1_2, DO_LSR)
451DO_ZPZZ(sve_lsl_zpzz_b, uint8_t, H1_4, DO_LSL)
452
453DO_ZPZZ(sve_asr_zpzz_h, int16_t, H1, DO_ASR)
454DO_ZPZZ(sve_lsr_zpzz_h, uint16_t, H1_2, DO_LSR)
455DO_ZPZZ(sve_lsl_zpzz_h, uint16_t, H1_4, DO_LSL)
456
457DO_ZPZZ(sve_asr_zpzz_s, int32_t, H1, DO_ASR)
458DO_ZPZZ(sve_lsr_zpzz_s, uint32_t, H1_2, DO_LSR)
459DO_ZPZZ(sve_lsl_zpzz_s, uint32_t, H1_4, DO_LSL)
460
461DO_ZPZZ_D(sve_asr_zpzz_d, int64_t, DO_ASR)
462DO_ZPZZ_D(sve_lsr_zpzz_d, uint64_t, DO_LSR)
463DO_ZPZZ_D(sve_lsl_zpzz_d, uint64_t, DO_LSL)
464
f97cfd59
RH
465#undef DO_ZPZZ
466#undef DO_ZPZZ_D
047cec97 467
fe7f8dfb
RH
468/* Three-operand expander, controlled by a predicate, in which the
469 * third operand is "wide". That is, for D = N op M, the same 64-bit
470 * value of M is used with all of the narrower values of N.
471 */
472#define DO_ZPZW(NAME, TYPE, TYPEW, H, OP) \
473void HELPER(NAME)(void *vd, void *vn, void *vm, void *vg, uint32_t desc) \
474{ \
475 intptr_t i, opr_sz = simd_oprsz(desc); \
476 for (i = 0; i < opr_sz; ) { \
477 uint8_t pg = *(uint8_t *)(vg + H1(i >> 3)); \
478 TYPEW mm = *(TYPEW *)(vm + i); \
479 do { \
480 if (pg & 1) { \
481 TYPE nn = *(TYPE *)(vn + H(i)); \
482 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
483 } \
484 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
485 } while (i & 7); \
486 } \
487}
488
489DO_ZPZW(sve_asr_zpzw_b, int8_t, uint64_t, H1, DO_ASR)
490DO_ZPZW(sve_lsr_zpzw_b, uint8_t, uint64_t, H1, DO_LSR)
491DO_ZPZW(sve_lsl_zpzw_b, uint8_t, uint64_t, H1, DO_LSL)
492
493DO_ZPZW(sve_asr_zpzw_h, int16_t, uint64_t, H1_2, DO_ASR)
494DO_ZPZW(sve_lsr_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
495DO_ZPZW(sve_lsl_zpzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
496
497DO_ZPZW(sve_asr_zpzw_s, int32_t, uint64_t, H1_4, DO_ASR)
498DO_ZPZW(sve_lsr_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
499DO_ZPZW(sve_lsl_zpzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
500
501#undef DO_ZPZW
502
afac6d04
RH
503/* Fully general two-operand expander, controlled by a predicate.
504 */
505#define DO_ZPZ(NAME, TYPE, H, OP) \
506void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
507{ \
508 intptr_t i, opr_sz = simd_oprsz(desc); \
509 for (i = 0; i < opr_sz; ) { \
510 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
511 do { \
512 if (pg & 1) { \
513 TYPE nn = *(TYPE *)(vn + H(i)); \
514 *(TYPE *)(vd + H(i)) = OP(nn); \
515 } \
516 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
517 } while (i & 15); \
518 } \
519}
520
521/* Similarly, specialized for 64-bit operands. */
522#define DO_ZPZ_D(NAME, TYPE, OP) \
523void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
524{ \
525 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
526 TYPE *d = vd, *n = vn; \
527 uint8_t *pg = vg; \
528 for (i = 0; i < opr_sz; i += 1) { \
529 if (pg[H1(i)] & 1) { \
530 TYPE nn = n[i]; \
531 d[i] = OP(nn); \
532 } \
533 } \
534}
535
536#define DO_CLS_B(N) (clrsb32(N) - 24)
537#define DO_CLS_H(N) (clrsb32(N) - 16)
538
539DO_ZPZ(sve_cls_b, int8_t, H1, DO_CLS_B)
540DO_ZPZ(sve_cls_h, int16_t, H1_2, DO_CLS_H)
541DO_ZPZ(sve_cls_s, int32_t, H1_4, clrsb32)
542DO_ZPZ_D(sve_cls_d, int64_t, clrsb64)
543
544#define DO_CLZ_B(N) (clz32(N) - 24)
545#define DO_CLZ_H(N) (clz32(N) - 16)
546
547DO_ZPZ(sve_clz_b, uint8_t, H1, DO_CLZ_B)
548DO_ZPZ(sve_clz_h, uint16_t, H1_2, DO_CLZ_H)
549DO_ZPZ(sve_clz_s, uint32_t, H1_4, clz32)
550DO_ZPZ_D(sve_clz_d, uint64_t, clz64)
551
552DO_ZPZ(sve_cnt_zpz_b, uint8_t, H1, ctpop8)
553DO_ZPZ(sve_cnt_zpz_h, uint16_t, H1_2, ctpop16)
554DO_ZPZ(sve_cnt_zpz_s, uint32_t, H1_4, ctpop32)
555DO_ZPZ_D(sve_cnt_zpz_d, uint64_t, ctpop64)
556
557#define DO_CNOT(N) (N == 0)
558
559DO_ZPZ(sve_cnot_b, uint8_t, H1, DO_CNOT)
560DO_ZPZ(sve_cnot_h, uint16_t, H1_2, DO_CNOT)
561DO_ZPZ(sve_cnot_s, uint32_t, H1_4, DO_CNOT)
562DO_ZPZ_D(sve_cnot_d, uint64_t, DO_CNOT)
563
564#define DO_FABS(N) (N & ((__typeof(N))-1 >> 1))
565
566DO_ZPZ(sve_fabs_h, uint16_t, H1_2, DO_FABS)
567DO_ZPZ(sve_fabs_s, uint32_t, H1_4, DO_FABS)
568DO_ZPZ_D(sve_fabs_d, uint64_t, DO_FABS)
569
570#define DO_FNEG(N) (N ^ ~((__typeof(N))-1 >> 1))
571
572DO_ZPZ(sve_fneg_h, uint16_t, H1_2, DO_FNEG)
573DO_ZPZ(sve_fneg_s, uint32_t, H1_4, DO_FNEG)
574DO_ZPZ_D(sve_fneg_d, uint64_t, DO_FNEG)
575
576#define DO_NOT(N) (~N)
577
578DO_ZPZ(sve_not_zpz_b, uint8_t, H1, DO_NOT)
579DO_ZPZ(sve_not_zpz_h, uint16_t, H1_2, DO_NOT)
580DO_ZPZ(sve_not_zpz_s, uint32_t, H1_4, DO_NOT)
581DO_ZPZ_D(sve_not_zpz_d, uint64_t, DO_NOT)
582
583#define DO_SXTB(N) ((int8_t)N)
584#define DO_SXTH(N) ((int16_t)N)
585#define DO_SXTS(N) ((int32_t)N)
586#define DO_UXTB(N) ((uint8_t)N)
587#define DO_UXTH(N) ((uint16_t)N)
588#define DO_UXTS(N) ((uint32_t)N)
589
590DO_ZPZ(sve_sxtb_h, uint16_t, H1_2, DO_SXTB)
591DO_ZPZ(sve_sxtb_s, uint32_t, H1_4, DO_SXTB)
592DO_ZPZ(sve_sxth_s, uint32_t, H1_4, DO_SXTH)
593DO_ZPZ_D(sve_sxtb_d, uint64_t, DO_SXTB)
594DO_ZPZ_D(sve_sxth_d, uint64_t, DO_SXTH)
595DO_ZPZ_D(sve_sxtw_d, uint64_t, DO_SXTS)
596
597DO_ZPZ(sve_uxtb_h, uint16_t, H1_2, DO_UXTB)
598DO_ZPZ(sve_uxtb_s, uint32_t, H1_4, DO_UXTB)
599DO_ZPZ(sve_uxth_s, uint32_t, H1_4, DO_UXTH)
600DO_ZPZ_D(sve_uxtb_d, uint64_t, DO_UXTB)
601DO_ZPZ_D(sve_uxth_d, uint64_t, DO_UXTH)
602DO_ZPZ_D(sve_uxtw_d, uint64_t, DO_UXTS)
603
604#define DO_ABS(N) (N < 0 ? -N : N)
605
606DO_ZPZ(sve_abs_b, int8_t, H1, DO_ABS)
607DO_ZPZ(sve_abs_h, int16_t, H1_2, DO_ABS)
608DO_ZPZ(sve_abs_s, int32_t, H1_4, DO_ABS)
609DO_ZPZ_D(sve_abs_d, int64_t, DO_ABS)
610
611#define DO_NEG(N) (-N)
612
613DO_ZPZ(sve_neg_b, uint8_t, H1, DO_NEG)
614DO_ZPZ(sve_neg_h, uint16_t, H1_2, DO_NEG)
615DO_ZPZ(sve_neg_s, uint32_t, H1_4, DO_NEG)
616DO_ZPZ_D(sve_neg_d, uint64_t, DO_NEG)
617
d9d78dcc
RH
618/* Three-operand expander, unpredicated, in which the third operand is "wide".
619 */
620#define DO_ZZW(NAME, TYPE, TYPEW, H, OP) \
621void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
622{ \
623 intptr_t i, opr_sz = simd_oprsz(desc); \
624 for (i = 0; i < opr_sz; ) { \
625 TYPEW mm = *(TYPEW *)(vm + i); \
626 do { \
627 TYPE nn = *(TYPE *)(vn + H(i)); \
628 *(TYPE *)(vd + H(i)) = OP(nn, mm); \
629 i += sizeof(TYPE); \
630 } while (i & 7); \
631 } \
632}
633
634DO_ZZW(sve_asr_zzw_b, int8_t, uint64_t, H1, DO_ASR)
635DO_ZZW(sve_lsr_zzw_b, uint8_t, uint64_t, H1, DO_LSR)
636DO_ZZW(sve_lsl_zzw_b, uint8_t, uint64_t, H1, DO_LSL)
637
638DO_ZZW(sve_asr_zzw_h, int16_t, uint64_t, H1_2, DO_ASR)
639DO_ZZW(sve_lsr_zzw_h, uint16_t, uint64_t, H1_2, DO_LSR)
640DO_ZZW(sve_lsl_zzw_h, uint16_t, uint64_t, H1_2, DO_LSL)
641
642DO_ZZW(sve_asr_zzw_s, int32_t, uint64_t, H1_4, DO_ASR)
643DO_ZZW(sve_lsr_zzw_s, uint32_t, uint64_t, H1_4, DO_LSR)
644DO_ZZW(sve_lsl_zzw_s, uint32_t, uint64_t, H1_4, DO_LSL)
645
646#undef DO_ZZW
647
afac6d04
RH
648#undef DO_CLS_B
649#undef DO_CLS_H
650#undef DO_CLZ_B
651#undef DO_CLZ_H
652#undef DO_CNOT
653#undef DO_FABS
654#undef DO_FNEG
655#undef DO_ABS
656#undef DO_NEG
657#undef DO_ZPZ
658#undef DO_ZPZ_D
659
047cec97
RH
660/* Two-operand reduction expander, controlled by a predicate.
661 * The difference between TYPERED and TYPERET has to do with
662 * sign-extension. E.g. for SMAX, TYPERED must be signed,
663 * but TYPERET must be unsigned so that e.g. a 32-bit value
664 * is not sign-extended to the ABI uint64_t return type.
665 */
666/* ??? If we were to vectorize this by hand the reduction ordering
667 * would change. For integer operands, this is perfectly fine.
668 */
669#define DO_VPZ(NAME, TYPEELT, TYPERED, TYPERET, H, INIT, OP) \
670uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
671{ \
672 intptr_t i, opr_sz = simd_oprsz(desc); \
673 TYPERED ret = INIT; \
674 for (i = 0; i < opr_sz; ) { \
675 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
676 do { \
677 if (pg & 1) { \
678 TYPEELT nn = *(TYPEELT *)(vn + H(i)); \
679 ret = OP(ret, nn); \
680 } \
681 i += sizeof(TYPEELT), pg >>= sizeof(TYPEELT); \
682 } while (i & 15); \
683 } \
684 return (TYPERET)ret; \
685}
686
687#define DO_VPZ_D(NAME, TYPEE, TYPER, INIT, OP) \
688uint64_t HELPER(NAME)(void *vn, void *vg, uint32_t desc) \
689{ \
690 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
691 TYPEE *n = vn; \
692 uint8_t *pg = vg; \
693 TYPER ret = INIT; \
694 for (i = 0; i < opr_sz; i += 1) { \
695 if (pg[H1(i)] & 1) { \
696 TYPEE nn = n[i]; \
697 ret = OP(ret, nn); \
698 } \
699 } \
700 return ret; \
701}
702
703DO_VPZ(sve_orv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_ORR)
704DO_VPZ(sve_orv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_ORR)
705DO_VPZ(sve_orv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_ORR)
706DO_VPZ_D(sve_orv_d, uint64_t, uint64_t, 0, DO_ORR)
707
708DO_VPZ(sve_eorv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_EOR)
709DO_VPZ(sve_eorv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_EOR)
710DO_VPZ(sve_eorv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_EOR)
711DO_VPZ_D(sve_eorv_d, uint64_t, uint64_t, 0, DO_EOR)
712
713DO_VPZ(sve_andv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_AND)
714DO_VPZ(sve_andv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_AND)
715DO_VPZ(sve_andv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_AND)
716DO_VPZ_D(sve_andv_d, uint64_t, uint64_t, -1, DO_AND)
717
718DO_VPZ(sve_saddv_b, int8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
719DO_VPZ(sve_saddv_h, int16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
720DO_VPZ(sve_saddv_s, int32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
721
722DO_VPZ(sve_uaddv_b, uint8_t, uint64_t, uint64_t, H1, 0, DO_ADD)
723DO_VPZ(sve_uaddv_h, uint16_t, uint64_t, uint64_t, H1_2, 0, DO_ADD)
724DO_VPZ(sve_uaddv_s, uint32_t, uint64_t, uint64_t, H1_4, 0, DO_ADD)
725DO_VPZ_D(sve_uaddv_d, uint64_t, uint64_t, 0, DO_ADD)
726
727DO_VPZ(sve_smaxv_b, int8_t, int8_t, uint8_t, H1, INT8_MIN, DO_MAX)
728DO_VPZ(sve_smaxv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MIN, DO_MAX)
729DO_VPZ(sve_smaxv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MIN, DO_MAX)
730DO_VPZ_D(sve_smaxv_d, int64_t, int64_t, INT64_MIN, DO_MAX)
731
732DO_VPZ(sve_umaxv_b, uint8_t, uint8_t, uint8_t, H1, 0, DO_MAX)
733DO_VPZ(sve_umaxv_h, uint16_t, uint16_t, uint16_t, H1_2, 0, DO_MAX)
734DO_VPZ(sve_umaxv_s, uint32_t, uint32_t, uint32_t, H1_4, 0, DO_MAX)
735DO_VPZ_D(sve_umaxv_d, uint64_t, uint64_t, 0, DO_MAX)
736
737DO_VPZ(sve_sminv_b, int8_t, int8_t, uint8_t, H1, INT8_MAX, DO_MIN)
738DO_VPZ(sve_sminv_h, int16_t, int16_t, uint16_t, H1_2, INT16_MAX, DO_MIN)
739DO_VPZ(sve_sminv_s, int32_t, int32_t, uint32_t, H1_4, INT32_MAX, DO_MIN)
740DO_VPZ_D(sve_sminv_d, int64_t, int64_t, INT64_MAX, DO_MIN)
741
742DO_VPZ(sve_uminv_b, uint8_t, uint8_t, uint8_t, H1, -1, DO_MIN)
743DO_VPZ(sve_uminv_h, uint16_t, uint16_t, uint16_t, H1_2, -1, DO_MIN)
744DO_VPZ(sve_uminv_s, uint32_t, uint32_t, uint32_t, H1_4, -1, DO_MIN)
745DO_VPZ_D(sve_uminv_d, uint64_t, uint64_t, -1, DO_MIN)
746
747#undef DO_VPZ
748#undef DO_VPZ_D
749
f97cfd59
RH
750#undef DO_AND
751#undef DO_ORR
752#undef DO_EOR
753#undef DO_BIC
754#undef DO_ADD
755#undef DO_SUB
756#undef DO_MAX
757#undef DO_MIN
758#undef DO_ABD
759#undef DO_MUL
760#undef DO_DIV
27721dbb
RH
761#undef DO_ASR
762#undef DO_LSR
763#undef DO_LSL
f97cfd59 764
028e2a7b
RH
765/* Similar to the ARM LastActiveElement pseudocode function, except the
766 result is multiplied by the element size. This includes the not found
767 indication; e.g. not found for esz=3 is -8. */
768static intptr_t last_active_element(uint64_t *g, intptr_t words, intptr_t esz)
769{
770 uint64_t mask = pred_esz_masks[esz];
771 intptr_t i = words;
772
773 do {
774 uint64_t this_g = g[--i] & mask;
775 if (this_g) {
776 return i * 64 + (63 - clz64(this_g));
777 }
778 } while (i > 0);
779 return (intptr_t)-1 << esz;
780}
781
782uint32_t HELPER(sve_pfirst)(void *vd, void *vg, uint32_t words)
783{
784 uint32_t flags = PREDTEST_INIT;
785 uint64_t *d = vd, *g = vg;
786 intptr_t i = 0;
787
788 do {
789 uint64_t this_d = d[i];
790 uint64_t this_g = g[i];
791
792 if (this_g) {
793 if (!(flags & 4)) {
794 /* Set in D the first bit of G. */
795 this_d |= this_g & -this_g;
796 d[i] = this_d;
797 }
798 flags = iter_predtest_fwd(this_d, this_g, flags);
799 }
800 } while (++i < words);
801
802 return flags;
803}
804
805uint32_t HELPER(sve_pnext)(void *vd, void *vg, uint32_t pred_desc)
806{
807 intptr_t words = extract32(pred_desc, 0, SIMD_OPRSZ_BITS);
808 intptr_t esz = extract32(pred_desc, SIMD_DATA_SHIFT, 2);
809 uint32_t flags = PREDTEST_INIT;
810 uint64_t *d = vd, *g = vg, esz_mask;
811 intptr_t i, next;
812
813 next = last_active_element(vd, words, esz) + (1 << esz);
814 esz_mask = pred_esz_masks[esz];
815
816 /* Similar to the pseudocode for pnext, but scaled by ESZ
817 so that we find the correct bit. */
818 if (next < words * 64) {
819 uint64_t mask = -1;
820
821 if (next & 63) {
822 mask = ~((1ull << (next & 63)) - 1);
823 next &= -64;
824 }
825 do {
826 uint64_t this_g = g[next / 64] & esz_mask & mask;
827 if (this_g != 0) {
828 next = (next & -64) + ctz64(this_g);
829 break;
830 }
831 next += 64;
832 mask = -1;
833 } while (next < words * 64);
834 }
835
836 i = 0;
837 do {
838 uint64_t this_d = 0;
839 if (i == next / 64) {
840 this_d = 1ull << (next & 63);
841 }
842 d[i] = this_d;
843 flags = iter_predtest_fwd(this_d, g[i] & esz_mask, flags);
844 } while (++i < words);
845
846 return flags;
847}
ccd841c3
RH
848
849/* Store zero into every active element of Zd. We will use this for two
850 * and three-operand predicated instructions for which logic dictates a
851 * zero result. In particular, logical shift by element size, which is
852 * otherwise undefined on the host.
853 *
854 * For element sizes smaller than uint64_t, we use tables to expand
855 * the N bits of the controlling predicate to a byte mask, and clear
856 * those bytes.
857 */
858void HELPER(sve_clr_b)(void *vd, void *vg, uint32_t desc)
859{
860 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
861 uint64_t *d = vd;
862 uint8_t *pg = vg;
863 for (i = 0; i < opr_sz; i += 1) {
864 d[i] &= ~expand_pred_b(pg[H1(i)]);
865 }
866}
867
868void HELPER(sve_clr_h)(void *vd, void *vg, uint32_t desc)
869{
870 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
871 uint64_t *d = vd;
872 uint8_t *pg = vg;
873 for (i = 0; i < opr_sz; i += 1) {
874 d[i] &= ~expand_pred_h(pg[H1(i)]);
875 }
876}
877
878void HELPER(sve_clr_s)(void *vd, void *vg, uint32_t desc)
879{
880 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
881 uint64_t *d = vd;
882 uint8_t *pg = vg;
883 for (i = 0; i < opr_sz; i += 1) {
884 d[i] &= ~expand_pred_s(pg[H1(i)]);
885 }
886}
887
888void HELPER(sve_clr_d)(void *vd, void *vg, uint32_t desc)
889{
890 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
891 uint64_t *d = vd;
892 uint8_t *pg = vg;
893 for (i = 0; i < opr_sz; i += 1) {
894 if (pg[H1(i)] & 1) {
895 d[i] = 0;
896 }
897 }
898}
899
900/* Three-operand expander, immediate operand, controlled by a predicate.
901 */
902#define DO_ZPZI(NAME, TYPE, H, OP) \
903void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
904{ \
905 intptr_t i, opr_sz = simd_oprsz(desc); \
906 TYPE imm = simd_data(desc); \
907 for (i = 0; i < opr_sz; ) { \
908 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
909 do { \
910 if (pg & 1) { \
911 TYPE nn = *(TYPE *)(vn + H(i)); \
912 *(TYPE *)(vd + H(i)) = OP(nn, imm); \
913 } \
914 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
915 } while (i & 15); \
916 } \
917}
918
919/* Similarly, specialized for 64-bit operands. */
920#define DO_ZPZI_D(NAME, TYPE, OP) \
921void HELPER(NAME)(void *vd, void *vn, void *vg, uint32_t desc) \
922{ \
923 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
924 TYPE *d = vd, *n = vn; \
925 TYPE imm = simd_data(desc); \
926 uint8_t *pg = vg; \
927 for (i = 0; i < opr_sz; i += 1) { \
928 if (pg[H1(i)] & 1) { \
929 TYPE nn = n[i]; \
930 d[i] = OP(nn, imm); \
931 } \
932 } \
933}
934
935#define DO_SHR(N, M) (N >> M)
936#define DO_SHL(N, M) (N << M)
937
938/* Arithmetic shift right for division. This rounds negative numbers
939 toward zero as per signed division. Therefore before shifting,
940 when N is negative, add 2**M-1. */
941#define DO_ASRD(N, M) ((N + (N < 0 ? ((__typeof(N))1 << M) - 1 : 0)) >> M)
942
943DO_ZPZI(sve_asr_zpzi_b, int8_t, H1, DO_SHR)
944DO_ZPZI(sve_asr_zpzi_h, int16_t, H1_2, DO_SHR)
945DO_ZPZI(sve_asr_zpzi_s, int32_t, H1_4, DO_SHR)
946DO_ZPZI_D(sve_asr_zpzi_d, int64_t, DO_SHR)
947
948DO_ZPZI(sve_lsr_zpzi_b, uint8_t, H1, DO_SHR)
949DO_ZPZI(sve_lsr_zpzi_h, uint16_t, H1_2, DO_SHR)
950DO_ZPZI(sve_lsr_zpzi_s, uint32_t, H1_4, DO_SHR)
951DO_ZPZI_D(sve_lsr_zpzi_d, uint64_t, DO_SHR)
952
953DO_ZPZI(sve_lsl_zpzi_b, uint8_t, H1, DO_SHL)
954DO_ZPZI(sve_lsl_zpzi_h, uint16_t, H1_2, DO_SHL)
955DO_ZPZI(sve_lsl_zpzi_s, uint32_t, H1_4, DO_SHL)
956DO_ZPZI_D(sve_lsl_zpzi_d, uint64_t, DO_SHL)
957
958DO_ZPZI(sve_asrd_b, int8_t, H1, DO_ASRD)
959DO_ZPZI(sve_asrd_h, int16_t, H1_2, DO_ASRD)
960DO_ZPZI(sve_asrd_s, int32_t, H1_4, DO_ASRD)
961DO_ZPZI_D(sve_asrd_d, int64_t, DO_ASRD)
962
963#undef DO_SHR
964#undef DO_SHL
965#undef DO_ASRD
966#undef DO_ZPZI
967#undef DO_ZPZI_D
96a36e4a
RH
968
969/* Fully general four-operand expander, controlled by a predicate.
970 */
971#define DO_ZPZZZ(NAME, TYPE, H, OP) \
972void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
973 void *vg, uint32_t desc) \
974{ \
975 intptr_t i, opr_sz = simd_oprsz(desc); \
976 for (i = 0; i < opr_sz; ) { \
977 uint16_t pg = *(uint16_t *)(vg + H1_2(i >> 3)); \
978 do { \
979 if (pg & 1) { \
980 TYPE nn = *(TYPE *)(vn + H(i)); \
981 TYPE mm = *(TYPE *)(vm + H(i)); \
982 TYPE aa = *(TYPE *)(va + H(i)); \
983 *(TYPE *)(vd + H(i)) = OP(aa, nn, mm); \
984 } \
985 i += sizeof(TYPE), pg >>= sizeof(TYPE); \
986 } while (i & 15); \
987 } \
988}
989
990/* Similarly, specialized for 64-bit operands. */
991#define DO_ZPZZZ_D(NAME, TYPE, OP) \
992void HELPER(NAME)(void *vd, void *va, void *vn, void *vm, \
993 void *vg, uint32_t desc) \
994{ \
995 intptr_t i, opr_sz = simd_oprsz(desc) / 8; \
996 TYPE *d = vd, *a = va, *n = vn, *m = vm; \
997 uint8_t *pg = vg; \
998 for (i = 0; i < opr_sz; i += 1) { \
999 if (pg[H1(i)] & 1) { \
1000 TYPE aa = a[i], nn = n[i], mm = m[i]; \
1001 d[i] = OP(aa, nn, mm); \
1002 } \
1003 } \
1004}
1005
1006#define DO_MLA(A, N, M) (A + N * M)
1007#define DO_MLS(A, N, M) (A - N * M)
1008
1009DO_ZPZZZ(sve_mla_b, uint8_t, H1, DO_MLA)
1010DO_ZPZZZ(sve_mls_b, uint8_t, H1, DO_MLS)
1011
1012DO_ZPZZZ(sve_mla_h, uint16_t, H1_2, DO_MLA)
1013DO_ZPZZZ(sve_mls_h, uint16_t, H1_2, DO_MLS)
1014
1015DO_ZPZZZ(sve_mla_s, uint32_t, H1_4, DO_MLA)
1016DO_ZPZZZ(sve_mls_s, uint32_t, H1_4, DO_MLS)
1017
1018DO_ZPZZZ_D(sve_mla_d, uint64_t, DO_MLA)
1019DO_ZPZZZ_D(sve_mls_d, uint64_t, DO_MLS)
1020
1021#undef DO_MLA
1022#undef DO_MLS
1023#undef DO_ZPZZZ
1024#undef DO_ZPZZZ_D
9a56c9c3
RH
1025
1026void HELPER(sve_index_b)(void *vd, uint32_t start,
1027 uint32_t incr, uint32_t desc)
1028{
1029 intptr_t i, opr_sz = simd_oprsz(desc);
1030 uint8_t *d = vd;
1031 for (i = 0; i < opr_sz; i += 1) {
1032 d[H1(i)] = start + i * incr;
1033 }
1034}
1035
1036void HELPER(sve_index_h)(void *vd, uint32_t start,
1037 uint32_t incr, uint32_t desc)
1038{
1039 intptr_t i, opr_sz = simd_oprsz(desc) / 2;
1040 uint16_t *d = vd;
1041 for (i = 0; i < opr_sz; i += 1) {
1042 d[H2(i)] = start + i * incr;
1043 }
1044}
1045
1046void HELPER(sve_index_s)(void *vd, uint32_t start,
1047 uint32_t incr, uint32_t desc)
1048{
1049 intptr_t i, opr_sz = simd_oprsz(desc) / 4;
1050 uint32_t *d = vd;
1051 for (i = 0; i < opr_sz; i += 1) {
1052 d[H4(i)] = start + i * incr;
1053 }
1054}
1055
1056void HELPER(sve_index_d)(void *vd, uint64_t start,
1057 uint64_t incr, uint32_t desc)
1058{
1059 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
1060 uint64_t *d = vd;
1061 for (i = 0; i < opr_sz; i += 1) {
1062 d[i] = start + i * incr;
1063 }
1064}