]> git.proxmox.com Git - mirror_qemu.git/blame - target/arm/tcg/vec_helper.c
target/arm: Use clmul_16* routines
[mirror_qemu.git] / target / arm / tcg / vec_helper.c
CommitLineData
d9061ec3
RH
1/*
2 * ARM AdvSIMD / SVE Vector Operations
3 *
4 * Copyright (c) 2018 Linaro
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
50f57e09 9 * version 2.1 of the License, or (at your option) any later version.
d9061ec3
RH
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "qemu/osdep.h"
21#include "cpu.h"
d9061ec3
RH
22#include "exec/helper-proto.h"
23#include "tcg/tcg-gvec-desc.h"
1695cd61 24#include "fpu/softfloat.h"
ab3ddf31 25#include "qemu/int128.h"
8e3da4c7 26#include "crypto/clmul.h"
a04b68e1 27#include "vec_internal.h"
d9061ec3 28
77f96148
PM
29/*
30 * Data for expanding active predicate bits to bytes, for byte elements.
31 *
32 * for (i = 0; i < 256; ++i) {
33 * unsigned long m = 0;
34 * for (j = 0; j < 8; j++) {
35 * if ((i >> j) & 1) {
36 * m |= 0xfful << (j << 3);
37 * }
38 * }
39 * printf("0x%016lx,\n", m);
40 * }
41 */
42const uint64_t expand_pred_b_data[256] = {
43 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00,
44 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff,
45 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000,
46 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff,
47 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00,
48 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff,
49 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000,
50 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff,
51 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00,
52 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff,
53 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000,
54 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff,
55 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00,
56 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff,
57 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000,
58 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff,
59 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00,
60 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff,
61 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000,
62 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff,
63 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00,
64 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff,
65 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000,
66 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff,
67 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00,
68 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff,
69 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000,
70 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff,
71 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00,
72 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff,
73 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000,
74 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff,
75 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00,
76 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff,
77 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000,
78 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff,
79 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00,
80 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff,
81 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000,
82 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff,
83 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00,
84 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff,
85 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000,
86 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff,
87 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00,
88 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff,
89 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000,
90 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff,
91 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00,
92 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff,
93 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000,
94 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff,
95 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00,
96 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff,
97 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000,
98 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff,
99 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00,
100 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff,
101 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000,
102 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff,
103 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00,
104 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff,
105 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000,
106 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff,
107 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00,
108 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff,
109 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000,
110 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff,
111 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00,
112 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff,
113 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000,
114 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff,
115 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00,
116 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff,
117 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000,
118 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff,
119 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00,
120 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff,
121 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000,
122 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff,
123 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00,
124 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff,
125 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000,
126 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff,
127 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00,
128 0xffffffffffffffff,
129};
130
a613cf2d
RH
131/*
132 * Similarly for half-word elements.
133 * for (i = 0; i < 256; ++i) {
134 * unsigned long m = 0;
135 * if (i & 0xaa) {
136 * continue;
137 * }
138 * for (j = 0; j < 8; j += 2) {
139 * if ((i >> j) & 1) {
140 * m |= 0xfffful << (j << 3);
141 * }
142 * }
143 * printf("[0x%x] = 0x%016lx,\n", i, m);
144 * }
145 */
146const uint64_t expand_pred_h_data[0x55 + 1] = {
147 [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000,
148 [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000,
149 [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000,
150 [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000,
151 [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000,
152 [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000,
153 [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000,
154 [0x55] = 0xffffffffffffffff,
155};
156
ab3ddf31 157/* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */
d782d3ca
RH
158int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3,
159 bool neg, bool round)
d9061ec3 160{
d2179885
RH
161 /*
162 * Simplify:
ab3ddf31
RH
163 * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8
164 * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7
d9061ec3
RH
165 */
166 int32_t ret = (int32_t)src1 * src2;
d2179885
RH
167 if (neg) {
168 ret = -ret;
169 }
ab3ddf31
RH
170 ret += ((int32_t)src3 << 7) + (round << 6);
171 ret >>= 7;
172
173 if (ret != (int8_t)ret) {
174 ret = (ret < 0 ? INT8_MIN : INT8_MAX);
175 }
176 return ret;
177}
178
179void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm,
180 void *va, uint32_t desc)
181{
182 intptr_t i, opr_sz = simd_oprsz(desc);
183 int8_t *d = vd, *n = vn, *m = vm, *a = va;
184
185 for (i = 0; i < opr_sz; ++i) {
186 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true);
187 }
188}
189
190void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm,
191 void *va, uint32_t desc)
192{
193 intptr_t i, opr_sz = simd_oprsz(desc);
194 int8_t *d = vd, *n = vn, *m = vm, *a = va;
195
196 for (i = 0; i < opr_sz; ++i) {
197 d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true);
198 }
199}
200
169d7c58
RH
201void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
202{
203 intptr_t i, opr_sz = simd_oprsz(desc);
204 int8_t *d = vd, *n = vn, *m = vm;
205
206 for (i = 0; i < opr_sz; ++i) {
207 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false);
208 }
209}
210
211void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
212{
213 intptr_t i, opr_sz = simd_oprsz(desc);
214 int8_t *d = vd, *n = vn, *m = vm;
215
216 for (i = 0; i < opr_sz; ++i) {
217 d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true);
218 }
219}
220
ab3ddf31 221/* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */
d782d3ca
RH
222int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3,
223 bool neg, bool round, uint32_t *sat)
ab3ddf31
RH
224{
225 /* Simplify similarly to do_sqrdmlah_b above. */
226 int32_t ret = (int32_t)src1 * src2;
227 if (neg) {
228 ret = -ret;
229 }
d2179885 230 ret += ((int32_t)src3 << 15) + (round << 14);
d9061ec3 231 ret >>= 15;
d2179885 232
d9061ec3 233 if (ret != (int16_t)ret) {
e286bf4a 234 *sat = 1;
d2179885 235 ret = (ret < 0 ? INT16_MIN : INT16_MAX);
d9061ec3
RH
236 }
237 return ret;
238}
239
240uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1,
241 uint32_t src2, uint32_t src3)
242{
e286bf4a 243 uint32_t *sat = &env->vfp.qc[0];
d2179885
RH
244 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat);
245 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
246 false, true, sat);
d9061ec3
RH
247 return deposit32(e1, 16, 16, e2);
248}
249
e7186d82 250void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm,
e286bf4a 251 void *vq, uint32_t desc)
e7186d82
RH
252{
253 uintptr_t opr_sz = simd_oprsz(desc);
254 int16_t *d = vd;
255 int16_t *n = vn;
256 int16_t *m = vm;
e7186d82
RH
257 uintptr_t i;
258
259 for (i = 0; i < opr_sz / 2; ++i) {
d2179885 260 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq);
e7186d82
RH
261 }
262 clear_tail(d, opr_sz, simd_maxsz(desc));
263}
264
d9061ec3
RH
265uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1,
266 uint32_t src2, uint32_t src3)
267{
e286bf4a 268 uint32_t *sat = &env->vfp.qc[0];
d2179885
RH
269 uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat);
270 uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16,
271 true, true, sat);
d9061ec3
RH
272 return deposit32(e1, 16, 16, e2);
273}
274
e7186d82 275void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm,
e286bf4a 276 void *vq, uint32_t desc)
e7186d82
RH
277{
278 uintptr_t opr_sz = simd_oprsz(desc);
279 int16_t *d = vd;
280 int16_t *n = vn;
281 int16_t *m = vm;
e7186d82
RH
282 uintptr_t i;
283
284 for (i = 0; i < opr_sz / 2; ++i) {
d2179885 285 d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq);
e7186d82
RH
286 }
287 clear_tail(d, opr_sz, simd_maxsz(desc));
288}
289
ed78849d
RH
290void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm,
291 void *vq, uint32_t desc)
292{
293 intptr_t i, opr_sz = simd_oprsz(desc);
294 int16_t *d = vd, *n = vn, *m = vm;
295
296 for (i = 0; i < opr_sz / 2; ++i) {
297 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq);
298 }
299 clear_tail(d, opr_sz, simd_maxsz(desc));
300}
301
302void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm,
303 void *vq, uint32_t desc)
304{
305 intptr_t i, opr_sz = simd_oprsz(desc);
306 int16_t *d = vd, *n = vn, *m = vm;
307
308 for (i = 0; i < opr_sz / 2; ++i) {
309 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq);
310 }
311 clear_tail(d, opr_sz, simd_maxsz(desc));
312}
313
ab3ddf31
RH
314void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm,
315 void *va, uint32_t desc)
316{
317 intptr_t i, opr_sz = simd_oprsz(desc);
318 int16_t *d = vd, *n = vn, *m = vm, *a = va;
319 uint32_t discard;
320
321 for (i = 0; i < opr_sz / 2; ++i) {
322 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard);
323 }
324}
325
326void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm,
327 void *va, uint32_t desc)
328{
329 intptr_t i, opr_sz = simd_oprsz(desc);
330 int16_t *d = vd, *n = vn, *m = vm, *a = va;
331 uint32_t discard;
332
333 for (i = 0; i < opr_sz / 2; ++i) {
334 d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard);
335 }
336}
337
169d7c58
RH
338void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
339{
340 intptr_t i, opr_sz = simd_oprsz(desc);
341 int16_t *d = vd, *n = vn, *m = vm;
342 uint32_t discard;
343
344 for (i = 0; i < opr_sz / 2; ++i) {
345 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard);
346 }
347}
348
349void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
350{
351 intptr_t i, opr_sz = simd_oprsz(desc);
352 int16_t *d = vd, *n = vn, *m = vm;
353 uint32_t discard;
354
355 for (i = 0; i < opr_sz / 2; ++i) {
356 d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard);
357 }
358}
359
1aee2d70
RH
360void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
361{
362 intptr_t i, j, opr_sz = simd_oprsz(desc);
363 int idx = simd_data(desc);
364 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
365 uint32_t discard;
366
367 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
368 int16_t mm = m[i];
369 for (j = 0; j < 16 / 2; ++j) {
370 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard);
371 }
372 }
373}
374
375void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc)
376{
377 intptr_t i, j, opr_sz = simd_oprsz(desc);
378 int idx = simd_data(desc);
379 int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx);
380 uint32_t discard;
381
382 for (i = 0; i < opr_sz / 2; i += 16 / 2) {
383 int16_t mm = m[i];
384 for (j = 0; j < 16 / 2; ++j) {
385 d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard);
386 }
387 }
388}
389
d9061ec3 390/* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */
d782d3ca
RH
391int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3,
392 bool neg, bool round, uint32_t *sat)
d9061ec3 393{
ab3ddf31 394 /* Simplify similarly to do_sqrdmlah_b above. */
d9061ec3 395 int64_t ret = (int64_t)src1 * src2;
d2179885
RH
396 if (neg) {
397 ret = -ret;
398 }
399 ret += ((int64_t)src3 << 31) + (round << 30);
d9061ec3 400 ret >>= 31;
d2179885 401
d9061ec3 402 if (ret != (int32_t)ret) {
e286bf4a 403 *sat = 1;
d9061ec3
RH
404 ret = (ret < 0 ? INT32_MIN : INT32_MAX);
405 }
406 return ret;
407}
408
e286bf4a
RH
409uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1,
410 int32_t src2, int32_t src3)
411{
412 uint32_t *sat = &env->vfp.qc[0];
d2179885 413 return do_sqrdmlah_s(src1, src2, src3, false, true, sat);
e286bf4a
RH
414}
415
e7186d82 416void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm,
e286bf4a 417 void *vq, uint32_t desc)
e7186d82
RH
418{
419 uintptr_t opr_sz = simd_oprsz(desc);
420 int32_t *d = vd;
421 int32_t *n = vn;
422 int32_t *m = vm;
e7186d82
RH
423 uintptr_t i;
424
425 for (i = 0; i < opr_sz / 4; ++i) {
d2179885 426 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq);
e7186d82
RH
427 }
428 clear_tail(d, opr_sz, simd_maxsz(desc));
429}
430
e286bf4a
RH
431uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1,
432 int32_t src2, int32_t src3)
433{
434 uint32_t *sat = &env->vfp.qc[0];
d2179885 435 return do_sqrdmlah_s(src1, src2, src3, true, true, sat);
e286bf4a
RH
436}
437
e7186d82 438void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm,
e286bf4a 439 void *vq, uint32_t desc)
e7186d82
RH
440{
441 uintptr_t opr_sz = simd_oprsz(desc);
442 int32_t *d = vd;
443 int32_t *n = vn;
444 int32_t *m = vm;
e7186d82
RH
445 uintptr_t i;
446
447 for (i = 0; i < opr_sz / 4; ++i) {
d2179885 448 d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq);
e7186d82
RH
449 }
450 clear_tail(d, opr_sz, simd_maxsz(desc));
451}
1695cd61 452
ed78849d
RH
453void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm,
454 void *vq, uint32_t desc)
455{
456 intptr_t i, opr_sz = simd_oprsz(desc);
457 int32_t *d = vd, *n = vn, *m = vm;
458
459 for (i = 0; i < opr_sz / 4; ++i) {
460 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq);
461 }
462 clear_tail(d, opr_sz, simd_maxsz(desc));
463}
464
465void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm,
466 void *vq, uint32_t desc)
467{
468 intptr_t i, opr_sz = simd_oprsz(desc);
469 int32_t *d = vd, *n = vn, *m = vm;
470
471 for (i = 0; i < opr_sz / 4; ++i) {
472 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq);
473 }
474 clear_tail(d, opr_sz, simd_maxsz(desc));
475}
476
ab3ddf31
RH
477void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm,
478 void *va, uint32_t desc)
479{
480 intptr_t i, opr_sz = simd_oprsz(desc);
481 int32_t *d = vd, *n = vn, *m = vm, *a = va;
482 uint32_t discard;
483
484 for (i = 0; i < opr_sz / 4; ++i) {
485 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard);
486 }
487}
488
489void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm,
490 void *va, uint32_t desc)
491{
492 intptr_t i, opr_sz = simd_oprsz(desc);
493 int32_t *d = vd, *n = vn, *m = vm, *a = va;
494 uint32_t discard;
495
496 for (i = 0; i < opr_sz / 4; ++i) {
497 d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard);
498 }
499}
500
169d7c58
RH
501void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
502{
503 intptr_t i, opr_sz = simd_oprsz(desc);
504 int32_t *d = vd, *n = vn, *m = vm;
505 uint32_t discard;
506
507 for (i = 0; i < opr_sz / 4; ++i) {
508 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard);
509 }
510}
511
512void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
513{
514 intptr_t i, opr_sz = simd_oprsz(desc);
515 int32_t *d = vd, *n = vn, *m = vm;
516 uint32_t discard;
517
518 for (i = 0; i < opr_sz / 4; ++i) {
519 d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard);
520 }
521}
522
1aee2d70
RH
523void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
524{
525 intptr_t i, j, opr_sz = simd_oprsz(desc);
526 int idx = simd_data(desc);
527 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
528 uint32_t discard;
529
530 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
531 int32_t mm = m[i];
532 for (j = 0; j < 16 / 4; ++j) {
533 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard);
534 }
535 }
536}
537
538void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc)
539{
540 intptr_t i, j, opr_sz = simd_oprsz(desc);
541 int idx = simd_data(desc);
542 int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx);
543 uint32_t discard;
544
545 for (i = 0; i < opr_sz / 4; i += 16 / 4) {
546 int32_t mm = m[i];
547 for (j = 0; j < 16 / 4; ++j) {
548 d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard);
549 }
550 }
551}
552
ab3ddf31
RH
553/* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */
554static int64_t do_sat128_d(Int128 r)
555{
556 int64_t ls = int128_getlo(r);
557 int64_t hs = int128_gethi(r);
558
559 if (unlikely(hs != (ls >> 63))) {
560 return hs < 0 ? INT64_MIN : INT64_MAX;
561 }
562 return ls;
563}
564
d782d3ca 565int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round)
ab3ddf31
RH
566{
567 uint64_t l, h;
568 Int128 r, t;
569
570 /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */
571 muls64(&l, &h, m, n);
572 r = int128_make128(l, h);
573 if (neg) {
574 r = int128_neg(r);
575 }
576 if (a) {
577 t = int128_exts64(a);
578 t = int128_lshift(t, 63);
579 r = int128_add(r, t);
580 }
581 if (round) {
582 t = int128_exts64(1ll << 62);
583 r = int128_add(r, t);
584 }
585 r = int128_rshift(r, 63);
586
587 return do_sat128_d(r);
588}
589
590void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm,
591 void *va, uint32_t desc)
592{
593 intptr_t i, opr_sz = simd_oprsz(desc);
594 int64_t *d = vd, *n = vn, *m = vm, *a = va;
595
596 for (i = 0; i < opr_sz / 8; ++i) {
597 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true);
598 }
599}
600
601void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm,
602 void *va, uint32_t desc)
603{
604 intptr_t i, opr_sz = simd_oprsz(desc);
605 int64_t *d = vd, *n = vn, *m = vm, *a = va;
606
607 for (i = 0; i < opr_sz / 8; ++i) {
608 d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true);
609 }
610}
611
169d7c58
RH
612void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
613{
614 intptr_t i, opr_sz = simd_oprsz(desc);
615 int64_t *d = vd, *n = vn, *m = vm;
616
617 for (i = 0; i < opr_sz / 8; ++i) {
618 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false);
619 }
620}
621
622void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
623{
624 intptr_t i, opr_sz = simd_oprsz(desc);
625 int64_t *d = vd, *n = vn, *m = vm;
626
627 for (i = 0; i < opr_sz / 8; ++i) {
628 d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true);
629 }
630}
631
1aee2d70
RH
632void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
633{
634 intptr_t i, j, opr_sz = simd_oprsz(desc);
635 int idx = simd_data(desc);
636 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
637
638 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
639 int64_t mm = m[i];
640 for (j = 0; j < 16 / 8; ++j) {
641 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false);
642 }
643 }
644}
645
646void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc)
647{
648 intptr_t i, j, opr_sz = simd_oprsz(desc);
649 int idx = simd_data(desc);
650 int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx;
651
652 for (i = 0; i < opr_sz / 8; i += 16 / 8) {
653 int64_t mm = m[i];
654 for (j = 0; j < 16 / 8; ++j) {
655 d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true);
656 }
657 }
658}
659
d730ecaa
RH
660/* Integer 8 and 16-bit dot-product.
661 *
662 * Note that for the loops herein, host endianness does not matter
5c57e3b9 663 * with respect to the ordering of data within the quad-width lanes.
d730ecaa
RH
664 * All elements are treated equally, no matter where they are.
665 */
666
5c57e3b9
RH
667#define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \
668void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
669{ \
670 intptr_t i, opr_sz = simd_oprsz(desc); \
671 TYPED *d = vd, *a = va; \
672 TYPEN *n = vn; \
673 TYPEM *m = vm; \
674 for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \
675 d[i] = (a[i] + \
676 (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \
677 (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \
678 (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \
679 (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \
680 } \
681 clear_tail(d, opr_sz, simd_maxsz(desc)); \
682}
683
684DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t)
685DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t)
6a98cb2a 686DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t)
5c57e3b9
RH
687DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t)
688DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t)
d730ecaa 689
7020ffd6
RH
690#define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \
691void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
692{ \
693 intptr_t i = 0, opr_sz = simd_oprsz(desc); \
694 intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \
695 intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \
696 intptr_t index = simd_data(desc); \
697 TYPED *d = vd, *a = va; \
698 TYPEN *n = vn; \
699 TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \
700 do { \
701 TYPED m0 = m_indexed[i * 4 + 0]; \
702 TYPED m1 = m_indexed[i * 4 + 1]; \
703 TYPED m2 = m_indexed[i * 4 + 2]; \
704 TYPED m3 = m_indexed[i * 4 + 3]; \
705 do { \
706 d[i] = (a[i] + \
707 n[i * 4 + 0] * m0 + \
708 n[i * 4 + 1] * m1 + \
709 n[i * 4 + 2] * m2 + \
710 n[i * 4 + 3] * m3); \
711 } while (++i < segend); \
712 segend = i + 4; \
713 } while (i < opr_sz_n); \
714 clear_tail(d, opr_sz, simd_maxsz(desc)); \
16fcfdc7
RH
715}
716
7020ffd6
RH
717DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4)
718DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4)
2867039a
RH
719DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4)
720DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4)
6e802db3
PM
721DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8)
722DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8)
16fcfdc7 723
1695cd61
RH
724void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm,
725 void *vfpst, uint32_t desc)
726{
727 uintptr_t opr_sz = simd_oprsz(desc);
728 float16 *d = vd;
729 float16 *n = vn;
730 float16 *m = vm;
731 float_status *fpst = vfpst;
732 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
733 uint32_t neg_imag = neg_real ^ 1;
734 uintptr_t i;
735
736 /* Shift boolean to the sign bit so we can xor to negate. */
737 neg_real <<= 15;
738 neg_imag <<= 15;
739
740 for (i = 0; i < opr_sz / 2; i += 2) {
741 float16 e0 = n[H2(i)];
742 float16 e1 = m[H2(i + 1)] ^ neg_imag;
743 float16 e2 = n[H2(i + 1)];
744 float16 e3 = m[H2(i)] ^ neg_real;
745
746 d[H2(i)] = float16_add(e0, e1, fpst);
747 d[H2(i + 1)] = float16_add(e2, e3, fpst);
748 }
749 clear_tail(d, opr_sz, simd_maxsz(desc));
750}
751
752void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm,
753 void *vfpst, uint32_t desc)
754{
755 uintptr_t opr_sz = simd_oprsz(desc);
756 float32 *d = vd;
757 float32 *n = vn;
758 float32 *m = vm;
759 float_status *fpst = vfpst;
760 uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1);
761 uint32_t neg_imag = neg_real ^ 1;
762 uintptr_t i;
763
764 /* Shift boolean to the sign bit so we can xor to negate. */
765 neg_real <<= 31;
766 neg_imag <<= 31;
767
768 for (i = 0; i < opr_sz / 4; i += 2) {
769 float32 e0 = n[H4(i)];
770 float32 e1 = m[H4(i + 1)] ^ neg_imag;
771 float32 e2 = n[H4(i + 1)];
772 float32 e3 = m[H4(i)] ^ neg_real;
773
774 d[H4(i)] = float32_add(e0, e1, fpst);
775 d[H4(i + 1)] = float32_add(e2, e3, fpst);
776 }
777 clear_tail(d, opr_sz, simd_maxsz(desc));
778}
779
780void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm,
781 void *vfpst, uint32_t desc)
782{
783 uintptr_t opr_sz = simd_oprsz(desc);
784 float64 *d = vd;
785 float64 *n = vn;
786 float64 *m = vm;
787 float_status *fpst = vfpst;
788 uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1);
789 uint64_t neg_imag = neg_real ^ 1;
790 uintptr_t i;
791
792 /* Shift boolean to the sign bit so we can xor to negate. */
793 neg_real <<= 63;
794 neg_imag <<= 63;
795
796 for (i = 0; i < opr_sz / 8; i += 2) {
797 float64 e0 = n[i];
798 float64 e1 = m[i + 1] ^ neg_imag;
799 float64 e2 = n[i + 1];
800 float64 e3 = m[i] ^ neg_real;
801
802 d[i] = float64_add(e0, e1, fpst);
803 d[i + 1] = float64_add(e2, e3, fpst);
804 }
805 clear_tail(d, opr_sz, simd_maxsz(desc));
806}
d17b7cdc 807
636ddeb1 808void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va,
d17b7cdc
RH
809 void *vfpst, uint32_t desc)
810{
811 uintptr_t opr_sz = simd_oprsz(desc);
636ddeb1 812 float16 *d = vd, *n = vn, *m = vm, *a = va;
d17b7cdc
RH
813 float_status *fpst = vfpst;
814 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
815 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
816 uint32_t neg_real = flip ^ neg_imag;
817 uintptr_t i;
818
819 /* Shift boolean to the sign bit so we can xor to negate. */
820 neg_real <<= 15;
821 neg_imag <<= 15;
822
823 for (i = 0; i < opr_sz / 2; i += 2) {
824 float16 e2 = n[H2(i + flip)];
825 float16 e1 = m[H2(i + flip)] ^ neg_real;
826 float16 e4 = e2;
827 float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag;
828
636ddeb1
RH
829 d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst);
830 d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst);
d17b7cdc
RH
831 }
832 clear_tail(d, opr_sz, simd_maxsz(desc));
833}
834
636ddeb1 835void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va,
d17b7cdc
RH
836 void *vfpst, uint32_t desc)
837{
838 uintptr_t opr_sz = simd_oprsz(desc);
636ddeb1 839 float16 *d = vd, *n = vn, *m = vm, *a = va;
d17b7cdc
RH
840 float_status *fpst = vfpst;
841 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
842 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2cc99919 843 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
d17b7cdc 844 uint32_t neg_real = flip ^ neg_imag;
18fc2405
RH
845 intptr_t elements = opr_sz / sizeof(float16);
846 intptr_t eltspersegment = 16 / sizeof(float16);
847 intptr_t i, j;
d17b7cdc
RH
848
849 /* Shift boolean to the sign bit so we can xor to negate. */
850 neg_real <<= 15;
851 neg_imag <<= 15;
d17b7cdc 852
18fc2405
RH
853 for (i = 0; i < elements; i += eltspersegment) {
854 float16 mr = m[H2(i + 2 * index + 0)];
855 float16 mi = m[H2(i + 2 * index + 1)];
856 float16 e1 = neg_real ^ (flip ? mi : mr);
857 float16 e3 = neg_imag ^ (flip ? mr : mi);
d17b7cdc 858
18fc2405
RH
859 for (j = i; j < i + eltspersegment; j += 2) {
860 float16 e2 = n[H2(j + flip)];
861 float16 e4 = e2;
862
636ddeb1
RH
863 d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst);
864 d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst);
18fc2405 865 }
d17b7cdc
RH
866 }
867 clear_tail(d, opr_sz, simd_maxsz(desc));
868}
869
636ddeb1 870void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va,
d17b7cdc
RH
871 void *vfpst, uint32_t desc)
872{
873 uintptr_t opr_sz = simd_oprsz(desc);
636ddeb1 874 float32 *d = vd, *n = vn, *m = vm, *a = va;
d17b7cdc
RH
875 float_status *fpst = vfpst;
876 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
877 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
878 uint32_t neg_real = flip ^ neg_imag;
879 uintptr_t i;
880
881 /* Shift boolean to the sign bit so we can xor to negate. */
882 neg_real <<= 31;
883 neg_imag <<= 31;
884
885 for (i = 0; i < opr_sz / 4; i += 2) {
886 float32 e2 = n[H4(i + flip)];
887 float32 e1 = m[H4(i + flip)] ^ neg_real;
888 float32 e4 = e2;
889 float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag;
890
636ddeb1
RH
891 d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst);
892 d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst);
d17b7cdc
RH
893 }
894 clear_tail(d, opr_sz, simd_maxsz(desc));
895}
896
636ddeb1 897void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va,
d17b7cdc
RH
898 void *vfpst, uint32_t desc)
899{
900 uintptr_t opr_sz = simd_oprsz(desc);
636ddeb1 901 float32 *d = vd, *n = vn, *m = vm, *a = va;
d17b7cdc
RH
902 float_status *fpst = vfpst;
903 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
904 uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
2cc99919 905 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2);
d17b7cdc 906 uint32_t neg_real = flip ^ neg_imag;
18fc2405
RH
907 intptr_t elements = opr_sz / sizeof(float32);
908 intptr_t eltspersegment = 16 / sizeof(float32);
909 intptr_t i, j;
d17b7cdc
RH
910
911 /* Shift boolean to the sign bit so we can xor to negate. */
912 neg_real <<= 31;
913 neg_imag <<= 31;
d17b7cdc 914
18fc2405
RH
915 for (i = 0; i < elements; i += eltspersegment) {
916 float32 mr = m[H4(i + 2 * index + 0)];
917 float32 mi = m[H4(i + 2 * index + 1)];
918 float32 e1 = neg_real ^ (flip ? mi : mr);
919 float32 e3 = neg_imag ^ (flip ? mr : mi);
d17b7cdc 920
18fc2405
RH
921 for (j = i; j < i + eltspersegment; j += 2) {
922 float32 e2 = n[H4(j + flip)];
923 float32 e4 = e2;
924
636ddeb1
RH
925 d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst);
926 d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst);
18fc2405 927 }
d17b7cdc
RH
928 }
929 clear_tail(d, opr_sz, simd_maxsz(desc));
930}
931
636ddeb1 932void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va,
d17b7cdc
RH
933 void *vfpst, uint32_t desc)
934{
935 uintptr_t opr_sz = simd_oprsz(desc);
636ddeb1 936 float64 *d = vd, *n = vn, *m = vm, *a = va;
d17b7cdc
RH
937 float_status *fpst = vfpst;
938 intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1);
939 uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
940 uint64_t neg_real = flip ^ neg_imag;
941 uintptr_t i;
942
943 /* Shift boolean to the sign bit so we can xor to negate. */
944 neg_real <<= 63;
945 neg_imag <<= 63;
946
947 for (i = 0; i < opr_sz / 8; i += 2) {
948 float64 e2 = n[i + flip];
949 float64 e1 = m[i + flip] ^ neg_real;
950 float64 e4 = e2;
951 float64 e3 = m[i + 1 - flip] ^ neg_imag;
952
636ddeb1
RH
953 d[i] = float64_muladd(e2, e1, a[i], 0, fpst);
954 d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst);
d17b7cdc
RH
955 }
956 clear_tail(d, opr_sz, simd_maxsz(desc));
957}
29b80469 958
ad505db2
PM
959/*
960 * Floating point comparisons producing an integer result (all 1s or all 0s).
961 * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do.
962 * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires.
963 */
964static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat)
965{
966 return -float16_eq_quiet(op1, op2, stat);
967}
968
969static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat)
970{
971 return -float32_eq_quiet(op1, op2, stat);
972}
973
974static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat)
975{
976 return -float16_le(op2, op1, stat);
977}
978
979static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat)
980{
981 return -float32_le(op2, op1, stat);
982}
983
984static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat)
985{
986 return -float16_lt(op2, op1, stat);
987}
988
989static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat)
990{
991 return -float32_lt(op2, op1, stat);
992}
993
bb2741da
PM
994static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat)
995{
996 return -float16_le(float16_abs(op2), float16_abs(op1), stat);
997}
998
999static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat)
1000{
1001 return -float32_le(float32_abs(op2), float32_abs(op1), stat);
1002}
1003
1004static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat)
1005{
1006 return -float16_lt(float16_abs(op2), float16_abs(op1), stat);
1007}
1008
1009static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat)
1010{
1011 return -float32_lt(float32_abs(op2), float32_abs(op1), stat);
1012}
1013
7782a9af
PM
1014static int16_t vfp_tosszh(float16 x, void *fpstp)
1015{
1016 float_status *fpst = fpstp;
1017 if (float16_is_any_nan(x)) {
1018 float_raise(float_flag_invalid, fpst);
1019 return 0;
1020 }
1021 return float16_to_int16_round_to_zero(x, fpst);
1022}
1023
1024static uint16_t vfp_touszh(float16 x, void *fpstp)
1025{
1026 float_status *fpst = fpstp;
1027 if (float16_is_any_nan(x)) {
1028 float_raise(float_flag_invalid, fpst);
1029 return 0;
1030 }
1031 return float16_to_uint16_round_to_zero(x, fpst);
1032}
1033
3887c038
RH
1034#define DO_2OP(NAME, FUNC, TYPE) \
1035void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
1036{ \
1037 intptr_t i, oprsz = simd_oprsz(desc); \
1038 TYPE *d = vd, *n = vn; \
1039 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1040 d[i] = FUNC(n[i], stat); \
1041 } \
d8efe78e 1042 clear_tail(d, oprsz, simd_maxsz(desc)); \
3887c038
RH
1043}
1044
1045DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16)
1046DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32)
1047DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64)
1048
1049DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16)
1050DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32)
1051DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64)
1052
23afcdd2
PM
1053DO_2OP(gvec_vrintx_h, float16_round_to_int, float16)
1054DO_2OP(gvec_vrintx_s, float32_round_to_int, float32)
1055
7782a9af
PM
1056DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t)
1057DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t)
1058DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32)
1059DO_2OP(gvec_touizs, helper_vfp_touizs, float32)
1060DO_2OP(gvec_sstoh, int16_to_float16, int16_t)
1061DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t)
1062DO_2OP(gvec_tosszh, vfp_tosszh, float16)
1063DO_2OP(gvec_touszh, vfp_touszh, float16)
1064
635187aa
PM
1065#define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \
1066 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1067 { \
1068 return TYPE##_##CMPOP(op, TYPE##_zero, stat); \
1069 }
1070
1071#define WRAP_CMP0_REV(FN, CMPOP, TYPE) \
1072 static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \
1073 { \
1074 return TYPE##_##CMPOP(TYPE##_zero, op, stat); \
1075 }
1076
1077#define DO_2OP_CMP0(FN, CMPOP, DIRN) \
1078 WRAP_CMP0_##DIRN(FN, CMPOP, float16) \
1079 WRAP_CMP0_##DIRN(FN, CMPOP, float32) \
1080 DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \
1081 DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32)
1082
1083DO_2OP_CMP0(cgt, cgt, FWD)
1084DO_2OP_CMP0(cge, cge, FWD)
1085DO_2OP_CMP0(ceq, ceq, FWD)
1086DO_2OP_CMP0(clt, cgt, REV)
1087DO_2OP_CMP0(cle, cge, REV)
1088
3887c038 1089#undef DO_2OP
635187aa 1090#undef DO_2OP_CMP0
3887c038 1091
29b80469
RH
1092/* Floating-point trigonometric starting value.
1093 * See the ARM ARM pseudocode function FPTrigSMul.
1094 */
1095static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat)
1096{
1097 float16 result = float16_mul(op1, op1, stat);
1098 if (!float16_is_any_nan(result)) {
1099 result = float16_set_sign(result, op2 & 1);
1100 }
1101 return result;
1102}
1103
1104static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat)
1105{
1106 float32 result = float32_mul(op1, op1, stat);
1107 if (!float32_is_any_nan(result)) {
1108 result = float32_set_sign(result, op2 & 1);
1109 }
1110 return result;
1111}
1112
1113static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat)
1114{
1115 float64 result = float64_mul(op1, op1, stat);
1116 if (!float64_is_any_nan(result)) {
1117 result = float64_set_sign(result, op2 & 1);
1118 }
1119 return result;
1120}
1121
e4a6d4a6
PM
1122static float16 float16_abd(float16 op1, float16 op2, float_status *stat)
1123{
1124 return float16_abs(float16_sub(op1, op2, stat));
1125}
1126
a26a352b
PM
1127static float32 float32_abd(float32 op1, float32 op2, float_status *stat)
1128{
1129 return float32_abs(float32_sub(op1, op2, stat));
1130}
1131
ac8c62c4
PM
1132/*
1133 * Reciprocal step. These are the AArch32 version which uses a
1134 * non-fused multiply-and-subtract.
1135 */
1136static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat)
1137{
1138 op1 = float16_squash_input_denormal(op1, stat);
1139 op2 = float16_squash_input_denormal(op2, stat);
1140
1141 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1142 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1143 return float16_two;
1144 }
1145 return float16_sub(float16_two, float16_mul(op1, op2, stat), stat);
1146}
1147
1148static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat)
1149{
1150 op1 = float32_squash_input_denormal(op1, stat);
1151 op2 = float32_squash_input_denormal(op2, stat);
1152
1153 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1154 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1155 return float32_two;
1156 }
1157 return float32_sub(float32_two, float32_mul(op1, op2, stat), stat);
1158}
1159
40fde72d
PM
1160/* Reciprocal square-root step. AArch32 non-fused semantics. */
1161static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat)
1162{
1163 op1 = float16_squash_input_denormal(op1, stat);
1164 op2 = float16_squash_input_denormal(op2, stat);
1165
1166 if ((float16_is_infinity(op1) && float16_is_zero(op2)) ||
1167 (float16_is_infinity(op2) && float16_is_zero(op1))) {
1168 return float16_one_point_five;
1169 }
1170 op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat);
1171 return float16_div(op1, float16_two, stat);
1172}
1173
1174static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat)
1175{
1176 op1 = float32_squash_input_denormal(op1, stat);
1177 op2 = float32_squash_input_denormal(op2, stat);
1178
1179 if ((float32_is_infinity(op1) && float32_is_zero(op2)) ||
1180 (float32_is_infinity(op2) && float32_is_zero(op1))) {
1181 return float32_one_point_five;
1182 }
1183 op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat);
1184 return float32_div(op1, float32_two, stat);
1185}
1186
29b80469
RH
1187#define DO_3OP(NAME, FUNC, TYPE) \
1188void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1189{ \
1190 intptr_t i, oprsz = simd_oprsz(desc); \
1191 TYPE *d = vd, *n = vn, *m = vm; \
1192 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1193 d[i] = FUNC(n[i], m[i], stat); \
1194 } \
d8efe78e 1195 clear_tail(d, oprsz, simd_maxsz(desc)); \
29b80469
RH
1196}
1197
1198DO_3OP(gvec_fadd_h, float16_add, float16)
1199DO_3OP(gvec_fadd_s, float32_add, float32)
1200DO_3OP(gvec_fadd_d, float64_add, float64)
1201
1202DO_3OP(gvec_fsub_h, float16_sub, float16)
1203DO_3OP(gvec_fsub_s, float32_sub, float32)
1204DO_3OP(gvec_fsub_d, float64_sub, float64)
1205
1206DO_3OP(gvec_fmul_h, float16_mul, float16)
1207DO_3OP(gvec_fmul_s, float32_mul, float32)
1208DO_3OP(gvec_fmul_d, float64_mul, float64)
1209
1210DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16)
1211DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32)
1212DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64)
1213
e4a6d4a6 1214DO_3OP(gvec_fabd_h, float16_abd, float16)
a26a352b
PM
1215DO_3OP(gvec_fabd_s, float32_abd, float32)
1216
ad505db2
PM
1217DO_3OP(gvec_fceq_h, float16_ceq, float16)
1218DO_3OP(gvec_fceq_s, float32_ceq, float32)
1219
1220DO_3OP(gvec_fcge_h, float16_cge, float16)
1221DO_3OP(gvec_fcge_s, float32_cge, float32)
1222
1223DO_3OP(gvec_fcgt_h, float16_cgt, float16)
1224DO_3OP(gvec_fcgt_s, float32_cgt, float32)
1225
bb2741da
PM
1226DO_3OP(gvec_facge_h, float16_acge, float16)
1227DO_3OP(gvec_facge_s, float32_acge, float32)
1228
1229DO_3OP(gvec_facgt_h, float16_acgt, float16)
1230DO_3OP(gvec_facgt_s, float32_acgt, float32)
1231
e43268c5
PM
1232DO_3OP(gvec_fmax_h, float16_max, float16)
1233DO_3OP(gvec_fmax_s, float32_max, float32)
1234
1235DO_3OP(gvec_fmin_h, float16_min, float16)
1236DO_3OP(gvec_fmin_s, float32_min, float32)
1237
e22705bb
PM
1238DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16)
1239DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32)
1240
1241DO_3OP(gvec_fminnum_h, float16_minnum, float16)
1242DO_3OP(gvec_fminnum_s, float32_minnum, float32)
1243
ac8c62c4
PM
1244DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16)
1245DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32)
1246
40fde72d
PM
1247DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16)
1248DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32)
1249
29b80469
RH
1250#ifdef TARGET_AARCH64
1251
1252DO_3OP(gvec_recps_h, helper_recpsf_f16, float16)
1253DO_3OP(gvec_recps_s, helper_recpsf_f32, float32)
1254DO_3OP(gvec_recps_d, helper_recpsf_f64, float64)
1255
1256DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16)
1257DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32)
1258DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64)
1259
1260#endif
1261#undef DO_3OP
ca40a6e6 1262
e5adc706
PM
1263/* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */
1264static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2,
1265 float_status *stat)
1266{
1267 return float16_add(dest, float16_mul(op1, op2, stat), stat);
1268}
1269
1270static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2,
1271 float_status *stat)
1272{
1273 return float32_add(dest, float32_mul(op1, op2, stat), stat);
1274}
1275
1276static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2,
1277 float_status *stat)
1278{
1279 return float16_sub(dest, float16_mul(op1, op2, stat), stat);
1280}
1281
1282static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2,
1283 float_status *stat)
1284{
1285 return float32_sub(dest, float32_mul(op1, op2, stat), stat);
1286}
1287
cf722d75
PM
1288/* Fused versions; these have the semantics Neon VFMA/VFMS want */
1289static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2,
1290 float_status *stat)
1291{
1292 return float16_muladd(op1, op2, dest, 0, stat);
1293}
1294
1295static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2,
1296 float_status *stat)
1297{
1298 return float32_muladd(op1, op2, dest, 0, stat);
1299}
1300
1301static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2,
1302 float_status *stat)
1303{
1304 return float16_muladd(float16_chs(op1), op2, dest, 0, stat);
1305}
1306
1307static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2,
1308 float_status *stat)
1309{
1310 return float32_muladd(float32_chs(op1), op2, dest, 0, stat);
1311}
1312
1313#define DO_MULADD(NAME, FUNC, TYPE) \
e5adc706
PM
1314void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1315{ \
1316 intptr_t i, oprsz = simd_oprsz(desc); \
1317 TYPE *d = vd, *n = vn, *m = vm; \
1318 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1319 d[i] = FUNC(d[i], n[i], m[i], stat); \
1320 } \
1321 clear_tail(d, oprsz, simd_maxsz(desc)); \
1322}
1323
1324DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16)
1325DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32)
1326
1327DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16)
1328DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32)
1329
cf722d75
PM
1330DO_MULADD(gvec_vfma_h, float16_muladd_f, float16)
1331DO_MULADD(gvec_vfma_s, float32_muladd_f, float32)
1332
1333DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16)
1334DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32)
1335
ca40a6e6
RH
1336/* For the indexed ops, SVE applies the index per 128-bit vector segment.
1337 * For AdvSIMD, there is of course only one such vector segment.
1338 */
1339
1340#define DO_MUL_IDX(NAME, TYPE, H) \
2e5a265e
RH
1341void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
1342{ \
d7ce81e5
PM
1343 intptr_t i, j, oprsz = simd_oprsz(desc); \
1344 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
2e5a265e
RH
1345 intptr_t idx = simd_data(desc); \
1346 TYPE *d = vd, *n = vn, *m = vm; \
1347 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1348 TYPE mm = m[H(i + idx)]; \
1349 for (j = 0; j < segment; j++) { \
1350 d[i + j] = n[i + j] * mm; \
1351 } \
1352 } \
1353 clear_tail(d, oprsz, simd_maxsz(desc)); \
1354}
1355
1356DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2)
1357DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4)
6e802db3 1358DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8)
2e5a265e
RH
1359
1360#undef DO_MUL_IDX
1361
3607440c
RH
1362#define DO_MLA_IDX(NAME, TYPE, OP, H) \
1363void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
1364{ \
d7ce81e5
PM
1365 intptr_t i, j, oprsz = simd_oprsz(desc); \
1366 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
3607440c
RH
1367 intptr_t idx = simd_data(desc); \
1368 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1369 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1370 TYPE mm = m[H(i + idx)]; \
1371 for (j = 0; j < segment; j++) { \
1372 d[i + j] = a[i + j] OP n[i + j] * mm; \
1373 } \
1374 } \
1375 clear_tail(d, oprsz, simd_maxsz(desc)); \
1376}
1377
1378DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2)
1379DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4)
6e802db3 1380DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8)
3607440c
RH
1381
1382DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2)
1383DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4)
6e802db3 1384DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8)
3607440c
RH
1385
1386#undef DO_MLA_IDX
1387
c50d8d14 1388#define DO_FMUL_IDX(NAME, ADD, TYPE, H) \
ca40a6e6
RH
1389void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \
1390{ \
d7ce81e5
PM
1391 intptr_t i, j, oprsz = simd_oprsz(desc); \
1392 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
ca40a6e6
RH
1393 intptr_t idx = simd_data(desc); \
1394 TYPE *d = vd, *n = vn, *m = vm; \
1395 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1396 TYPE mm = m[H(i + idx)]; \
1397 for (j = 0; j < segment; j++) { \
c50d8d14
PM
1398 d[i + j] = TYPE##_##ADD(d[i + j], \
1399 TYPE##_mul(n[i + j], mm, stat), stat); \
ca40a6e6
RH
1400 } \
1401 } \
525d9b6d 1402 clear_tail(d, oprsz, simd_maxsz(desc)); \
ca40a6e6
RH
1403}
1404
c50d8d14
PM
1405#define float16_nop(N, M, S) (M)
1406#define float32_nop(N, M, S) (M)
1407#define float64_nop(N, M, S) (M)
ca40a6e6 1408
c50d8d14
PM
1409DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2)
1410DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4)
6e802db3 1411DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8)
c50d8d14
PM
1412
1413/*
1414 * Non-fused multiply-accumulate operations, for Neon. NB that unlike
1415 * the fused ops below they assume accumulate both from and into Vd.
1416 */
1417DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2)
1418DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4)
1419DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2)
1420DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4)
1421
1422#undef float16_nop
1423#undef float32_nop
1424#undef float64_nop
2e5a265e 1425#undef DO_FMUL_IDX
ca40a6e6
RH
1426
1427#define DO_FMLA_IDX(NAME, TYPE, H) \
1428void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \
1429 void *stat, uint32_t desc) \
1430{ \
d7ce81e5
PM
1431 intptr_t i, j, oprsz = simd_oprsz(desc); \
1432 intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \
ca40a6e6
RH
1433 TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \
1434 intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \
1435 TYPE *d = vd, *n = vn, *m = vm, *a = va; \
1436 op1_neg <<= (8 * sizeof(TYPE) - 1); \
1437 for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \
1438 TYPE mm = m[H(i + idx)]; \
1439 for (j = 0; j < segment; j++) { \
1440 d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \
1441 mm, a[i + j], 0, stat); \
1442 } \
1443 } \
525d9b6d 1444 clear_tail(d, oprsz, simd_maxsz(desc)); \
ca40a6e6
RH
1445}
1446
1447DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2)
1448DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4)
6e802db3 1449DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8)
ca40a6e6
RH
1450
1451#undef DO_FMLA_IDX
89e68b57
RH
1452
1453#define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \
1454void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \
1455{ \
1456 intptr_t i, oprsz = simd_oprsz(desc); \
1457 TYPEN *d = vd, *n = vn; TYPEM *m = vm; \
1458 bool q = false; \
1459 for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \
1460 WTYPE dd = (WTYPE)n[i] OP m[i]; \
1461 if (dd < MIN) { \
1462 dd = MIN; \
1463 q = true; \
1464 } else if (dd > MAX) { \
1465 dd = MAX; \
1466 q = true; \
1467 } \
1468 d[i] = dd; \
1469 } \
1470 if (q) { \
1471 uint32_t *qc = vq; \
1472 qc[0] = 1; \
1473 } \
1474 clear_tail(d, oprsz, simd_maxsz(desc)); \
1475}
1476
1477DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX)
1478DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX)
1479DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX)
1480
1481DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX)
1482DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX)
1483DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX)
1484
1485DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX)
1486DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX)
1487DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX)
1488
1489DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX)
1490DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX)
1491DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX)
1492
1493#undef DO_SAT
1494
1495void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn,
1496 void *vm, uint32_t desc)
1497{
1498 intptr_t i, oprsz = simd_oprsz(desc);
1499 uint64_t *d = vd, *n = vn, *m = vm;
1500 bool q = false;
1501
1502 for (i = 0; i < oprsz / 8; i++) {
1503 uint64_t nn = n[i], mm = m[i], dd = nn + mm;
1504 if (dd < nn) {
1505 dd = UINT64_MAX;
1506 q = true;
1507 }
1508 d[i] = dd;
1509 }
1510 if (q) {
1511 uint32_t *qc = vq;
1512 qc[0] = 1;
1513 }
1514 clear_tail(d, oprsz, simd_maxsz(desc));
1515}
1516
1517void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn,
1518 void *vm, uint32_t desc)
1519{
1520 intptr_t i, oprsz = simd_oprsz(desc);
1521 uint64_t *d = vd, *n = vn, *m = vm;
1522 bool q = false;
1523
1524 for (i = 0; i < oprsz / 8; i++) {
1525 uint64_t nn = n[i], mm = m[i], dd = nn - mm;
1526 if (nn < mm) {
1527 dd = 0;
1528 q = true;
1529 }
1530 d[i] = dd;
1531 }
1532 if (q) {
1533 uint32_t *qc = vq;
1534 qc[0] = 1;
1535 }
1536 clear_tail(d, oprsz, simd_maxsz(desc));
1537}
1538
1539void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn,
1540 void *vm, uint32_t desc)
1541{
1542 intptr_t i, oprsz = simd_oprsz(desc);
1543 int64_t *d = vd, *n = vn, *m = vm;
1544 bool q = false;
1545
1546 for (i = 0; i < oprsz / 8; i++) {
1547 int64_t nn = n[i], mm = m[i], dd = nn + mm;
1548 if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) {
1549 dd = (nn >> 63) ^ ~INT64_MIN;
1550 q = true;
1551 }
1552 d[i] = dd;
1553 }
1554 if (q) {
1555 uint32_t *qc = vq;
1556 qc[0] = 1;
1557 }
1558 clear_tail(d, oprsz, simd_maxsz(desc));
1559}
1560
1561void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn,
1562 void *vm, uint32_t desc)
1563{
1564 intptr_t i, oprsz = simd_oprsz(desc);
1565 int64_t *d = vd, *n = vn, *m = vm;
1566 bool q = false;
1567
1568 for (i = 0; i < oprsz / 8; i++) {
1569 int64_t nn = n[i], mm = m[i], dd = nn - mm;
1570 if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) {
1571 dd = (nn >> 63) ^ ~INT64_MIN;
1572 q = true;
1573 }
1574 d[i] = dd;
1575 }
1576 if (q) {
1577 uint32_t *qc = vq;
1578 qc[0] = 1;
1579 }
1580 clear_tail(d, oprsz, simd_maxsz(desc));
1581}
a4e943a7 1582
631e5654
RH
1583
1584#define DO_SRA(NAME, TYPE) \
1585void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1586{ \
1587 intptr_t i, oprsz = simd_oprsz(desc); \
1588 int shift = simd_data(desc); \
1589 TYPE *d = vd, *n = vn; \
1590 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1591 d[i] += n[i] >> shift; \
1592 } \
1593 clear_tail(d, oprsz, simd_maxsz(desc)); \
1594}
1595
1596DO_SRA(gvec_ssra_b, int8_t)
1597DO_SRA(gvec_ssra_h, int16_t)
1598DO_SRA(gvec_ssra_s, int32_t)
1599DO_SRA(gvec_ssra_d, int64_t)
1600
1601DO_SRA(gvec_usra_b, uint8_t)
1602DO_SRA(gvec_usra_h, uint16_t)
1603DO_SRA(gvec_usra_s, uint32_t)
1604DO_SRA(gvec_usra_d, uint64_t)
1605
1606#undef DO_SRA
1607
6ccd48d4
RH
1608#define DO_RSHR(NAME, TYPE) \
1609void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1610{ \
1611 intptr_t i, oprsz = simd_oprsz(desc); \
1612 int shift = simd_data(desc); \
1613 TYPE *d = vd, *n = vn; \
1614 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1615 TYPE tmp = n[i] >> (shift - 1); \
1616 d[i] = (tmp >> 1) + (tmp & 1); \
1617 } \
1618 clear_tail(d, oprsz, simd_maxsz(desc)); \
1619}
1620
1621DO_RSHR(gvec_srshr_b, int8_t)
1622DO_RSHR(gvec_srshr_h, int16_t)
1623DO_RSHR(gvec_srshr_s, int32_t)
1624DO_RSHR(gvec_srshr_d, int64_t)
1625
1626DO_RSHR(gvec_urshr_b, uint8_t)
1627DO_RSHR(gvec_urshr_h, uint16_t)
1628DO_RSHR(gvec_urshr_s, uint32_t)
1629DO_RSHR(gvec_urshr_d, uint64_t)
1630
1631#undef DO_RSHR
1632
1633#define DO_RSRA(NAME, TYPE) \
1634void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1635{ \
1636 intptr_t i, oprsz = simd_oprsz(desc); \
1637 int shift = simd_data(desc); \
1638 TYPE *d = vd, *n = vn; \
1639 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1640 TYPE tmp = n[i] >> (shift - 1); \
1641 d[i] += (tmp >> 1) + (tmp & 1); \
1642 } \
1643 clear_tail(d, oprsz, simd_maxsz(desc)); \
1644}
1645
1646DO_RSRA(gvec_srsra_b, int8_t)
1647DO_RSRA(gvec_srsra_h, int16_t)
1648DO_RSRA(gvec_srsra_s, int32_t)
1649DO_RSRA(gvec_srsra_d, int64_t)
1650
1651DO_RSRA(gvec_ursra_b, uint8_t)
1652DO_RSRA(gvec_ursra_h, uint16_t)
1653DO_RSRA(gvec_ursra_s, uint32_t)
1654DO_RSRA(gvec_ursra_d, uint64_t)
1655
1656#undef DO_RSRA
1657
893ab054
RH
1658#define DO_SRI(NAME, TYPE) \
1659void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1660{ \
1661 intptr_t i, oprsz = simd_oprsz(desc); \
1662 int shift = simd_data(desc); \
1663 TYPE *d = vd, *n = vn; \
1664 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1665 d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \
1666 } \
1667 clear_tail(d, oprsz, simd_maxsz(desc)); \
1668}
1669
1670DO_SRI(gvec_sri_b, uint8_t)
1671DO_SRI(gvec_sri_h, uint16_t)
1672DO_SRI(gvec_sri_s, uint32_t)
1673DO_SRI(gvec_sri_d, uint64_t)
1674
1675#undef DO_SRI
1676
1677#define DO_SLI(NAME, TYPE) \
1678void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
1679{ \
1680 intptr_t i, oprsz = simd_oprsz(desc); \
1681 int shift = simd_data(desc); \
1682 TYPE *d = vd, *n = vn; \
1683 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
1684 d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \
1685 } \
1686 clear_tail(d, oprsz, simd_maxsz(desc)); \
1687}
1688
1689DO_SLI(gvec_sli_b, uint8_t)
1690DO_SLI(gvec_sli_h, uint16_t)
1691DO_SLI(gvec_sli_s, uint32_t)
1692DO_SLI(gvec_sli_d, uint64_t)
1693
1694#undef DO_SLI
1695
a4e943a7
RH
1696/*
1697 * Convert float16 to float32, raising no exceptions and
1698 * preserving exceptional values, including SNaN.
1699 * This is effectively an unpack+repack operation.
1700 */
1701static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16)
1702{
1703 const int f16_bias = 15;
1704 const int f32_bias = 127;
1705 uint32_t sign = extract32(f16, 15, 1);
1706 uint32_t exp = extract32(f16, 10, 5);
1707 uint32_t frac = extract32(f16, 0, 10);
1708
1709 if (exp == 0x1f) {
1710 /* Inf or NaN */
1711 exp = 0xff;
1712 } else if (exp == 0) {
1713 /* Zero or denormal. */
1714 if (frac != 0) {
1715 if (fz16) {
1716 frac = 0;
1717 } else {
1718 /*
1719 * Denormal; these are all normal float32.
1720 * Shift the fraction so that the msb is at bit 11,
1721 * then remove bit 11 as the implicit bit of the
1722 * normalized float32. Note that we still go through
1723 * the shift for normal numbers below, to put the
1724 * float32 fraction at the right place.
1725 */
1726 int shift = clz32(frac) - 21;
1727 frac = (frac << shift) & 0x3ff;
1728 exp = f32_bias - f16_bias - shift + 1;
1729 }
1730 }
1731 } else {
1732 /* Normal number; adjust the bias. */
1733 exp += f32_bias - f16_bias;
1734 }
1735 sign <<= 31;
1736 exp <<= 23;
1737 frac <<= 23 - 10;
1738
1739 return sign | exp | frac;
1740}
1741
1742static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2)
1743{
1744 /*
1745 * Branchless load of u32[0], u64[0], u32[1], or u64[1].
1746 * Load the 2nd qword iff is_q & is_2.
1747 * Shift to the 2nd dword iff !is_q & is_2.
1748 * For !is_q & !is_2, the upper bits of the result are garbage.
1749 */
1750 return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5);
1751}
1752
1753/*
1754 * Note that FMLAL requires oprsz == 8 or oprsz == 16,
1755 * as there is not yet SVE versions that might use blocking.
1756 */
1757
1758static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst,
1759 uint32_t desc, bool fz16)
1760{
1761 intptr_t i, oprsz = simd_oprsz(desc);
1762 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1763 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1764 int is_q = oprsz == 16;
1765 uint64_t n_4, m_4;
1766
1767 /* Pre-load all of the f16 data, avoiding overlap issues. */
1768 n_4 = load4_f16(vn, is_q, is_2);
1769 m_4 = load4_f16(vm, is_q, is_2);
1770
1771 /* Negate all inputs for FMLSL at once. */
1772 if (is_s) {
1773 n_4 ^= 0x8000800080008000ull;
1774 }
1775
1776 for (i = 0; i < oprsz / 4; i++) {
1777 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1778 float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16);
1779 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1780 }
1781 clear_tail(d, oprsz, simd_maxsz(desc));
1782}
1783
1784void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm,
1785 void *venv, uint32_t desc)
1786{
1787 CPUARMState *env = venv;
1788 do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1789 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1790}
1791
1792void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm,
1793 void *venv, uint32_t desc)
1794{
1795 CPUARMState *env = venv;
1796 do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc,
1797 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1798}
1799
50d102bd
SL
1800void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va,
1801 void *venv, uint32_t desc)
1802{
1803 intptr_t i, oprsz = simd_oprsz(desc);
1804 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1805 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1806 CPUARMState *env = venv;
1807 float_status *status = &env->vfp.fp_status;
1808 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1809
1810 for (i = 0; i < oprsz; i += sizeof(float32)) {
1811 float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn;
1812 float16 mm_16 = *(float16 *)(vm + H1_2(i + sel));
1813 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1814 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1815 float32 aa = *(float32 *)(va + H1_4(i));
1816
1817 *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status);
1818 }
1819}
1820
a4e943a7
RH
1821static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst,
1822 uint32_t desc, bool fz16)
1823{
1824 intptr_t i, oprsz = simd_oprsz(desc);
1825 int is_s = extract32(desc, SIMD_DATA_SHIFT, 1);
1826 int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1);
1827 int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3);
1828 int is_q = oprsz == 16;
1829 uint64_t n_4;
1830 float32 m_1;
1831
1832 /* Pre-load all of the f16 data, avoiding overlap issues. */
1833 n_4 = load4_f16(vn, is_q, is_2);
1834
1835 /* Negate all inputs for FMLSL at once. */
1836 if (is_s) {
1837 n_4 ^= 0x8000800080008000ull;
1838 }
1839
1840 m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16);
1841
1842 for (i = 0; i < oprsz / 4; i++) {
1843 float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16);
1844 d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst);
1845 }
1846 clear_tail(d, oprsz, simd_maxsz(desc));
1847}
1848
1849void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm,
1850 void *venv, uint32_t desc)
1851{
1852 CPUARMState *env = venv;
1853 do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc,
1854 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1855}
1856
1857void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm,
1858 void *venv, uint32_t desc)
1859{
1860 CPUARMState *env = venv;
1861 do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc,
1862 get_flush_inputs_to_zero(&env->vfp.fp_status_f16));
1863}
87b74e8b 1864
50d102bd
SL
1865void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va,
1866 void *venv, uint32_t desc)
1867{
1868 intptr_t i, j, oprsz = simd_oprsz(desc);
1869 uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15;
1870 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16);
1871 intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16);
1872 CPUARMState *env = venv;
1873 float_status *status = &env->vfp.fp_status;
1874 bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16);
1875
1876 for (i = 0; i < oprsz; i += 16) {
1877 float16 mm_16 = *(float16 *)(vm + i + idx);
1878 float32 mm = float16_to_float32_by_bits(mm_16, fz16);
1879
1880 for (j = 0; j < 16; j += sizeof(float32)) {
1881 float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn;
1882 float32 nn = float16_to_float32_by_bits(nn_16, fz16);
1883 float32 aa = *(float32 *)(va + H1_4(i + j));
1884
1885 *(float32 *)(vd + H1_4(i + j)) =
1886 float32_muladd(nn, mm, aa, 0, status);
1887 }
1888 }
1889}
1890
87b74e8b
RH
1891void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1892{
1893 intptr_t i, opr_sz = simd_oprsz(desc);
1894 int8_t *d = vd, *n = vn, *m = vm;
1895
1896 for (i = 0; i < opr_sz; ++i) {
1897 int8_t mm = m[i];
1898 int8_t nn = n[i];
1899 int8_t res = 0;
1900 if (mm >= 0) {
1901 if (mm < 8) {
1902 res = nn << mm;
1903 }
1904 } else {
1905 res = nn >> (mm > -8 ? -mm : 7);
1906 }
1907 d[i] = res;
1908 }
1909 clear_tail(d, opr_sz, simd_maxsz(desc));
1910}
1911
1912void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1913{
1914 intptr_t i, opr_sz = simd_oprsz(desc);
1915 int16_t *d = vd, *n = vn, *m = vm;
1916
1917 for (i = 0; i < opr_sz / 2; ++i) {
1918 int8_t mm = m[i]; /* only 8 bits of shift are significant */
1919 int16_t nn = n[i];
1920 int16_t res = 0;
1921 if (mm >= 0) {
1922 if (mm < 16) {
1923 res = nn << mm;
1924 }
1925 } else {
1926 res = nn >> (mm > -16 ? -mm : 15);
1927 }
1928 d[i] = res;
1929 }
1930 clear_tail(d, opr_sz, simd_maxsz(desc));
1931}
1932
1933void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc)
1934{
1935 intptr_t i, opr_sz = simd_oprsz(desc);
1936 uint8_t *d = vd, *n = vn, *m = vm;
1937
1938 for (i = 0; i < opr_sz; ++i) {
1939 int8_t mm = m[i];
1940 uint8_t nn = n[i];
1941 uint8_t res = 0;
1942 if (mm >= 0) {
1943 if (mm < 8) {
1944 res = nn << mm;
1945 }
1946 } else {
1947 if (mm > -8) {
1948 res = nn >> -mm;
1949 }
1950 }
1951 d[i] = res;
1952 }
1953 clear_tail(d, opr_sz, simd_maxsz(desc));
1954}
1955
1956void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc)
1957{
1958 intptr_t i, opr_sz = simd_oprsz(desc);
1959 uint16_t *d = vd, *n = vn, *m = vm;
1960
1961 for (i = 0; i < opr_sz / 2; ++i) {
1962 int8_t mm = m[i]; /* only 8 bits of shift are significant */
1963 uint16_t nn = n[i];
1964 uint16_t res = 0;
1965 if (mm >= 0) {
1966 if (mm < 16) {
1967 res = nn << mm;
1968 }
1969 } else {
1970 if (mm > -16) {
1971 res = nn >> -mm;
1972 }
1973 }
1974 d[i] = res;
1975 }
1976 clear_tail(d, opr_sz, simd_maxsz(desc));
1977}
a21bb78e
RH
1978
1979/*
1980 * 8x8->8 polynomial multiply.
1981 *
1982 * Polynomial multiplication is like integer multiplication except the
1983 * partial products are XORed, not added.
1984 *
1985 * TODO: expose this as a generic vector operation, as it is a common
1986 * crypto building block.
1987 */
1988void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc)
1989{
8e3da4c7 1990 intptr_t i, opr_sz = simd_oprsz(desc);
a21bb78e
RH
1991 uint64_t *d = vd, *n = vn, *m = vm;
1992
1993 for (i = 0; i < opr_sz / 8; ++i) {
8e3da4c7 1994 d[i] = clmul_8x8_low(n[i], m[i]);
a21bb78e
RH
1995 }
1996 clear_tail(d, opr_sz, simd_maxsz(desc));
1997}
b9ed510e
RH
1998
1999/*
2000 * 64x64->128 polynomial multiply.
2001 * Because of the lanes are not accessed in strict columns,
2002 * this probably cannot be turned into a generic helper.
2003 */
2004void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc)
2005{
2006 intptr_t i, j, opr_sz = simd_oprsz(desc);
2007 intptr_t hi = simd_data(desc);
2008 uint64_t *d = vd, *n = vn, *m = vm;
2009
2010 for (i = 0; i < opr_sz / 8; i += 2) {
2011 uint64_t nn = n[i + hi];
2012 uint64_t mm = m[i + hi];
2013 uint64_t rhi = 0;
2014 uint64_t rlo = 0;
2015
2016 /* Bit 0 can only influence the low 64-bit result. */
2017 if (nn & 1) {
2018 rlo = mm;
2019 }
2020
2021 for (j = 1; j < 64; ++j) {
2022 uint64_t mask = -((nn >> j) & 1);
2023 rlo ^= (mm << j) & mask;
2024 rhi ^= (mm >> (64 - j)) & mask;
2025 }
2026 d[i] = rlo;
2027 d[i + 1] = rhi;
2028 }
2029 clear_tail(d, opr_sz, simd_maxsz(desc));
2030}
e7e96fc5 2031
e7e96fc5
RH
2032void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2033{
2034 int hi = simd_data(desc);
2035 uint64_t *d = vd, *n = vn, *m = vm;
2036 uint64_t nn = n[hi], mm = m[hi];
2037
8e3da4c7 2038 d[0] = clmul_8x4_packed(nn, mm);
e7e96fc5
RH
2039 nn >>= 32;
2040 mm >>= 32;
8e3da4c7 2041 d[1] = clmul_8x4_packed(nn, mm);
e7e96fc5
RH
2042
2043 clear_tail(d, 16, simd_maxsz(desc));
2044}
2045
2046#ifdef TARGET_AARCH64
2047void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc)
2048{
2049 int shift = simd_data(desc) * 8;
2050 intptr_t i, opr_sz = simd_oprsz(desc);
2051 uint64_t *d = vd, *n = vn, *m = vm;
2052
2053 for (i = 0; i < opr_sz / 8; ++i) {
8e3da4c7 2054 d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift);
e7e96fc5
RH
2055 }
2056}
e3a56131
RH
2057
2058static uint64_t pmull_d(uint64_t op1, uint64_t op2)
2059{
2060 uint64_t result = 0;
2061 int i;
2062
2063 for (i = 0; i < 32; ++i) {
2064 uint64_t mask = -((op1 >> i) & 1);
2065 result ^= (op2 << i) & mask;
2066 }
2067 return result;
2068}
2069
2070void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc)
2071{
2072 intptr_t sel = H4(simd_data(desc));
2073 intptr_t i, opr_sz = simd_oprsz(desc);
2074 uint32_t *n = vn, *m = vm;
2075 uint64_t *d = vd;
2076
2077 for (i = 0; i < opr_sz / 8; ++i) {
2078 d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]);
2079 }
2080}
e7e96fc5 2081#endif
6b375d35
RH
2082
2083#define DO_CMP0(NAME, TYPE, OP) \
2084void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \
2085{ \
2086 intptr_t i, opr_sz = simd_oprsz(desc); \
2087 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
2088 TYPE nn = *(TYPE *)(vn + i); \
2089 *(TYPE *)(vd + i) = -(nn OP 0); \
2090 } \
2091 clear_tail(vd, opr_sz, simd_maxsz(desc)); \
2092}
2093
2094DO_CMP0(gvec_ceq0_b, int8_t, ==)
2095DO_CMP0(gvec_clt0_b, int8_t, <)
2096DO_CMP0(gvec_cle0_b, int8_t, <=)
2097DO_CMP0(gvec_cgt0_b, int8_t, >)
2098DO_CMP0(gvec_cge0_b, int8_t, >=)
2099
2100DO_CMP0(gvec_ceq0_h, int16_t, ==)
2101DO_CMP0(gvec_clt0_h, int16_t, <)
2102DO_CMP0(gvec_cle0_h, int16_t, <=)
2103DO_CMP0(gvec_cgt0_h, int16_t, >)
2104DO_CMP0(gvec_cge0_h, int16_t, >=)
2105
2106#undef DO_CMP0
50c160d4
RH
2107
2108#define DO_ABD(NAME, TYPE) \
2109void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2110{ \
2111 intptr_t i, opr_sz = simd_oprsz(desc); \
2112 TYPE *d = vd, *n = vn, *m = vm; \
2113 \
2114 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2115 d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2116 } \
2117 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2118}
2119
2120DO_ABD(gvec_sabd_b, int8_t)
2121DO_ABD(gvec_sabd_h, int16_t)
2122DO_ABD(gvec_sabd_s, int32_t)
2123DO_ABD(gvec_sabd_d, int64_t)
2124
2125DO_ABD(gvec_uabd_b, uint8_t)
2126DO_ABD(gvec_uabd_h, uint16_t)
2127DO_ABD(gvec_uabd_s, uint32_t)
2128DO_ABD(gvec_uabd_d, uint64_t)
2129
2130#undef DO_ABD
cfdb2c0c
RH
2131
2132#define DO_ABA(NAME, TYPE) \
2133void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \
2134{ \
2135 intptr_t i, opr_sz = simd_oprsz(desc); \
2136 TYPE *d = vd, *n = vn, *m = vm; \
2137 \
2138 for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \
2139 d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \
2140 } \
2141 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2142}
2143
2144DO_ABA(gvec_saba_b, int8_t)
2145DO_ABA(gvec_saba_h, int16_t)
2146DO_ABA(gvec_saba_s, int32_t)
2147DO_ABA(gvec_saba_d, int64_t)
2148
2149DO_ABA(gvec_uaba_b, uint8_t)
2150DO_ABA(gvec_uaba_h, uint16_t)
2151DO_ABA(gvec_uaba_s, uint32_t)
2152DO_ABA(gvec_uaba_d, uint64_t)
2153
2154#undef DO_ABA
1dc587ee
PM
2155
2156#define DO_NEON_PAIRWISE(NAME, OP) \
2157 void HELPER(NAME##s)(void *vd, void *vn, void *vm, \
2158 void *stat, uint32_t oprsz) \
2159 { \
2160 float_status *fpst = stat; \
2161 float32 *d = vd; \
2162 float32 *n = vn; \
2163 float32 *m = vm; \
2164 float32 r0, r1; \
2165 \
2166 /* Read all inputs before writing outputs in case vm == vd */ \
2167 r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \
2168 r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \
2169 \
2170 d[H4(0)] = r0; \
2171 d[H4(1)] = r1; \
2172 } \
2173 \
2174 void HELPER(NAME##h)(void *vd, void *vn, void *vm, \
2175 void *stat, uint32_t oprsz) \
2176 { \
2177 float_status *fpst = stat; \
2178 float16 *d = vd; \
2179 float16 *n = vn; \
2180 float16 *m = vm; \
2181 float16 r0, r1, r2, r3; \
2182 \
2183 /* Read all inputs before writing outputs in case vm == vd */ \
2184 r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \
2185 r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \
2186 r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \
2187 r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \
2188 \
552714c0
PM
2189 d[H2(0)] = r0; \
2190 d[H2(1)] = r1; \
2191 d[H2(2)] = r2; \
2192 d[H2(3)] = r3; \
1dc587ee
PM
2193 }
2194
2195DO_NEON_PAIRWISE(neon_padd, add)
2196DO_NEON_PAIRWISE(neon_pmax, max)
2197DO_NEON_PAIRWISE(neon_pmin, min)
2198
2199#undef DO_NEON_PAIRWISE
7b959c58
PM
2200
2201#define DO_VCVT_FIXED(NAME, FUNC, TYPE) \
2202 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2203 { \
2204 intptr_t i, oprsz = simd_oprsz(desc); \
2205 int shift = simd_data(desc); \
2206 TYPE *d = vd, *n = vn; \
2207 float_status *fpst = stat; \
2208 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2209 d[i] = FUNC(n[i], shift, fpst); \
2210 } \
2211 clear_tail(d, oprsz, simd_maxsz(desc)); \
2212 }
2213
2214DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t)
2215DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t)
2216DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t)
2217DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t)
24018cf3
PM
2218DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t)
2219DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t)
2220DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t)
2221DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t)
7b959c58
PM
2222
2223#undef DO_VCVT_FIXED
ca88a6ef
PM
2224
2225#define DO_VCVT_RMODE(NAME, FUNC, TYPE) \
2226 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2227 { \
2228 float_status *fpst = stat; \
2229 intptr_t i, oprsz = simd_oprsz(desc); \
2230 uint32_t rmode = simd_data(desc); \
2231 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2232 TYPE *d = vd, *n = vn; \
2233 set_float_rounding_mode(rmode, fpst); \
2234 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2235 d[i] = FUNC(n[i], 0, fpst); \
2236 } \
2237 set_float_rounding_mode(prev_rmode, fpst); \
2238 clear_tail(d, oprsz, simd_maxsz(desc)); \
2239 }
2240
2241DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t)
2242DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t)
2243DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t)
2244DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t)
2245
2246#undef DO_VCVT_RMODE
18725916
PM
2247
2248#define DO_VRINT_RMODE(NAME, FUNC, TYPE) \
2249 void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \
2250 { \
2251 float_status *fpst = stat; \
2252 intptr_t i, oprsz = simd_oprsz(desc); \
2253 uint32_t rmode = simd_data(desc); \
2254 uint32_t prev_rmode = get_float_rounding_mode(fpst); \
2255 TYPE *d = vd, *n = vn; \
2256 set_float_rounding_mode(rmode, fpst); \
2257 for (i = 0; i < oprsz / sizeof(TYPE); i++) { \
2258 d[i] = FUNC(n[i], fpst); \
2259 } \
2260 set_float_rounding_mode(prev_rmode, fpst); \
2261 clear_tail(d, oprsz, simd_maxsz(desc)); \
2262 }
2263
2264DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t)
2265DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t)
2266
2267#undef DO_VRINT_RMODE
519183d3
RH
2268
2269#ifdef TARGET_AARCH64
2270void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc)
2271{
2272 const uint8_t *indices = vm;
2273 CPUARMState *env = venv;
2274 size_t oprsz = simd_oprsz(desc);
2275 uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5);
2276 bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1);
2277 uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6);
2278 union {
2279 uint8_t b[16];
2280 uint64_t d[2];
2281 } result;
2282
2283 /*
2284 * We must construct the final result in a temp, lest the output
2285 * overlaps the input table. For TBL, begin with zero; for TBX,
2286 * begin with the original register contents. Note that we always
2287 * copy 16 bytes here to avoid an extra branch; clearing the high
2288 * bits of the register for oprsz == 8 is handled below.
2289 */
2290 if (is_tbx) {
2291 memcpy(&result, vd, 16);
2292 } else {
2293 memset(&result, 0, 16);
2294 }
2295
2296 for (size_t i = 0; i < oprsz; ++i) {
2297 uint32_t index = indices[H1(i)];
2298
2299 if (index < table_len) {
2300 /*
2301 * Convert index (a byte offset into the virtual table
2302 * which is a series of 128-bit vectors concatenated)
2303 * into the correct register element, bearing in mind
2304 * that the table can wrap around from V31 to V0.
2305 */
2306 const uint8_t *table = (const uint8_t *)
2307 aa64_vfp_qreg(env, (rn + (index >> 4)) % 32);
2308 result.b[H1(i)] = table[H1(index % 16)];
2309 }
2310 }
2311
2312 memcpy(vd, &result, 16);
2313 clear_tail(vd, oprsz, simd_maxsz(desc));
2314}
2315#endif
5dad1ba5
RH
2316
2317/*
2318 * NxN -> N highpart multiply
2319 *
2320 * TODO: expose this as a generic vector operation.
2321 */
2322
2323void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2324{
2325 intptr_t i, opr_sz = simd_oprsz(desc);
2326 int8_t *d = vd, *n = vn, *m = vm;
2327
2328 for (i = 0; i < opr_sz; ++i) {
2329 d[i] = ((int32_t)n[i] * m[i]) >> 8;
2330 }
2331 clear_tail(d, opr_sz, simd_maxsz(desc));
2332}
2333
2334void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2335{
2336 intptr_t i, opr_sz = simd_oprsz(desc);
2337 int16_t *d = vd, *n = vn, *m = vm;
2338
2339 for (i = 0; i < opr_sz / 2; ++i) {
2340 d[i] = ((int32_t)n[i] * m[i]) >> 16;
2341 }
2342 clear_tail(d, opr_sz, simd_maxsz(desc));
2343}
2344
2345void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2346{
2347 intptr_t i, opr_sz = simd_oprsz(desc);
2348 int32_t *d = vd, *n = vn, *m = vm;
2349
2350 for (i = 0; i < opr_sz / 4; ++i) {
2351 d[i] = ((int64_t)n[i] * m[i]) >> 32;
2352 }
2353 clear_tail(d, opr_sz, simd_maxsz(desc));
2354}
2355
2356void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2357{
2358 intptr_t i, opr_sz = simd_oprsz(desc);
2359 uint64_t *d = vd, *n = vn, *m = vm;
2360 uint64_t discard;
2361
2362 for (i = 0; i < opr_sz / 8; ++i) {
2363 muls64(&discard, &d[i], n[i], m[i]);
2364 }
2365 clear_tail(d, opr_sz, simd_maxsz(desc));
2366}
2367
2368void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc)
2369{
2370 intptr_t i, opr_sz = simd_oprsz(desc);
2371 uint8_t *d = vd, *n = vn, *m = vm;
2372
2373 for (i = 0; i < opr_sz; ++i) {
2374 d[i] = ((uint32_t)n[i] * m[i]) >> 8;
2375 }
2376 clear_tail(d, opr_sz, simd_maxsz(desc));
2377}
2378
2379void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc)
2380{
2381 intptr_t i, opr_sz = simd_oprsz(desc);
2382 uint16_t *d = vd, *n = vn, *m = vm;
2383
2384 for (i = 0; i < opr_sz / 2; ++i) {
2385 d[i] = ((uint32_t)n[i] * m[i]) >> 16;
2386 }
2387 clear_tail(d, opr_sz, simd_maxsz(desc));
2388}
2389
2390void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc)
2391{
2392 intptr_t i, opr_sz = simd_oprsz(desc);
2393 uint32_t *d = vd, *n = vn, *m = vm;
2394
2395 for (i = 0; i < opr_sz / 4; ++i) {
2396 d[i] = ((uint64_t)n[i] * m[i]) >> 32;
2397 }
2398 clear_tail(d, opr_sz, simd_maxsz(desc));
2399}
2400
2401void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc)
2402{
2403 intptr_t i, opr_sz = simd_oprsz(desc);
2404 uint64_t *d = vd, *n = vn, *m = vm;
2405 uint64_t discard;
2406
2407 for (i = 0; i < opr_sz / 8; ++i) {
2408 mulu64(&discard, &d[i], n[i], m[i]);
2409 }
2410 clear_tail(d, opr_sz, simd_maxsz(desc));
2411}
e6eba6e5
RH
2412
2413void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc)
2414{
2415 intptr_t i, opr_sz = simd_oprsz(desc) / 8;
2416 int shr = simd_data(desc);
2417 uint64_t *d = vd, *n = vn, *m = vm;
2418
2419 for (i = 0; i < opr_sz; ++i) {
2420 d[i] = ror64(n[i] ^ m[i], shr);
2421 }
2422 clear_tail(d, opr_sz * 8, simd_maxsz(desc));
2423}
2323c5ff
RH
2424
2425/*
2426 * Integer matrix-multiply accumulate
2427 */
2428
2429static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm)
2430{
2431 int8_t *n = vn, *m = vm;
2432
2433 for (intptr_t k = 0; k < 8; ++k) {
2434 sum += n[H1(k)] * m[H1(k)];
2435 }
2436 return sum;
2437}
2438
2439static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm)
2440{
2441 uint8_t *n = vn, *m = vm;
2442
2443 for (intptr_t k = 0; k < 8; ++k) {
2444 sum += n[H1(k)] * m[H1(k)];
2445 }
2446 return sum;
2447}
2448
2449static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm)
2450{
2451 uint8_t *n = vn;
2452 int8_t *m = vm;
2453
2454 for (intptr_t k = 0; k < 8; ++k) {
2455 sum += n[H1(k)] * m[H1(k)];
2456 }
2457 return sum;
2458}
2459
2460static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc,
2461 uint32_t (*inner_loop)(uint32_t, void *, void *))
2462{
2463 intptr_t seg, opr_sz = simd_oprsz(desc);
2464
2465 for (seg = 0; seg < opr_sz; seg += 16) {
2466 uint32_t *d = vd + seg;
2467 uint32_t *a = va + seg;
2468 uint32_t sum0, sum1, sum2, sum3;
2469
2470 /*
2471 * Process the entire segment at once, writing back the
2472 * results only after we've consumed all of the inputs.
2473 *
81266a1f 2474 * Key to indices by column:
2323c5ff
RH
2475 * i j i j
2476 */
2477 sum0 = a[H4(0 + 0)];
2478 sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0);
2479 sum1 = a[H4(0 + 1)];
2480 sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8);
2481 sum2 = a[H4(2 + 0)];
2482 sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0);
2483 sum3 = a[H4(2 + 1)];
2484 sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8);
2485
2486 d[H4(0)] = sum0;
2487 d[H4(1)] = sum1;
2488 d[H4(2)] = sum2;
2489 d[H4(3)] = sum3;
2490 }
2491 clear_tail(vd, opr_sz, simd_maxsz(desc));
2492}
2493
2494#define DO_MMLA_B(NAME, INNER) \
2495 void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \
2496 { do_mmla_b(vd, vn, vm, va, desc, INNER); }
2497
2498DO_MMLA_B(gvec_smmla_b, do_smmla_b)
2499DO_MMLA_B(gvec_ummla_b, do_ummla_b)
2500DO_MMLA_B(gvec_usmmla_b, do_usmmla_b)
cb8657f7
RH
2501
2502/*
2503 * BFloat16 Dot Product
2504 */
2505
72db2aa3 2506float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2)
cb8657f7
RH
2507{
2508 /* FPCR is ignored for BFDOT and BFMMLA. */
2509 float_status bf_status = {
2510 .tininess_before_rounding = float_tininess_before_rounding,
2511 .float_rounding_mode = float_round_to_odd_inf,
2512 .flush_to_zero = true,
2513 .flush_inputs_to_zero = true,
2514 .default_nan_mode = true,
2515 };
2516 float32 t1, t2;
2517
2518 /*
2519 * Extract each BFloat16 from the element pair, and shift
2520 * them such that they become float32.
2521 */
2522 t1 = float32_mul(e1 << 16, e2 << 16, &bf_status);
2523 t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status);
2524 t1 = float32_add(t1, t2, &bf_status);
2525 t1 = float32_add(sum, t1, &bf_status);
2526
2527 return t1;
2528}
2529
2530void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2531{
2532 intptr_t i, opr_sz = simd_oprsz(desc);
2533 float32 *d = vd, *a = va;
2534 uint32_t *n = vn, *m = vm;
2535
2536 for (i = 0; i < opr_sz / 4; ++i) {
2537 d[i] = bfdotadd(a[i], n[i], m[i]);
2538 }
2539 clear_tail(d, opr_sz, simd_maxsz(desc));
2540}
83914478
RH
2541
2542void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm,
2543 void *va, uint32_t desc)
2544{
2545 intptr_t i, j, opr_sz = simd_oprsz(desc);
2546 intptr_t index = simd_data(desc);
2547 intptr_t elements = opr_sz / 4;
2548 intptr_t eltspersegment = MIN(16 / 4, elements);
2549 float32 *d = vd, *a = va;
2550 uint32_t *n = vn, *m = vm;
2551
2552 for (i = 0; i < elements; i += eltspersegment) {
2553 uint32_t m_idx = m[i + H4(index)];
2554
2555 for (j = i; j < i + eltspersegment; j++) {
2556 d[j] = bfdotadd(a[j], n[j], m_idx);
2557 }
2558 }
2559 clear_tail(d, opr_sz, simd_maxsz(desc));
2560}
81266a1f
RH
2561
2562void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc)
2563{
2564 intptr_t s, opr_sz = simd_oprsz(desc);
2565 float32 *d = vd, *a = va;
2566 uint32_t *n = vn, *m = vm;
2567
2568 for (s = 0; s < opr_sz / 4; s += 4) {
2569 float32 sum00, sum01, sum10, sum11;
2570
2571 /*
2572 * Process the entire segment at once, writing back the
2573 * results only after we've consumed all of the inputs.
2574 *
673d8215 2575 * Key to indices by column:
81266a1f
RH
2576 * i j i k j k
2577 */
2578 sum00 = a[s + H4(0 + 0)];
2579 sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]);
2580 sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]);
2581
2582 sum01 = a[s + H4(0 + 1)];
2583 sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]);
2584 sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]);
2585
2586 sum10 = a[s + H4(2 + 0)];
2587 sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]);
2588 sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]);
2589
2590 sum11 = a[s + H4(2 + 1)];
2591 sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]);
2592 sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]);
2593
2594 d[s + H4(0 + 0)] = sum00;
2595 d[s + H4(0 + 1)] = sum01;
2596 d[s + H4(2 + 0)] = sum10;
2597 d[s + H4(2 + 1)] = sum11;
2598 }
2599 clear_tail(d, opr_sz, simd_maxsz(desc));
2600}
5693887f
RH
2601
2602void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va,
2603 void *stat, uint32_t desc)
2604{
2605 intptr_t i, opr_sz = simd_oprsz(desc);
2606 intptr_t sel = simd_data(desc);
2607 float32 *d = vd, *a = va;
2608 bfloat16 *n = vn, *m = vm;
2609
2610 for (i = 0; i < opr_sz / 4; ++i) {
2611 float32 nn = n[H2(i * 2 + sel)] << 16;
2612 float32 mm = m[H2(i * 2 + sel)] << 16;
2613 d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat);
2614 }
2615 clear_tail(d, opr_sz, simd_maxsz(desc));
2616}
458d0ab6
RH
2617
2618void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm,
2619 void *va, void *stat, uint32_t desc)
2620{
2621 intptr_t i, j, opr_sz = simd_oprsz(desc);
2622 intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1);
2623 intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3);
2624 intptr_t elements = opr_sz / 4;
2625 intptr_t eltspersegment = MIN(16 / 4, elements);
2626 float32 *d = vd, *a = va;
2627 bfloat16 *n = vn, *m = vm;
2628
2629 for (i = 0; i < elements; i += eltspersegment) {
2630 float32 m_idx = m[H2(2 * i + index)] << 16;
2631
2632 for (j = i; j < i + eltspersegment; j++) {
2633 float32 n_j = n[H2(2 * j + sel)] << 16;
2634 d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat);
2635 }
2636 }
2637 clear_tail(d, opr_sz, simd_maxsz(desc));
2638}
6b5a3bdf
RH
2639
2640#define DO_CLAMP(NAME, TYPE) \
2641void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \
2642{ \
2643 intptr_t i, opr_sz = simd_oprsz(desc); \
2644 for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \
2645 TYPE aa = *(TYPE *)(a + i); \
2646 TYPE nn = *(TYPE *)(n + i); \
2647 TYPE mm = *(TYPE *)(m + i); \
2648 TYPE dd = MIN(MAX(aa, nn), mm); \
2649 *(TYPE *)(d + i) = dd; \
2650 } \
2651 clear_tail(d, opr_sz, simd_maxsz(desc)); \
2652}
2653
2654DO_CLAMP(gvec_sclamp_b, int8_t)
2655DO_CLAMP(gvec_sclamp_h, int16_t)
2656DO_CLAMP(gvec_sclamp_s, int32_t)
2657DO_CLAMP(gvec_sclamp_d, int64_t)
2658
2659DO_CLAMP(gvec_uclamp_b, uint8_t)
2660DO_CLAMP(gvec_uclamp_h, uint16_t)
2661DO_CLAMP(gvec_uclamp_s, uint32_t)
2662DO_CLAMP(gvec_uclamp_d, uint64_t)