]>
Commit | Line | Data |
---|---|---|
d9061ec3 RH |
1 | /* |
2 | * ARM AdvSIMD / SVE Vector Operations | |
3 | * | |
4 | * Copyright (c) 2018 Linaro | |
5 | * | |
6 | * This library is free software; you can redistribute it and/or | |
7 | * modify it under the terms of the GNU Lesser General Public | |
8 | * License as published by the Free Software Foundation; either | |
50f57e09 | 9 | * version 2.1 of the License, or (at your option) any later version. |
d9061ec3 RH |
10 | * |
11 | * This library is distributed in the hope that it will be useful, | |
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU | |
14 | * Lesser General Public License for more details. | |
15 | * | |
16 | * You should have received a copy of the GNU Lesser General Public | |
17 | * License along with this library; if not, see <http://www.gnu.org/licenses/>. | |
18 | */ | |
19 | ||
20 | #include "qemu/osdep.h" | |
21 | #include "cpu.h" | |
d9061ec3 RH |
22 | #include "exec/helper-proto.h" |
23 | #include "tcg/tcg-gvec-desc.h" | |
1695cd61 | 24 | #include "fpu/softfloat.h" |
ab3ddf31 | 25 | #include "qemu/int128.h" |
8e3da4c7 | 26 | #include "crypto/clmul.h" |
a04b68e1 | 27 | #include "vec_internal.h" |
d9061ec3 | 28 | |
77f96148 PM |
29 | /* |
30 | * Data for expanding active predicate bits to bytes, for byte elements. | |
31 | * | |
32 | * for (i = 0; i < 256; ++i) { | |
33 | * unsigned long m = 0; | |
34 | * for (j = 0; j < 8; j++) { | |
35 | * if ((i >> j) & 1) { | |
36 | * m |= 0xfful << (j << 3); | |
37 | * } | |
38 | * } | |
39 | * printf("0x%016lx,\n", m); | |
40 | * } | |
41 | */ | |
42 | const uint64_t expand_pred_b_data[256] = { | |
43 | 0x0000000000000000, 0x00000000000000ff, 0x000000000000ff00, | |
44 | 0x000000000000ffff, 0x0000000000ff0000, 0x0000000000ff00ff, | |
45 | 0x0000000000ffff00, 0x0000000000ffffff, 0x00000000ff000000, | |
46 | 0x00000000ff0000ff, 0x00000000ff00ff00, 0x00000000ff00ffff, | |
47 | 0x00000000ffff0000, 0x00000000ffff00ff, 0x00000000ffffff00, | |
48 | 0x00000000ffffffff, 0x000000ff00000000, 0x000000ff000000ff, | |
49 | 0x000000ff0000ff00, 0x000000ff0000ffff, 0x000000ff00ff0000, | |
50 | 0x000000ff00ff00ff, 0x000000ff00ffff00, 0x000000ff00ffffff, | |
51 | 0x000000ffff000000, 0x000000ffff0000ff, 0x000000ffff00ff00, | |
52 | 0x000000ffff00ffff, 0x000000ffffff0000, 0x000000ffffff00ff, | |
53 | 0x000000ffffffff00, 0x000000ffffffffff, 0x0000ff0000000000, | |
54 | 0x0000ff00000000ff, 0x0000ff000000ff00, 0x0000ff000000ffff, | |
55 | 0x0000ff0000ff0000, 0x0000ff0000ff00ff, 0x0000ff0000ffff00, | |
56 | 0x0000ff0000ffffff, 0x0000ff00ff000000, 0x0000ff00ff0000ff, | |
57 | 0x0000ff00ff00ff00, 0x0000ff00ff00ffff, 0x0000ff00ffff0000, | |
58 | 0x0000ff00ffff00ff, 0x0000ff00ffffff00, 0x0000ff00ffffffff, | |
59 | 0x0000ffff00000000, 0x0000ffff000000ff, 0x0000ffff0000ff00, | |
60 | 0x0000ffff0000ffff, 0x0000ffff00ff0000, 0x0000ffff00ff00ff, | |
61 | 0x0000ffff00ffff00, 0x0000ffff00ffffff, 0x0000ffffff000000, | |
62 | 0x0000ffffff0000ff, 0x0000ffffff00ff00, 0x0000ffffff00ffff, | |
63 | 0x0000ffffffff0000, 0x0000ffffffff00ff, 0x0000ffffffffff00, | |
64 | 0x0000ffffffffffff, 0x00ff000000000000, 0x00ff0000000000ff, | |
65 | 0x00ff00000000ff00, 0x00ff00000000ffff, 0x00ff000000ff0000, | |
66 | 0x00ff000000ff00ff, 0x00ff000000ffff00, 0x00ff000000ffffff, | |
67 | 0x00ff0000ff000000, 0x00ff0000ff0000ff, 0x00ff0000ff00ff00, | |
68 | 0x00ff0000ff00ffff, 0x00ff0000ffff0000, 0x00ff0000ffff00ff, | |
69 | 0x00ff0000ffffff00, 0x00ff0000ffffffff, 0x00ff00ff00000000, | |
70 | 0x00ff00ff000000ff, 0x00ff00ff0000ff00, 0x00ff00ff0000ffff, | |
71 | 0x00ff00ff00ff0000, 0x00ff00ff00ff00ff, 0x00ff00ff00ffff00, | |
72 | 0x00ff00ff00ffffff, 0x00ff00ffff000000, 0x00ff00ffff0000ff, | |
73 | 0x00ff00ffff00ff00, 0x00ff00ffff00ffff, 0x00ff00ffffff0000, | |
74 | 0x00ff00ffffff00ff, 0x00ff00ffffffff00, 0x00ff00ffffffffff, | |
75 | 0x00ffff0000000000, 0x00ffff00000000ff, 0x00ffff000000ff00, | |
76 | 0x00ffff000000ffff, 0x00ffff0000ff0000, 0x00ffff0000ff00ff, | |
77 | 0x00ffff0000ffff00, 0x00ffff0000ffffff, 0x00ffff00ff000000, | |
78 | 0x00ffff00ff0000ff, 0x00ffff00ff00ff00, 0x00ffff00ff00ffff, | |
79 | 0x00ffff00ffff0000, 0x00ffff00ffff00ff, 0x00ffff00ffffff00, | |
80 | 0x00ffff00ffffffff, 0x00ffffff00000000, 0x00ffffff000000ff, | |
81 | 0x00ffffff0000ff00, 0x00ffffff0000ffff, 0x00ffffff00ff0000, | |
82 | 0x00ffffff00ff00ff, 0x00ffffff00ffff00, 0x00ffffff00ffffff, | |
83 | 0x00ffffffff000000, 0x00ffffffff0000ff, 0x00ffffffff00ff00, | |
84 | 0x00ffffffff00ffff, 0x00ffffffffff0000, 0x00ffffffffff00ff, | |
85 | 0x00ffffffffffff00, 0x00ffffffffffffff, 0xff00000000000000, | |
86 | 0xff000000000000ff, 0xff0000000000ff00, 0xff0000000000ffff, | |
87 | 0xff00000000ff0000, 0xff00000000ff00ff, 0xff00000000ffff00, | |
88 | 0xff00000000ffffff, 0xff000000ff000000, 0xff000000ff0000ff, | |
89 | 0xff000000ff00ff00, 0xff000000ff00ffff, 0xff000000ffff0000, | |
90 | 0xff000000ffff00ff, 0xff000000ffffff00, 0xff000000ffffffff, | |
91 | 0xff0000ff00000000, 0xff0000ff000000ff, 0xff0000ff0000ff00, | |
92 | 0xff0000ff0000ffff, 0xff0000ff00ff0000, 0xff0000ff00ff00ff, | |
93 | 0xff0000ff00ffff00, 0xff0000ff00ffffff, 0xff0000ffff000000, | |
94 | 0xff0000ffff0000ff, 0xff0000ffff00ff00, 0xff0000ffff00ffff, | |
95 | 0xff0000ffffff0000, 0xff0000ffffff00ff, 0xff0000ffffffff00, | |
96 | 0xff0000ffffffffff, 0xff00ff0000000000, 0xff00ff00000000ff, | |
97 | 0xff00ff000000ff00, 0xff00ff000000ffff, 0xff00ff0000ff0000, | |
98 | 0xff00ff0000ff00ff, 0xff00ff0000ffff00, 0xff00ff0000ffffff, | |
99 | 0xff00ff00ff000000, 0xff00ff00ff0000ff, 0xff00ff00ff00ff00, | |
100 | 0xff00ff00ff00ffff, 0xff00ff00ffff0000, 0xff00ff00ffff00ff, | |
101 | 0xff00ff00ffffff00, 0xff00ff00ffffffff, 0xff00ffff00000000, | |
102 | 0xff00ffff000000ff, 0xff00ffff0000ff00, 0xff00ffff0000ffff, | |
103 | 0xff00ffff00ff0000, 0xff00ffff00ff00ff, 0xff00ffff00ffff00, | |
104 | 0xff00ffff00ffffff, 0xff00ffffff000000, 0xff00ffffff0000ff, | |
105 | 0xff00ffffff00ff00, 0xff00ffffff00ffff, 0xff00ffffffff0000, | |
106 | 0xff00ffffffff00ff, 0xff00ffffffffff00, 0xff00ffffffffffff, | |
107 | 0xffff000000000000, 0xffff0000000000ff, 0xffff00000000ff00, | |
108 | 0xffff00000000ffff, 0xffff000000ff0000, 0xffff000000ff00ff, | |
109 | 0xffff000000ffff00, 0xffff000000ffffff, 0xffff0000ff000000, | |
110 | 0xffff0000ff0000ff, 0xffff0000ff00ff00, 0xffff0000ff00ffff, | |
111 | 0xffff0000ffff0000, 0xffff0000ffff00ff, 0xffff0000ffffff00, | |
112 | 0xffff0000ffffffff, 0xffff00ff00000000, 0xffff00ff000000ff, | |
113 | 0xffff00ff0000ff00, 0xffff00ff0000ffff, 0xffff00ff00ff0000, | |
114 | 0xffff00ff00ff00ff, 0xffff00ff00ffff00, 0xffff00ff00ffffff, | |
115 | 0xffff00ffff000000, 0xffff00ffff0000ff, 0xffff00ffff00ff00, | |
116 | 0xffff00ffff00ffff, 0xffff00ffffff0000, 0xffff00ffffff00ff, | |
117 | 0xffff00ffffffff00, 0xffff00ffffffffff, 0xffffff0000000000, | |
118 | 0xffffff00000000ff, 0xffffff000000ff00, 0xffffff000000ffff, | |
119 | 0xffffff0000ff0000, 0xffffff0000ff00ff, 0xffffff0000ffff00, | |
120 | 0xffffff0000ffffff, 0xffffff00ff000000, 0xffffff00ff0000ff, | |
121 | 0xffffff00ff00ff00, 0xffffff00ff00ffff, 0xffffff00ffff0000, | |
122 | 0xffffff00ffff00ff, 0xffffff00ffffff00, 0xffffff00ffffffff, | |
123 | 0xffffffff00000000, 0xffffffff000000ff, 0xffffffff0000ff00, | |
124 | 0xffffffff0000ffff, 0xffffffff00ff0000, 0xffffffff00ff00ff, | |
125 | 0xffffffff00ffff00, 0xffffffff00ffffff, 0xffffffffff000000, | |
126 | 0xffffffffff0000ff, 0xffffffffff00ff00, 0xffffffffff00ffff, | |
127 | 0xffffffffffff0000, 0xffffffffffff00ff, 0xffffffffffffff00, | |
128 | 0xffffffffffffffff, | |
129 | }; | |
130 | ||
a613cf2d RH |
131 | /* |
132 | * Similarly for half-word elements. | |
133 | * for (i = 0; i < 256; ++i) { | |
134 | * unsigned long m = 0; | |
135 | * if (i & 0xaa) { | |
136 | * continue; | |
137 | * } | |
138 | * for (j = 0; j < 8; j += 2) { | |
139 | * if ((i >> j) & 1) { | |
140 | * m |= 0xfffful << (j << 3); | |
141 | * } | |
142 | * } | |
143 | * printf("[0x%x] = 0x%016lx,\n", i, m); | |
144 | * } | |
145 | */ | |
146 | const uint64_t expand_pred_h_data[0x55 + 1] = { | |
147 | [0x01] = 0x000000000000ffff, [0x04] = 0x00000000ffff0000, | |
148 | [0x05] = 0x00000000ffffffff, [0x10] = 0x0000ffff00000000, | |
149 | [0x11] = 0x0000ffff0000ffff, [0x14] = 0x0000ffffffff0000, | |
150 | [0x15] = 0x0000ffffffffffff, [0x40] = 0xffff000000000000, | |
151 | [0x41] = 0xffff00000000ffff, [0x44] = 0xffff0000ffff0000, | |
152 | [0x45] = 0xffff0000ffffffff, [0x50] = 0xffffffff00000000, | |
153 | [0x51] = 0xffffffff0000ffff, [0x54] = 0xffffffffffff0000, | |
154 | [0x55] = 0xffffffffffffffff, | |
155 | }; | |
156 | ||
ab3ddf31 | 157 | /* Signed saturating rounding doubling multiply-accumulate high half, 8-bit */ |
d782d3ca RH |
158 | int8_t do_sqrdmlah_b(int8_t src1, int8_t src2, int8_t src3, |
159 | bool neg, bool round) | |
d9061ec3 | 160 | { |
d2179885 RH |
161 | /* |
162 | * Simplify: | |
ab3ddf31 RH |
163 | * = ((a3 << 8) + ((e1 * e2) << 1) + (round << 7)) >> 8 |
164 | * = ((a3 << 7) + (e1 * e2) + (round << 6)) >> 7 | |
d9061ec3 RH |
165 | */ |
166 | int32_t ret = (int32_t)src1 * src2; | |
d2179885 RH |
167 | if (neg) { |
168 | ret = -ret; | |
169 | } | |
ab3ddf31 RH |
170 | ret += ((int32_t)src3 << 7) + (round << 6); |
171 | ret >>= 7; | |
172 | ||
173 | if (ret != (int8_t)ret) { | |
174 | ret = (ret < 0 ? INT8_MIN : INT8_MAX); | |
175 | } | |
176 | return ret; | |
177 | } | |
178 | ||
179 | void HELPER(sve2_sqrdmlah_b)(void *vd, void *vn, void *vm, | |
180 | void *va, uint32_t desc) | |
181 | { | |
182 | intptr_t i, opr_sz = simd_oprsz(desc); | |
183 | int8_t *d = vd, *n = vn, *m = vm, *a = va; | |
184 | ||
185 | for (i = 0; i < opr_sz; ++i) { | |
186 | d[i] = do_sqrdmlah_b(n[i], m[i], a[i], false, true); | |
187 | } | |
188 | } | |
189 | ||
190 | void HELPER(sve2_sqrdmlsh_b)(void *vd, void *vn, void *vm, | |
191 | void *va, uint32_t desc) | |
192 | { | |
193 | intptr_t i, opr_sz = simd_oprsz(desc); | |
194 | int8_t *d = vd, *n = vn, *m = vm, *a = va; | |
195 | ||
196 | for (i = 0; i < opr_sz; ++i) { | |
197 | d[i] = do_sqrdmlah_b(n[i], m[i], a[i], true, true); | |
198 | } | |
199 | } | |
200 | ||
169d7c58 RH |
201 | void HELPER(sve2_sqdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) |
202 | { | |
203 | intptr_t i, opr_sz = simd_oprsz(desc); | |
204 | int8_t *d = vd, *n = vn, *m = vm; | |
205 | ||
206 | for (i = 0; i < opr_sz; ++i) { | |
207 | d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, false); | |
208 | } | |
209 | } | |
210 | ||
211 | void HELPER(sve2_sqrdmulh_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
212 | { | |
213 | intptr_t i, opr_sz = simd_oprsz(desc); | |
214 | int8_t *d = vd, *n = vn, *m = vm; | |
215 | ||
216 | for (i = 0; i < opr_sz; ++i) { | |
217 | d[i] = do_sqrdmlah_b(n[i], m[i], 0, false, true); | |
218 | } | |
219 | } | |
220 | ||
ab3ddf31 | 221 | /* Signed saturating rounding doubling multiply-accumulate high half, 16-bit */ |
d782d3ca RH |
222 | int16_t do_sqrdmlah_h(int16_t src1, int16_t src2, int16_t src3, |
223 | bool neg, bool round, uint32_t *sat) | |
ab3ddf31 RH |
224 | { |
225 | /* Simplify similarly to do_sqrdmlah_b above. */ | |
226 | int32_t ret = (int32_t)src1 * src2; | |
227 | if (neg) { | |
228 | ret = -ret; | |
229 | } | |
d2179885 | 230 | ret += ((int32_t)src3 << 15) + (round << 14); |
d9061ec3 | 231 | ret >>= 15; |
d2179885 | 232 | |
d9061ec3 | 233 | if (ret != (int16_t)ret) { |
e286bf4a | 234 | *sat = 1; |
d2179885 | 235 | ret = (ret < 0 ? INT16_MIN : INT16_MAX); |
d9061ec3 RH |
236 | } |
237 | return ret; | |
238 | } | |
239 | ||
240 | uint32_t HELPER(neon_qrdmlah_s16)(CPUARMState *env, uint32_t src1, | |
241 | uint32_t src2, uint32_t src3) | |
242 | { | |
e286bf4a | 243 | uint32_t *sat = &env->vfp.qc[0]; |
d2179885 RH |
244 | uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, false, true, sat); |
245 | uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, | |
246 | false, true, sat); | |
d9061ec3 RH |
247 | return deposit32(e1, 16, 16, e2); |
248 | } | |
249 | ||
e7186d82 | 250 | void HELPER(gvec_qrdmlah_s16)(void *vd, void *vn, void *vm, |
e286bf4a | 251 | void *vq, uint32_t desc) |
e7186d82 RH |
252 | { |
253 | uintptr_t opr_sz = simd_oprsz(desc); | |
254 | int16_t *d = vd; | |
255 | int16_t *n = vn; | |
256 | int16_t *m = vm; | |
e7186d82 RH |
257 | uintptr_t i; |
258 | ||
259 | for (i = 0; i < opr_sz / 2; ++i) { | |
d2179885 | 260 | d[i] = do_sqrdmlah_h(n[i], m[i], d[i], false, true, vq); |
e7186d82 RH |
261 | } |
262 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
263 | } | |
264 | ||
d9061ec3 RH |
265 | uint32_t HELPER(neon_qrdmlsh_s16)(CPUARMState *env, uint32_t src1, |
266 | uint32_t src2, uint32_t src3) | |
267 | { | |
e286bf4a | 268 | uint32_t *sat = &env->vfp.qc[0]; |
d2179885 RH |
269 | uint16_t e1 = do_sqrdmlah_h(src1, src2, src3, true, true, sat); |
270 | uint16_t e2 = do_sqrdmlah_h(src1 >> 16, src2 >> 16, src3 >> 16, | |
271 | true, true, sat); | |
d9061ec3 RH |
272 | return deposit32(e1, 16, 16, e2); |
273 | } | |
274 | ||
e7186d82 | 275 | void HELPER(gvec_qrdmlsh_s16)(void *vd, void *vn, void *vm, |
e286bf4a | 276 | void *vq, uint32_t desc) |
e7186d82 RH |
277 | { |
278 | uintptr_t opr_sz = simd_oprsz(desc); | |
279 | int16_t *d = vd; | |
280 | int16_t *n = vn; | |
281 | int16_t *m = vm; | |
e7186d82 RH |
282 | uintptr_t i; |
283 | ||
284 | for (i = 0; i < opr_sz / 2; ++i) { | |
d2179885 | 285 | d[i] = do_sqrdmlah_h(n[i], m[i], d[i], true, true, vq); |
e7186d82 RH |
286 | } |
287 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
288 | } | |
289 | ||
ed78849d RH |
290 | void HELPER(neon_sqdmulh_h)(void *vd, void *vn, void *vm, |
291 | void *vq, uint32_t desc) | |
292 | { | |
293 | intptr_t i, opr_sz = simd_oprsz(desc); | |
294 | int16_t *d = vd, *n = vn, *m = vm; | |
295 | ||
296 | for (i = 0; i < opr_sz / 2; ++i) { | |
297 | d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, vq); | |
298 | } | |
299 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
300 | } | |
301 | ||
302 | void HELPER(neon_sqrdmulh_h)(void *vd, void *vn, void *vm, | |
303 | void *vq, uint32_t desc) | |
304 | { | |
305 | intptr_t i, opr_sz = simd_oprsz(desc); | |
306 | int16_t *d = vd, *n = vn, *m = vm; | |
307 | ||
308 | for (i = 0; i < opr_sz / 2; ++i) { | |
309 | d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, vq); | |
310 | } | |
311 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
312 | } | |
313 | ||
ab3ddf31 RH |
314 | void HELPER(sve2_sqrdmlah_h)(void *vd, void *vn, void *vm, |
315 | void *va, uint32_t desc) | |
316 | { | |
317 | intptr_t i, opr_sz = simd_oprsz(desc); | |
318 | int16_t *d = vd, *n = vn, *m = vm, *a = va; | |
319 | uint32_t discard; | |
320 | ||
321 | for (i = 0; i < opr_sz / 2; ++i) { | |
322 | d[i] = do_sqrdmlah_h(n[i], m[i], a[i], false, true, &discard); | |
323 | } | |
324 | } | |
325 | ||
326 | void HELPER(sve2_sqrdmlsh_h)(void *vd, void *vn, void *vm, | |
327 | void *va, uint32_t desc) | |
328 | { | |
329 | intptr_t i, opr_sz = simd_oprsz(desc); | |
330 | int16_t *d = vd, *n = vn, *m = vm, *a = va; | |
331 | uint32_t discard; | |
332 | ||
333 | for (i = 0; i < opr_sz / 2; ++i) { | |
334 | d[i] = do_sqrdmlah_h(n[i], m[i], a[i], true, true, &discard); | |
335 | } | |
336 | } | |
337 | ||
169d7c58 RH |
338 | void HELPER(sve2_sqdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) |
339 | { | |
340 | intptr_t i, opr_sz = simd_oprsz(desc); | |
341 | int16_t *d = vd, *n = vn, *m = vm; | |
342 | uint32_t discard; | |
343 | ||
344 | for (i = 0; i < opr_sz / 2; ++i) { | |
345 | d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, false, &discard); | |
346 | } | |
347 | } | |
348 | ||
349 | void HELPER(sve2_sqrdmulh_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
350 | { | |
351 | intptr_t i, opr_sz = simd_oprsz(desc); | |
352 | int16_t *d = vd, *n = vn, *m = vm; | |
353 | uint32_t discard; | |
354 | ||
355 | for (i = 0; i < opr_sz / 2; ++i) { | |
356 | d[i] = do_sqrdmlah_h(n[i], m[i], 0, false, true, &discard); | |
357 | } | |
358 | } | |
359 | ||
1aee2d70 RH |
360 | void HELPER(sve2_sqdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) |
361 | { | |
362 | intptr_t i, j, opr_sz = simd_oprsz(desc); | |
363 | int idx = simd_data(desc); | |
364 | int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); | |
365 | uint32_t discard; | |
366 | ||
367 | for (i = 0; i < opr_sz / 2; i += 16 / 2) { | |
368 | int16_t mm = m[i]; | |
369 | for (j = 0; j < 16 / 2; ++j) { | |
370 | d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, false, &discard); | |
371 | } | |
372 | } | |
373 | } | |
374 | ||
375 | void HELPER(sve2_sqrdmulh_idx_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
376 | { | |
377 | intptr_t i, j, opr_sz = simd_oprsz(desc); | |
378 | int idx = simd_data(desc); | |
379 | int16_t *d = vd, *n = vn, *m = (int16_t *)vm + H2(idx); | |
380 | uint32_t discard; | |
381 | ||
382 | for (i = 0; i < opr_sz / 2; i += 16 / 2) { | |
383 | int16_t mm = m[i]; | |
384 | for (j = 0; j < 16 / 2; ++j) { | |
385 | d[i + j] = do_sqrdmlah_h(n[i + j], mm, 0, false, true, &discard); | |
386 | } | |
387 | } | |
388 | } | |
389 | ||
d9061ec3 | 390 | /* Signed saturating rounding doubling multiply-accumulate high half, 32-bit */ |
d782d3ca RH |
391 | int32_t do_sqrdmlah_s(int32_t src1, int32_t src2, int32_t src3, |
392 | bool neg, bool round, uint32_t *sat) | |
d9061ec3 | 393 | { |
ab3ddf31 | 394 | /* Simplify similarly to do_sqrdmlah_b above. */ |
d9061ec3 | 395 | int64_t ret = (int64_t)src1 * src2; |
d2179885 RH |
396 | if (neg) { |
397 | ret = -ret; | |
398 | } | |
399 | ret += ((int64_t)src3 << 31) + (round << 30); | |
d9061ec3 | 400 | ret >>= 31; |
d2179885 | 401 | |
d9061ec3 | 402 | if (ret != (int32_t)ret) { |
e286bf4a | 403 | *sat = 1; |
d9061ec3 RH |
404 | ret = (ret < 0 ? INT32_MIN : INT32_MAX); |
405 | } | |
406 | return ret; | |
407 | } | |
408 | ||
e286bf4a RH |
409 | uint32_t HELPER(neon_qrdmlah_s32)(CPUARMState *env, int32_t src1, |
410 | int32_t src2, int32_t src3) | |
411 | { | |
412 | uint32_t *sat = &env->vfp.qc[0]; | |
d2179885 | 413 | return do_sqrdmlah_s(src1, src2, src3, false, true, sat); |
e286bf4a RH |
414 | } |
415 | ||
e7186d82 | 416 | void HELPER(gvec_qrdmlah_s32)(void *vd, void *vn, void *vm, |
e286bf4a | 417 | void *vq, uint32_t desc) |
e7186d82 RH |
418 | { |
419 | uintptr_t opr_sz = simd_oprsz(desc); | |
420 | int32_t *d = vd; | |
421 | int32_t *n = vn; | |
422 | int32_t *m = vm; | |
e7186d82 RH |
423 | uintptr_t i; |
424 | ||
425 | for (i = 0; i < opr_sz / 4; ++i) { | |
d2179885 | 426 | d[i] = do_sqrdmlah_s(n[i], m[i], d[i], false, true, vq); |
e7186d82 RH |
427 | } |
428 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
429 | } | |
430 | ||
e286bf4a RH |
431 | uint32_t HELPER(neon_qrdmlsh_s32)(CPUARMState *env, int32_t src1, |
432 | int32_t src2, int32_t src3) | |
433 | { | |
434 | uint32_t *sat = &env->vfp.qc[0]; | |
d2179885 | 435 | return do_sqrdmlah_s(src1, src2, src3, true, true, sat); |
e286bf4a RH |
436 | } |
437 | ||
e7186d82 | 438 | void HELPER(gvec_qrdmlsh_s32)(void *vd, void *vn, void *vm, |
e286bf4a | 439 | void *vq, uint32_t desc) |
e7186d82 RH |
440 | { |
441 | uintptr_t opr_sz = simd_oprsz(desc); | |
442 | int32_t *d = vd; | |
443 | int32_t *n = vn; | |
444 | int32_t *m = vm; | |
e7186d82 RH |
445 | uintptr_t i; |
446 | ||
447 | for (i = 0; i < opr_sz / 4; ++i) { | |
d2179885 | 448 | d[i] = do_sqrdmlah_s(n[i], m[i], d[i], true, true, vq); |
e7186d82 RH |
449 | } |
450 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
451 | } | |
1695cd61 | 452 | |
ed78849d RH |
453 | void HELPER(neon_sqdmulh_s)(void *vd, void *vn, void *vm, |
454 | void *vq, uint32_t desc) | |
455 | { | |
456 | intptr_t i, opr_sz = simd_oprsz(desc); | |
457 | int32_t *d = vd, *n = vn, *m = vm; | |
458 | ||
459 | for (i = 0; i < opr_sz / 4; ++i) { | |
460 | d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, vq); | |
461 | } | |
462 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
463 | } | |
464 | ||
465 | void HELPER(neon_sqrdmulh_s)(void *vd, void *vn, void *vm, | |
466 | void *vq, uint32_t desc) | |
467 | { | |
468 | intptr_t i, opr_sz = simd_oprsz(desc); | |
469 | int32_t *d = vd, *n = vn, *m = vm; | |
470 | ||
471 | for (i = 0; i < opr_sz / 4; ++i) { | |
472 | d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, vq); | |
473 | } | |
474 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
475 | } | |
476 | ||
ab3ddf31 RH |
477 | void HELPER(sve2_sqrdmlah_s)(void *vd, void *vn, void *vm, |
478 | void *va, uint32_t desc) | |
479 | { | |
480 | intptr_t i, opr_sz = simd_oprsz(desc); | |
481 | int32_t *d = vd, *n = vn, *m = vm, *a = va; | |
482 | uint32_t discard; | |
483 | ||
484 | for (i = 0; i < opr_sz / 4; ++i) { | |
485 | d[i] = do_sqrdmlah_s(n[i], m[i], a[i], false, true, &discard); | |
486 | } | |
487 | } | |
488 | ||
489 | void HELPER(sve2_sqrdmlsh_s)(void *vd, void *vn, void *vm, | |
490 | void *va, uint32_t desc) | |
491 | { | |
492 | intptr_t i, opr_sz = simd_oprsz(desc); | |
493 | int32_t *d = vd, *n = vn, *m = vm, *a = va; | |
494 | uint32_t discard; | |
495 | ||
496 | for (i = 0; i < opr_sz / 4; ++i) { | |
497 | d[i] = do_sqrdmlah_s(n[i], m[i], a[i], true, true, &discard); | |
498 | } | |
499 | } | |
500 | ||
169d7c58 RH |
501 | void HELPER(sve2_sqdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) |
502 | { | |
503 | intptr_t i, opr_sz = simd_oprsz(desc); | |
504 | int32_t *d = vd, *n = vn, *m = vm; | |
505 | uint32_t discard; | |
506 | ||
507 | for (i = 0; i < opr_sz / 4; ++i) { | |
508 | d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, false, &discard); | |
509 | } | |
510 | } | |
511 | ||
512 | void HELPER(sve2_sqrdmulh_s)(void *vd, void *vn, void *vm, uint32_t desc) | |
513 | { | |
514 | intptr_t i, opr_sz = simd_oprsz(desc); | |
515 | int32_t *d = vd, *n = vn, *m = vm; | |
516 | uint32_t discard; | |
517 | ||
518 | for (i = 0; i < opr_sz / 4; ++i) { | |
519 | d[i] = do_sqrdmlah_s(n[i], m[i], 0, false, true, &discard); | |
520 | } | |
521 | } | |
522 | ||
1aee2d70 RH |
523 | void HELPER(sve2_sqdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) |
524 | { | |
525 | intptr_t i, j, opr_sz = simd_oprsz(desc); | |
526 | int idx = simd_data(desc); | |
527 | int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); | |
528 | uint32_t discard; | |
529 | ||
530 | for (i = 0; i < opr_sz / 4; i += 16 / 4) { | |
531 | int32_t mm = m[i]; | |
532 | for (j = 0; j < 16 / 4; ++j) { | |
533 | d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, false, &discard); | |
534 | } | |
535 | } | |
536 | } | |
537 | ||
538 | void HELPER(sve2_sqrdmulh_idx_s)(void *vd, void *vn, void *vm, uint32_t desc) | |
539 | { | |
540 | intptr_t i, j, opr_sz = simd_oprsz(desc); | |
541 | int idx = simd_data(desc); | |
542 | int32_t *d = vd, *n = vn, *m = (int32_t *)vm + H4(idx); | |
543 | uint32_t discard; | |
544 | ||
545 | for (i = 0; i < opr_sz / 4; i += 16 / 4) { | |
546 | int32_t mm = m[i]; | |
547 | for (j = 0; j < 16 / 4; ++j) { | |
548 | d[i + j] = do_sqrdmlah_s(n[i + j], mm, 0, false, true, &discard); | |
549 | } | |
550 | } | |
551 | } | |
552 | ||
ab3ddf31 RH |
553 | /* Signed saturating rounding doubling multiply-accumulate high half, 64-bit */ |
554 | static int64_t do_sat128_d(Int128 r) | |
555 | { | |
556 | int64_t ls = int128_getlo(r); | |
557 | int64_t hs = int128_gethi(r); | |
558 | ||
559 | if (unlikely(hs != (ls >> 63))) { | |
560 | return hs < 0 ? INT64_MIN : INT64_MAX; | |
561 | } | |
562 | return ls; | |
563 | } | |
564 | ||
d782d3ca | 565 | int64_t do_sqrdmlah_d(int64_t n, int64_t m, int64_t a, bool neg, bool round) |
ab3ddf31 RH |
566 | { |
567 | uint64_t l, h; | |
568 | Int128 r, t; | |
569 | ||
570 | /* As in do_sqrdmlah_b, but with 128-bit arithmetic. */ | |
571 | muls64(&l, &h, m, n); | |
572 | r = int128_make128(l, h); | |
573 | if (neg) { | |
574 | r = int128_neg(r); | |
575 | } | |
576 | if (a) { | |
577 | t = int128_exts64(a); | |
578 | t = int128_lshift(t, 63); | |
579 | r = int128_add(r, t); | |
580 | } | |
581 | if (round) { | |
582 | t = int128_exts64(1ll << 62); | |
583 | r = int128_add(r, t); | |
584 | } | |
585 | r = int128_rshift(r, 63); | |
586 | ||
587 | return do_sat128_d(r); | |
588 | } | |
589 | ||
590 | void HELPER(sve2_sqrdmlah_d)(void *vd, void *vn, void *vm, | |
591 | void *va, uint32_t desc) | |
592 | { | |
593 | intptr_t i, opr_sz = simd_oprsz(desc); | |
594 | int64_t *d = vd, *n = vn, *m = vm, *a = va; | |
595 | ||
596 | for (i = 0; i < opr_sz / 8; ++i) { | |
597 | d[i] = do_sqrdmlah_d(n[i], m[i], a[i], false, true); | |
598 | } | |
599 | } | |
600 | ||
601 | void HELPER(sve2_sqrdmlsh_d)(void *vd, void *vn, void *vm, | |
602 | void *va, uint32_t desc) | |
603 | { | |
604 | intptr_t i, opr_sz = simd_oprsz(desc); | |
605 | int64_t *d = vd, *n = vn, *m = vm, *a = va; | |
606 | ||
607 | for (i = 0; i < opr_sz / 8; ++i) { | |
608 | d[i] = do_sqrdmlah_d(n[i], m[i], a[i], true, true); | |
609 | } | |
610 | } | |
611 | ||
169d7c58 RH |
612 | void HELPER(sve2_sqdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) |
613 | { | |
614 | intptr_t i, opr_sz = simd_oprsz(desc); | |
615 | int64_t *d = vd, *n = vn, *m = vm; | |
616 | ||
617 | for (i = 0; i < opr_sz / 8; ++i) { | |
618 | d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, false); | |
619 | } | |
620 | } | |
621 | ||
622 | void HELPER(sve2_sqrdmulh_d)(void *vd, void *vn, void *vm, uint32_t desc) | |
623 | { | |
624 | intptr_t i, opr_sz = simd_oprsz(desc); | |
625 | int64_t *d = vd, *n = vn, *m = vm; | |
626 | ||
627 | for (i = 0; i < opr_sz / 8; ++i) { | |
628 | d[i] = do_sqrdmlah_d(n[i], m[i], 0, false, true); | |
629 | } | |
630 | } | |
631 | ||
1aee2d70 RH |
632 | void HELPER(sve2_sqdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) |
633 | { | |
634 | intptr_t i, j, opr_sz = simd_oprsz(desc); | |
635 | int idx = simd_data(desc); | |
636 | int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; | |
637 | ||
638 | for (i = 0; i < opr_sz / 8; i += 16 / 8) { | |
639 | int64_t mm = m[i]; | |
640 | for (j = 0; j < 16 / 8; ++j) { | |
641 | d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, false); | |
642 | } | |
643 | } | |
644 | } | |
645 | ||
646 | void HELPER(sve2_sqrdmulh_idx_d)(void *vd, void *vn, void *vm, uint32_t desc) | |
647 | { | |
648 | intptr_t i, j, opr_sz = simd_oprsz(desc); | |
649 | int idx = simd_data(desc); | |
650 | int64_t *d = vd, *n = vn, *m = (int64_t *)vm + idx; | |
651 | ||
652 | for (i = 0; i < opr_sz / 8; i += 16 / 8) { | |
653 | int64_t mm = m[i]; | |
654 | for (j = 0; j < 16 / 8; ++j) { | |
655 | d[i + j] = do_sqrdmlah_d(n[i + j], mm, 0, false, true); | |
656 | } | |
657 | } | |
658 | } | |
659 | ||
d730ecaa RH |
660 | /* Integer 8 and 16-bit dot-product. |
661 | * | |
662 | * Note that for the loops herein, host endianness does not matter | |
5c57e3b9 | 663 | * with respect to the ordering of data within the quad-width lanes. |
d730ecaa RH |
664 | * All elements are treated equally, no matter where they are. |
665 | */ | |
666 | ||
5c57e3b9 RH |
667 | #define DO_DOT(NAME, TYPED, TYPEN, TYPEM) \ |
668 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ | |
669 | { \ | |
670 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
671 | TYPED *d = vd, *a = va; \ | |
672 | TYPEN *n = vn; \ | |
673 | TYPEM *m = vm; \ | |
674 | for (i = 0; i < opr_sz / sizeof(TYPED); ++i) { \ | |
675 | d[i] = (a[i] + \ | |
676 | (TYPED)n[i * 4 + 0] * m[i * 4 + 0] + \ | |
677 | (TYPED)n[i * 4 + 1] * m[i * 4 + 1] + \ | |
678 | (TYPED)n[i * 4 + 2] * m[i * 4 + 2] + \ | |
679 | (TYPED)n[i * 4 + 3] * m[i * 4 + 3]); \ | |
680 | } \ | |
681 | clear_tail(d, opr_sz, simd_maxsz(desc)); \ | |
682 | } | |
683 | ||
684 | DO_DOT(gvec_sdot_b, int32_t, int8_t, int8_t) | |
685 | DO_DOT(gvec_udot_b, uint32_t, uint8_t, uint8_t) | |
6a98cb2a | 686 | DO_DOT(gvec_usdot_b, uint32_t, uint8_t, int8_t) |
5c57e3b9 RH |
687 | DO_DOT(gvec_sdot_h, int64_t, int16_t, int16_t) |
688 | DO_DOT(gvec_udot_h, uint64_t, uint16_t, uint16_t) | |
d730ecaa | 689 | |
7020ffd6 RH |
690 | #define DO_DOT_IDX(NAME, TYPED, TYPEN, TYPEM, HD) \ |
691 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ | |
692 | { \ | |
693 | intptr_t i = 0, opr_sz = simd_oprsz(desc); \ | |
694 | intptr_t opr_sz_n = opr_sz / sizeof(TYPED); \ | |
695 | intptr_t segend = MIN(16 / sizeof(TYPED), opr_sz_n); \ | |
696 | intptr_t index = simd_data(desc); \ | |
697 | TYPED *d = vd, *a = va; \ | |
698 | TYPEN *n = vn; \ | |
699 | TYPEM *m_indexed = (TYPEM *)vm + HD(index) * 4; \ | |
700 | do { \ | |
701 | TYPED m0 = m_indexed[i * 4 + 0]; \ | |
702 | TYPED m1 = m_indexed[i * 4 + 1]; \ | |
703 | TYPED m2 = m_indexed[i * 4 + 2]; \ | |
704 | TYPED m3 = m_indexed[i * 4 + 3]; \ | |
705 | do { \ | |
706 | d[i] = (a[i] + \ | |
707 | n[i * 4 + 0] * m0 + \ | |
708 | n[i * 4 + 1] * m1 + \ | |
709 | n[i * 4 + 2] * m2 + \ | |
710 | n[i * 4 + 3] * m3); \ | |
711 | } while (++i < segend); \ | |
712 | segend = i + 4; \ | |
713 | } while (i < opr_sz_n); \ | |
714 | clear_tail(d, opr_sz, simd_maxsz(desc)); \ | |
16fcfdc7 RH |
715 | } |
716 | ||
7020ffd6 RH |
717 | DO_DOT_IDX(gvec_sdot_idx_b, int32_t, int8_t, int8_t, H4) |
718 | DO_DOT_IDX(gvec_udot_idx_b, uint32_t, uint8_t, uint8_t, H4) | |
2867039a RH |
719 | DO_DOT_IDX(gvec_sudot_idx_b, int32_t, int8_t, uint8_t, H4) |
720 | DO_DOT_IDX(gvec_usdot_idx_b, int32_t, uint8_t, int8_t, H4) | |
6e802db3 PM |
721 | DO_DOT_IDX(gvec_sdot_idx_h, int64_t, int16_t, int16_t, H8) |
722 | DO_DOT_IDX(gvec_udot_idx_h, uint64_t, uint16_t, uint16_t, H8) | |
16fcfdc7 | 723 | |
1695cd61 RH |
724 | void HELPER(gvec_fcaddh)(void *vd, void *vn, void *vm, |
725 | void *vfpst, uint32_t desc) | |
726 | { | |
727 | uintptr_t opr_sz = simd_oprsz(desc); | |
728 | float16 *d = vd; | |
729 | float16 *n = vn; | |
730 | float16 *m = vm; | |
731 | float_status *fpst = vfpst; | |
732 | uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); | |
733 | uint32_t neg_imag = neg_real ^ 1; | |
734 | uintptr_t i; | |
735 | ||
736 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
737 | neg_real <<= 15; | |
738 | neg_imag <<= 15; | |
739 | ||
740 | for (i = 0; i < opr_sz / 2; i += 2) { | |
741 | float16 e0 = n[H2(i)]; | |
742 | float16 e1 = m[H2(i + 1)] ^ neg_imag; | |
743 | float16 e2 = n[H2(i + 1)]; | |
744 | float16 e3 = m[H2(i)] ^ neg_real; | |
745 | ||
746 | d[H2(i)] = float16_add(e0, e1, fpst); | |
747 | d[H2(i + 1)] = float16_add(e2, e3, fpst); | |
748 | } | |
749 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
750 | } | |
751 | ||
752 | void HELPER(gvec_fcadds)(void *vd, void *vn, void *vm, | |
753 | void *vfpst, uint32_t desc) | |
754 | { | |
755 | uintptr_t opr_sz = simd_oprsz(desc); | |
756 | float32 *d = vd; | |
757 | float32 *n = vn; | |
758 | float32 *m = vm; | |
759 | float_status *fpst = vfpst; | |
760 | uint32_t neg_real = extract32(desc, SIMD_DATA_SHIFT, 1); | |
761 | uint32_t neg_imag = neg_real ^ 1; | |
762 | uintptr_t i; | |
763 | ||
764 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
765 | neg_real <<= 31; | |
766 | neg_imag <<= 31; | |
767 | ||
768 | for (i = 0; i < opr_sz / 4; i += 2) { | |
769 | float32 e0 = n[H4(i)]; | |
770 | float32 e1 = m[H4(i + 1)] ^ neg_imag; | |
771 | float32 e2 = n[H4(i + 1)]; | |
772 | float32 e3 = m[H4(i)] ^ neg_real; | |
773 | ||
774 | d[H4(i)] = float32_add(e0, e1, fpst); | |
775 | d[H4(i + 1)] = float32_add(e2, e3, fpst); | |
776 | } | |
777 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
778 | } | |
779 | ||
780 | void HELPER(gvec_fcaddd)(void *vd, void *vn, void *vm, | |
781 | void *vfpst, uint32_t desc) | |
782 | { | |
783 | uintptr_t opr_sz = simd_oprsz(desc); | |
784 | float64 *d = vd; | |
785 | float64 *n = vn; | |
786 | float64 *m = vm; | |
787 | float_status *fpst = vfpst; | |
788 | uint64_t neg_real = extract64(desc, SIMD_DATA_SHIFT, 1); | |
789 | uint64_t neg_imag = neg_real ^ 1; | |
790 | uintptr_t i; | |
791 | ||
792 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
793 | neg_real <<= 63; | |
794 | neg_imag <<= 63; | |
795 | ||
796 | for (i = 0; i < opr_sz / 8; i += 2) { | |
797 | float64 e0 = n[i]; | |
798 | float64 e1 = m[i + 1] ^ neg_imag; | |
799 | float64 e2 = n[i + 1]; | |
800 | float64 e3 = m[i] ^ neg_real; | |
801 | ||
802 | d[i] = float64_add(e0, e1, fpst); | |
803 | d[i + 1] = float64_add(e2, e3, fpst); | |
804 | } | |
805 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
806 | } | |
d17b7cdc | 807 | |
636ddeb1 | 808 | void HELPER(gvec_fcmlah)(void *vd, void *vn, void *vm, void *va, |
d17b7cdc RH |
809 | void *vfpst, uint32_t desc) |
810 | { | |
811 | uintptr_t opr_sz = simd_oprsz(desc); | |
636ddeb1 | 812 | float16 *d = vd, *n = vn, *m = vm, *a = va; |
d17b7cdc RH |
813 | float_status *fpst = vfpst; |
814 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
815 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
816 | uint32_t neg_real = flip ^ neg_imag; | |
817 | uintptr_t i; | |
818 | ||
819 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
820 | neg_real <<= 15; | |
821 | neg_imag <<= 15; | |
822 | ||
823 | for (i = 0; i < opr_sz / 2; i += 2) { | |
824 | float16 e2 = n[H2(i + flip)]; | |
825 | float16 e1 = m[H2(i + flip)] ^ neg_real; | |
826 | float16 e4 = e2; | |
827 | float16 e3 = m[H2(i + 1 - flip)] ^ neg_imag; | |
828 | ||
636ddeb1 RH |
829 | d[H2(i)] = float16_muladd(e2, e1, a[H2(i)], 0, fpst); |
830 | d[H2(i + 1)] = float16_muladd(e4, e3, a[H2(i + 1)], 0, fpst); | |
d17b7cdc RH |
831 | } |
832 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
833 | } | |
834 | ||
636ddeb1 | 835 | void HELPER(gvec_fcmlah_idx)(void *vd, void *vn, void *vm, void *va, |
d17b7cdc RH |
836 | void *vfpst, uint32_t desc) |
837 | { | |
838 | uintptr_t opr_sz = simd_oprsz(desc); | |
636ddeb1 | 839 | float16 *d = vd, *n = vn, *m = vm, *a = va; |
d17b7cdc RH |
840 | float_status *fpst = vfpst; |
841 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
842 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
2cc99919 | 843 | intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); |
d17b7cdc | 844 | uint32_t neg_real = flip ^ neg_imag; |
18fc2405 RH |
845 | intptr_t elements = opr_sz / sizeof(float16); |
846 | intptr_t eltspersegment = 16 / sizeof(float16); | |
847 | intptr_t i, j; | |
d17b7cdc RH |
848 | |
849 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
850 | neg_real <<= 15; | |
851 | neg_imag <<= 15; | |
d17b7cdc | 852 | |
18fc2405 RH |
853 | for (i = 0; i < elements; i += eltspersegment) { |
854 | float16 mr = m[H2(i + 2 * index + 0)]; | |
855 | float16 mi = m[H2(i + 2 * index + 1)]; | |
856 | float16 e1 = neg_real ^ (flip ? mi : mr); | |
857 | float16 e3 = neg_imag ^ (flip ? mr : mi); | |
d17b7cdc | 858 | |
18fc2405 RH |
859 | for (j = i; j < i + eltspersegment; j += 2) { |
860 | float16 e2 = n[H2(j + flip)]; | |
861 | float16 e4 = e2; | |
862 | ||
636ddeb1 RH |
863 | d[H2(j)] = float16_muladd(e2, e1, a[H2(j)], 0, fpst); |
864 | d[H2(j + 1)] = float16_muladd(e4, e3, a[H2(j + 1)], 0, fpst); | |
18fc2405 | 865 | } |
d17b7cdc RH |
866 | } |
867 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
868 | } | |
869 | ||
636ddeb1 | 870 | void HELPER(gvec_fcmlas)(void *vd, void *vn, void *vm, void *va, |
d17b7cdc RH |
871 | void *vfpst, uint32_t desc) |
872 | { | |
873 | uintptr_t opr_sz = simd_oprsz(desc); | |
636ddeb1 | 874 | float32 *d = vd, *n = vn, *m = vm, *a = va; |
d17b7cdc RH |
875 | float_status *fpst = vfpst; |
876 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
877 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
878 | uint32_t neg_real = flip ^ neg_imag; | |
879 | uintptr_t i; | |
880 | ||
881 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
882 | neg_real <<= 31; | |
883 | neg_imag <<= 31; | |
884 | ||
885 | for (i = 0; i < opr_sz / 4; i += 2) { | |
886 | float32 e2 = n[H4(i + flip)]; | |
887 | float32 e1 = m[H4(i + flip)] ^ neg_real; | |
888 | float32 e4 = e2; | |
889 | float32 e3 = m[H4(i + 1 - flip)] ^ neg_imag; | |
890 | ||
636ddeb1 RH |
891 | d[H4(i)] = float32_muladd(e2, e1, a[H4(i)], 0, fpst); |
892 | d[H4(i + 1)] = float32_muladd(e4, e3, a[H4(i + 1)], 0, fpst); | |
d17b7cdc RH |
893 | } |
894 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
895 | } | |
896 | ||
636ddeb1 | 897 | void HELPER(gvec_fcmlas_idx)(void *vd, void *vn, void *vm, void *va, |
d17b7cdc RH |
898 | void *vfpst, uint32_t desc) |
899 | { | |
900 | uintptr_t opr_sz = simd_oprsz(desc); | |
636ddeb1 | 901 | float32 *d = vd, *n = vn, *m = vm, *a = va; |
d17b7cdc RH |
902 | float_status *fpst = vfpst; |
903 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
904 | uint32_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
2cc99919 | 905 | intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 2, 2); |
d17b7cdc | 906 | uint32_t neg_real = flip ^ neg_imag; |
18fc2405 RH |
907 | intptr_t elements = opr_sz / sizeof(float32); |
908 | intptr_t eltspersegment = 16 / sizeof(float32); | |
909 | intptr_t i, j; | |
d17b7cdc RH |
910 | |
911 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
912 | neg_real <<= 31; | |
913 | neg_imag <<= 31; | |
d17b7cdc | 914 | |
18fc2405 RH |
915 | for (i = 0; i < elements; i += eltspersegment) { |
916 | float32 mr = m[H4(i + 2 * index + 0)]; | |
917 | float32 mi = m[H4(i + 2 * index + 1)]; | |
918 | float32 e1 = neg_real ^ (flip ? mi : mr); | |
919 | float32 e3 = neg_imag ^ (flip ? mr : mi); | |
d17b7cdc | 920 | |
18fc2405 RH |
921 | for (j = i; j < i + eltspersegment; j += 2) { |
922 | float32 e2 = n[H4(j + flip)]; | |
923 | float32 e4 = e2; | |
924 | ||
636ddeb1 RH |
925 | d[H4(j)] = float32_muladd(e2, e1, a[H4(j)], 0, fpst); |
926 | d[H4(j + 1)] = float32_muladd(e4, e3, a[H4(j + 1)], 0, fpst); | |
18fc2405 | 927 | } |
d17b7cdc RH |
928 | } |
929 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
930 | } | |
931 | ||
636ddeb1 | 932 | void HELPER(gvec_fcmlad)(void *vd, void *vn, void *vm, void *va, |
d17b7cdc RH |
933 | void *vfpst, uint32_t desc) |
934 | { | |
935 | uintptr_t opr_sz = simd_oprsz(desc); | |
636ddeb1 | 936 | float64 *d = vd, *n = vn, *m = vm, *a = va; |
d17b7cdc RH |
937 | float_status *fpst = vfpst; |
938 | intptr_t flip = extract32(desc, SIMD_DATA_SHIFT, 1); | |
939 | uint64_t neg_imag = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
940 | uint64_t neg_real = flip ^ neg_imag; | |
941 | uintptr_t i; | |
942 | ||
943 | /* Shift boolean to the sign bit so we can xor to negate. */ | |
944 | neg_real <<= 63; | |
945 | neg_imag <<= 63; | |
946 | ||
947 | for (i = 0; i < opr_sz / 8; i += 2) { | |
948 | float64 e2 = n[i + flip]; | |
949 | float64 e1 = m[i + flip] ^ neg_real; | |
950 | float64 e4 = e2; | |
951 | float64 e3 = m[i + 1 - flip] ^ neg_imag; | |
952 | ||
636ddeb1 RH |
953 | d[i] = float64_muladd(e2, e1, a[i], 0, fpst); |
954 | d[i + 1] = float64_muladd(e4, e3, a[i + 1], 0, fpst); | |
d17b7cdc RH |
955 | } |
956 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
957 | } | |
29b80469 | 958 | |
ad505db2 PM |
959 | /* |
960 | * Floating point comparisons producing an integer result (all 1s or all 0s). | |
961 | * Note that EQ doesn't signal InvalidOp for QNaNs but GE and GT do. | |
962 | * Softfloat routines return 0/1, which we convert to the 0/-1 Neon requires. | |
963 | */ | |
964 | static uint16_t float16_ceq(float16 op1, float16 op2, float_status *stat) | |
965 | { | |
966 | return -float16_eq_quiet(op1, op2, stat); | |
967 | } | |
968 | ||
969 | static uint32_t float32_ceq(float32 op1, float32 op2, float_status *stat) | |
970 | { | |
971 | return -float32_eq_quiet(op1, op2, stat); | |
972 | } | |
973 | ||
974 | static uint16_t float16_cge(float16 op1, float16 op2, float_status *stat) | |
975 | { | |
976 | return -float16_le(op2, op1, stat); | |
977 | } | |
978 | ||
979 | static uint32_t float32_cge(float32 op1, float32 op2, float_status *stat) | |
980 | { | |
981 | return -float32_le(op2, op1, stat); | |
982 | } | |
983 | ||
984 | static uint16_t float16_cgt(float16 op1, float16 op2, float_status *stat) | |
985 | { | |
986 | return -float16_lt(op2, op1, stat); | |
987 | } | |
988 | ||
989 | static uint32_t float32_cgt(float32 op1, float32 op2, float_status *stat) | |
990 | { | |
991 | return -float32_lt(op2, op1, stat); | |
992 | } | |
993 | ||
bb2741da PM |
994 | static uint16_t float16_acge(float16 op1, float16 op2, float_status *stat) |
995 | { | |
996 | return -float16_le(float16_abs(op2), float16_abs(op1), stat); | |
997 | } | |
998 | ||
999 | static uint32_t float32_acge(float32 op1, float32 op2, float_status *stat) | |
1000 | { | |
1001 | return -float32_le(float32_abs(op2), float32_abs(op1), stat); | |
1002 | } | |
1003 | ||
1004 | static uint16_t float16_acgt(float16 op1, float16 op2, float_status *stat) | |
1005 | { | |
1006 | return -float16_lt(float16_abs(op2), float16_abs(op1), stat); | |
1007 | } | |
1008 | ||
1009 | static uint32_t float32_acgt(float32 op1, float32 op2, float_status *stat) | |
1010 | { | |
1011 | return -float32_lt(float32_abs(op2), float32_abs(op1), stat); | |
1012 | } | |
1013 | ||
7782a9af PM |
1014 | static int16_t vfp_tosszh(float16 x, void *fpstp) |
1015 | { | |
1016 | float_status *fpst = fpstp; | |
1017 | if (float16_is_any_nan(x)) { | |
1018 | float_raise(float_flag_invalid, fpst); | |
1019 | return 0; | |
1020 | } | |
1021 | return float16_to_int16_round_to_zero(x, fpst); | |
1022 | } | |
1023 | ||
1024 | static uint16_t vfp_touszh(float16 x, void *fpstp) | |
1025 | { | |
1026 | float_status *fpst = fpstp; | |
1027 | if (float16_is_any_nan(x)) { | |
1028 | float_raise(float_flag_invalid, fpst); | |
1029 | return 0; | |
1030 | } | |
1031 | return float16_to_uint16_round_to_zero(x, fpst); | |
1032 | } | |
1033 | ||
3887c038 RH |
1034 | #define DO_2OP(NAME, FUNC, TYPE) \ |
1035 | void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ | |
1036 | { \ | |
1037 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
1038 | TYPE *d = vd, *n = vn; \ | |
1039 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
1040 | d[i] = FUNC(n[i], stat); \ | |
1041 | } \ | |
d8efe78e | 1042 | clear_tail(d, oprsz, simd_maxsz(desc)); \ |
3887c038 RH |
1043 | } |
1044 | ||
1045 | DO_2OP(gvec_frecpe_h, helper_recpe_f16, float16) | |
1046 | DO_2OP(gvec_frecpe_s, helper_recpe_f32, float32) | |
1047 | DO_2OP(gvec_frecpe_d, helper_recpe_f64, float64) | |
1048 | ||
1049 | DO_2OP(gvec_frsqrte_h, helper_rsqrte_f16, float16) | |
1050 | DO_2OP(gvec_frsqrte_s, helper_rsqrte_f32, float32) | |
1051 | DO_2OP(gvec_frsqrte_d, helper_rsqrte_f64, float64) | |
1052 | ||
23afcdd2 PM |
1053 | DO_2OP(gvec_vrintx_h, float16_round_to_int, float16) |
1054 | DO_2OP(gvec_vrintx_s, float32_round_to_int, float32) | |
1055 | ||
7782a9af PM |
1056 | DO_2OP(gvec_sitos, helper_vfp_sitos, int32_t) |
1057 | DO_2OP(gvec_uitos, helper_vfp_uitos, uint32_t) | |
1058 | DO_2OP(gvec_tosizs, helper_vfp_tosizs, float32) | |
1059 | DO_2OP(gvec_touizs, helper_vfp_touizs, float32) | |
1060 | DO_2OP(gvec_sstoh, int16_to_float16, int16_t) | |
1061 | DO_2OP(gvec_ustoh, uint16_to_float16, uint16_t) | |
1062 | DO_2OP(gvec_tosszh, vfp_tosszh, float16) | |
1063 | DO_2OP(gvec_touszh, vfp_touszh, float16) | |
1064 | ||
635187aa PM |
1065 | #define WRAP_CMP0_FWD(FN, CMPOP, TYPE) \ |
1066 | static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ | |
1067 | { \ | |
1068 | return TYPE##_##CMPOP(op, TYPE##_zero, stat); \ | |
1069 | } | |
1070 | ||
1071 | #define WRAP_CMP0_REV(FN, CMPOP, TYPE) \ | |
1072 | static TYPE TYPE##_##FN##0(TYPE op, float_status *stat) \ | |
1073 | { \ | |
1074 | return TYPE##_##CMPOP(TYPE##_zero, op, stat); \ | |
1075 | } | |
1076 | ||
1077 | #define DO_2OP_CMP0(FN, CMPOP, DIRN) \ | |
1078 | WRAP_CMP0_##DIRN(FN, CMPOP, float16) \ | |
1079 | WRAP_CMP0_##DIRN(FN, CMPOP, float32) \ | |
1080 | DO_2OP(gvec_f##FN##0_h, float16_##FN##0, float16) \ | |
1081 | DO_2OP(gvec_f##FN##0_s, float32_##FN##0, float32) | |
1082 | ||
1083 | DO_2OP_CMP0(cgt, cgt, FWD) | |
1084 | DO_2OP_CMP0(cge, cge, FWD) | |
1085 | DO_2OP_CMP0(ceq, ceq, FWD) | |
1086 | DO_2OP_CMP0(clt, cgt, REV) | |
1087 | DO_2OP_CMP0(cle, cge, REV) | |
1088 | ||
3887c038 | 1089 | #undef DO_2OP |
635187aa | 1090 | #undef DO_2OP_CMP0 |
3887c038 | 1091 | |
29b80469 RH |
1092 | /* Floating-point trigonometric starting value. |
1093 | * See the ARM ARM pseudocode function FPTrigSMul. | |
1094 | */ | |
1095 | static float16 float16_ftsmul(float16 op1, uint16_t op2, float_status *stat) | |
1096 | { | |
1097 | float16 result = float16_mul(op1, op1, stat); | |
1098 | if (!float16_is_any_nan(result)) { | |
1099 | result = float16_set_sign(result, op2 & 1); | |
1100 | } | |
1101 | return result; | |
1102 | } | |
1103 | ||
1104 | static float32 float32_ftsmul(float32 op1, uint32_t op2, float_status *stat) | |
1105 | { | |
1106 | float32 result = float32_mul(op1, op1, stat); | |
1107 | if (!float32_is_any_nan(result)) { | |
1108 | result = float32_set_sign(result, op2 & 1); | |
1109 | } | |
1110 | return result; | |
1111 | } | |
1112 | ||
1113 | static float64 float64_ftsmul(float64 op1, uint64_t op2, float_status *stat) | |
1114 | { | |
1115 | float64 result = float64_mul(op1, op1, stat); | |
1116 | if (!float64_is_any_nan(result)) { | |
1117 | result = float64_set_sign(result, op2 & 1); | |
1118 | } | |
1119 | return result; | |
1120 | } | |
1121 | ||
e4a6d4a6 PM |
1122 | static float16 float16_abd(float16 op1, float16 op2, float_status *stat) |
1123 | { | |
1124 | return float16_abs(float16_sub(op1, op2, stat)); | |
1125 | } | |
1126 | ||
a26a352b PM |
1127 | static float32 float32_abd(float32 op1, float32 op2, float_status *stat) |
1128 | { | |
1129 | return float32_abs(float32_sub(op1, op2, stat)); | |
1130 | } | |
1131 | ||
ac8c62c4 PM |
1132 | /* |
1133 | * Reciprocal step. These are the AArch32 version which uses a | |
1134 | * non-fused multiply-and-subtract. | |
1135 | */ | |
1136 | static float16 float16_recps_nf(float16 op1, float16 op2, float_status *stat) | |
1137 | { | |
1138 | op1 = float16_squash_input_denormal(op1, stat); | |
1139 | op2 = float16_squash_input_denormal(op2, stat); | |
1140 | ||
1141 | if ((float16_is_infinity(op1) && float16_is_zero(op2)) || | |
1142 | (float16_is_infinity(op2) && float16_is_zero(op1))) { | |
1143 | return float16_two; | |
1144 | } | |
1145 | return float16_sub(float16_two, float16_mul(op1, op2, stat), stat); | |
1146 | } | |
1147 | ||
1148 | static float32 float32_recps_nf(float32 op1, float32 op2, float_status *stat) | |
1149 | { | |
1150 | op1 = float32_squash_input_denormal(op1, stat); | |
1151 | op2 = float32_squash_input_denormal(op2, stat); | |
1152 | ||
1153 | if ((float32_is_infinity(op1) && float32_is_zero(op2)) || | |
1154 | (float32_is_infinity(op2) && float32_is_zero(op1))) { | |
1155 | return float32_two; | |
1156 | } | |
1157 | return float32_sub(float32_two, float32_mul(op1, op2, stat), stat); | |
1158 | } | |
1159 | ||
40fde72d PM |
1160 | /* Reciprocal square-root step. AArch32 non-fused semantics. */ |
1161 | static float16 float16_rsqrts_nf(float16 op1, float16 op2, float_status *stat) | |
1162 | { | |
1163 | op1 = float16_squash_input_denormal(op1, stat); | |
1164 | op2 = float16_squash_input_denormal(op2, stat); | |
1165 | ||
1166 | if ((float16_is_infinity(op1) && float16_is_zero(op2)) || | |
1167 | (float16_is_infinity(op2) && float16_is_zero(op1))) { | |
1168 | return float16_one_point_five; | |
1169 | } | |
1170 | op1 = float16_sub(float16_three, float16_mul(op1, op2, stat), stat); | |
1171 | return float16_div(op1, float16_two, stat); | |
1172 | } | |
1173 | ||
1174 | static float32 float32_rsqrts_nf(float32 op1, float32 op2, float_status *stat) | |
1175 | { | |
1176 | op1 = float32_squash_input_denormal(op1, stat); | |
1177 | op2 = float32_squash_input_denormal(op2, stat); | |
1178 | ||
1179 | if ((float32_is_infinity(op1) && float32_is_zero(op2)) || | |
1180 | (float32_is_infinity(op2) && float32_is_zero(op1))) { | |
1181 | return float32_one_point_five; | |
1182 | } | |
1183 | op1 = float32_sub(float32_three, float32_mul(op1, op2, stat), stat); | |
1184 | return float32_div(op1, float32_two, stat); | |
1185 | } | |
1186 | ||
29b80469 RH |
1187 | #define DO_3OP(NAME, FUNC, TYPE) \ |
1188 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ | |
1189 | { \ | |
1190 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
1191 | TYPE *d = vd, *n = vn, *m = vm; \ | |
1192 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
1193 | d[i] = FUNC(n[i], m[i], stat); \ | |
1194 | } \ | |
d8efe78e | 1195 | clear_tail(d, oprsz, simd_maxsz(desc)); \ |
29b80469 RH |
1196 | } |
1197 | ||
1198 | DO_3OP(gvec_fadd_h, float16_add, float16) | |
1199 | DO_3OP(gvec_fadd_s, float32_add, float32) | |
1200 | DO_3OP(gvec_fadd_d, float64_add, float64) | |
1201 | ||
1202 | DO_3OP(gvec_fsub_h, float16_sub, float16) | |
1203 | DO_3OP(gvec_fsub_s, float32_sub, float32) | |
1204 | DO_3OP(gvec_fsub_d, float64_sub, float64) | |
1205 | ||
1206 | DO_3OP(gvec_fmul_h, float16_mul, float16) | |
1207 | DO_3OP(gvec_fmul_s, float32_mul, float32) | |
1208 | DO_3OP(gvec_fmul_d, float64_mul, float64) | |
1209 | ||
1210 | DO_3OP(gvec_ftsmul_h, float16_ftsmul, float16) | |
1211 | DO_3OP(gvec_ftsmul_s, float32_ftsmul, float32) | |
1212 | DO_3OP(gvec_ftsmul_d, float64_ftsmul, float64) | |
1213 | ||
e4a6d4a6 | 1214 | DO_3OP(gvec_fabd_h, float16_abd, float16) |
a26a352b PM |
1215 | DO_3OP(gvec_fabd_s, float32_abd, float32) |
1216 | ||
ad505db2 PM |
1217 | DO_3OP(gvec_fceq_h, float16_ceq, float16) |
1218 | DO_3OP(gvec_fceq_s, float32_ceq, float32) | |
1219 | ||
1220 | DO_3OP(gvec_fcge_h, float16_cge, float16) | |
1221 | DO_3OP(gvec_fcge_s, float32_cge, float32) | |
1222 | ||
1223 | DO_3OP(gvec_fcgt_h, float16_cgt, float16) | |
1224 | DO_3OP(gvec_fcgt_s, float32_cgt, float32) | |
1225 | ||
bb2741da PM |
1226 | DO_3OP(gvec_facge_h, float16_acge, float16) |
1227 | DO_3OP(gvec_facge_s, float32_acge, float32) | |
1228 | ||
1229 | DO_3OP(gvec_facgt_h, float16_acgt, float16) | |
1230 | DO_3OP(gvec_facgt_s, float32_acgt, float32) | |
1231 | ||
e43268c5 PM |
1232 | DO_3OP(gvec_fmax_h, float16_max, float16) |
1233 | DO_3OP(gvec_fmax_s, float32_max, float32) | |
1234 | ||
1235 | DO_3OP(gvec_fmin_h, float16_min, float16) | |
1236 | DO_3OP(gvec_fmin_s, float32_min, float32) | |
1237 | ||
e22705bb PM |
1238 | DO_3OP(gvec_fmaxnum_h, float16_maxnum, float16) |
1239 | DO_3OP(gvec_fmaxnum_s, float32_maxnum, float32) | |
1240 | ||
1241 | DO_3OP(gvec_fminnum_h, float16_minnum, float16) | |
1242 | DO_3OP(gvec_fminnum_s, float32_minnum, float32) | |
1243 | ||
ac8c62c4 PM |
1244 | DO_3OP(gvec_recps_nf_h, float16_recps_nf, float16) |
1245 | DO_3OP(gvec_recps_nf_s, float32_recps_nf, float32) | |
1246 | ||
40fde72d PM |
1247 | DO_3OP(gvec_rsqrts_nf_h, float16_rsqrts_nf, float16) |
1248 | DO_3OP(gvec_rsqrts_nf_s, float32_rsqrts_nf, float32) | |
1249 | ||
29b80469 RH |
1250 | #ifdef TARGET_AARCH64 |
1251 | ||
1252 | DO_3OP(gvec_recps_h, helper_recpsf_f16, float16) | |
1253 | DO_3OP(gvec_recps_s, helper_recpsf_f32, float32) | |
1254 | DO_3OP(gvec_recps_d, helper_recpsf_f64, float64) | |
1255 | ||
1256 | DO_3OP(gvec_rsqrts_h, helper_rsqrtsf_f16, float16) | |
1257 | DO_3OP(gvec_rsqrts_s, helper_rsqrtsf_f32, float32) | |
1258 | DO_3OP(gvec_rsqrts_d, helper_rsqrtsf_f64, float64) | |
1259 | ||
1260 | #endif | |
1261 | #undef DO_3OP | |
ca40a6e6 | 1262 | |
e5adc706 PM |
1263 | /* Non-fused multiply-add (unlike float16_muladd etc, which are fused) */ |
1264 | static float16 float16_muladd_nf(float16 dest, float16 op1, float16 op2, | |
1265 | float_status *stat) | |
1266 | { | |
1267 | return float16_add(dest, float16_mul(op1, op2, stat), stat); | |
1268 | } | |
1269 | ||
1270 | static float32 float32_muladd_nf(float32 dest, float32 op1, float32 op2, | |
1271 | float_status *stat) | |
1272 | { | |
1273 | return float32_add(dest, float32_mul(op1, op2, stat), stat); | |
1274 | } | |
1275 | ||
1276 | static float16 float16_mulsub_nf(float16 dest, float16 op1, float16 op2, | |
1277 | float_status *stat) | |
1278 | { | |
1279 | return float16_sub(dest, float16_mul(op1, op2, stat), stat); | |
1280 | } | |
1281 | ||
1282 | static float32 float32_mulsub_nf(float32 dest, float32 op1, float32 op2, | |
1283 | float_status *stat) | |
1284 | { | |
1285 | return float32_sub(dest, float32_mul(op1, op2, stat), stat); | |
1286 | } | |
1287 | ||
cf722d75 PM |
1288 | /* Fused versions; these have the semantics Neon VFMA/VFMS want */ |
1289 | static float16 float16_muladd_f(float16 dest, float16 op1, float16 op2, | |
1290 | float_status *stat) | |
1291 | { | |
1292 | return float16_muladd(op1, op2, dest, 0, stat); | |
1293 | } | |
1294 | ||
1295 | static float32 float32_muladd_f(float32 dest, float32 op1, float32 op2, | |
1296 | float_status *stat) | |
1297 | { | |
1298 | return float32_muladd(op1, op2, dest, 0, stat); | |
1299 | } | |
1300 | ||
1301 | static float16 float16_mulsub_f(float16 dest, float16 op1, float16 op2, | |
1302 | float_status *stat) | |
1303 | { | |
1304 | return float16_muladd(float16_chs(op1), op2, dest, 0, stat); | |
1305 | } | |
1306 | ||
1307 | static float32 float32_mulsub_f(float32 dest, float32 op1, float32 op2, | |
1308 | float_status *stat) | |
1309 | { | |
1310 | return float32_muladd(float32_chs(op1), op2, dest, 0, stat); | |
1311 | } | |
1312 | ||
1313 | #define DO_MULADD(NAME, FUNC, TYPE) \ | |
e5adc706 PM |
1314 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ |
1315 | { \ | |
1316 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
1317 | TYPE *d = vd, *n = vn, *m = vm; \ | |
1318 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
1319 | d[i] = FUNC(d[i], n[i], m[i], stat); \ | |
1320 | } \ | |
1321 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
1322 | } | |
1323 | ||
1324 | DO_MULADD(gvec_fmla_h, float16_muladd_nf, float16) | |
1325 | DO_MULADD(gvec_fmla_s, float32_muladd_nf, float32) | |
1326 | ||
1327 | DO_MULADD(gvec_fmls_h, float16_mulsub_nf, float16) | |
1328 | DO_MULADD(gvec_fmls_s, float32_mulsub_nf, float32) | |
1329 | ||
cf722d75 PM |
1330 | DO_MULADD(gvec_vfma_h, float16_muladd_f, float16) |
1331 | DO_MULADD(gvec_vfma_s, float32_muladd_f, float32) | |
1332 | ||
1333 | DO_MULADD(gvec_vfms_h, float16_mulsub_f, float16) | |
1334 | DO_MULADD(gvec_vfms_s, float32_mulsub_f, float32) | |
1335 | ||
ca40a6e6 RH |
1336 | /* For the indexed ops, SVE applies the index per 128-bit vector segment. |
1337 | * For AdvSIMD, there is of course only one such vector segment. | |
1338 | */ | |
1339 | ||
1340 | #define DO_MUL_IDX(NAME, TYPE, H) \ | |
2e5a265e RH |
1341 | void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ |
1342 | { \ | |
d7ce81e5 PM |
1343 | intptr_t i, j, oprsz = simd_oprsz(desc); \ |
1344 | intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ | |
2e5a265e RH |
1345 | intptr_t idx = simd_data(desc); \ |
1346 | TYPE *d = vd, *n = vn, *m = vm; \ | |
1347 | for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ | |
1348 | TYPE mm = m[H(i + idx)]; \ | |
1349 | for (j = 0; j < segment; j++) { \ | |
1350 | d[i + j] = n[i + j] * mm; \ | |
1351 | } \ | |
1352 | } \ | |
1353 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
1354 | } | |
1355 | ||
1356 | DO_MUL_IDX(gvec_mul_idx_h, uint16_t, H2) | |
1357 | DO_MUL_IDX(gvec_mul_idx_s, uint32_t, H4) | |
6e802db3 | 1358 | DO_MUL_IDX(gvec_mul_idx_d, uint64_t, H8) |
2e5a265e RH |
1359 | |
1360 | #undef DO_MUL_IDX | |
1361 | ||
3607440c RH |
1362 | #define DO_MLA_IDX(NAME, TYPE, OP, H) \ |
1363 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ | |
1364 | { \ | |
d7ce81e5 PM |
1365 | intptr_t i, j, oprsz = simd_oprsz(desc); \ |
1366 | intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ | |
3607440c RH |
1367 | intptr_t idx = simd_data(desc); \ |
1368 | TYPE *d = vd, *n = vn, *m = vm, *a = va; \ | |
1369 | for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ | |
1370 | TYPE mm = m[H(i + idx)]; \ | |
1371 | for (j = 0; j < segment; j++) { \ | |
1372 | d[i + j] = a[i + j] OP n[i + j] * mm; \ | |
1373 | } \ | |
1374 | } \ | |
1375 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
1376 | } | |
1377 | ||
1378 | DO_MLA_IDX(gvec_mla_idx_h, uint16_t, +, H2) | |
1379 | DO_MLA_IDX(gvec_mla_idx_s, uint32_t, +, H4) | |
6e802db3 | 1380 | DO_MLA_IDX(gvec_mla_idx_d, uint64_t, +, H8) |
3607440c RH |
1381 | |
1382 | DO_MLA_IDX(gvec_mls_idx_h, uint16_t, -, H2) | |
1383 | DO_MLA_IDX(gvec_mls_idx_s, uint32_t, -, H4) | |
6e802db3 | 1384 | DO_MLA_IDX(gvec_mls_idx_d, uint64_t, -, H8) |
3607440c RH |
1385 | |
1386 | #undef DO_MLA_IDX | |
1387 | ||
c50d8d14 | 1388 | #define DO_FMUL_IDX(NAME, ADD, TYPE, H) \ |
ca40a6e6 RH |
1389 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *stat, uint32_t desc) \ |
1390 | { \ | |
d7ce81e5 PM |
1391 | intptr_t i, j, oprsz = simd_oprsz(desc); \ |
1392 | intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ | |
ca40a6e6 RH |
1393 | intptr_t idx = simd_data(desc); \ |
1394 | TYPE *d = vd, *n = vn, *m = vm; \ | |
1395 | for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ | |
1396 | TYPE mm = m[H(i + idx)]; \ | |
1397 | for (j = 0; j < segment; j++) { \ | |
c50d8d14 PM |
1398 | d[i + j] = TYPE##_##ADD(d[i + j], \ |
1399 | TYPE##_mul(n[i + j], mm, stat), stat); \ | |
ca40a6e6 RH |
1400 | } \ |
1401 | } \ | |
525d9b6d | 1402 | clear_tail(d, oprsz, simd_maxsz(desc)); \ |
ca40a6e6 RH |
1403 | } |
1404 | ||
c50d8d14 PM |
1405 | #define float16_nop(N, M, S) (M) |
1406 | #define float32_nop(N, M, S) (M) | |
1407 | #define float64_nop(N, M, S) (M) | |
ca40a6e6 | 1408 | |
c50d8d14 PM |
1409 | DO_FMUL_IDX(gvec_fmul_idx_h, nop, float16, H2) |
1410 | DO_FMUL_IDX(gvec_fmul_idx_s, nop, float32, H4) | |
6e802db3 | 1411 | DO_FMUL_IDX(gvec_fmul_idx_d, nop, float64, H8) |
c50d8d14 PM |
1412 | |
1413 | /* | |
1414 | * Non-fused multiply-accumulate operations, for Neon. NB that unlike | |
1415 | * the fused ops below they assume accumulate both from and into Vd. | |
1416 | */ | |
1417 | DO_FMUL_IDX(gvec_fmla_nf_idx_h, add, float16, H2) | |
1418 | DO_FMUL_IDX(gvec_fmla_nf_idx_s, add, float32, H4) | |
1419 | DO_FMUL_IDX(gvec_fmls_nf_idx_h, sub, float16, H2) | |
1420 | DO_FMUL_IDX(gvec_fmls_nf_idx_s, sub, float32, H4) | |
1421 | ||
1422 | #undef float16_nop | |
1423 | #undef float32_nop | |
1424 | #undef float64_nop | |
2e5a265e | 1425 | #undef DO_FMUL_IDX |
ca40a6e6 RH |
1426 | |
1427 | #define DO_FMLA_IDX(NAME, TYPE, H) \ | |
1428 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, \ | |
1429 | void *stat, uint32_t desc) \ | |
1430 | { \ | |
d7ce81e5 PM |
1431 | intptr_t i, j, oprsz = simd_oprsz(desc); \ |
1432 | intptr_t segment = MIN(16, oprsz) / sizeof(TYPE); \ | |
ca40a6e6 RH |
1433 | TYPE op1_neg = extract32(desc, SIMD_DATA_SHIFT, 1); \ |
1434 | intptr_t idx = desc >> (SIMD_DATA_SHIFT + 1); \ | |
1435 | TYPE *d = vd, *n = vn, *m = vm, *a = va; \ | |
1436 | op1_neg <<= (8 * sizeof(TYPE) - 1); \ | |
1437 | for (i = 0; i < oprsz / sizeof(TYPE); i += segment) { \ | |
1438 | TYPE mm = m[H(i + idx)]; \ | |
1439 | for (j = 0; j < segment; j++) { \ | |
1440 | d[i + j] = TYPE##_muladd(n[i + j] ^ op1_neg, \ | |
1441 | mm, a[i + j], 0, stat); \ | |
1442 | } \ | |
1443 | } \ | |
525d9b6d | 1444 | clear_tail(d, oprsz, simd_maxsz(desc)); \ |
ca40a6e6 RH |
1445 | } |
1446 | ||
1447 | DO_FMLA_IDX(gvec_fmla_idx_h, float16, H2) | |
1448 | DO_FMLA_IDX(gvec_fmla_idx_s, float32, H4) | |
6e802db3 | 1449 | DO_FMLA_IDX(gvec_fmla_idx_d, float64, H8) |
ca40a6e6 RH |
1450 | |
1451 | #undef DO_FMLA_IDX | |
89e68b57 RH |
1452 | |
1453 | #define DO_SAT(NAME, WTYPE, TYPEN, TYPEM, OP, MIN, MAX) \ | |
1454 | void HELPER(NAME)(void *vd, void *vq, void *vn, void *vm, uint32_t desc) \ | |
1455 | { \ | |
1456 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
1457 | TYPEN *d = vd, *n = vn; TYPEM *m = vm; \ | |
1458 | bool q = false; \ | |
1459 | for (i = 0; i < oprsz / sizeof(TYPEN); i++) { \ | |
1460 | WTYPE dd = (WTYPE)n[i] OP m[i]; \ | |
1461 | if (dd < MIN) { \ | |
1462 | dd = MIN; \ | |
1463 | q = true; \ | |
1464 | } else if (dd > MAX) { \ | |
1465 | dd = MAX; \ | |
1466 | q = true; \ | |
1467 | } \ | |
1468 | d[i] = dd; \ | |
1469 | } \ | |
1470 | if (q) { \ | |
1471 | uint32_t *qc = vq; \ | |
1472 | qc[0] = 1; \ | |
1473 | } \ | |
1474 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
1475 | } | |
1476 | ||
1477 | DO_SAT(gvec_uqadd_b, int, uint8_t, uint8_t, +, 0, UINT8_MAX) | |
1478 | DO_SAT(gvec_uqadd_h, int, uint16_t, uint16_t, +, 0, UINT16_MAX) | |
1479 | DO_SAT(gvec_uqadd_s, int64_t, uint32_t, uint32_t, +, 0, UINT32_MAX) | |
1480 | ||
1481 | DO_SAT(gvec_sqadd_b, int, int8_t, int8_t, +, INT8_MIN, INT8_MAX) | |
1482 | DO_SAT(gvec_sqadd_h, int, int16_t, int16_t, +, INT16_MIN, INT16_MAX) | |
1483 | DO_SAT(gvec_sqadd_s, int64_t, int32_t, int32_t, +, INT32_MIN, INT32_MAX) | |
1484 | ||
1485 | DO_SAT(gvec_uqsub_b, int, uint8_t, uint8_t, -, 0, UINT8_MAX) | |
1486 | DO_SAT(gvec_uqsub_h, int, uint16_t, uint16_t, -, 0, UINT16_MAX) | |
1487 | DO_SAT(gvec_uqsub_s, int64_t, uint32_t, uint32_t, -, 0, UINT32_MAX) | |
1488 | ||
1489 | DO_SAT(gvec_sqsub_b, int, int8_t, int8_t, -, INT8_MIN, INT8_MAX) | |
1490 | DO_SAT(gvec_sqsub_h, int, int16_t, int16_t, -, INT16_MIN, INT16_MAX) | |
1491 | DO_SAT(gvec_sqsub_s, int64_t, int32_t, int32_t, -, INT32_MIN, INT32_MAX) | |
1492 | ||
1493 | #undef DO_SAT | |
1494 | ||
1495 | void HELPER(gvec_uqadd_d)(void *vd, void *vq, void *vn, | |
1496 | void *vm, uint32_t desc) | |
1497 | { | |
1498 | intptr_t i, oprsz = simd_oprsz(desc); | |
1499 | uint64_t *d = vd, *n = vn, *m = vm; | |
1500 | bool q = false; | |
1501 | ||
1502 | for (i = 0; i < oprsz / 8; i++) { | |
1503 | uint64_t nn = n[i], mm = m[i], dd = nn + mm; | |
1504 | if (dd < nn) { | |
1505 | dd = UINT64_MAX; | |
1506 | q = true; | |
1507 | } | |
1508 | d[i] = dd; | |
1509 | } | |
1510 | if (q) { | |
1511 | uint32_t *qc = vq; | |
1512 | qc[0] = 1; | |
1513 | } | |
1514 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
1515 | } | |
1516 | ||
1517 | void HELPER(gvec_uqsub_d)(void *vd, void *vq, void *vn, | |
1518 | void *vm, uint32_t desc) | |
1519 | { | |
1520 | intptr_t i, oprsz = simd_oprsz(desc); | |
1521 | uint64_t *d = vd, *n = vn, *m = vm; | |
1522 | bool q = false; | |
1523 | ||
1524 | for (i = 0; i < oprsz / 8; i++) { | |
1525 | uint64_t nn = n[i], mm = m[i], dd = nn - mm; | |
1526 | if (nn < mm) { | |
1527 | dd = 0; | |
1528 | q = true; | |
1529 | } | |
1530 | d[i] = dd; | |
1531 | } | |
1532 | if (q) { | |
1533 | uint32_t *qc = vq; | |
1534 | qc[0] = 1; | |
1535 | } | |
1536 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
1537 | } | |
1538 | ||
1539 | void HELPER(gvec_sqadd_d)(void *vd, void *vq, void *vn, | |
1540 | void *vm, uint32_t desc) | |
1541 | { | |
1542 | intptr_t i, oprsz = simd_oprsz(desc); | |
1543 | int64_t *d = vd, *n = vn, *m = vm; | |
1544 | bool q = false; | |
1545 | ||
1546 | for (i = 0; i < oprsz / 8; i++) { | |
1547 | int64_t nn = n[i], mm = m[i], dd = nn + mm; | |
1548 | if (((dd ^ nn) & ~(nn ^ mm)) & INT64_MIN) { | |
1549 | dd = (nn >> 63) ^ ~INT64_MIN; | |
1550 | q = true; | |
1551 | } | |
1552 | d[i] = dd; | |
1553 | } | |
1554 | if (q) { | |
1555 | uint32_t *qc = vq; | |
1556 | qc[0] = 1; | |
1557 | } | |
1558 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
1559 | } | |
1560 | ||
1561 | void HELPER(gvec_sqsub_d)(void *vd, void *vq, void *vn, | |
1562 | void *vm, uint32_t desc) | |
1563 | { | |
1564 | intptr_t i, oprsz = simd_oprsz(desc); | |
1565 | int64_t *d = vd, *n = vn, *m = vm; | |
1566 | bool q = false; | |
1567 | ||
1568 | for (i = 0; i < oprsz / 8; i++) { | |
1569 | int64_t nn = n[i], mm = m[i], dd = nn - mm; | |
1570 | if (((dd ^ nn) & (nn ^ mm)) & INT64_MIN) { | |
1571 | dd = (nn >> 63) ^ ~INT64_MIN; | |
1572 | q = true; | |
1573 | } | |
1574 | d[i] = dd; | |
1575 | } | |
1576 | if (q) { | |
1577 | uint32_t *qc = vq; | |
1578 | qc[0] = 1; | |
1579 | } | |
1580 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
1581 | } | |
a4e943a7 | 1582 | |
631e5654 RH |
1583 | |
1584 | #define DO_SRA(NAME, TYPE) \ | |
1585 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
1586 | { \ | |
1587 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
1588 | int shift = simd_data(desc); \ | |
1589 | TYPE *d = vd, *n = vn; \ | |
1590 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
1591 | d[i] += n[i] >> shift; \ | |
1592 | } \ | |
1593 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
1594 | } | |
1595 | ||
1596 | DO_SRA(gvec_ssra_b, int8_t) | |
1597 | DO_SRA(gvec_ssra_h, int16_t) | |
1598 | DO_SRA(gvec_ssra_s, int32_t) | |
1599 | DO_SRA(gvec_ssra_d, int64_t) | |
1600 | ||
1601 | DO_SRA(gvec_usra_b, uint8_t) | |
1602 | DO_SRA(gvec_usra_h, uint16_t) | |
1603 | DO_SRA(gvec_usra_s, uint32_t) | |
1604 | DO_SRA(gvec_usra_d, uint64_t) | |
1605 | ||
1606 | #undef DO_SRA | |
1607 | ||
6ccd48d4 RH |
1608 | #define DO_RSHR(NAME, TYPE) \ |
1609 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
1610 | { \ | |
1611 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
1612 | int shift = simd_data(desc); \ | |
1613 | TYPE *d = vd, *n = vn; \ | |
1614 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
1615 | TYPE tmp = n[i] >> (shift - 1); \ | |
1616 | d[i] = (tmp >> 1) + (tmp & 1); \ | |
1617 | } \ | |
1618 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
1619 | } | |
1620 | ||
1621 | DO_RSHR(gvec_srshr_b, int8_t) | |
1622 | DO_RSHR(gvec_srshr_h, int16_t) | |
1623 | DO_RSHR(gvec_srshr_s, int32_t) | |
1624 | DO_RSHR(gvec_srshr_d, int64_t) | |
1625 | ||
1626 | DO_RSHR(gvec_urshr_b, uint8_t) | |
1627 | DO_RSHR(gvec_urshr_h, uint16_t) | |
1628 | DO_RSHR(gvec_urshr_s, uint32_t) | |
1629 | DO_RSHR(gvec_urshr_d, uint64_t) | |
1630 | ||
1631 | #undef DO_RSHR | |
1632 | ||
1633 | #define DO_RSRA(NAME, TYPE) \ | |
1634 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
1635 | { \ | |
1636 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
1637 | int shift = simd_data(desc); \ | |
1638 | TYPE *d = vd, *n = vn; \ | |
1639 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
1640 | TYPE tmp = n[i] >> (shift - 1); \ | |
1641 | d[i] += (tmp >> 1) + (tmp & 1); \ | |
1642 | } \ | |
1643 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
1644 | } | |
1645 | ||
1646 | DO_RSRA(gvec_srsra_b, int8_t) | |
1647 | DO_RSRA(gvec_srsra_h, int16_t) | |
1648 | DO_RSRA(gvec_srsra_s, int32_t) | |
1649 | DO_RSRA(gvec_srsra_d, int64_t) | |
1650 | ||
1651 | DO_RSRA(gvec_ursra_b, uint8_t) | |
1652 | DO_RSRA(gvec_ursra_h, uint16_t) | |
1653 | DO_RSRA(gvec_ursra_s, uint32_t) | |
1654 | DO_RSRA(gvec_ursra_d, uint64_t) | |
1655 | ||
1656 | #undef DO_RSRA | |
1657 | ||
893ab054 RH |
1658 | #define DO_SRI(NAME, TYPE) \ |
1659 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
1660 | { \ | |
1661 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
1662 | int shift = simd_data(desc); \ | |
1663 | TYPE *d = vd, *n = vn; \ | |
1664 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
1665 | d[i] = deposit64(d[i], 0, sizeof(TYPE) * 8 - shift, n[i] >> shift); \ | |
1666 | } \ | |
1667 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
1668 | } | |
1669 | ||
1670 | DO_SRI(gvec_sri_b, uint8_t) | |
1671 | DO_SRI(gvec_sri_h, uint16_t) | |
1672 | DO_SRI(gvec_sri_s, uint32_t) | |
1673 | DO_SRI(gvec_sri_d, uint64_t) | |
1674 | ||
1675 | #undef DO_SRI | |
1676 | ||
1677 | #define DO_SLI(NAME, TYPE) \ | |
1678 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
1679 | { \ | |
1680 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
1681 | int shift = simd_data(desc); \ | |
1682 | TYPE *d = vd, *n = vn; \ | |
1683 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
1684 | d[i] = deposit64(d[i], shift, sizeof(TYPE) * 8 - shift, n[i]); \ | |
1685 | } \ | |
1686 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
1687 | } | |
1688 | ||
1689 | DO_SLI(gvec_sli_b, uint8_t) | |
1690 | DO_SLI(gvec_sli_h, uint16_t) | |
1691 | DO_SLI(gvec_sli_s, uint32_t) | |
1692 | DO_SLI(gvec_sli_d, uint64_t) | |
1693 | ||
1694 | #undef DO_SLI | |
1695 | ||
a4e943a7 RH |
1696 | /* |
1697 | * Convert float16 to float32, raising no exceptions and | |
1698 | * preserving exceptional values, including SNaN. | |
1699 | * This is effectively an unpack+repack operation. | |
1700 | */ | |
1701 | static float32 float16_to_float32_by_bits(uint32_t f16, bool fz16) | |
1702 | { | |
1703 | const int f16_bias = 15; | |
1704 | const int f32_bias = 127; | |
1705 | uint32_t sign = extract32(f16, 15, 1); | |
1706 | uint32_t exp = extract32(f16, 10, 5); | |
1707 | uint32_t frac = extract32(f16, 0, 10); | |
1708 | ||
1709 | if (exp == 0x1f) { | |
1710 | /* Inf or NaN */ | |
1711 | exp = 0xff; | |
1712 | } else if (exp == 0) { | |
1713 | /* Zero or denormal. */ | |
1714 | if (frac != 0) { | |
1715 | if (fz16) { | |
1716 | frac = 0; | |
1717 | } else { | |
1718 | /* | |
1719 | * Denormal; these are all normal float32. | |
1720 | * Shift the fraction so that the msb is at bit 11, | |
1721 | * then remove bit 11 as the implicit bit of the | |
1722 | * normalized float32. Note that we still go through | |
1723 | * the shift for normal numbers below, to put the | |
1724 | * float32 fraction at the right place. | |
1725 | */ | |
1726 | int shift = clz32(frac) - 21; | |
1727 | frac = (frac << shift) & 0x3ff; | |
1728 | exp = f32_bias - f16_bias - shift + 1; | |
1729 | } | |
1730 | } | |
1731 | } else { | |
1732 | /* Normal number; adjust the bias. */ | |
1733 | exp += f32_bias - f16_bias; | |
1734 | } | |
1735 | sign <<= 31; | |
1736 | exp <<= 23; | |
1737 | frac <<= 23 - 10; | |
1738 | ||
1739 | return sign | exp | frac; | |
1740 | } | |
1741 | ||
1742 | static uint64_t load4_f16(uint64_t *ptr, int is_q, int is_2) | |
1743 | { | |
1744 | /* | |
1745 | * Branchless load of u32[0], u64[0], u32[1], or u64[1]. | |
1746 | * Load the 2nd qword iff is_q & is_2. | |
1747 | * Shift to the 2nd dword iff !is_q & is_2. | |
1748 | * For !is_q & !is_2, the upper bits of the result are garbage. | |
1749 | */ | |
1750 | return ptr[is_q & is_2] >> ((is_2 & ~is_q) << 5); | |
1751 | } | |
1752 | ||
1753 | /* | |
1754 | * Note that FMLAL requires oprsz == 8 or oprsz == 16, | |
1755 | * as there is not yet SVE versions that might use blocking. | |
1756 | */ | |
1757 | ||
1758 | static void do_fmlal(float32 *d, void *vn, void *vm, float_status *fpst, | |
1759 | uint32_t desc, bool fz16) | |
1760 | { | |
1761 | intptr_t i, oprsz = simd_oprsz(desc); | |
1762 | int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); | |
1763 | int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
1764 | int is_q = oprsz == 16; | |
1765 | uint64_t n_4, m_4; | |
1766 | ||
1767 | /* Pre-load all of the f16 data, avoiding overlap issues. */ | |
1768 | n_4 = load4_f16(vn, is_q, is_2); | |
1769 | m_4 = load4_f16(vm, is_q, is_2); | |
1770 | ||
1771 | /* Negate all inputs for FMLSL at once. */ | |
1772 | if (is_s) { | |
1773 | n_4 ^= 0x8000800080008000ull; | |
1774 | } | |
1775 | ||
1776 | for (i = 0; i < oprsz / 4; i++) { | |
1777 | float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); | |
1778 | float32 m_1 = float16_to_float32_by_bits(m_4 >> (i * 16), fz16); | |
1779 | d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); | |
1780 | } | |
1781 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
1782 | } | |
1783 | ||
1784 | void HELPER(gvec_fmlal_a32)(void *vd, void *vn, void *vm, | |
1785 | void *venv, uint32_t desc) | |
1786 | { | |
1787 | CPUARMState *env = venv; | |
1788 | do_fmlal(vd, vn, vm, &env->vfp.standard_fp_status, desc, | |
1789 | get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); | |
1790 | } | |
1791 | ||
1792 | void HELPER(gvec_fmlal_a64)(void *vd, void *vn, void *vm, | |
1793 | void *venv, uint32_t desc) | |
1794 | { | |
1795 | CPUARMState *env = venv; | |
1796 | do_fmlal(vd, vn, vm, &env->vfp.fp_status, desc, | |
1797 | get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); | |
1798 | } | |
1799 | ||
50d102bd SL |
1800 | void HELPER(sve2_fmlal_zzzw_s)(void *vd, void *vn, void *vm, void *va, |
1801 | void *venv, uint32_t desc) | |
1802 | { | |
1803 | intptr_t i, oprsz = simd_oprsz(desc); | |
1804 | uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; | |
1805 | intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); | |
1806 | CPUARMState *env = venv; | |
1807 | float_status *status = &env->vfp.fp_status; | |
1808 | bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); | |
1809 | ||
1810 | for (i = 0; i < oprsz; i += sizeof(float32)) { | |
1811 | float16 nn_16 = *(float16 *)(vn + H1_2(i + sel)) ^ negn; | |
1812 | float16 mm_16 = *(float16 *)(vm + H1_2(i + sel)); | |
1813 | float32 nn = float16_to_float32_by_bits(nn_16, fz16); | |
1814 | float32 mm = float16_to_float32_by_bits(mm_16, fz16); | |
1815 | float32 aa = *(float32 *)(va + H1_4(i)); | |
1816 | ||
1817 | *(float32 *)(vd + H1_4(i)) = float32_muladd(nn, mm, aa, 0, status); | |
1818 | } | |
1819 | } | |
1820 | ||
a4e943a7 RH |
1821 | static void do_fmlal_idx(float32 *d, void *vn, void *vm, float_status *fpst, |
1822 | uint32_t desc, bool fz16) | |
1823 | { | |
1824 | intptr_t i, oprsz = simd_oprsz(desc); | |
1825 | int is_s = extract32(desc, SIMD_DATA_SHIFT, 1); | |
1826 | int is_2 = extract32(desc, SIMD_DATA_SHIFT + 1, 1); | |
1827 | int index = extract32(desc, SIMD_DATA_SHIFT + 2, 3); | |
1828 | int is_q = oprsz == 16; | |
1829 | uint64_t n_4; | |
1830 | float32 m_1; | |
1831 | ||
1832 | /* Pre-load all of the f16 data, avoiding overlap issues. */ | |
1833 | n_4 = load4_f16(vn, is_q, is_2); | |
1834 | ||
1835 | /* Negate all inputs for FMLSL at once. */ | |
1836 | if (is_s) { | |
1837 | n_4 ^= 0x8000800080008000ull; | |
1838 | } | |
1839 | ||
1840 | m_1 = float16_to_float32_by_bits(((float16 *)vm)[H2(index)], fz16); | |
1841 | ||
1842 | for (i = 0; i < oprsz / 4; i++) { | |
1843 | float32 n_1 = float16_to_float32_by_bits(n_4 >> (i * 16), fz16); | |
1844 | d[H4(i)] = float32_muladd(n_1, m_1, d[H4(i)], 0, fpst); | |
1845 | } | |
1846 | clear_tail(d, oprsz, simd_maxsz(desc)); | |
1847 | } | |
1848 | ||
1849 | void HELPER(gvec_fmlal_idx_a32)(void *vd, void *vn, void *vm, | |
1850 | void *venv, uint32_t desc) | |
1851 | { | |
1852 | CPUARMState *env = venv; | |
1853 | do_fmlal_idx(vd, vn, vm, &env->vfp.standard_fp_status, desc, | |
1854 | get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); | |
1855 | } | |
1856 | ||
1857 | void HELPER(gvec_fmlal_idx_a64)(void *vd, void *vn, void *vm, | |
1858 | void *venv, uint32_t desc) | |
1859 | { | |
1860 | CPUARMState *env = venv; | |
1861 | do_fmlal_idx(vd, vn, vm, &env->vfp.fp_status, desc, | |
1862 | get_flush_inputs_to_zero(&env->vfp.fp_status_f16)); | |
1863 | } | |
87b74e8b | 1864 | |
50d102bd SL |
1865 | void HELPER(sve2_fmlal_zzxw_s)(void *vd, void *vn, void *vm, void *va, |
1866 | void *venv, uint32_t desc) | |
1867 | { | |
1868 | intptr_t i, j, oprsz = simd_oprsz(desc); | |
1869 | uint16_t negn = extract32(desc, SIMD_DATA_SHIFT, 1) << 15; | |
1870 | intptr_t sel = extract32(desc, SIMD_DATA_SHIFT + 1, 1) * sizeof(float16); | |
1871 | intptr_t idx = extract32(desc, SIMD_DATA_SHIFT + 2, 3) * sizeof(float16); | |
1872 | CPUARMState *env = venv; | |
1873 | float_status *status = &env->vfp.fp_status; | |
1874 | bool fz16 = get_flush_inputs_to_zero(&env->vfp.fp_status_f16); | |
1875 | ||
1876 | for (i = 0; i < oprsz; i += 16) { | |
1877 | float16 mm_16 = *(float16 *)(vm + i + idx); | |
1878 | float32 mm = float16_to_float32_by_bits(mm_16, fz16); | |
1879 | ||
1880 | for (j = 0; j < 16; j += sizeof(float32)) { | |
1881 | float16 nn_16 = *(float16 *)(vn + H1_2(i + j + sel)) ^ negn; | |
1882 | float32 nn = float16_to_float32_by_bits(nn_16, fz16); | |
1883 | float32 aa = *(float32 *)(va + H1_4(i + j)); | |
1884 | ||
1885 | *(float32 *)(vd + H1_4(i + j)) = | |
1886 | float32_muladd(nn, mm, aa, 0, status); | |
1887 | } | |
1888 | } | |
1889 | } | |
1890 | ||
87b74e8b RH |
1891 | void HELPER(gvec_sshl_b)(void *vd, void *vn, void *vm, uint32_t desc) |
1892 | { | |
1893 | intptr_t i, opr_sz = simd_oprsz(desc); | |
1894 | int8_t *d = vd, *n = vn, *m = vm; | |
1895 | ||
1896 | for (i = 0; i < opr_sz; ++i) { | |
1897 | int8_t mm = m[i]; | |
1898 | int8_t nn = n[i]; | |
1899 | int8_t res = 0; | |
1900 | if (mm >= 0) { | |
1901 | if (mm < 8) { | |
1902 | res = nn << mm; | |
1903 | } | |
1904 | } else { | |
1905 | res = nn >> (mm > -8 ? -mm : 7); | |
1906 | } | |
1907 | d[i] = res; | |
1908 | } | |
1909 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
1910 | } | |
1911 | ||
1912 | void HELPER(gvec_sshl_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
1913 | { | |
1914 | intptr_t i, opr_sz = simd_oprsz(desc); | |
1915 | int16_t *d = vd, *n = vn, *m = vm; | |
1916 | ||
1917 | for (i = 0; i < opr_sz / 2; ++i) { | |
1918 | int8_t mm = m[i]; /* only 8 bits of shift are significant */ | |
1919 | int16_t nn = n[i]; | |
1920 | int16_t res = 0; | |
1921 | if (mm >= 0) { | |
1922 | if (mm < 16) { | |
1923 | res = nn << mm; | |
1924 | } | |
1925 | } else { | |
1926 | res = nn >> (mm > -16 ? -mm : 15); | |
1927 | } | |
1928 | d[i] = res; | |
1929 | } | |
1930 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
1931 | } | |
1932 | ||
1933 | void HELPER(gvec_ushl_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
1934 | { | |
1935 | intptr_t i, opr_sz = simd_oprsz(desc); | |
1936 | uint8_t *d = vd, *n = vn, *m = vm; | |
1937 | ||
1938 | for (i = 0; i < opr_sz; ++i) { | |
1939 | int8_t mm = m[i]; | |
1940 | uint8_t nn = n[i]; | |
1941 | uint8_t res = 0; | |
1942 | if (mm >= 0) { | |
1943 | if (mm < 8) { | |
1944 | res = nn << mm; | |
1945 | } | |
1946 | } else { | |
1947 | if (mm > -8) { | |
1948 | res = nn >> -mm; | |
1949 | } | |
1950 | } | |
1951 | d[i] = res; | |
1952 | } | |
1953 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
1954 | } | |
1955 | ||
1956 | void HELPER(gvec_ushl_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
1957 | { | |
1958 | intptr_t i, opr_sz = simd_oprsz(desc); | |
1959 | uint16_t *d = vd, *n = vn, *m = vm; | |
1960 | ||
1961 | for (i = 0; i < opr_sz / 2; ++i) { | |
1962 | int8_t mm = m[i]; /* only 8 bits of shift are significant */ | |
1963 | uint16_t nn = n[i]; | |
1964 | uint16_t res = 0; | |
1965 | if (mm >= 0) { | |
1966 | if (mm < 16) { | |
1967 | res = nn << mm; | |
1968 | } | |
1969 | } else { | |
1970 | if (mm > -16) { | |
1971 | res = nn >> -mm; | |
1972 | } | |
1973 | } | |
1974 | d[i] = res; | |
1975 | } | |
1976 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
1977 | } | |
a21bb78e RH |
1978 | |
1979 | /* | |
1980 | * 8x8->8 polynomial multiply. | |
1981 | * | |
1982 | * Polynomial multiplication is like integer multiplication except the | |
1983 | * partial products are XORed, not added. | |
1984 | * | |
1985 | * TODO: expose this as a generic vector operation, as it is a common | |
1986 | * crypto building block. | |
1987 | */ | |
1988 | void HELPER(gvec_pmul_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
1989 | { | |
8e3da4c7 | 1990 | intptr_t i, opr_sz = simd_oprsz(desc); |
a21bb78e RH |
1991 | uint64_t *d = vd, *n = vn, *m = vm; |
1992 | ||
1993 | for (i = 0; i < opr_sz / 8; ++i) { | |
8e3da4c7 | 1994 | d[i] = clmul_8x8_low(n[i], m[i]); |
a21bb78e RH |
1995 | } |
1996 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
1997 | } | |
b9ed510e RH |
1998 | |
1999 | /* | |
2000 | * 64x64->128 polynomial multiply. | |
2001 | * Because of the lanes are not accessed in strict columns, | |
2002 | * this probably cannot be turned into a generic helper. | |
2003 | */ | |
2004 | void HELPER(gvec_pmull_q)(void *vd, void *vn, void *vm, uint32_t desc) | |
2005 | { | |
2006 | intptr_t i, j, opr_sz = simd_oprsz(desc); | |
2007 | intptr_t hi = simd_data(desc); | |
2008 | uint64_t *d = vd, *n = vn, *m = vm; | |
2009 | ||
2010 | for (i = 0; i < opr_sz / 8; i += 2) { | |
2011 | uint64_t nn = n[i + hi]; | |
2012 | uint64_t mm = m[i + hi]; | |
2013 | uint64_t rhi = 0; | |
2014 | uint64_t rlo = 0; | |
2015 | ||
2016 | /* Bit 0 can only influence the low 64-bit result. */ | |
2017 | if (nn & 1) { | |
2018 | rlo = mm; | |
2019 | } | |
2020 | ||
2021 | for (j = 1; j < 64; ++j) { | |
2022 | uint64_t mask = -((nn >> j) & 1); | |
2023 | rlo ^= (mm << j) & mask; | |
2024 | rhi ^= (mm >> (64 - j)) & mask; | |
2025 | } | |
2026 | d[i] = rlo; | |
2027 | d[i + 1] = rhi; | |
2028 | } | |
2029 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
2030 | } | |
e7e96fc5 | 2031 | |
e7e96fc5 RH |
2032 | void HELPER(neon_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) |
2033 | { | |
2034 | int hi = simd_data(desc); | |
2035 | uint64_t *d = vd, *n = vn, *m = vm; | |
2036 | uint64_t nn = n[hi], mm = m[hi]; | |
2037 | ||
8e3da4c7 | 2038 | d[0] = clmul_8x4_packed(nn, mm); |
e7e96fc5 RH |
2039 | nn >>= 32; |
2040 | mm >>= 32; | |
8e3da4c7 | 2041 | d[1] = clmul_8x4_packed(nn, mm); |
e7e96fc5 RH |
2042 | |
2043 | clear_tail(d, 16, simd_maxsz(desc)); | |
2044 | } | |
2045 | ||
2046 | #ifdef TARGET_AARCH64 | |
2047 | void HELPER(sve2_pmull_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
2048 | { | |
2049 | int shift = simd_data(desc) * 8; | |
2050 | intptr_t i, opr_sz = simd_oprsz(desc); | |
2051 | uint64_t *d = vd, *n = vn, *m = vm; | |
2052 | ||
2053 | for (i = 0; i < opr_sz / 8; ++i) { | |
8e3da4c7 | 2054 | d[i] = clmul_8x4_even(n[i] >> shift, m[i] >> shift); |
e7e96fc5 RH |
2055 | } |
2056 | } | |
e3a56131 RH |
2057 | |
2058 | static uint64_t pmull_d(uint64_t op1, uint64_t op2) | |
2059 | { | |
2060 | uint64_t result = 0; | |
2061 | int i; | |
2062 | ||
2063 | for (i = 0; i < 32; ++i) { | |
2064 | uint64_t mask = -((op1 >> i) & 1); | |
2065 | result ^= (op2 << i) & mask; | |
2066 | } | |
2067 | return result; | |
2068 | } | |
2069 | ||
2070 | void HELPER(sve2_pmull_d)(void *vd, void *vn, void *vm, uint32_t desc) | |
2071 | { | |
2072 | intptr_t sel = H4(simd_data(desc)); | |
2073 | intptr_t i, opr_sz = simd_oprsz(desc); | |
2074 | uint32_t *n = vn, *m = vm; | |
2075 | uint64_t *d = vd; | |
2076 | ||
2077 | for (i = 0; i < opr_sz / 8; ++i) { | |
2078 | d[i] = pmull_d(n[2 * i + sel], m[2 * i + sel]); | |
2079 | } | |
2080 | } | |
e7e96fc5 | 2081 | #endif |
6b375d35 RH |
2082 | |
2083 | #define DO_CMP0(NAME, TYPE, OP) \ | |
2084 | void HELPER(NAME)(void *vd, void *vn, uint32_t desc) \ | |
2085 | { \ | |
2086 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
2087 | for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ | |
2088 | TYPE nn = *(TYPE *)(vn + i); \ | |
2089 | *(TYPE *)(vd + i) = -(nn OP 0); \ | |
2090 | } \ | |
2091 | clear_tail(vd, opr_sz, simd_maxsz(desc)); \ | |
2092 | } | |
2093 | ||
2094 | DO_CMP0(gvec_ceq0_b, int8_t, ==) | |
2095 | DO_CMP0(gvec_clt0_b, int8_t, <) | |
2096 | DO_CMP0(gvec_cle0_b, int8_t, <=) | |
2097 | DO_CMP0(gvec_cgt0_b, int8_t, >) | |
2098 | DO_CMP0(gvec_cge0_b, int8_t, >=) | |
2099 | ||
2100 | DO_CMP0(gvec_ceq0_h, int16_t, ==) | |
2101 | DO_CMP0(gvec_clt0_h, int16_t, <) | |
2102 | DO_CMP0(gvec_cle0_h, int16_t, <=) | |
2103 | DO_CMP0(gvec_cgt0_h, int16_t, >) | |
2104 | DO_CMP0(gvec_cge0_h, int16_t, >=) | |
2105 | ||
2106 | #undef DO_CMP0 | |
50c160d4 RH |
2107 | |
2108 | #define DO_ABD(NAME, TYPE) \ | |
2109 | void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ | |
2110 | { \ | |
2111 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
2112 | TYPE *d = vd, *n = vn, *m = vm; \ | |
2113 | \ | |
2114 | for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ | |
2115 | d[i] = n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ | |
2116 | } \ | |
2117 | clear_tail(d, opr_sz, simd_maxsz(desc)); \ | |
2118 | } | |
2119 | ||
2120 | DO_ABD(gvec_sabd_b, int8_t) | |
2121 | DO_ABD(gvec_sabd_h, int16_t) | |
2122 | DO_ABD(gvec_sabd_s, int32_t) | |
2123 | DO_ABD(gvec_sabd_d, int64_t) | |
2124 | ||
2125 | DO_ABD(gvec_uabd_b, uint8_t) | |
2126 | DO_ABD(gvec_uabd_h, uint16_t) | |
2127 | DO_ABD(gvec_uabd_s, uint32_t) | |
2128 | DO_ABD(gvec_uabd_d, uint64_t) | |
2129 | ||
2130 | #undef DO_ABD | |
cfdb2c0c RH |
2131 | |
2132 | #define DO_ABA(NAME, TYPE) \ | |
2133 | void HELPER(NAME)(void *vd, void *vn, void *vm, uint32_t desc) \ | |
2134 | { \ | |
2135 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
2136 | TYPE *d = vd, *n = vn, *m = vm; \ | |
2137 | \ | |
2138 | for (i = 0; i < opr_sz / sizeof(TYPE); ++i) { \ | |
2139 | d[i] += n[i] < m[i] ? m[i] - n[i] : n[i] - m[i]; \ | |
2140 | } \ | |
2141 | clear_tail(d, opr_sz, simd_maxsz(desc)); \ | |
2142 | } | |
2143 | ||
2144 | DO_ABA(gvec_saba_b, int8_t) | |
2145 | DO_ABA(gvec_saba_h, int16_t) | |
2146 | DO_ABA(gvec_saba_s, int32_t) | |
2147 | DO_ABA(gvec_saba_d, int64_t) | |
2148 | ||
2149 | DO_ABA(gvec_uaba_b, uint8_t) | |
2150 | DO_ABA(gvec_uaba_h, uint16_t) | |
2151 | DO_ABA(gvec_uaba_s, uint32_t) | |
2152 | DO_ABA(gvec_uaba_d, uint64_t) | |
2153 | ||
2154 | #undef DO_ABA | |
1dc587ee PM |
2155 | |
2156 | #define DO_NEON_PAIRWISE(NAME, OP) \ | |
2157 | void HELPER(NAME##s)(void *vd, void *vn, void *vm, \ | |
2158 | void *stat, uint32_t oprsz) \ | |
2159 | { \ | |
2160 | float_status *fpst = stat; \ | |
2161 | float32 *d = vd; \ | |
2162 | float32 *n = vn; \ | |
2163 | float32 *m = vm; \ | |
2164 | float32 r0, r1; \ | |
2165 | \ | |
2166 | /* Read all inputs before writing outputs in case vm == vd */ \ | |
2167 | r0 = float32_##OP(n[H4(0)], n[H4(1)], fpst); \ | |
2168 | r1 = float32_##OP(m[H4(0)], m[H4(1)], fpst); \ | |
2169 | \ | |
2170 | d[H4(0)] = r0; \ | |
2171 | d[H4(1)] = r1; \ | |
2172 | } \ | |
2173 | \ | |
2174 | void HELPER(NAME##h)(void *vd, void *vn, void *vm, \ | |
2175 | void *stat, uint32_t oprsz) \ | |
2176 | { \ | |
2177 | float_status *fpst = stat; \ | |
2178 | float16 *d = vd; \ | |
2179 | float16 *n = vn; \ | |
2180 | float16 *m = vm; \ | |
2181 | float16 r0, r1, r2, r3; \ | |
2182 | \ | |
2183 | /* Read all inputs before writing outputs in case vm == vd */ \ | |
2184 | r0 = float16_##OP(n[H2(0)], n[H2(1)], fpst); \ | |
2185 | r1 = float16_##OP(n[H2(2)], n[H2(3)], fpst); \ | |
2186 | r2 = float16_##OP(m[H2(0)], m[H2(1)], fpst); \ | |
2187 | r3 = float16_##OP(m[H2(2)], m[H2(3)], fpst); \ | |
2188 | \ | |
552714c0 PM |
2189 | d[H2(0)] = r0; \ |
2190 | d[H2(1)] = r1; \ | |
2191 | d[H2(2)] = r2; \ | |
2192 | d[H2(3)] = r3; \ | |
1dc587ee PM |
2193 | } |
2194 | ||
2195 | DO_NEON_PAIRWISE(neon_padd, add) | |
2196 | DO_NEON_PAIRWISE(neon_pmax, max) | |
2197 | DO_NEON_PAIRWISE(neon_pmin, min) | |
2198 | ||
2199 | #undef DO_NEON_PAIRWISE | |
7b959c58 PM |
2200 | |
2201 | #define DO_VCVT_FIXED(NAME, FUNC, TYPE) \ | |
2202 | void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ | |
2203 | { \ | |
2204 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
2205 | int shift = simd_data(desc); \ | |
2206 | TYPE *d = vd, *n = vn; \ | |
2207 | float_status *fpst = stat; \ | |
2208 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
2209 | d[i] = FUNC(n[i], shift, fpst); \ | |
2210 | } \ | |
2211 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
2212 | } | |
2213 | ||
2214 | DO_VCVT_FIXED(gvec_vcvt_sf, helper_vfp_sltos, uint32_t) | |
2215 | DO_VCVT_FIXED(gvec_vcvt_uf, helper_vfp_ultos, uint32_t) | |
2216 | DO_VCVT_FIXED(gvec_vcvt_fs, helper_vfp_tosls_round_to_zero, uint32_t) | |
2217 | DO_VCVT_FIXED(gvec_vcvt_fu, helper_vfp_touls_round_to_zero, uint32_t) | |
24018cf3 PM |
2218 | DO_VCVT_FIXED(gvec_vcvt_sh, helper_vfp_shtoh, uint16_t) |
2219 | DO_VCVT_FIXED(gvec_vcvt_uh, helper_vfp_uhtoh, uint16_t) | |
2220 | DO_VCVT_FIXED(gvec_vcvt_hs, helper_vfp_toshh_round_to_zero, uint16_t) | |
2221 | DO_VCVT_FIXED(gvec_vcvt_hu, helper_vfp_touhh_round_to_zero, uint16_t) | |
7b959c58 PM |
2222 | |
2223 | #undef DO_VCVT_FIXED | |
ca88a6ef PM |
2224 | |
2225 | #define DO_VCVT_RMODE(NAME, FUNC, TYPE) \ | |
2226 | void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ | |
2227 | { \ | |
2228 | float_status *fpst = stat; \ | |
2229 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
2230 | uint32_t rmode = simd_data(desc); \ | |
2231 | uint32_t prev_rmode = get_float_rounding_mode(fpst); \ | |
2232 | TYPE *d = vd, *n = vn; \ | |
2233 | set_float_rounding_mode(rmode, fpst); \ | |
2234 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
2235 | d[i] = FUNC(n[i], 0, fpst); \ | |
2236 | } \ | |
2237 | set_float_rounding_mode(prev_rmode, fpst); \ | |
2238 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
2239 | } | |
2240 | ||
2241 | DO_VCVT_RMODE(gvec_vcvt_rm_ss, helper_vfp_tosls, uint32_t) | |
2242 | DO_VCVT_RMODE(gvec_vcvt_rm_us, helper_vfp_touls, uint32_t) | |
2243 | DO_VCVT_RMODE(gvec_vcvt_rm_sh, helper_vfp_toshh, uint16_t) | |
2244 | DO_VCVT_RMODE(gvec_vcvt_rm_uh, helper_vfp_touhh, uint16_t) | |
2245 | ||
2246 | #undef DO_VCVT_RMODE | |
18725916 PM |
2247 | |
2248 | #define DO_VRINT_RMODE(NAME, FUNC, TYPE) \ | |
2249 | void HELPER(NAME)(void *vd, void *vn, void *stat, uint32_t desc) \ | |
2250 | { \ | |
2251 | float_status *fpst = stat; \ | |
2252 | intptr_t i, oprsz = simd_oprsz(desc); \ | |
2253 | uint32_t rmode = simd_data(desc); \ | |
2254 | uint32_t prev_rmode = get_float_rounding_mode(fpst); \ | |
2255 | TYPE *d = vd, *n = vn; \ | |
2256 | set_float_rounding_mode(rmode, fpst); \ | |
2257 | for (i = 0; i < oprsz / sizeof(TYPE); i++) { \ | |
2258 | d[i] = FUNC(n[i], fpst); \ | |
2259 | } \ | |
2260 | set_float_rounding_mode(prev_rmode, fpst); \ | |
2261 | clear_tail(d, oprsz, simd_maxsz(desc)); \ | |
2262 | } | |
2263 | ||
2264 | DO_VRINT_RMODE(gvec_vrint_rm_h, helper_rinth, uint16_t) | |
2265 | DO_VRINT_RMODE(gvec_vrint_rm_s, helper_rints, uint32_t) | |
2266 | ||
2267 | #undef DO_VRINT_RMODE | |
519183d3 RH |
2268 | |
2269 | #ifdef TARGET_AARCH64 | |
2270 | void HELPER(simd_tblx)(void *vd, void *vm, void *venv, uint32_t desc) | |
2271 | { | |
2272 | const uint8_t *indices = vm; | |
2273 | CPUARMState *env = venv; | |
2274 | size_t oprsz = simd_oprsz(desc); | |
2275 | uint32_t rn = extract32(desc, SIMD_DATA_SHIFT, 5); | |
2276 | bool is_tbx = extract32(desc, SIMD_DATA_SHIFT + 5, 1); | |
2277 | uint32_t table_len = desc >> (SIMD_DATA_SHIFT + 6); | |
2278 | union { | |
2279 | uint8_t b[16]; | |
2280 | uint64_t d[2]; | |
2281 | } result; | |
2282 | ||
2283 | /* | |
2284 | * We must construct the final result in a temp, lest the output | |
2285 | * overlaps the input table. For TBL, begin with zero; for TBX, | |
2286 | * begin with the original register contents. Note that we always | |
2287 | * copy 16 bytes here to avoid an extra branch; clearing the high | |
2288 | * bits of the register for oprsz == 8 is handled below. | |
2289 | */ | |
2290 | if (is_tbx) { | |
2291 | memcpy(&result, vd, 16); | |
2292 | } else { | |
2293 | memset(&result, 0, 16); | |
2294 | } | |
2295 | ||
2296 | for (size_t i = 0; i < oprsz; ++i) { | |
2297 | uint32_t index = indices[H1(i)]; | |
2298 | ||
2299 | if (index < table_len) { | |
2300 | /* | |
2301 | * Convert index (a byte offset into the virtual table | |
2302 | * which is a series of 128-bit vectors concatenated) | |
2303 | * into the correct register element, bearing in mind | |
2304 | * that the table can wrap around from V31 to V0. | |
2305 | */ | |
2306 | const uint8_t *table = (const uint8_t *) | |
2307 | aa64_vfp_qreg(env, (rn + (index >> 4)) % 32); | |
2308 | result.b[H1(i)] = table[H1(index % 16)]; | |
2309 | } | |
2310 | } | |
2311 | ||
2312 | memcpy(vd, &result, 16); | |
2313 | clear_tail(vd, oprsz, simd_maxsz(desc)); | |
2314 | } | |
2315 | #endif | |
5dad1ba5 RH |
2316 | |
2317 | /* | |
2318 | * NxN -> N highpart multiply | |
2319 | * | |
2320 | * TODO: expose this as a generic vector operation. | |
2321 | */ | |
2322 | ||
2323 | void HELPER(gvec_smulh_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
2324 | { | |
2325 | intptr_t i, opr_sz = simd_oprsz(desc); | |
2326 | int8_t *d = vd, *n = vn, *m = vm; | |
2327 | ||
2328 | for (i = 0; i < opr_sz; ++i) { | |
2329 | d[i] = ((int32_t)n[i] * m[i]) >> 8; | |
2330 | } | |
2331 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
2332 | } | |
2333 | ||
2334 | void HELPER(gvec_smulh_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
2335 | { | |
2336 | intptr_t i, opr_sz = simd_oprsz(desc); | |
2337 | int16_t *d = vd, *n = vn, *m = vm; | |
2338 | ||
2339 | for (i = 0; i < opr_sz / 2; ++i) { | |
2340 | d[i] = ((int32_t)n[i] * m[i]) >> 16; | |
2341 | } | |
2342 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
2343 | } | |
2344 | ||
2345 | void HELPER(gvec_smulh_s)(void *vd, void *vn, void *vm, uint32_t desc) | |
2346 | { | |
2347 | intptr_t i, opr_sz = simd_oprsz(desc); | |
2348 | int32_t *d = vd, *n = vn, *m = vm; | |
2349 | ||
2350 | for (i = 0; i < opr_sz / 4; ++i) { | |
2351 | d[i] = ((int64_t)n[i] * m[i]) >> 32; | |
2352 | } | |
2353 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
2354 | } | |
2355 | ||
2356 | void HELPER(gvec_smulh_d)(void *vd, void *vn, void *vm, uint32_t desc) | |
2357 | { | |
2358 | intptr_t i, opr_sz = simd_oprsz(desc); | |
2359 | uint64_t *d = vd, *n = vn, *m = vm; | |
2360 | uint64_t discard; | |
2361 | ||
2362 | for (i = 0; i < opr_sz / 8; ++i) { | |
2363 | muls64(&discard, &d[i], n[i], m[i]); | |
2364 | } | |
2365 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
2366 | } | |
2367 | ||
2368 | void HELPER(gvec_umulh_b)(void *vd, void *vn, void *vm, uint32_t desc) | |
2369 | { | |
2370 | intptr_t i, opr_sz = simd_oprsz(desc); | |
2371 | uint8_t *d = vd, *n = vn, *m = vm; | |
2372 | ||
2373 | for (i = 0; i < opr_sz; ++i) { | |
2374 | d[i] = ((uint32_t)n[i] * m[i]) >> 8; | |
2375 | } | |
2376 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
2377 | } | |
2378 | ||
2379 | void HELPER(gvec_umulh_h)(void *vd, void *vn, void *vm, uint32_t desc) | |
2380 | { | |
2381 | intptr_t i, opr_sz = simd_oprsz(desc); | |
2382 | uint16_t *d = vd, *n = vn, *m = vm; | |
2383 | ||
2384 | for (i = 0; i < opr_sz / 2; ++i) { | |
2385 | d[i] = ((uint32_t)n[i] * m[i]) >> 16; | |
2386 | } | |
2387 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
2388 | } | |
2389 | ||
2390 | void HELPER(gvec_umulh_s)(void *vd, void *vn, void *vm, uint32_t desc) | |
2391 | { | |
2392 | intptr_t i, opr_sz = simd_oprsz(desc); | |
2393 | uint32_t *d = vd, *n = vn, *m = vm; | |
2394 | ||
2395 | for (i = 0; i < opr_sz / 4; ++i) { | |
2396 | d[i] = ((uint64_t)n[i] * m[i]) >> 32; | |
2397 | } | |
2398 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
2399 | } | |
2400 | ||
2401 | void HELPER(gvec_umulh_d)(void *vd, void *vn, void *vm, uint32_t desc) | |
2402 | { | |
2403 | intptr_t i, opr_sz = simd_oprsz(desc); | |
2404 | uint64_t *d = vd, *n = vn, *m = vm; | |
2405 | uint64_t discard; | |
2406 | ||
2407 | for (i = 0; i < opr_sz / 8; ++i) { | |
2408 | mulu64(&discard, &d[i], n[i], m[i]); | |
2409 | } | |
2410 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
2411 | } | |
e6eba6e5 RH |
2412 | |
2413 | void HELPER(gvec_xar_d)(void *vd, void *vn, void *vm, uint32_t desc) | |
2414 | { | |
2415 | intptr_t i, opr_sz = simd_oprsz(desc) / 8; | |
2416 | int shr = simd_data(desc); | |
2417 | uint64_t *d = vd, *n = vn, *m = vm; | |
2418 | ||
2419 | for (i = 0; i < opr_sz; ++i) { | |
2420 | d[i] = ror64(n[i] ^ m[i], shr); | |
2421 | } | |
2422 | clear_tail(d, opr_sz * 8, simd_maxsz(desc)); | |
2423 | } | |
2323c5ff RH |
2424 | |
2425 | /* | |
2426 | * Integer matrix-multiply accumulate | |
2427 | */ | |
2428 | ||
2429 | static uint32_t do_smmla_b(uint32_t sum, void *vn, void *vm) | |
2430 | { | |
2431 | int8_t *n = vn, *m = vm; | |
2432 | ||
2433 | for (intptr_t k = 0; k < 8; ++k) { | |
2434 | sum += n[H1(k)] * m[H1(k)]; | |
2435 | } | |
2436 | return sum; | |
2437 | } | |
2438 | ||
2439 | static uint32_t do_ummla_b(uint32_t sum, void *vn, void *vm) | |
2440 | { | |
2441 | uint8_t *n = vn, *m = vm; | |
2442 | ||
2443 | for (intptr_t k = 0; k < 8; ++k) { | |
2444 | sum += n[H1(k)] * m[H1(k)]; | |
2445 | } | |
2446 | return sum; | |
2447 | } | |
2448 | ||
2449 | static uint32_t do_usmmla_b(uint32_t sum, void *vn, void *vm) | |
2450 | { | |
2451 | uint8_t *n = vn; | |
2452 | int8_t *m = vm; | |
2453 | ||
2454 | for (intptr_t k = 0; k < 8; ++k) { | |
2455 | sum += n[H1(k)] * m[H1(k)]; | |
2456 | } | |
2457 | return sum; | |
2458 | } | |
2459 | ||
2460 | static void do_mmla_b(void *vd, void *vn, void *vm, void *va, uint32_t desc, | |
2461 | uint32_t (*inner_loop)(uint32_t, void *, void *)) | |
2462 | { | |
2463 | intptr_t seg, opr_sz = simd_oprsz(desc); | |
2464 | ||
2465 | for (seg = 0; seg < opr_sz; seg += 16) { | |
2466 | uint32_t *d = vd + seg; | |
2467 | uint32_t *a = va + seg; | |
2468 | uint32_t sum0, sum1, sum2, sum3; | |
2469 | ||
2470 | /* | |
2471 | * Process the entire segment at once, writing back the | |
2472 | * results only after we've consumed all of the inputs. | |
2473 | * | |
81266a1f | 2474 | * Key to indices by column: |
2323c5ff RH |
2475 | * i j i j |
2476 | */ | |
2477 | sum0 = a[H4(0 + 0)]; | |
2478 | sum0 = inner_loop(sum0, vn + seg + 0, vm + seg + 0); | |
2479 | sum1 = a[H4(0 + 1)]; | |
2480 | sum1 = inner_loop(sum1, vn + seg + 0, vm + seg + 8); | |
2481 | sum2 = a[H4(2 + 0)]; | |
2482 | sum2 = inner_loop(sum2, vn + seg + 8, vm + seg + 0); | |
2483 | sum3 = a[H4(2 + 1)]; | |
2484 | sum3 = inner_loop(sum3, vn + seg + 8, vm + seg + 8); | |
2485 | ||
2486 | d[H4(0)] = sum0; | |
2487 | d[H4(1)] = sum1; | |
2488 | d[H4(2)] = sum2; | |
2489 | d[H4(3)] = sum3; | |
2490 | } | |
2491 | clear_tail(vd, opr_sz, simd_maxsz(desc)); | |
2492 | } | |
2493 | ||
2494 | #define DO_MMLA_B(NAME, INNER) \ | |
2495 | void HELPER(NAME)(void *vd, void *vn, void *vm, void *va, uint32_t desc) \ | |
2496 | { do_mmla_b(vd, vn, vm, va, desc, INNER); } | |
2497 | ||
2498 | DO_MMLA_B(gvec_smmla_b, do_smmla_b) | |
2499 | DO_MMLA_B(gvec_ummla_b, do_ummla_b) | |
2500 | DO_MMLA_B(gvec_usmmla_b, do_usmmla_b) | |
cb8657f7 RH |
2501 | |
2502 | /* | |
2503 | * BFloat16 Dot Product | |
2504 | */ | |
2505 | ||
72db2aa3 | 2506 | float32 bfdotadd(float32 sum, uint32_t e1, uint32_t e2) |
cb8657f7 RH |
2507 | { |
2508 | /* FPCR is ignored for BFDOT and BFMMLA. */ | |
2509 | float_status bf_status = { | |
2510 | .tininess_before_rounding = float_tininess_before_rounding, | |
2511 | .float_rounding_mode = float_round_to_odd_inf, | |
2512 | .flush_to_zero = true, | |
2513 | .flush_inputs_to_zero = true, | |
2514 | .default_nan_mode = true, | |
2515 | }; | |
2516 | float32 t1, t2; | |
2517 | ||
2518 | /* | |
2519 | * Extract each BFloat16 from the element pair, and shift | |
2520 | * them such that they become float32. | |
2521 | */ | |
2522 | t1 = float32_mul(e1 << 16, e2 << 16, &bf_status); | |
2523 | t2 = float32_mul(e1 & 0xffff0000u, e2 & 0xffff0000u, &bf_status); | |
2524 | t1 = float32_add(t1, t2, &bf_status); | |
2525 | t1 = float32_add(sum, t1, &bf_status); | |
2526 | ||
2527 | return t1; | |
2528 | } | |
2529 | ||
2530 | void HELPER(gvec_bfdot)(void *vd, void *vn, void *vm, void *va, uint32_t desc) | |
2531 | { | |
2532 | intptr_t i, opr_sz = simd_oprsz(desc); | |
2533 | float32 *d = vd, *a = va; | |
2534 | uint32_t *n = vn, *m = vm; | |
2535 | ||
2536 | for (i = 0; i < opr_sz / 4; ++i) { | |
2537 | d[i] = bfdotadd(a[i], n[i], m[i]); | |
2538 | } | |
2539 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
2540 | } | |
83914478 RH |
2541 | |
2542 | void HELPER(gvec_bfdot_idx)(void *vd, void *vn, void *vm, | |
2543 | void *va, uint32_t desc) | |
2544 | { | |
2545 | intptr_t i, j, opr_sz = simd_oprsz(desc); | |
2546 | intptr_t index = simd_data(desc); | |
2547 | intptr_t elements = opr_sz / 4; | |
2548 | intptr_t eltspersegment = MIN(16 / 4, elements); | |
2549 | float32 *d = vd, *a = va; | |
2550 | uint32_t *n = vn, *m = vm; | |
2551 | ||
2552 | for (i = 0; i < elements; i += eltspersegment) { | |
2553 | uint32_t m_idx = m[i + H4(index)]; | |
2554 | ||
2555 | for (j = i; j < i + eltspersegment; j++) { | |
2556 | d[j] = bfdotadd(a[j], n[j], m_idx); | |
2557 | } | |
2558 | } | |
2559 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
2560 | } | |
81266a1f RH |
2561 | |
2562 | void HELPER(gvec_bfmmla)(void *vd, void *vn, void *vm, void *va, uint32_t desc) | |
2563 | { | |
2564 | intptr_t s, opr_sz = simd_oprsz(desc); | |
2565 | float32 *d = vd, *a = va; | |
2566 | uint32_t *n = vn, *m = vm; | |
2567 | ||
2568 | for (s = 0; s < opr_sz / 4; s += 4) { | |
2569 | float32 sum00, sum01, sum10, sum11; | |
2570 | ||
2571 | /* | |
2572 | * Process the entire segment at once, writing back the | |
2573 | * results only after we've consumed all of the inputs. | |
2574 | * | |
673d8215 | 2575 | * Key to indices by column: |
81266a1f RH |
2576 | * i j i k j k |
2577 | */ | |
2578 | sum00 = a[s + H4(0 + 0)]; | |
2579 | sum00 = bfdotadd(sum00, n[s + H4(0 + 0)], m[s + H4(0 + 0)]); | |
2580 | sum00 = bfdotadd(sum00, n[s + H4(0 + 1)], m[s + H4(0 + 1)]); | |
2581 | ||
2582 | sum01 = a[s + H4(0 + 1)]; | |
2583 | sum01 = bfdotadd(sum01, n[s + H4(0 + 0)], m[s + H4(2 + 0)]); | |
2584 | sum01 = bfdotadd(sum01, n[s + H4(0 + 1)], m[s + H4(2 + 1)]); | |
2585 | ||
2586 | sum10 = a[s + H4(2 + 0)]; | |
2587 | sum10 = bfdotadd(sum10, n[s + H4(2 + 0)], m[s + H4(0 + 0)]); | |
2588 | sum10 = bfdotadd(sum10, n[s + H4(2 + 1)], m[s + H4(0 + 1)]); | |
2589 | ||
2590 | sum11 = a[s + H4(2 + 1)]; | |
2591 | sum11 = bfdotadd(sum11, n[s + H4(2 + 0)], m[s + H4(2 + 0)]); | |
2592 | sum11 = bfdotadd(sum11, n[s + H4(2 + 1)], m[s + H4(2 + 1)]); | |
2593 | ||
2594 | d[s + H4(0 + 0)] = sum00; | |
2595 | d[s + H4(0 + 1)] = sum01; | |
2596 | d[s + H4(2 + 0)] = sum10; | |
2597 | d[s + H4(2 + 1)] = sum11; | |
2598 | } | |
2599 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
2600 | } | |
5693887f RH |
2601 | |
2602 | void HELPER(gvec_bfmlal)(void *vd, void *vn, void *vm, void *va, | |
2603 | void *stat, uint32_t desc) | |
2604 | { | |
2605 | intptr_t i, opr_sz = simd_oprsz(desc); | |
2606 | intptr_t sel = simd_data(desc); | |
2607 | float32 *d = vd, *a = va; | |
2608 | bfloat16 *n = vn, *m = vm; | |
2609 | ||
2610 | for (i = 0; i < opr_sz / 4; ++i) { | |
2611 | float32 nn = n[H2(i * 2 + sel)] << 16; | |
2612 | float32 mm = m[H2(i * 2 + sel)] << 16; | |
2613 | d[H4(i)] = float32_muladd(nn, mm, a[H4(i)], 0, stat); | |
2614 | } | |
2615 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
2616 | } | |
458d0ab6 RH |
2617 | |
2618 | void HELPER(gvec_bfmlal_idx)(void *vd, void *vn, void *vm, | |
2619 | void *va, void *stat, uint32_t desc) | |
2620 | { | |
2621 | intptr_t i, j, opr_sz = simd_oprsz(desc); | |
2622 | intptr_t sel = extract32(desc, SIMD_DATA_SHIFT, 1); | |
2623 | intptr_t index = extract32(desc, SIMD_DATA_SHIFT + 1, 3); | |
2624 | intptr_t elements = opr_sz / 4; | |
2625 | intptr_t eltspersegment = MIN(16 / 4, elements); | |
2626 | float32 *d = vd, *a = va; | |
2627 | bfloat16 *n = vn, *m = vm; | |
2628 | ||
2629 | for (i = 0; i < elements; i += eltspersegment) { | |
2630 | float32 m_idx = m[H2(2 * i + index)] << 16; | |
2631 | ||
2632 | for (j = i; j < i + eltspersegment; j++) { | |
2633 | float32 n_j = n[H2(2 * j + sel)] << 16; | |
2634 | d[H4(j)] = float32_muladd(n_j, m_idx, a[H4(j)], 0, stat); | |
2635 | } | |
2636 | } | |
2637 | clear_tail(d, opr_sz, simd_maxsz(desc)); | |
2638 | } | |
6b5a3bdf RH |
2639 | |
2640 | #define DO_CLAMP(NAME, TYPE) \ | |
2641 | void HELPER(NAME)(void *d, void *n, void *m, void *a, uint32_t desc) \ | |
2642 | { \ | |
2643 | intptr_t i, opr_sz = simd_oprsz(desc); \ | |
2644 | for (i = 0; i < opr_sz; i += sizeof(TYPE)) { \ | |
2645 | TYPE aa = *(TYPE *)(a + i); \ | |
2646 | TYPE nn = *(TYPE *)(n + i); \ | |
2647 | TYPE mm = *(TYPE *)(m + i); \ | |
2648 | TYPE dd = MIN(MAX(aa, nn), mm); \ | |
2649 | *(TYPE *)(d + i) = dd; \ | |
2650 | } \ | |
2651 | clear_tail(d, opr_sz, simd_maxsz(desc)); \ | |
2652 | } | |
2653 | ||
2654 | DO_CLAMP(gvec_sclamp_b, int8_t) | |
2655 | DO_CLAMP(gvec_sclamp_h, int16_t) | |
2656 | DO_CLAMP(gvec_sclamp_s, int32_t) | |
2657 | DO_CLAMP(gvec_sclamp_d, int64_t) | |
2658 | ||
2659 | DO_CLAMP(gvec_uclamp_b, uint8_t) | |
2660 | DO_CLAMP(gvec_uclamp_h, uint16_t) | |
2661 | DO_CLAMP(gvec_uclamp_s, uint32_t) | |
2662 | DO_CLAMP(gvec_uclamp_d, uint64_t) |