]>
Commit | Line | Data |
---|---|---|
a0c9400a SG |
1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* | |
1dc33f26 | 3 | * QEMU LoongArch vector helper functions. |
a0c9400a SG |
4 | * |
5 | * Copyright (c) 2022-2023 Loongson Technology Corporation Limited | |
6 | */ | |
c037fbc9 SG |
7 | |
8 | #include "qemu/osdep.h" | |
9 | #include "cpu.h" | |
10 | #include "exec/exec-all.h" | |
11 | #include "exec/helper-proto.h" | |
aca67472 SG |
12 | #include "fpu/softfloat.h" |
13 | #include "internals.h" | |
d0dfa19a | 14 | #include "tcg/tcg.h" |
008a3b16 | 15 | #include "vec.h" |
64cf6b99 | 16 | #include "tcg/tcg-gvec-desc.h" |
c037fbc9 | 17 | |
c037fbc9 | 18 | #define DO_ODD_EVEN(NAME, BIT, E1, E2, DO_OP) \ |
04711da1 | 19 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
c037fbc9 SG |
20 | { \ |
21 | int i; \ | |
04711da1 SG |
22 | VReg *Vd = (VReg *)vd; \ |
23 | VReg *Vj = (VReg *)vj; \ | |
24 | VReg *Vk = (VReg *)vk; \ | |
c037fbc9 | 25 | typedef __typeof(Vd->E1(0)) TD; \ |
64cf6b99 | 26 | int oprsz = simd_oprsz(desc); \ |
c037fbc9 | 27 | \ |
64cf6b99 | 28 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
c037fbc9 SG |
29 | Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i)); \ |
30 | } \ | |
31 | } | |
32 | ||
33 | DO_ODD_EVEN(vhaddw_h_b, 16, H, B, DO_ADD) | |
34 | DO_ODD_EVEN(vhaddw_w_h, 32, W, H, DO_ADD) | |
35 | DO_ODD_EVEN(vhaddw_d_w, 64, D, W, DO_ADD) | |
36 | ||
04711da1 | 37 | void HELPER(vhaddw_q_d)(void *vd, void *vj, void *vk, uint32_t desc) |
c037fbc9 | 38 | { |
64cf6b99 | 39 | int i; |
04711da1 SG |
40 | VReg *Vd = (VReg *)vd; |
41 | VReg *Vj = (VReg *)vj; | |
42 | VReg *Vk = (VReg *)vk; | |
64cf6b99 | 43 | int oprsz = simd_oprsz(desc); |
c037fbc9 | 44 | |
64cf6b99 SG |
45 | for (i = 0; i < oprsz / 16 ; i++) { |
46 | Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i + 1)), | |
47 | int128_makes64(Vk->D(2 * i))); | |
48 | } | |
c037fbc9 SG |
49 | } |
50 | ||
51 | DO_ODD_EVEN(vhsubw_h_b, 16, H, B, DO_SUB) | |
52 | DO_ODD_EVEN(vhsubw_w_h, 32, W, H, DO_SUB) | |
53 | DO_ODD_EVEN(vhsubw_d_w, 64, D, W, DO_SUB) | |
54 | ||
04711da1 | 55 | void HELPER(vhsubw_q_d)(void *vd, void *vj, void *vk, uint32_t desc) |
c037fbc9 | 56 | { |
64cf6b99 | 57 | int i; |
04711da1 SG |
58 | VReg *Vd = (VReg *)vd; |
59 | VReg *Vj = (VReg *)vj; | |
60 | VReg *Vk = (VReg *)vk; | |
64cf6b99 | 61 | int oprsz = simd_oprsz(desc); |
c037fbc9 | 62 | |
64cf6b99 SG |
63 | for (i = 0; i < oprsz / 16; i++) { |
64 | Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)), | |
65 | int128_makes64(Vk->D(2 * i))); | |
66 | } | |
c037fbc9 SG |
67 | } |
68 | ||
69 | DO_ODD_EVEN(vhaddw_hu_bu, 16, UH, UB, DO_ADD) | |
70 | DO_ODD_EVEN(vhaddw_wu_hu, 32, UW, UH, DO_ADD) | |
71 | DO_ODD_EVEN(vhaddw_du_wu, 64, UD, UW, DO_ADD) | |
72 | ||
04711da1 | 73 | void HELPER(vhaddw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc) |
c037fbc9 | 74 | { |
64cf6b99 | 75 | int i; |
04711da1 SG |
76 | VReg *Vd = (VReg *)vd; |
77 | VReg *Vj = (VReg *)vj; | |
78 | VReg *Vk = (VReg *)vk; | |
64cf6b99 | 79 | int oprsz = simd_oprsz(desc); |
c037fbc9 | 80 | |
64cf6b99 SG |
81 | for (i = 0; i < oprsz / 16; i ++) { |
82 | Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)), | |
83 | int128_make64(Vk->UD(2 * i))); | |
84 | } | |
c037fbc9 SG |
85 | } |
86 | ||
87 | DO_ODD_EVEN(vhsubw_hu_bu, 16, UH, UB, DO_SUB) | |
88 | DO_ODD_EVEN(vhsubw_wu_hu, 32, UW, UH, DO_SUB) | |
89 | DO_ODD_EVEN(vhsubw_du_wu, 64, UD, UW, DO_SUB) | |
90 | ||
04711da1 | 91 | void HELPER(vhsubw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc) |
c037fbc9 | 92 | { |
64cf6b99 | 93 | int i; |
04711da1 SG |
94 | VReg *Vd = (VReg *)vd; |
95 | VReg *Vj = (VReg *)vj; | |
96 | VReg *Vk = (VReg *)vk; | |
64cf6b99 | 97 | int oprsz = simd_oprsz(desc); |
c037fbc9 | 98 | |
64cf6b99 SG |
99 | for (i = 0; i < oprsz / 16; i++) { |
100 | Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)), | |
101 | int128_make64(Vk->UD(2 * i))); | |
102 | } | |
c037fbc9 | 103 | } |
2d5f950c SG |
104 | |
105 | #define DO_EVEN(NAME, BIT, E1, E2, DO_OP) \ | |
85995f07 | 106 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
2d5f950c SG |
107 | { \ |
108 | int i; \ | |
109 | VReg *Vd = (VReg *)vd; \ | |
110 | VReg *Vj = (VReg *)vj; \ | |
111 | VReg *Vk = (VReg *)vk; \ | |
112 | typedef __typeof(Vd->E1(0)) TD; \ | |
85995f07 SG |
113 | int oprsz = simd_oprsz(desc); \ |
114 | \ | |
115 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
2d5f950c SG |
116 | Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i) ,(TD)Vk->E2(2 * i)); \ |
117 | } \ | |
118 | } | |
119 | ||
120 | #define DO_ODD(NAME, BIT, E1, E2, DO_OP) \ | |
85995f07 | 121 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
2d5f950c SG |
122 | { \ |
123 | int i; \ | |
124 | VReg *Vd = (VReg *)vd; \ | |
125 | VReg *Vj = (VReg *)vj; \ | |
126 | VReg *Vk = (VReg *)vk; \ | |
127 | typedef __typeof(Vd->E1(0)) TD; \ | |
85995f07 SG |
128 | int oprsz = simd_oprsz(desc); \ |
129 | \ | |
130 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
2d5f950c SG |
131 | Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i + 1)); \ |
132 | } \ | |
133 | } | |
134 | ||
85995f07 | 135 | void HELPER(vaddwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 136 | { |
85995f07 | 137 | int i; |
2d5f950c SG |
138 | VReg *Vd = (VReg *)vd; |
139 | VReg *Vj = (VReg *)vj; | |
140 | VReg *Vk = (VReg *)vk; | |
85995f07 | 141 | int oprsz = simd_oprsz(desc); |
2d5f950c | 142 | |
85995f07 SG |
143 | for (i = 0; i < oprsz / 16; i++) { |
144 | Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i)), | |
145 | int128_makes64(Vk->D(2 * i))); | |
146 | } | |
2d5f950c SG |
147 | } |
148 | ||
149 | DO_EVEN(vaddwev_h_b, 16, H, B, DO_ADD) | |
150 | DO_EVEN(vaddwev_w_h, 32, W, H, DO_ADD) | |
151 | DO_EVEN(vaddwev_d_w, 64, D, W, DO_ADD) | |
152 | ||
85995f07 | 153 | void HELPER(vaddwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 154 | { |
85995f07 | 155 | int i; |
2d5f950c SG |
156 | VReg *Vd = (VReg *)vd; |
157 | VReg *Vj = (VReg *)vj; | |
158 | VReg *Vk = (VReg *)vk; | |
85995f07 | 159 | int oprsz = simd_oprsz(desc); |
2d5f950c | 160 | |
85995f07 SG |
161 | for (i = 0; i < oprsz / 16; i++) { |
162 | Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i +1)), | |
163 | int128_makes64(Vk->D(2 * i +1))); | |
164 | } | |
2d5f950c SG |
165 | } |
166 | ||
167 | DO_ODD(vaddwod_h_b, 16, H, B, DO_ADD) | |
168 | DO_ODD(vaddwod_w_h, 32, W, H, DO_ADD) | |
169 | DO_ODD(vaddwod_d_w, 64, D, W, DO_ADD) | |
170 | ||
85995f07 | 171 | void HELPER(vsubwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 172 | { |
85995f07 | 173 | int i; |
2d5f950c SG |
174 | VReg *Vd = (VReg *)vd; |
175 | VReg *Vj = (VReg *)vj; | |
176 | VReg *Vk = (VReg *)vk; | |
85995f07 | 177 | int oprsz = simd_oprsz(desc); |
2d5f950c | 178 | |
85995f07 SG |
179 | for (i = 0; i < oprsz / 16; i++) { |
180 | Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i)), | |
181 | int128_makes64(Vk->D(2 * i))); | |
182 | } | |
2d5f950c SG |
183 | } |
184 | ||
185 | DO_EVEN(vsubwev_h_b, 16, H, B, DO_SUB) | |
186 | DO_EVEN(vsubwev_w_h, 32, W, H, DO_SUB) | |
187 | DO_EVEN(vsubwev_d_w, 64, D, W, DO_SUB) | |
188 | ||
85995f07 | 189 | void HELPER(vsubwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 190 | { |
85995f07 | 191 | int i; |
2d5f950c SG |
192 | VReg *Vd = (VReg *)vd; |
193 | VReg *Vj = (VReg *)vj; | |
194 | VReg *Vk = (VReg *)vk; | |
85995f07 | 195 | int oprsz = simd_oprsz(desc); |
2d5f950c | 196 | |
85995f07 SG |
197 | for (i = 0; i < oprsz / 16; i++) { |
198 | Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)), | |
199 | int128_makes64(Vk->D(2 * i + 1))); | |
200 | } | |
2d5f950c SG |
201 | } |
202 | ||
203 | DO_ODD(vsubwod_h_b, 16, H, B, DO_SUB) | |
204 | DO_ODD(vsubwod_w_h, 32, W, H, DO_SUB) | |
205 | DO_ODD(vsubwod_d_w, 64, D, W, DO_SUB) | |
206 | ||
85995f07 | 207 | void HELPER(vaddwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 208 | { |
85995f07 | 209 | int i; |
2d5f950c SG |
210 | VReg *Vd = (VReg *)vd; |
211 | VReg *Vj = (VReg *)vj; | |
212 | VReg *Vk = (VReg *)vk; | |
85995f07 | 213 | int oprsz = simd_oprsz(desc); |
2d5f950c | 214 | |
85995f07 SG |
215 | for (i = 0; i < oprsz / 16; i++) { |
216 | Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)), | |
217 | int128_make64(Vk->UD(2 * i))); | |
218 | } | |
2d5f950c SG |
219 | } |
220 | ||
221 | DO_EVEN(vaddwev_h_bu, 16, UH, UB, DO_ADD) | |
222 | DO_EVEN(vaddwev_w_hu, 32, UW, UH, DO_ADD) | |
223 | DO_EVEN(vaddwev_d_wu, 64, UD, UW, DO_ADD) | |
224 | ||
85995f07 | 225 | void HELPER(vaddwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 226 | { |
85995f07 | 227 | int i; |
2d5f950c SG |
228 | VReg *Vd = (VReg *)vd; |
229 | VReg *Vj = (VReg *)vj; | |
230 | VReg *Vk = (VReg *)vk; | |
85995f07 | 231 | int oprsz = simd_oprsz(desc); |
2d5f950c | 232 | |
85995f07 SG |
233 | for (i = 0; i < oprsz / 16; i++) { |
234 | Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)), | |
235 | int128_make64(Vk->UD(2 * i + 1))); | |
236 | } | |
2d5f950c SG |
237 | } |
238 | ||
239 | DO_ODD(vaddwod_h_bu, 16, UH, UB, DO_ADD) | |
240 | DO_ODD(vaddwod_w_hu, 32, UW, UH, DO_ADD) | |
241 | DO_ODD(vaddwod_d_wu, 64, UD, UW, DO_ADD) | |
242 | ||
85995f07 | 243 | void HELPER(vsubwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 244 | { |
85995f07 | 245 | int i; |
2d5f950c SG |
246 | VReg *Vd = (VReg *)vd; |
247 | VReg *Vj = (VReg *)vj; | |
248 | VReg *Vk = (VReg *)vk; | |
85995f07 | 249 | int oprsz = simd_oprsz(desc); |
2d5f950c | 250 | |
85995f07 SG |
251 | for (i = 0; i < oprsz / 16; i++) { |
252 | Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i)), | |
253 | int128_make64(Vk->UD(2 * i))); | |
254 | } | |
2d5f950c SG |
255 | } |
256 | ||
257 | DO_EVEN(vsubwev_h_bu, 16, UH, UB, DO_SUB) | |
258 | DO_EVEN(vsubwev_w_hu, 32, UW, UH, DO_SUB) | |
259 | DO_EVEN(vsubwev_d_wu, 64, UD, UW, DO_SUB) | |
260 | ||
85995f07 | 261 | void HELPER(vsubwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 262 | { |
85995f07 | 263 | int i; |
2d5f950c SG |
264 | VReg *Vd = (VReg *)vd; |
265 | VReg *Vj = (VReg *)vj; | |
266 | VReg *Vk = (VReg *)vk; | |
85995f07 | 267 | int oprsz = simd_oprsz(desc); |
2d5f950c | 268 | |
85995f07 SG |
269 | for (i = 0; i < oprsz / 16; i++) { |
270 | Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)), | |
271 | int128_make64(Vk->UD(2 * i + 1))); | |
272 | } | |
2d5f950c SG |
273 | } |
274 | ||
275 | DO_ODD(vsubwod_h_bu, 16, UH, UB, DO_SUB) | |
276 | DO_ODD(vsubwod_w_hu, 32, UW, UH, DO_SUB) | |
277 | DO_ODD(vsubwod_d_wu, 64, UD, UW, DO_SUB) | |
278 | ||
279 | #define DO_EVEN_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ | |
85995f07 | 280 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
2d5f950c SG |
281 | { \ |
282 | int i; \ | |
283 | VReg *Vd = (VReg *)vd; \ | |
284 | VReg *Vj = (VReg *)vj; \ | |
285 | VReg *Vk = (VReg *)vk; \ | |
286 | typedef __typeof(Vd->ES1(0)) TDS; \ | |
287 | typedef __typeof(Vd->EU1(0)) TDU; \ | |
85995f07 SG |
288 | int oprsz = simd_oprsz(desc); \ |
289 | \ | |
290 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
2d5f950c SG |
291 | Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i) ,(TDS)Vk->ES2(2 * i)); \ |
292 | } \ | |
293 | } | |
294 | ||
295 | #define DO_ODD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ | |
85995f07 | 296 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
2d5f950c SG |
297 | { \ |
298 | int i; \ | |
299 | VReg *Vd = (VReg *)vd; \ | |
300 | VReg *Vj = (VReg *)vj; \ | |
301 | VReg *Vk = (VReg *)vk; \ | |
302 | typedef __typeof(Vd->ES1(0)) TDS; \ | |
303 | typedef __typeof(Vd->EU1(0)) TDU; \ | |
85995f07 SG |
304 | int oprsz = simd_oprsz(desc); \ |
305 | \ | |
306 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
2d5f950c SG |
307 | Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i + 1), (TDS)Vk->ES2(2 * i + 1)); \ |
308 | } \ | |
309 | } | |
310 | ||
85995f07 | 311 | void HELPER(vaddwev_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 312 | { |
85995f07 | 313 | int i; |
2d5f950c SG |
314 | VReg *Vd = (VReg *)vd; |
315 | VReg *Vj = (VReg *)vj; | |
316 | VReg *Vk = (VReg *)vk; | |
85995f07 | 317 | int oprsz = simd_oprsz(desc); |
2d5f950c | 318 | |
85995f07 SG |
319 | for (i = 0; i < oprsz / 16; i++) { |
320 | Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)), | |
321 | int128_makes64(Vk->D(2 * i))); | |
322 | } | |
2d5f950c SG |
323 | } |
324 | ||
325 | DO_EVEN_U_S(vaddwev_h_bu_b, 16, H, UH, B, UB, DO_ADD) | |
326 | DO_EVEN_U_S(vaddwev_w_hu_h, 32, W, UW, H, UH, DO_ADD) | |
327 | DO_EVEN_U_S(vaddwev_d_wu_w, 64, D, UD, W, UW, DO_ADD) | |
328 | ||
85995f07 | 329 | void HELPER(vaddwod_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 330 | { |
85995f07 | 331 | int i; |
2d5f950c SG |
332 | VReg *Vd = (VReg *)vd; |
333 | VReg *Vj = (VReg *)vj; | |
334 | VReg *Vk = (VReg *)vk; | |
85995f07 | 335 | int oprsz = simd_oprsz(desc); |
2d5f950c | 336 | |
85995f07 SG |
337 | for (i = 0; i < oprsz / 16; i++) { |
338 | Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)), | |
339 | int128_makes64(Vk->D(2 * i + 1))); | |
340 | } | |
2d5f950c SG |
341 | } |
342 | ||
343 | DO_ODD_U_S(vaddwod_h_bu_b, 16, H, UH, B, UB, DO_ADD) | |
344 | DO_ODD_U_S(vaddwod_w_hu_h, 32, W, UW, H, UH, DO_ADD) | |
345 | DO_ODD_U_S(vaddwod_d_wu_w, 64, D, UD, W, UW, DO_ADD) | |
39e9b0a7 | 346 | |
ee7250d0 SG |
347 | #define DO_3OP(NAME, BIT, E, DO_OP) \ |
348 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
349 | { \ | |
350 | int i; \ | |
351 | VReg *Vd = (VReg *)vd; \ | |
352 | VReg *Vj = (VReg *)vj; \ | |
353 | VReg *Vk = (VReg *)vk; \ | |
354 | int oprsz = simd_oprsz(desc); \ | |
355 | \ | |
356 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
357 | Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)); \ | |
358 | } \ | |
39e9b0a7 SG |
359 | } |
360 | ||
361 | DO_3OP(vavg_b, 8, B, DO_VAVG) | |
362 | DO_3OP(vavg_h, 16, H, DO_VAVG) | |
363 | DO_3OP(vavg_w, 32, W, DO_VAVG) | |
364 | DO_3OP(vavg_d, 64, D, DO_VAVG) | |
365 | DO_3OP(vavgr_b, 8, B, DO_VAVGR) | |
366 | DO_3OP(vavgr_h, 16, H, DO_VAVGR) | |
367 | DO_3OP(vavgr_w, 32, W, DO_VAVGR) | |
368 | DO_3OP(vavgr_d, 64, D, DO_VAVGR) | |
369 | DO_3OP(vavg_bu, 8, UB, DO_VAVG) | |
370 | DO_3OP(vavg_hu, 16, UH, DO_VAVG) | |
371 | DO_3OP(vavg_wu, 32, UW, DO_VAVG) | |
372 | DO_3OP(vavg_du, 64, UD, DO_VAVG) | |
373 | DO_3OP(vavgr_bu, 8, UB, DO_VAVGR) | |
374 | DO_3OP(vavgr_hu, 16, UH, DO_VAVGR) | |
375 | DO_3OP(vavgr_wu, 32, UW, DO_VAVGR) | |
376 | DO_3OP(vavgr_du, 64, UD, DO_VAVGR) | |
49725659 | 377 | |
49725659 SG |
378 | DO_3OP(vabsd_b, 8, B, DO_VABSD) |
379 | DO_3OP(vabsd_h, 16, H, DO_VABSD) | |
380 | DO_3OP(vabsd_w, 32, W, DO_VABSD) | |
381 | DO_3OP(vabsd_d, 64, D, DO_VABSD) | |
382 | DO_3OP(vabsd_bu, 8, UB, DO_VABSD) | |
383 | DO_3OP(vabsd_hu, 16, UH, DO_VABSD) | |
384 | DO_3OP(vabsd_wu, 32, UW, DO_VABSD) | |
385 | DO_3OP(vabsd_du, 64, UD, DO_VABSD) | |
af448cb3 | 386 | |
27f5485d SG |
387 | #define DO_VADDA(NAME, BIT, E) \ |
388 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
389 | { \ | |
390 | int i; \ | |
391 | VReg *Vd = (VReg *)vd; \ | |
392 | VReg *Vj = (VReg *)vj; \ | |
393 | VReg *Vk = (VReg *)vk; \ | |
394 | int oprsz = simd_oprsz(desc); \ | |
395 | \ | |
396 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
397 | Vd->E(i) = DO_VABS(Vj->E(i)) + DO_VABS(Vk->E(i)); \ | |
398 | } \ | |
af448cb3 SG |
399 | } |
400 | ||
27f5485d SG |
401 | DO_VADDA(vadda_b, 8, B) |
402 | DO_VADDA(vadda_h, 16, H) | |
403 | DO_VADDA(vadda_w, 32, W) | |
404 | DO_VADDA(vadda_d, 64, D) | |
9ab29520 | 405 | |
c09360fa SG |
406 | #define VMINMAXI(NAME, BIT, E, DO_OP) \ |
407 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
408 | { \ | |
409 | int i; \ | |
410 | VReg *Vd = (VReg *)vd; \ | |
411 | VReg *Vj = (VReg *)vj; \ | |
412 | typedef __typeof(Vd->E(0)) TD; \ | |
413 | int oprsz = simd_oprsz(desc); \ | |
414 | \ | |
415 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
416 | Vd->E(i) = DO_OP(Vj->E(i), (TD)imm); \ | |
417 | } \ | |
9ab29520 SG |
418 | } |
419 | ||
420 | VMINMAXI(vmini_b, 8, B, DO_MIN) | |
421 | VMINMAXI(vmini_h, 16, H, DO_MIN) | |
422 | VMINMAXI(vmini_w, 32, W, DO_MIN) | |
423 | VMINMAXI(vmini_d, 64, D, DO_MIN) | |
424 | VMINMAXI(vmaxi_b, 8, B, DO_MAX) | |
425 | VMINMAXI(vmaxi_h, 16, H, DO_MAX) | |
426 | VMINMAXI(vmaxi_w, 32, W, DO_MAX) | |
427 | VMINMAXI(vmaxi_d, 64, D, DO_MAX) | |
428 | VMINMAXI(vmini_bu, 8, UB, DO_MIN) | |
429 | VMINMAXI(vmini_hu, 16, UH, DO_MIN) | |
430 | VMINMAXI(vmini_wu, 32, UW, DO_MIN) | |
431 | VMINMAXI(vmini_du, 64, UD, DO_MIN) | |
432 | VMINMAXI(vmaxi_bu, 8, UB, DO_MAX) | |
433 | VMINMAXI(vmaxi_hu, 16, UH, DO_MAX) | |
434 | VMINMAXI(vmaxi_wu, 32, UW, DO_MAX) | |
435 | VMINMAXI(vmaxi_du, 64, UD, DO_MAX) | |
cd1c49ad | 436 | |
342dc1cf SG |
437 | #define DO_VMUH(NAME, BIT, E1, E2, DO_OP) \ |
438 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
439 | { \ | |
440 | int i; \ | |
441 | VReg *Vd = (VReg *)vd; \ | |
442 | VReg *Vj = (VReg *)vj; \ | |
443 | VReg *Vk = (VReg *)vk; \ | |
444 | typedef __typeof(Vd->E1(0)) T; \ | |
445 | int oprsz = simd_oprsz(desc); \ | |
446 | \ | |
447 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
448 | Vd->E2(i) = ((T)Vj->E2(i)) * ((T)Vk->E2(i)) >> BIT; \ | |
449 | } \ | |
cd1c49ad SG |
450 | } |
451 | ||
342dc1cf | 452 | void HELPER(vmuh_d)(void *vd, void *vj, void *vk, uint32_t desc) |
cd1c49ad | 453 | { |
342dc1cf SG |
454 | int i; |
455 | uint64_t l, h; | |
cd1c49ad SG |
456 | VReg *Vd = (VReg *)vd; |
457 | VReg *Vj = (VReg *)vj; | |
458 | VReg *Vk = (VReg *)vk; | |
342dc1cf | 459 | int oprsz = simd_oprsz(desc); |
cd1c49ad | 460 | |
342dc1cf SG |
461 | for (i = 0; i < oprsz / 8; i++) { |
462 | muls64(&l, &h, Vj->D(i), Vk->D(i)); | |
463 | Vd->D(i) = h; | |
464 | } | |
cd1c49ad SG |
465 | } |
466 | ||
467 | DO_VMUH(vmuh_b, 8, H, B, DO_MUH) | |
468 | DO_VMUH(vmuh_h, 16, W, H, DO_MUH) | |
469 | DO_VMUH(vmuh_w, 32, D, W, DO_MUH) | |
470 | ||
342dc1cf | 471 | void HELPER(vmuh_du)(void *vd, void *vj, void *vk, uint32_t desc) |
cd1c49ad | 472 | { |
342dc1cf SG |
473 | int i; |
474 | uint64_t l, h; | |
cd1c49ad SG |
475 | VReg *Vd = (VReg *)vd; |
476 | VReg *Vj = (VReg *)vj; | |
477 | VReg *Vk = (VReg *)vk; | |
342dc1cf | 478 | int oprsz = simd_oprsz(desc); |
cd1c49ad | 479 | |
342dc1cf SG |
480 | for (i = 0; i < oprsz / 8; i++) { |
481 | mulu64(&l, &h, Vj->D(i), Vk->D(i)); | |
482 | Vd->D(i) = h; | |
483 | } | |
cd1c49ad SG |
484 | } |
485 | ||
486 | DO_VMUH(vmuh_bu, 8, UH, UB, DO_MUH) | |
487 | DO_VMUH(vmuh_hu, 16, UW, UH, DO_MUH) | |
488 | DO_VMUH(vmuh_wu, 32, UD, UW, DO_MUH) | |
489 | ||
cd1c49ad SG |
490 | DO_EVEN(vmulwev_h_b, 16, H, B, DO_MUL) |
491 | DO_EVEN(vmulwev_w_h, 32, W, H, DO_MUL) | |
492 | DO_EVEN(vmulwev_d_w, 64, D, W, DO_MUL) | |
493 | ||
494 | DO_ODD(vmulwod_h_b, 16, H, B, DO_MUL) | |
495 | DO_ODD(vmulwod_w_h, 32, W, H, DO_MUL) | |
496 | DO_ODD(vmulwod_d_w, 64, D, W, DO_MUL) | |
497 | ||
498 | DO_EVEN(vmulwev_h_bu, 16, UH, UB, DO_MUL) | |
499 | DO_EVEN(vmulwev_w_hu, 32, UW, UH, DO_MUL) | |
500 | DO_EVEN(vmulwev_d_wu, 64, UD, UW, DO_MUL) | |
501 | ||
502 | DO_ODD(vmulwod_h_bu, 16, UH, UB, DO_MUL) | |
503 | DO_ODD(vmulwod_w_hu, 32, UW, UH, DO_MUL) | |
504 | DO_ODD(vmulwod_d_wu, 64, UD, UW, DO_MUL) | |
505 | ||
506 | DO_EVEN_U_S(vmulwev_h_bu_b, 16, H, UH, B, UB, DO_MUL) | |
507 | DO_EVEN_U_S(vmulwev_w_hu_h, 32, W, UW, H, UH, DO_MUL) | |
508 | DO_EVEN_U_S(vmulwev_d_wu_w, 64, D, UD, W, UW, DO_MUL) | |
509 | ||
510 | DO_ODD_U_S(vmulwod_h_bu_b, 16, H, UH, B, UB, DO_MUL) | |
511 | DO_ODD_U_S(vmulwod_w_hu_h, 32, W, UW, H, UH, DO_MUL) | |
512 | DO_ODD_U_S(vmulwod_d_wu_w, 64, D, UD, W, UW, DO_MUL) | |
d3aec65b | 513 | |
3f450c17 SG |
514 | #define VMADDSUB(NAME, BIT, E, DO_OP) \ |
515 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
516 | { \ | |
517 | int i; \ | |
518 | VReg *Vd = (VReg *)vd; \ | |
519 | VReg *Vj = (VReg *)vj; \ | |
520 | VReg *Vk = (VReg *)vk; \ | |
521 | int oprsz = simd_oprsz(desc); \ | |
522 | \ | |
523 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
524 | Vd->E(i) = DO_OP(Vd->E(i), Vj->E(i) ,Vk->E(i)); \ | |
525 | } \ | |
d3aec65b SG |
526 | } |
527 | ||
528 | VMADDSUB(vmadd_b, 8, B, DO_MADD) | |
529 | VMADDSUB(vmadd_h, 16, H, DO_MADD) | |
530 | VMADDSUB(vmadd_w, 32, W, DO_MADD) | |
531 | VMADDSUB(vmadd_d, 64, D, DO_MADD) | |
532 | VMADDSUB(vmsub_b, 8, B, DO_MSUB) | |
533 | VMADDSUB(vmsub_h, 16, H, DO_MSUB) | |
534 | VMADDSUB(vmsub_w, 32, W, DO_MSUB) | |
535 | VMADDSUB(vmsub_d, 64, D, DO_MSUB) | |
536 | ||
537 | #define VMADDWEV(NAME, BIT, E1, E2, DO_OP) \ | |
3f450c17 | 538 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
d3aec65b SG |
539 | { \ |
540 | int i; \ | |
541 | VReg *Vd = (VReg *)vd; \ | |
542 | VReg *Vj = (VReg *)vj; \ | |
543 | VReg *Vk = (VReg *)vk; \ | |
544 | typedef __typeof(Vd->E1(0)) TD; \ | |
3f450c17 | 545 | int oprsz = simd_oprsz(desc); \ |
d3aec65b | 546 | \ |
3f450c17 | 547 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
d3aec65b SG |
548 | Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i), (TD)Vk->E2(2 * i)); \ |
549 | } \ | |
550 | } | |
551 | ||
552 | VMADDWEV(vmaddwev_h_b, 16, H, B, DO_MUL) | |
553 | VMADDWEV(vmaddwev_w_h, 32, W, H, DO_MUL) | |
554 | VMADDWEV(vmaddwev_d_w, 64, D, W, DO_MUL) | |
555 | VMADDWEV(vmaddwev_h_bu, 16, UH, UB, DO_MUL) | |
556 | VMADDWEV(vmaddwev_w_hu, 32, UW, UH, DO_MUL) | |
557 | VMADDWEV(vmaddwev_d_wu, 64, UD, UW, DO_MUL) | |
558 | ||
3f450c17 SG |
559 | #define VMADDWOD(NAME, BIT, E1, E2, DO_OP) \ |
560 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
561 | { \ | |
562 | int i; \ | |
563 | VReg *Vd = (VReg *)vd; \ | |
564 | VReg *Vj = (VReg *)vj; \ | |
565 | VReg *Vk = (VReg *)vk; \ | |
566 | typedef __typeof(Vd->E1(0)) TD; \ | |
567 | int oprsz = simd_oprsz(desc); \ | |
568 | \ | |
569 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
570 | Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i + 1), \ | |
571 | (TD)Vk->E2(2 * i + 1)); \ | |
572 | } \ | |
d3aec65b SG |
573 | } |
574 | ||
575 | VMADDWOD(vmaddwod_h_b, 16, H, B, DO_MUL) | |
576 | VMADDWOD(vmaddwod_w_h, 32, W, H, DO_MUL) | |
577 | VMADDWOD(vmaddwod_d_w, 64, D, W, DO_MUL) | |
578 | VMADDWOD(vmaddwod_h_bu, 16, UH, UB, DO_MUL) | |
579 | VMADDWOD(vmaddwod_w_hu, 32, UW, UH, DO_MUL) | |
580 | VMADDWOD(vmaddwod_d_wu, 64, UD, UW, DO_MUL) | |
581 | ||
3f450c17 SG |
582 | #define VMADDWEV_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ |
583 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
584 | { \ | |
585 | int i; \ | |
586 | VReg *Vd = (VReg *)vd; \ | |
587 | VReg *Vj = (VReg *)vj; \ | |
588 | VReg *Vk = (VReg *)vk; \ | |
589 | typedef __typeof(Vd->ES1(0)) TS1; \ | |
590 | typedef __typeof(Vd->EU1(0)) TU1; \ | |
591 | int oprsz = simd_oprsz(desc); \ | |
592 | \ | |
593 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
594 | Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i), \ | |
595 | (TS1)Vk->ES2(2 * i)); \ | |
596 | } \ | |
d3aec65b SG |
597 | } |
598 | ||
599 | VMADDWEV_U_S(vmaddwev_h_bu_b, 16, H, UH, B, UB, DO_MUL) | |
600 | VMADDWEV_U_S(vmaddwev_w_hu_h, 32, W, UW, H, UH, DO_MUL) | |
601 | VMADDWEV_U_S(vmaddwev_d_wu_w, 64, D, UD, W, UW, DO_MUL) | |
602 | ||
3f450c17 SG |
603 | #define VMADDWOD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ |
604 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
605 | { \ | |
606 | int i; \ | |
607 | VReg *Vd = (VReg *)vd; \ | |
608 | VReg *Vj = (VReg *)vj; \ | |
609 | VReg *Vk = (VReg *)vk; \ | |
610 | typedef __typeof(Vd->ES1(0)) TS1; \ | |
611 | typedef __typeof(Vd->EU1(0)) TU1; \ | |
612 | int oprsz = simd_oprsz(desc); \ | |
613 | \ | |
614 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
615 | Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i + 1), \ | |
616 | (TS1)Vk->ES2(2 * i + 1)); \ | |
617 | } \ | |
d3aec65b SG |
618 | } |
619 | ||
620 | VMADDWOD_U_S(vmaddwod_h_bu_b, 16, H, UH, B, UB, DO_MUL) | |
621 | VMADDWOD_U_S(vmaddwod_w_hu_h, 32, W, UW, H, UH, DO_MUL) | |
622 | VMADDWOD_U_S(vmaddwod_d_wu_w, 64, D, UD, W, UW, DO_MUL) | |
4cc4c0f7 | 623 | |
04711da1 SG |
624 | #define VDIV(NAME, BIT, E, DO_OP) \ |
625 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
626 | { \ | |
627 | int i; \ | |
628 | VReg *Vd = (VReg *)vd; \ | |
629 | VReg *Vj = (VReg *)vj; \ | |
630 | VReg *Vk = (VReg *)vk; \ | |
abb693de SG |
631 | int oprsz = simd_oprsz(desc); \ |
632 | \ | |
633 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
04711da1 SG |
634 | Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)); \ |
635 | } \ | |
4cc4c0f7 SG |
636 | } |
637 | ||
638 | VDIV(vdiv_b, 8, B, DO_DIV) | |
639 | VDIV(vdiv_h, 16, H, DO_DIV) | |
640 | VDIV(vdiv_w, 32, W, DO_DIV) | |
641 | VDIV(vdiv_d, 64, D, DO_DIV) | |
642 | VDIV(vdiv_bu, 8, UB, DO_DIVU) | |
643 | VDIV(vdiv_hu, 16, UH, DO_DIVU) | |
644 | VDIV(vdiv_wu, 32, UW, DO_DIVU) | |
645 | VDIV(vdiv_du, 64, UD, DO_DIVU) | |
646 | VDIV(vmod_b, 8, B, DO_REM) | |
647 | VDIV(vmod_h, 16, H, DO_REM) | |
648 | VDIV(vmod_w, 32, W, DO_REM) | |
649 | VDIV(vmod_d, 64, D, DO_REM) | |
650 | VDIV(vmod_bu, 8, UB, DO_REMU) | |
651 | VDIV(vmod_hu, 16, UH, DO_REMU) | |
652 | VDIV(vmod_wu, 32, UW, DO_REMU) | |
653 | VDIV(vmod_du, 64, UD, DO_REMU) | |
cbe44190 | 654 | |
e5c7f031 SG |
655 | #define VSAT_S(NAME, BIT, E) \ |
656 | void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \ | |
657 | { \ | |
658 | int i; \ | |
659 | VReg *Vd = (VReg *)vd; \ | |
660 | VReg *Vj = (VReg *)vj; \ | |
661 | typedef __typeof(Vd->E(0)) TD; \ | |
662 | int oprsz = simd_oprsz(desc); \ | |
663 | \ | |
664 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
665 | Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max : \ | |
666 | Vj->E(i) < (TD)~max ? (TD)~max: Vj->E(i); \ | |
667 | } \ | |
cbe44190 SG |
668 | } |
669 | ||
670 | VSAT_S(vsat_b, 8, B) | |
671 | VSAT_S(vsat_h, 16, H) | |
672 | VSAT_S(vsat_w, 32, W) | |
673 | VSAT_S(vsat_d, 64, D) | |
674 | ||
e5c7f031 SG |
675 | #define VSAT_U(NAME, BIT, E) \ |
676 | void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \ | |
677 | { \ | |
678 | int i; \ | |
679 | VReg *Vd = (VReg *)vd; \ | |
680 | VReg *Vj = (VReg *)vj; \ | |
681 | typedef __typeof(Vd->E(0)) TD; \ | |
682 | int oprsz = simd_oprsz(desc); \ | |
683 | \ | |
684 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
685 | Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max : Vj->E(i); \ | |
686 | } \ | |
cbe44190 SG |
687 | } |
688 | ||
689 | VSAT_U(vsat_bu, 8, UB) | |
690 | VSAT_U(vsat_hu, 16, UH) | |
691 | VSAT_U(vsat_wu, 32, UW) | |
692 | VSAT_U(vsat_du, 64, UD) | |
3734ad93 | 693 | |
f0db0beb SG |
694 | #define VEXTH(NAME, BIT, E1, E2) \ |
695 | void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ | |
696 | { \ | |
697 | int i, j, ofs; \ | |
698 | VReg *Vd = (VReg *)vd; \ | |
699 | VReg *Vj = (VReg *)vj; \ | |
700 | int oprsz = simd_oprsz(desc); \ | |
701 | \ | |
702 | ofs = LSX_LEN / BIT; \ | |
703 | for (i = 0; i < oprsz / 16; i++) { \ | |
704 | for (j = 0; j < ofs; j++) { \ | |
705 | Vd->E1(j + i * ofs) = Vj->E2(j + ofs + ofs * 2 * i); \ | |
706 | } \ | |
707 | } \ | |
ff27e335 SG |
708 | } |
709 | ||
710 | void HELPER(vexth_q_d)(void *vd, void *vj, uint32_t desc) | |
3734ad93 | 711 | { |
f0db0beb | 712 | int i; |
ff27e335 SG |
713 | VReg *Vd = (VReg *)vd; |
714 | VReg *Vj = (VReg *)vj; | |
f0db0beb | 715 | int oprsz = simd_oprsz(desc); |
3734ad93 | 716 | |
f0db0beb SG |
717 | for (i = 0; i < oprsz / 16; i++) { |
718 | Vd->Q(i) = int128_makes64(Vj->D(2 * i + 1)); | |
719 | } | |
3734ad93 SG |
720 | } |
721 | ||
ff27e335 | 722 | void HELPER(vexth_qu_du)(void *vd, void *vj, uint32_t desc) |
3734ad93 | 723 | { |
f0db0beb | 724 | int i; |
ff27e335 SG |
725 | VReg *Vd = (VReg *)vd; |
726 | VReg *Vj = (VReg *)vj; | |
f0db0beb | 727 | int oprsz = simd_oprsz(desc); |
3734ad93 | 728 | |
f0db0beb SG |
729 | for (i = 0; i < oprsz / 16; i++) { |
730 | Vd->Q(i) = int128_make64(Vj->UD(2 * i + 1)); | |
731 | } | |
3734ad93 SG |
732 | } |
733 | ||
734 | VEXTH(vexth_h_b, 16, H, B) | |
735 | VEXTH(vexth_w_h, 32, W, H) | |
736 | VEXTH(vexth_d_w, 64, D, W) | |
737 | VEXTH(vexth_hu_bu, 16, UH, UB) | |
738 | VEXTH(vexth_wu_hu, 32, UW, UH) | |
739 | VEXTH(vexth_du_wu, 64, UD, UW) | |
f0e395df | 740 | |
790acb2a SG |
741 | #define VEXT2XV(NAME, BIT, E1, E2) \ |
742 | void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ | |
743 | { \ | |
744 | int i; \ | |
745 | VReg temp = {}; \ | |
746 | VReg *Vd = (VReg *)vd; \ | |
747 | VReg *Vj = (VReg *)vj; \ | |
748 | int oprsz = simd_oprsz(desc); \ | |
749 | \ | |
750 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
751 | temp.E1(i) = Vj->E2(i); \ | |
752 | } \ | |
753 | *Vd = temp; \ | |
754 | } | |
755 | ||
756 | VEXT2XV(vext2xv_h_b, 16, H, B) | |
757 | VEXT2XV(vext2xv_w_b, 32, W, B) | |
758 | VEXT2XV(vext2xv_d_b, 64, D, B) | |
759 | VEXT2XV(vext2xv_w_h, 32, W, H) | |
760 | VEXT2XV(vext2xv_d_h, 64, D, H) | |
761 | VEXT2XV(vext2xv_d_w, 64, D, W) | |
762 | VEXT2XV(vext2xv_hu_bu, 16, UH, UB) | |
763 | VEXT2XV(vext2xv_wu_bu, 32, UW, UB) | |
764 | VEXT2XV(vext2xv_du_bu, 64, UD, UB) | |
765 | VEXT2XV(vext2xv_wu_hu, 32, UW, UH) | |
766 | VEXT2XV(vext2xv_du_hu, 64, UD, UH) | |
767 | VEXT2XV(vext2xv_du_wu, 64, UD, UW) | |
768 | ||
f0e395df SG |
769 | DO_3OP(vsigncov_b, 8, B, DO_SIGNCOV) |
770 | DO_3OP(vsigncov_h, 16, H, DO_SIGNCOV) | |
771 | DO_3OP(vsigncov_w, 32, W, DO_SIGNCOV) | |
772 | DO_3OP(vsigncov_d, 64, D, DO_SIGNCOV) | |
789f4a4c SG |
773 | |
774 | static uint64_t do_vmskltz_b(int64_t val) | |
775 | { | |
776 | uint64_t m = 0x8080808080808080ULL; | |
777 | uint64_t c = val & m; | |
778 | c |= c << 7; | |
779 | c |= c << 14; | |
780 | c |= c << 28; | |
781 | return c >> 56; | |
782 | } | |
783 | ||
ff27e335 | 784 | void HELPER(vmskltz_b)(void *vd, void *vj, uint32_t desc) |
789f4a4c | 785 | { |
97074674 | 786 | int i; |
789f4a4c | 787 | uint16_t temp = 0; |
ff27e335 SG |
788 | VReg *Vd = (VReg *)vd; |
789 | VReg *Vj = (VReg *)vj; | |
97074674 | 790 | int oprsz = simd_oprsz(desc); |
789f4a4c | 791 | |
97074674 SG |
792 | for (i = 0; i < oprsz / 16; i++) { |
793 | temp = 0; | |
794 | temp = do_vmskltz_b(Vj->D(2 * i)); | |
795 | temp |= (do_vmskltz_b(Vj->D(2 * i + 1)) << 8); | |
796 | Vd->D(2 * i) = temp; | |
797 | Vd->D(2 * i + 1) = 0; | |
798 | } | |
789f4a4c SG |
799 | } |
800 | ||
801 | static uint64_t do_vmskltz_h(int64_t val) | |
802 | { | |
803 | uint64_t m = 0x8000800080008000ULL; | |
804 | uint64_t c = val & m; | |
805 | c |= c << 15; | |
806 | c |= c << 30; | |
807 | return c >> 60; | |
808 | } | |
809 | ||
ff27e335 | 810 | void HELPER(vmskltz_h)(void *vd, void *vj, uint32_t desc) |
789f4a4c | 811 | { |
97074674 | 812 | int i; |
789f4a4c | 813 | uint16_t temp = 0; |
ff27e335 SG |
814 | VReg *Vd = (VReg *)vd; |
815 | VReg *Vj = (VReg *)vj; | |
97074674 | 816 | int oprsz = simd_oprsz(desc); |
789f4a4c | 817 | |
97074674 SG |
818 | for (i = 0; i < oprsz / 16; i++) { |
819 | temp = 0; | |
820 | temp = do_vmskltz_h(Vj->D(2 * i)); | |
821 | temp |= (do_vmskltz_h(Vj->D(2 * i + 1)) << 4); | |
822 | Vd->D(2 * i) = temp; | |
823 | Vd->D(2 * i + 1) = 0; | |
824 | } | |
789f4a4c SG |
825 | } |
826 | ||
827 | static uint64_t do_vmskltz_w(int64_t val) | |
828 | { | |
829 | uint64_t m = 0x8000000080000000ULL; | |
830 | uint64_t c = val & m; | |
831 | c |= c << 31; | |
832 | return c >> 62; | |
833 | } | |
834 | ||
ff27e335 | 835 | void HELPER(vmskltz_w)(void *vd, void *vj, uint32_t desc) |
789f4a4c | 836 | { |
97074674 | 837 | int i; |
789f4a4c | 838 | uint16_t temp = 0; |
ff27e335 SG |
839 | VReg *Vd = (VReg *)vd; |
840 | VReg *Vj = (VReg *)vj; | |
97074674 | 841 | int oprsz = simd_oprsz(desc); |
789f4a4c | 842 | |
97074674 SG |
843 | for (i = 0; i < oprsz / 16; i++) { |
844 | temp = 0; | |
845 | temp = do_vmskltz_w(Vj->D(2 * i)); | |
846 | temp |= (do_vmskltz_w(Vj->D(2 * i + 1)) << 2); | |
847 | Vd->D(2 * i) = temp; | |
848 | Vd->D(2 * i + 1) = 0; | |
849 | } | |
789f4a4c SG |
850 | } |
851 | ||
852 | static uint64_t do_vmskltz_d(int64_t val) | |
853 | { | |
854 | return (uint64_t)val >> 63; | |
855 | } | |
ff27e335 | 856 | void HELPER(vmskltz_d)(void *vd, void *vj, uint32_t desc) |
789f4a4c | 857 | { |
97074674 | 858 | int i; |
789f4a4c | 859 | uint16_t temp = 0; |
ff27e335 SG |
860 | VReg *Vd = (VReg *)vd; |
861 | VReg *Vj = (VReg *)vj; | |
97074674 | 862 | int oprsz = simd_oprsz(desc); |
789f4a4c | 863 | |
97074674 SG |
864 | for (i = 0; i < oprsz / 16; i++) { |
865 | temp = 0; | |
866 | temp = do_vmskltz_d(Vj->D(2 * i)); | |
867 | temp |= (do_vmskltz_d(Vj->D(2 * i + 1)) << 1); | |
868 | Vd->D(2 * i) = temp; | |
869 | Vd->D(2 * i + 1) = 0; | |
870 | } | |
789f4a4c SG |
871 | } |
872 | ||
ff27e335 | 873 | void HELPER(vmskgez_b)(void *vd, void *vj, uint32_t desc) |
789f4a4c | 874 | { |
97074674 | 875 | int i; |
789f4a4c | 876 | uint16_t temp = 0; |
ff27e335 SG |
877 | VReg *Vd = (VReg *)vd; |
878 | VReg *Vj = (VReg *)vj; | |
97074674 | 879 | int oprsz = simd_oprsz(desc); |
789f4a4c | 880 | |
97074674 SG |
881 | for (i = 0; i < oprsz / 16; i++) { |
882 | temp = 0; | |
883 | temp = do_vmskltz_b(Vj->D(2 * i)); | |
884 | temp |= (do_vmskltz_b(Vj->D(2 * i + 1)) << 8); | |
885 | Vd->D(2 * i) = (uint16_t)(~temp); | |
886 | Vd->D(2 * i + 1) = 0; | |
887 | } | |
789f4a4c SG |
888 | } |
889 | ||
890 | static uint64_t do_vmskez_b(uint64_t a) | |
891 | { | |
892 | uint64_t m = 0x7f7f7f7f7f7f7f7fULL; | |
893 | uint64_t c = ~(((a & m) + m) | a | m); | |
894 | c |= c << 7; | |
895 | c |= c << 14; | |
896 | c |= c << 28; | |
897 | return c >> 56; | |
898 | } | |
899 | ||
ff27e335 | 900 | void HELPER(vmsknz_b)(void *vd, void *vj, uint32_t desc) |
789f4a4c | 901 | { |
97074674 | 902 | int i; |
789f4a4c | 903 | uint16_t temp = 0; |
ff27e335 SG |
904 | VReg *Vd = (VReg *)vd; |
905 | VReg *Vj = (VReg *)vj; | |
97074674 | 906 | int oprsz = simd_oprsz(desc); |
789f4a4c | 907 | |
97074674 SG |
908 | for (i = 0; i < oprsz / 16; i++) { |
909 | temp = 0; | |
910 | temp = do_vmskez_b(Vj->D(2 * i)); | |
911 | temp |= (do_vmskez_b(Vj->D(2 * i + 1)) << 8); | |
912 | Vd->D(2 * i) = (uint16_t)(~temp); | |
913 | Vd->D(2 * i + 1) = 0; | |
914 | } | |
789f4a4c | 915 | } |
f205a539 | 916 | |
4472a45a | 917 | void HELPER(vnori_b)(void *vd, void *vj, uint64_t imm, uint32_t desc) |
f205a539 SG |
918 | { |
919 | int i; | |
920 | VReg *Vd = (VReg *)vd; | |
921 | VReg *Vj = (VReg *)vj; | |
922 | ||
4472a45a | 923 | for (i = 0; i < simd_oprsz(desc); i++) { |
f205a539 SG |
924 | Vd->B(i) = ~(Vj->B(i) | (uint8_t)imm); |
925 | } | |
926 | } | |
9b21a7a5 | 927 | |
6567eac7 SG |
928 | #define VSLLWIL(NAME, BIT, E1, E2) \ |
929 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
930 | { \ | |
931 | int i, j, ofs; \ | |
932 | VReg temp = {}; \ | |
933 | VReg *Vd = (VReg *)vd; \ | |
934 | VReg *Vj = (VReg *)vj; \ | |
935 | int oprsz = simd_oprsz(desc); \ | |
936 | typedef __typeof(temp.E1(0)) TD; \ | |
937 | \ | |
938 | ofs = LSX_LEN / BIT; \ | |
939 | for (i = 0; i < oprsz / 16; i++) { \ | |
940 | for (j = 0; j < ofs; j++) { \ | |
941 | temp.E1(j + ofs * i) = (TD)Vj->E2(j + ofs * 2 * i) << (imm % BIT); \ | |
942 | } \ | |
943 | } \ | |
944 | *Vd = temp; \ | |
9b21a7a5 SG |
945 | } |
946 | ||
6567eac7 | 947 | |
ff27e335 | 948 | void HELPER(vextl_q_d)(void *vd, void *vj, uint32_t desc) |
9b21a7a5 | 949 | { |
6567eac7 | 950 | int i; |
ff27e335 SG |
951 | VReg *Vd = (VReg *)vd; |
952 | VReg *Vj = (VReg *)vj; | |
6567eac7 | 953 | int oprsz = simd_oprsz(desc); |
9b21a7a5 | 954 | |
6567eac7 SG |
955 | for (i = 0; i < oprsz / 16; i++) { |
956 | Vd->Q(i) = int128_makes64(Vj->D(2 * i)); | |
957 | } | |
9b21a7a5 SG |
958 | } |
959 | ||
ff27e335 | 960 | void HELPER(vextl_qu_du)(void *vd, void *vj, uint32_t desc) |
9b21a7a5 | 961 | { |
6567eac7 | 962 | int i; |
ff27e335 SG |
963 | VReg *Vd = (VReg *)vd; |
964 | VReg *Vj = (VReg *)vj; | |
6567eac7 | 965 | int oprsz = simd_oprsz(desc); |
9b21a7a5 | 966 | |
6567eac7 SG |
967 | for (i = 0; i < oprsz / 16; i++) { |
968 | Vd->Q(i) = int128_make64(Vj->UD(2 * i)); | |
969 | } | |
9b21a7a5 SG |
970 | } |
971 | ||
972 | VSLLWIL(vsllwil_h_b, 16, H, B) | |
973 | VSLLWIL(vsllwil_w_h, 32, W, H) | |
974 | VSLLWIL(vsllwil_d_w, 64, D, W) | |
975 | VSLLWIL(vsllwil_hu_bu, 16, UH, UB) | |
976 | VSLLWIL(vsllwil_wu_hu, 32, UW, UH) | |
977 | VSLLWIL(vsllwil_du_wu, 64, UD, UW) | |
ecb93716 SG |
978 | |
979 | #define do_vsrlr(E, T) \ | |
980 | static T do_vsrlr_ ##E(T s1, int sh) \ | |
981 | { \ | |
982 | if (sh == 0) { \ | |
983 | return s1; \ | |
984 | } else { \ | |
985 | return (s1 >> sh) + ((s1 >> (sh - 1)) & 0x1); \ | |
986 | } \ | |
987 | } | |
988 | ||
989 | do_vsrlr(B, uint8_t) | |
990 | do_vsrlr(H, uint16_t) | |
991 | do_vsrlr(W, uint32_t) | |
992 | do_vsrlr(D, uint64_t) | |
993 | ||
994 | #define VSRLR(NAME, BIT, T, E) \ | |
04711da1 | 995 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
ecb93716 SG |
996 | { \ |
997 | int i; \ | |
04711da1 SG |
998 | VReg *Vd = (VReg *)vd; \ |
999 | VReg *Vj = (VReg *)vj; \ | |
1000 | VReg *Vk = (VReg *)vk; \ | |
8c272fe8 | 1001 | int oprsz = simd_oprsz(desc); \ |
ecb93716 | 1002 | \ |
8c272fe8 | 1003 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
ecb93716 SG |
1004 | Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \ |
1005 | } \ | |
1006 | } | |
1007 | ||
1008 | VSRLR(vsrlr_b, 8, uint8_t, B) | |
1009 | VSRLR(vsrlr_h, 16, uint16_t, H) | |
1010 | VSRLR(vsrlr_w, 32, uint32_t, W) | |
1011 | VSRLR(vsrlr_d, 64, uint64_t, D) | |
1012 | ||
329517d5 SG |
1013 | #define VSRLRI(NAME, BIT, E) \ |
1014 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
1015 | { \ | |
1016 | int i; \ | |
1017 | VReg *Vd = (VReg *)vd; \ | |
1018 | VReg *Vj = (VReg *)vj; \ | |
8c272fe8 | 1019 | int oprsz = simd_oprsz(desc); \ |
329517d5 | 1020 | \ |
8c272fe8 | 1021 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
329517d5 SG |
1022 | Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), imm); \ |
1023 | } \ | |
ecb93716 SG |
1024 | } |
1025 | ||
1026 | VSRLRI(vsrlri_b, 8, B) | |
1027 | VSRLRI(vsrlri_h, 16, H) | |
1028 | VSRLRI(vsrlri_w, 32, W) | |
1029 | VSRLRI(vsrlri_d, 64, D) | |
1030 | ||
1031 | #define do_vsrar(E, T) \ | |
1032 | static T do_vsrar_ ##E(T s1, int sh) \ | |
1033 | { \ | |
1034 | if (sh == 0) { \ | |
1035 | return s1; \ | |
1036 | } else { \ | |
1037 | return (s1 >> sh) + ((s1 >> (sh - 1)) & 0x1); \ | |
1038 | } \ | |
1039 | } | |
1040 | ||
1041 | do_vsrar(B, int8_t) | |
1042 | do_vsrar(H, int16_t) | |
1043 | do_vsrar(W, int32_t) | |
1044 | do_vsrar(D, int64_t) | |
1045 | ||
1046 | #define VSRAR(NAME, BIT, T, E) \ | |
04711da1 | 1047 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
ecb93716 SG |
1048 | { \ |
1049 | int i; \ | |
04711da1 SG |
1050 | VReg *Vd = (VReg *)vd; \ |
1051 | VReg *Vj = (VReg *)vj; \ | |
1052 | VReg *Vk = (VReg *)vk; \ | |
8c272fe8 | 1053 | int oprsz = simd_oprsz(desc); \ |
ecb93716 | 1054 | \ |
8c272fe8 | 1055 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
ecb93716 SG |
1056 | Vd->E(i) = do_vsrar_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \ |
1057 | } \ | |
1058 | } | |
1059 | ||
1060 | VSRAR(vsrar_b, 8, uint8_t, B) | |
1061 | VSRAR(vsrar_h, 16, uint16_t, H) | |
1062 | VSRAR(vsrar_w, 32, uint32_t, W) | |
1063 | VSRAR(vsrar_d, 64, uint64_t, D) | |
1064 | ||
329517d5 SG |
1065 | #define VSRARI(NAME, BIT, E) \ |
1066 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
1067 | { \ | |
1068 | int i; \ | |
1069 | VReg *Vd = (VReg *)vd; \ | |
1070 | VReg *Vj = (VReg *)vj; \ | |
8c272fe8 | 1071 | int oprsz = simd_oprsz(desc); \ |
329517d5 | 1072 | \ |
8c272fe8 | 1073 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
329517d5 SG |
1074 | Vd->E(i) = do_vsrar_ ## E(Vj->E(i), imm); \ |
1075 | } \ | |
ecb93716 SG |
1076 | } |
1077 | ||
1078 | VSRARI(vsrari_b, 8, B) | |
1079 | VSRARI(vsrari_h, 16, H) | |
1080 | VSRARI(vsrari_w, 32, W) | |
1081 | VSRARI(vsrari_d, 64, D) | |
d79fb8dd | 1082 | |
40c7674e SG |
1083 | #define VSRLN(NAME, BIT, E1, E2) \ |
1084 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
1085 | { \ | |
1086 | int i, j, ofs; \ | |
1087 | VReg *Vd = (VReg *)vd; \ | |
1088 | VReg *Vj = (VReg *)vj; \ | |
1089 | VReg *Vk = (VReg *)vk; \ | |
1090 | int oprsz = simd_oprsz(desc); \ | |
1091 | \ | |
1092 | ofs = LSX_LEN / BIT; \ | |
1093 | for (i = 0; i < oprsz / 16; i++) { \ | |
1094 | for (j = 0; j < ofs; j++) { \ | |
1095 | Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), \ | |
1096 | Vk->E2(j + ofs * i) % BIT); \ | |
1097 | } \ | |
1098 | Vd->D(2 * i + 1) = 0; \ | |
1099 | } \ | |
1100 | } | |
1101 | ||
1102 | VSRLN(vsrln_b_h, 16, B, UH) | |
1103 | VSRLN(vsrln_h_w, 32, H, UW) | |
1104 | VSRLN(vsrln_w_d, 64, W, UD) | |
1105 | ||
1106 | #define VSRAN(NAME, BIT, E1, E2, E3) \ | |
1107 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
1108 | { \ | |
1109 | int i, j, ofs; \ | |
1110 | VReg *Vd = (VReg *)vd; \ | |
1111 | VReg *Vj = (VReg *)vj; \ | |
1112 | VReg *Vk = (VReg *)vk; \ | |
1113 | int oprsz = simd_oprsz(desc); \ | |
1114 | \ | |
1115 | ofs = LSX_LEN / BIT; \ | |
1116 | for (i = 0; i < oprsz / 16; i++) { \ | |
1117 | for (j = 0; j < ofs; j++) { \ | |
1118 | Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), \ | |
1119 | Vk->E3(j + ofs * i) % BIT); \ | |
1120 | } \ | |
1121 | Vd->D(2 * i + 1) = 0; \ | |
1122 | } \ | |
1123 | } | |
1124 | ||
1125 | VSRAN(vsran_b_h, 16, B, H, UH) | |
1126 | VSRAN(vsran_h_w, 32, H, W, UW) | |
1127 | VSRAN(vsran_w_d, 64, W, D, UD) | |
1128 | ||
1129 | #define VSRLNI(NAME, BIT, E1, E2) \ | |
1130 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
1131 | { \ | |
1132 | int i, j, ofs; \ | |
1133 | VReg temp = {}; \ | |
1134 | VReg *Vd = (VReg *)vd; \ | |
1135 | VReg *Vj = (VReg *)vj; \ | |
1136 | int oprsz = simd_oprsz(desc); \ | |
1137 | \ | |
1138 | ofs = LSX_LEN / BIT; \ | |
1139 | for (i = 0; i < oprsz / 16; i++) { \ | |
1140 | for (j = 0; j < ofs; j++) { \ | |
1141 | temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \ | |
1142 | temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \ | |
1143 | imm); \ | |
1144 | } \ | |
1145 | } \ | |
1146 | *Vd = temp; \ | |
329517d5 SG |
1147 | } |
1148 | ||
1149 | void HELPER(vsrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) | |
d79fb8dd | 1150 | { |
40c7674e SG |
1151 | int i; |
1152 | VReg temp = {}; | |
329517d5 SG |
1153 | VReg *Vd = (VReg *)vd; |
1154 | VReg *Vj = (VReg *)vj; | |
d79fb8dd | 1155 | |
40c7674e SG |
1156 | for (i = 0; i < 2; i++) { |
1157 | temp.D(2 * i) = int128_getlo(int128_urshift(Vj->Q(i), imm % 128)); | |
1158 | temp.D(2 * i +1) = int128_getlo(int128_urshift(Vd->Q(i), imm % 128)); | |
1159 | } | |
d79fb8dd SG |
1160 | *Vd = temp; |
1161 | } | |
1162 | ||
40c7674e SG |
1163 | VSRLNI(vsrlni_b_h, 16, B, UH) |
1164 | VSRLNI(vsrlni_h_w, 32, H, UW) | |
1165 | VSRLNI(vsrlni_w_d, 64, W, UD) | |
1166 | ||
1167 | #define VSRANI(NAME, BIT, E1, E2) \ | |
1168 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
1169 | { \ | |
1170 | int i, j, ofs; \ | |
1171 | VReg temp = {}; \ | |
1172 | VReg *Vd = (VReg *)vd; \ | |
1173 | VReg *Vj = (VReg *)vj; \ | |
1174 | int oprsz = simd_oprsz(desc); \ | |
1175 | \ | |
1176 | ofs = LSX_LEN / BIT; \ | |
1177 | for (i = 0; i < oprsz / 16; i++) { \ | |
1178 | for (j = 0; j < ofs; j++) { \ | |
1179 | temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \ | |
1180 | temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \ | |
1181 | imm); \ | |
1182 | } \ | |
1183 | } \ | |
1184 | *Vd = temp; \ | |
329517d5 SG |
1185 | } |
1186 | ||
1187 | void HELPER(vsrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) | |
d79fb8dd | 1188 | { |
40c7674e SG |
1189 | int i; |
1190 | VReg temp = {}; | |
329517d5 SG |
1191 | VReg *Vd = (VReg *)vd; |
1192 | VReg *Vj = (VReg *)vj; | |
d79fb8dd | 1193 | |
40c7674e SG |
1194 | for (i = 0; i < 2; i++) { |
1195 | temp.D(2 * i) = int128_getlo(int128_rshift(Vj->Q(i), imm % 128)); | |
1196 | temp.D(2 * i + 1) = int128_getlo(int128_rshift(Vd->Q(i), imm % 128)); | |
1197 | } | |
d79fb8dd SG |
1198 | *Vd = temp; |
1199 | } | |
1200 | ||
1201 | VSRANI(vsrani_b_h, 16, B, H) | |
1202 | VSRANI(vsrani_h_w, 32, H, W) | |
1203 | VSRANI(vsrani_w_d, 64, W, D) | |
a5200a17 | 1204 | |
c50ce38a SG |
1205 | #define VSRLRN(NAME, BIT, E1, E2, E3) \ |
1206 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
1207 | { \ | |
1208 | int i, j, ofs; \ | |
1209 | VReg *Vd = (VReg *)vd; \ | |
1210 | VReg *Vj = (VReg *)vj; \ | |
1211 | VReg *Vk = (VReg *)vk; \ | |
1212 | int oprsz = simd_oprsz(desc); \ | |
1213 | \ | |
1214 | ofs = LSX_LEN / BIT; \ | |
1215 | for (i = 0; i < oprsz / 16; i++) { \ | |
1216 | for (j = 0; j < ofs; j++) { \ | |
1217 | Vd->E1(j + ofs * 2 * i) = do_vsrlr_ ##E2(Vj->E2(j + ofs * i), \ | |
1218 | Vk->E3(j + ofs * i) % BIT); \ | |
1219 | } \ | |
1220 | Vd->D(2 * i + 1) = 0; \ | |
1221 | } \ | |
1222 | } | |
1223 | ||
1224 | VSRLRN(vsrlrn_b_h, 16, B, H, UH) | |
1225 | VSRLRN(vsrlrn_h_w, 32, H, W, UW) | |
1226 | VSRLRN(vsrlrn_w_d, 64, W, D, UD) | |
1227 | ||
1228 | #define VSRARN(NAME, BIT, E1, E2, E3) \ | |
1229 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
1230 | { \ | |
1231 | int i, j, ofs; \ | |
1232 | VReg *Vd = (VReg *)vd; \ | |
1233 | VReg *Vj = (VReg *)vj; \ | |
1234 | VReg *Vk = (VReg *)vk; \ | |
1235 | int oprsz = simd_oprsz(desc); \ | |
1236 | \ | |
1237 | ofs = LSX_LEN / BIT; \ | |
1238 | for (i = 0; i < oprsz / 16; i++) { \ | |
1239 | for (j = 0; j < ofs; j++) { \ | |
1240 | Vd->E1(j + ofs * 2 * i) = do_vsrar_ ## E2(Vj->E2(j + ofs * i), \ | |
1241 | Vk->E3(j + ofs * i) % BIT); \ | |
1242 | } \ | |
1243 | Vd->D(2 * i + 1) = 0; \ | |
1244 | } \ | |
1245 | } | |
1246 | ||
1247 | VSRARN(vsrarn_b_h, 16, B, H, UH) | |
1248 | VSRARN(vsrarn_h_w, 32, H, W, UW) | |
1249 | VSRARN(vsrarn_w_d, 64, W, D, UD) | |
1250 | ||
1251 | #define VSRLRNI(NAME, BIT, E1, E2) \ | |
1252 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
1253 | { \ | |
1254 | int i, j, ofs; \ | |
1255 | VReg temp = {}; \ | |
1256 | VReg *Vd = (VReg *)vd; \ | |
1257 | VReg *Vj = (VReg *)vj; \ | |
1258 | int oprsz = simd_oprsz(desc); \ | |
1259 | \ | |
1260 | ofs = LSX_LEN / BIT; \ | |
1261 | for (i = 0; i < oprsz / 16; i++) { \ | |
1262 | for (j = 0; j < ofs; j++) { \ | |
1263 | temp.E1(j + ofs * 2 * i) = do_vsrlr_ ## E2(Vj->E2(j + ofs * i), imm); \ | |
1264 | temp.E1(j + ofs * (2 * i + 1)) = do_vsrlr_ ## E2(Vd->E2(j + ofs * i), \ | |
1265 | imm); \ | |
1266 | } \ | |
1267 | } \ | |
1268 | *Vd = temp; \ | |
329517d5 SG |
1269 | } |
1270 | ||
1271 | void HELPER(vsrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) | |
a5200a17 | 1272 | { |
c50ce38a SG |
1273 | int i; |
1274 | VReg temp = {}; | |
329517d5 SG |
1275 | VReg *Vd = (VReg *)vd; |
1276 | VReg *Vj = (VReg *)vj; | |
c50ce38a SG |
1277 | Int128 r[4]; |
1278 | int oprsz = simd_oprsz(desc); | |
a5200a17 | 1279 | |
c50ce38a SG |
1280 | for (i = 0; i < oprsz / 16; i++) { |
1281 | if (imm == 0) { | |
1282 | temp.D(2 * i) = int128_getlo(Vj->Q(i)); | |
1283 | temp.D(2 * i + 1) = int128_getlo(Vd->Q(i)); | |
1284 | } else { | |
1285 | r[2 * i] = int128_and(int128_urshift(Vj->Q(i), (imm - 1)), | |
1286 | int128_one()); | |
1287 | r[2 * i + 1] = int128_and(int128_urshift(Vd->Q(i), (imm - 1)), | |
1288 | int128_one()); | |
1289 | temp.D(2 * i) = int128_getlo(int128_add(int128_urshift(Vj->Q(i), | |
1290 | imm), r[2 * i])); | |
1291 | temp.D(2 * i + 1) = int128_getlo(int128_add(int128_urshift(Vd->Q(i), | |
1292 | imm), r[ 2 * i + 1])); | |
1293 | } | |
a5200a17 SG |
1294 | } |
1295 | *Vd = temp; | |
1296 | } | |
1297 | ||
1298 | VSRLRNI(vsrlrni_b_h, 16, B, H) | |
1299 | VSRLRNI(vsrlrni_h_w, 32, H, W) | |
1300 | VSRLRNI(vsrlrni_w_d, 64, W, D) | |
1301 | ||
c50ce38a SG |
1302 | #define VSRARNI(NAME, BIT, E1, E2) \ |
1303 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
1304 | { \ | |
1305 | int i, j, ofs; \ | |
1306 | VReg temp = {}; \ | |
1307 | VReg *Vd = (VReg *)vd; \ | |
1308 | VReg *Vj = (VReg *)vj; \ | |
1309 | int oprsz = simd_oprsz(desc); \ | |
1310 | \ | |
1311 | ofs = LSX_LEN / BIT; \ | |
1312 | for (i = 0; i < oprsz / 16; i++) { \ | |
1313 | for (j = 0; j < ofs; j++) { \ | |
1314 | temp.E1(j + ofs * 2 * i) = do_vsrar_ ## E2(Vj->E2(j + ofs * i), imm); \ | |
1315 | temp.E1(j + ofs * (2 * i + 1)) = do_vsrar_ ## E2(Vd->E2(j + ofs * i), \ | |
1316 | imm); \ | |
1317 | } \ | |
1318 | } \ | |
1319 | *Vd = temp; \ | |
329517d5 SG |
1320 | } |
1321 | ||
1322 | void HELPER(vsrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) | |
a5200a17 | 1323 | { |
c50ce38a SG |
1324 | int i; |
1325 | VReg temp = {}; | |
329517d5 SG |
1326 | VReg *Vd = (VReg *)vd; |
1327 | VReg *Vj = (VReg *)vj; | |
c50ce38a SG |
1328 | Int128 r[4]; |
1329 | int oprsz = simd_oprsz(desc); | |
a5200a17 | 1330 | |
c50ce38a SG |
1331 | for (i = 0; i < oprsz / 16; i++) { |
1332 | if (imm == 0) { | |
1333 | temp.D(2 * i) = int128_getlo(Vj->Q(i)); | |
1334 | temp.D(2 * i + 1) = int128_getlo(Vd->Q(i)); | |
1335 | } else { | |
1336 | r[2 * i] = int128_and(int128_rshift(Vj->Q(i), (imm - 1)), | |
1337 | int128_one()); | |
1338 | r[2 * i + 1] = int128_and(int128_rshift(Vd->Q(i), (imm - 1)), | |
1339 | int128_one()); | |
1340 | temp.D(2 * i) = int128_getlo(int128_add(int128_rshift(Vj->Q(i), | |
1341 | imm), r[2 * i])); | |
1342 | temp.D(2 * i + 1) = int128_getlo(int128_add(int128_rshift(Vd->Q(i), | |
1343 | imm), r[2 * i + 1])); | |
1344 | } | |
a5200a17 SG |
1345 | } |
1346 | *Vd = temp; | |
1347 | } | |
1348 | ||
1349 | VSRARNI(vsrarni_b_h, 16, B, H) | |
1350 | VSRARNI(vsrarni_h_w, 32, H, W) | |
1351 | VSRARNI(vsrarni_w_d, 64, W, D) | |
83b3815d SG |
1352 | |
1353 | #define SSRLNS(NAME, T1, T2, T3) \ | |
1354 | static T1 do_ssrlns_ ## NAME(T2 e2, int sa, int sh) \ | |
1355 | { \ | |
1356 | T1 shft_res; \ | |
1357 | if (sa == 0) { \ | |
1358 | shft_res = e2; \ | |
1359 | } else { \ | |
1360 | shft_res = (((T1)e2) >> sa); \ | |
1361 | } \ | |
1362 | T3 mask; \ | |
1363 | mask = (1ull << sh) -1; \ | |
1364 | if (shft_res > mask) { \ | |
1365 | return mask; \ | |
1366 | } else { \ | |
1367 | return shft_res; \ | |
1368 | } \ | |
1369 | } | |
1370 | ||
1371 | SSRLNS(B, uint16_t, int16_t, uint8_t) | |
1372 | SSRLNS(H, uint32_t, int32_t, uint16_t) | |
1373 | SSRLNS(W, uint64_t, int64_t, uint32_t) | |
1374 | ||
6256c8ca SG |
1375 | #define VSSRLN(NAME, BIT, E1, E2, E3) \ |
1376 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
1377 | { \ | |
1378 | int i, j, ofs; \ | |
1379 | VReg *Vd = (VReg *)vd; \ | |
1380 | VReg *Vj = (VReg *)vj; \ | |
1381 | VReg *Vk = (VReg *)vk; \ | |
1382 | int oprsz = simd_oprsz(desc); \ | |
1383 | \ | |
1384 | ofs = LSX_LEN / BIT; \ | |
1385 | for (i = 0; i < oprsz / 16; i++) { \ | |
1386 | for (j = 0; j < ofs; j++) { \ | |
1387 | Vd->E1(j + ofs * 2 * i) = do_ssrlns_ ## E1(Vj->E2(j + ofs * i), \ | |
1388 | Vk->E3(j + ofs * i) % BIT, \ | |
1389 | BIT / 2 - 1); \ | |
1390 | } \ | |
1391 | Vd->D(2 * i + 1) = 0; \ | |
1392 | } \ | |
83b3815d SG |
1393 | } |
1394 | ||
6256c8ca SG |
1395 | VSSRLN(vssrln_b_h, 16, B, H, UH) |
1396 | VSSRLN(vssrln_h_w, 32, H, W, UW) | |
1397 | VSSRLN(vssrln_w_d, 64, W, D, UD) | |
83b3815d SG |
1398 | |
1399 | #define SSRANS(E, T1, T2) \ | |
1400 | static T1 do_ssrans_ ## E(T1 e2, int sa, int sh) \ | |
1401 | { \ | |
1402 | T1 shft_res; \ | |
1403 | if (sa == 0) { \ | |
1404 | shft_res = e2; \ | |
1405 | } else { \ | |
1406 | shft_res = e2 >> sa; \ | |
1407 | } \ | |
1408 | T2 mask; \ | |
6256c8ca | 1409 | mask = (1ll << sh) - 1; \ |
83b3815d SG |
1410 | if (shft_res > mask) { \ |
1411 | return mask; \ | |
6256c8ca | 1412 | } else if (shft_res < -(mask + 1)) { \ |
83b3815d SG |
1413 | return ~mask; \ |
1414 | } else { \ | |
1415 | return shft_res; \ | |
1416 | } \ | |
1417 | } | |
1418 | ||
1419 | SSRANS(B, int16_t, int8_t) | |
1420 | SSRANS(H, int32_t, int16_t) | |
1421 | SSRANS(W, int64_t, int32_t) | |
1422 | ||
6256c8ca SG |
1423 | #define VSSRAN(NAME, BIT, E1, E2, E3) \ |
1424 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
1425 | { \ | |
1426 | int i, j, ofs; \ | |
1427 | VReg *Vd = (VReg *)vd; \ | |
1428 | VReg *Vj = (VReg *)vj; \ | |
1429 | VReg *Vk = (VReg *)vk; \ | |
1430 | int oprsz = simd_oprsz(desc); \ | |
1431 | \ | |
1432 | ofs = LSX_LEN / BIT; \ | |
1433 | for (i = 0; i < oprsz / 16; i++) { \ | |
1434 | for (j = 0; j < ofs; j++) { \ | |
1435 | Vd->E1(j + ofs * 2 * i) = do_ssrans_ ## E1(Vj->E2(j + ofs * i), \ | |
1436 | Vk->E3(j + ofs * i) % BIT, \ | |
1437 | BIT / 2 - 1); \ | |
1438 | } \ | |
1439 | Vd->D(2 * i + 1) = 0; \ | |
1440 | } \ | |
83b3815d SG |
1441 | } |
1442 | ||
6256c8ca SG |
1443 | VSSRAN(vssran_b_h, 16, B, H, UH) |
1444 | VSSRAN(vssran_h_w, 32, H, W, UW) | |
1445 | VSSRAN(vssran_w_d, 64, W, D, UD) | |
83b3815d SG |
1446 | |
1447 | #define SSRLNU(E, T1, T2, T3) \ | |
1448 | static T1 do_ssrlnu_ ## E(T3 e2, int sa, int sh) \ | |
1449 | { \ | |
1450 | T1 shft_res; \ | |
1451 | if (sa == 0) { \ | |
1452 | shft_res = e2; \ | |
1453 | } else { \ | |
1454 | shft_res = (((T1)e2) >> sa); \ | |
1455 | } \ | |
1456 | T2 mask; \ | |
6256c8ca | 1457 | mask = (1ull << sh) - 1; \ |
83b3815d SG |
1458 | if (shft_res > mask) { \ |
1459 | return mask; \ | |
1460 | } else { \ | |
1461 | return shft_res; \ | |
1462 | } \ | |
1463 | } | |
1464 | ||
1465 | SSRLNU(B, uint16_t, uint8_t, int16_t) | |
1466 | SSRLNU(H, uint32_t, uint16_t, int32_t) | |
1467 | SSRLNU(W, uint64_t, uint32_t, int64_t) | |
1468 | ||
6256c8ca SG |
1469 | #define VSSRLNU(NAME, BIT, E1, E2, E3) \ |
1470 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
1471 | { \ | |
1472 | int i, j, ofs; \ | |
1473 | VReg *Vd = (VReg *)vd; \ | |
1474 | VReg *Vj = (VReg *)vj; \ | |
1475 | VReg *Vk = (VReg *)vk; \ | |
1476 | int oprsz = simd_oprsz(desc); \ | |
1477 | \ | |
1478 | ofs = LSX_LEN / BIT; \ | |
1479 | for (i = 0; i < oprsz / 16; i++) { \ | |
1480 | for (j = 0; j < ofs; j++) { \ | |
1481 | Vd->E1(j + ofs * 2 * i) = do_ssrlnu_ ## E1(Vj->E2(j + ofs * i), \ | |
1482 | Vk->E3(j + ofs * i) % BIT, \ | |
1483 | BIT / 2); \ | |
1484 | } \ | |
1485 | Vd->D(2 * i + 1) = 0; \ | |
1486 | } \ | |
83b3815d SG |
1487 | } |
1488 | ||
6256c8ca SG |
1489 | VSSRLNU(vssrln_bu_h, 16, B, H, UH) |
1490 | VSSRLNU(vssrln_hu_w, 32, H, W, UW) | |
1491 | VSSRLNU(vssrln_wu_d, 64, W, D, UD) | |
83b3815d SG |
1492 | |
1493 | #define SSRANU(E, T1, T2, T3) \ | |
1494 | static T1 do_ssranu_ ## E(T3 e2, int sa, int sh) \ | |
1495 | { \ | |
1496 | T1 shft_res; \ | |
1497 | if (sa == 0) { \ | |
1498 | shft_res = e2; \ | |
1499 | } else { \ | |
1500 | shft_res = e2 >> sa; \ | |
1501 | } \ | |
1502 | if (e2 < 0) { \ | |
1503 | shft_res = 0; \ | |
1504 | } \ | |
1505 | T2 mask; \ | |
6256c8ca | 1506 | mask = (1ull << sh) - 1; \ |
83b3815d SG |
1507 | if (shft_res > mask) { \ |
1508 | return mask; \ | |
1509 | } else { \ | |
1510 | return shft_res; \ | |
1511 | } \ | |
1512 | } | |
1513 | ||
1514 | SSRANU(B, uint16_t, uint8_t, int16_t) | |
1515 | SSRANU(H, uint32_t, uint16_t, int32_t) | |
1516 | SSRANU(W, uint64_t, uint32_t, int64_t) | |
1517 | ||
6256c8ca SG |
1518 | #define VSSRANU(NAME, BIT, E1, E2, E3) \ |
1519 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
1520 | { \ | |
1521 | int i, j, ofs; \ | |
1522 | VReg *Vd = (VReg *)vd; \ | |
1523 | VReg *Vj = (VReg *)vj; \ | |
1524 | VReg *Vk = (VReg *)vk; \ | |
1525 | int oprsz = simd_oprsz(desc); \ | |
1526 | \ | |
1527 | ofs = LSX_LEN / BIT; \ | |
1528 | for (i = 0; i < oprsz / 16; i++) { \ | |
1529 | for (j = 0; j < ofs; j++) { \ | |
1530 | Vd->E1(j + ofs * 2 * i) = do_ssranu_ ## E1(Vj->E2(j + ofs * i), \ | |
1531 | Vk->E3(j + ofs * i) % BIT, \ | |
1532 | BIT / 2); \ | |
1533 | } \ | |
1534 | Vd->D(2 * i + 1) = 0; \ | |
1535 | } \ | |
83b3815d SG |
1536 | } |
1537 | ||
6256c8ca SG |
1538 | VSSRANU(vssran_bu_h, 16, B, H, UH) |
1539 | VSSRANU(vssran_hu_w, 32, H, W, UW) | |
1540 | VSSRANU(vssran_wu_d, 64, W, D, UD) | |
1541 | ||
1542 | #define VSSRLNI(NAME, BIT, E1, E2) \ | |
1543 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
1544 | { \ | |
1545 | int i, j, ofs; \ | |
1546 | VReg temp = {}; \ | |
1547 | VReg *Vd = (VReg *)vd; \ | |
1548 | VReg *Vj = (VReg *)vj; \ | |
1549 | int oprsz = simd_oprsz(desc); \ | |
1550 | \ | |
1551 | ofs = LSX_LEN / BIT; \ | |
1552 | for (i = 0; i < oprsz / 16; i++) { \ | |
1553 | for (j = 0; j < ofs; j++) { \ | |
1554 | temp.E1(j + ofs * 2 * i) = do_ssrlns_ ## E1(Vj->E2(j + ofs * i), \ | |
1555 | imm, BIT / 2 - 1); \ | |
1556 | temp.E1(j + ofs * (2 * i + 1)) = do_ssrlns_ ## E1(Vd->E2(j + ofs * i), \ | |
1557 | imm, BIT / 2 - 1); \ | |
1558 | } \ | |
1559 | } \ | |
1560 | *Vd = temp; \ | |
1561 | } | |
1562 | ||
1563 | static void do_vssrlni_q(VReg *Vd, VReg *Vj, | |
1564 | uint64_t imm, int idx, Int128 mask) | |
83b3815d | 1565 | { |
6256c8ca | 1566 | Int128 shft_res1, shft_res2; |
83b3815d SG |
1567 | |
1568 | if (imm == 0) { | |
6256c8ca SG |
1569 | shft_res1 = Vj->Q(idx); |
1570 | shft_res2 = Vd->Q(idx); | |
83b3815d | 1571 | } else { |
6256c8ca SG |
1572 | shft_res1 = int128_urshift(Vj->Q(idx), imm); |
1573 | shft_res2 = int128_urshift(Vd->Q(idx), imm); | |
83b3815d | 1574 | } |
83b3815d SG |
1575 | |
1576 | if (int128_ult(mask, shft_res1)) { | |
6256c8ca | 1577 | Vd->D(idx * 2) = int128_getlo(mask); |
83b3815d | 1578 | }else { |
6256c8ca | 1579 | Vd->D(idx * 2) = int128_getlo(shft_res1); |
83b3815d SG |
1580 | } |
1581 | ||
1582 | if (int128_ult(mask, shft_res2)) { | |
6256c8ca | 1583 | Vd->D(idx * 2 + 1) = int128_getlo(mask); |
83b3815d | 1584 | }else { |
6256c8ca SG |
1585 | Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); |
1586 | } | |
1587 | } | |
1588 | ||
1589 | void HELPER(vssrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) | |
1590 | { | |
1591 | int i; | |
1592 | Int128 mask; | |
1593 | VReg *Vd = (VReg *)vd; | |
1594 | VReg *Vj = (VReg *)vj; | |
1595 | int oprsz = simd_oprsz(desc); | |
1596 | ||
1597 | mask = int128_sub(int128_lshift(int128_one(), 63), int128_one()); | |
1598 | ||
1599 | for (i = 0; i < oprsz / 16; i++) { | |
1600 | do_vssrlni_q(Vd, Vj, imm, i, mask); | |
83b3815d SG |
1601 | } |
1602 | } | |
1603 | ||
1604 | VSSRLNI(vssrlni_b_h, 16, B, H) | |
1605 | VSSRLNI(vssrlni_h_w, 32, H, W) | |
1606 | VSSRLNI(vssrlni_w_d, 64, W, D) | |
1607 | ||
6256c8ca SG |
1608 | #define VSSRANI(NAME, BIT, E1, E2) \ |
1609 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
1610 | { \ | |
1611 | int i, j, ofs; \ | |
1612 | VReg temp = {}; \ | |
1613 | VReg *Vd = (VReg *)vd; \ | |
1614 | VReg *Vj = (VReg *)vj; \ | |
1615 | int oprsz = simd_oprsz(desc); \ | |
1616 | \ | |
1617 | ofs = LSX_LEN / BIT; \ | |
1618 | for (i = 0; i < oprsz / 16; i++) { \ | |
1619 | for (j = 0; j < ofs; j++) { \ | |
1620 | temp.E1(j + ofs * 2 * i) = do_ssrans_ ## E1(Vj->E2(j + ofs * i), \ | |
1621 | imm, BIT / 2 - 1); \ | |
1622 | temp.E1(j + ofs * (2 * i + 1)) = do_ssrans_ ## E1(Vd->E2(j + ofs * i), \ | |
1623 | imm, BIT / 2 - 1); \ | |
1624 | } \ | |
1625 | } \ | |
1626 | *Vd = temp; \ | |
1627 | } | |
1628 | ||
1629 | static void do_vssrani_d_q(VReg *Vd, VReg *Vj, | |
1630 | uint64_t imm, int idx, Int128 mask, Int128 min) | |
83b3815d | 1631 | { |
6256c8ca | 1632 | Int128 shft_res1, shft_res2; |
83b3815d SG |
1633 | |
1634 | if (imm == 0) { | |
6256c8ca SG |
1635 | shft_res1 = Vj->Q(idx); |
1636 | shft_res2 = Vd->Q(idx); | |
83b3815d | 1637 | } else { |
6256c8ca SG |
1638 | shft_res1 = int128_rshift(Vj->Q(idx), imm); |
1639 | shft_res2 = int128_rshift(Vd->Q(idx), imm); | |
83b3815d | 1640 | } |
83b3815d | 1641 | |
6256c8ca SG |
1642 | if (int128_gt(shft_res1, mask)) { |
1643 | Vd->D(idx * 2) = int128_getlo(mask); | |
83b3815d | 1644 | } else if (int128_lt(shft_res1, int128_neg(min))) { |
6256c8ca | 1645 | Vd->D(idx * 2) = int128_getlo(min); |
83b3815d | 1646 | } else { |
6256c8ca | 1647 | Vd->D(idx * 2) = int128_getlo(shft_res1); |
83b3815d SG |
1648 | } |
1649 | ||
1650 | if (int128_gt(shft_res2, mask)) { | |
6256c8ca | 1651 | Vd->D(idx * 2 + 1) = int128_getlo(mask); |
83b3815d | 1652 | } else if (int128_lt(shft_res2, int128_neg(min))) { |
6256c8ca | 1653 | Vd->D(idx * 2 + 1) = int128_getlo(min); |
83b3815d | 1654 | } else { |
6256c8ca SG |
1655 | Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); |
1656 | } | |
1657 | } | |
1658 | ||
1659 | void HELPER(vssrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) | |
1660 | { | |
1661 | int i; | |
1662 | Int128 mask, min; | |
1663 | VReg *Vd = (VReg *)vd; | |
1664 | VReg *Vj = (VReg *)vj; | |
1665 | int oprsz = simd_oprsz(desc); | |
1666 | ||
1667 | mask = int128_sub(int128_lshift(int128_one(), 63), int128_one()); | |
1668 | min = int128_lshift(int128_one(), 63); | |
1669 | ||
1670 | for (i = 0; i < oprsz / 16; i++) { | |
1671 | do_vssrani_d_q(Vd, Vj, imm, i, mask, min); | |
83b3815d SG |
1672 | } |
1673 | } | |
1674 | ||
6256c8ca | 1675 | |
83b3815d SG |
1676 | VSSRANI(vssrani_b_h, 16, B, H) |
1677 | VSSRANI(vssrani_h_w, 32, H, W) | |
1678 | VSSRANI(vssrani_w_d, 64, W, D) | |
1679 | ||
6256c8ca SG |
1680 | #define VSSRLNUI(NAME, BIT, E1, E2) \ |
1681 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
1682 | { \ | |
1683 | int i, j, ofs; \ | |
1684 | VReg temp = {}; \ | |
1685 | VReg *Vd = (VReg *)vd; \ | |
1686 | VReg *Vj = (VReg *)vj; \ | |
1687 | int oprsz = simd_oprsz(desc); \ | |
1688 | \ | |
1689 | ofs = LSX_LEN / BIT; \ | |
1690 | for (i = 0; i < oprsz / 16; i++) { \ | |
1691 | for (j = 0; j < ofs; j++) { \ | |
1692 | temp.E1(j + ofs * 2 * i) = do_ssrlnu_ ## E1(Vj->E2(j + ofs * i), \ | |
1693 | imm, BIT / 2); \ | |
1694 | temp.E1(j + ofs * (2 * i + 1)) = do_ssrlnu_ ## E1(Vd->E2(j + ofs * i), \ | |
1695 | imm, BIT / 2); \ | |
1696 | } \ | |
1697 | } \ | |
1698 | *Vd = temp; \ | |
83b3815d SG |
1699 | } |
1700 | ||
329517d5 | 1701 | void HELPER(vssrlni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) |
83b3815d | 1702 | { |
6256c8ca SG |
1703 | int i; |
1704 | Int128 mask; | |
329517d5 SG |
1705 | VReg *Vd = (VReg *)vd; |
1706 | VReg *Vj = (VReg *)vj; | |
6256c8ca | 1707 | int oprsz = simd_oprsz(desc); |
83b3815d | 1708 | |
83b3815d SG |
1709 | mask = int128_sub(int128_lshift(int128_one(), 64), int128_one()); |
1710 | ||
6256c8ca SG |
1711 | for (i = 0; i < oprsz / 16; i++) { |
1712 | do_vssrlni_q(Vd, Vj, imm, i, mask); | |
83b3815d SG |
1713 | } |
1714 | } | |
1715 | ||
1716 | VSSRLNUI(vssrlni_bu_h, 16, B, H) | |
1717 | VSSRLNUI(vssrlni_hu_w, 32, H, W) | |
1718 | VSSRLNUI(vssrlni_wu_d, 64, W, D) | |
1719 | ||
6256c8ca SG |
1720 | #define VSSRANUI(NAME, BIT, E1, E2) \ |
1721 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
1722 | { \ | |
1723 | int i, j, ofs; \ | |
1724 | VReg temp = {}; \ | |
1725 | VReg *Vd = (VReg *)vd; \ | |
1726 | VReg *Vj = (VReg *)vj; \ | |
1727 | int oprsz = simd_oprsz(desc); \ | |
1728 | \ | |
1729 | ofs = LSX_LEN / BIT; \ | |
1730 | for (i = 0; i < oprsz / 16; i++) { \ | |
1731 | for (j = 0; j < ofs; j++) { \ | |
1732 | temp.E1(j + ofs * 2 * i) = do_ssranu_ ## E1(Vj->E2(j + ofs * i), \ | |
1733 | imm, BIT / 2); \ | |
1734 | temp.E1(j + ofs * (2 * i + 1)) = do_ssranu_ ## E1(Vd->E2(j + ofs * i), \ | |
1735 | imm, BIT / 2); \ | |
1736 | } \ | |
1737 | } \ | |
1738 | *Vd = temp; \ | |
1739 | } | |
1740 | ||
1741 | static void do_vssrani_du_q(VReg *Vd, VReg *Vj, | |
1742 | uint64_t imm, int idx, Int128 mask) | |
83b3815d | 1743 | { |
6256c8ca | 1744 | Int128 shft_res1, shft_res2; |
83b3815d SG |
1745 | |
1746 | if (imm == 0) { | |
6256c8ca SG |
1747 | shft_res1 = Vj->Q(idx); |
1748 | shft_res2 = Vd->Q(idx); | |
83b3815d | 1749 | } else { |
6256c8ca SG |
1750 | shft_res1 = int128_rshift(Vj->Q(idx), imm); |
1751 | shft_res2 = int128_rshift(Vd->Q(idx), imm); | |
83b3815d SG |
1752 | } |
1753 | ||
6256c8ca | 1754 | if (int128_lt(Vj->Q(idx), int128_zero())) { |
83b3815d SG |
1755 | shft_res1 = int128_zero(); |
1756 | } | |
1757 | ||
6256c8ca | 1758 | if (int128_lt(Vd->Q(idx), int128_zero())) { |
83b3815d SG |
1759 | shft_res2 = int128_zero(); |
1760 | } | |
83b3815d | 1761 | if (int128_ult(mask, shft_res1)) { |
6256c8ca | 1762 | Vd->D(idx * 2) = int128_getlo(mask); |
83b3815d | 1763 | }else { |
6256c8ca | 1764 | Vd->D(idx * 2) = int128_getlo(shft_res1); |
83b3815d SG |
1765 | } |
1766 | ||
1767 | if (int128_ult(mask, shft_res2)) { | |
6256c8ca | 1768 | Vd->D(idx * 2 + 1) = int128_getlo(mask); |
83b3815d | 1769 | }else { |
6256c8ca SG |
1770 | Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); |
1771 | } | |
1772 | ||
1773 | } | |
1774 | ||
1775 | void HELPER(vssrani_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) | |
1776 | { | |
1777 | int i; | |
1778 | Int128 mask; | |
1779 | VReg *Vd = (VReg *)vd; | |
1780 | VReg *Vj = (VReg *)vj; | |
1781 | int oprsz = simd_oprsz(desc); | |
1782 | ||
1783 | mask = int128_sub(int128_lshift(int128_one(), 64), int128_one()); | |
1784 | ||
1785 | for (i = 0; i < oprsz / 16; i++) { | |
1786 | do_vssrani_du_q(Vd, Vj, imm, i, mask); | |
83b3815d SG |
1787 | } |
1788 | } | |
1789 | ||
1790 | VSSRANUI(vssrani_bu_h, 16, B, H) | |
1791 | VSSRANUI(vssrani_hu_w, 32, H, W) | |
1792 | VSSRANUI(vssrani_wu_d, 64, W, D) | |
162cd32c SG |
1793 | |
1794 | #define SSRLRNS(E1, E2, T1, T2, T3) \ | |
1795 | static T1 do_ssrlrns_ ## E1(T2 e2, int sa, int sh) \ | |
1796 | { \ | |
1797 | T1 shft_res; \ | |
1798 | \ | |
1799 | shft_res = do_vsrlr_ ## E2(e2, sa); \ | |
1800 | T1 mask; \ | |
77fca794 | 1801 | mask = (1ull << sh) - 1; \ |
162cd32c SG |
1802 | if (shft_res > mask) { \ |
1803 | return mask; \ | |
1804 | } else { \ | |
1805 | return shft_res; \ | |
1806 | } \ | |
1807 | } | |
1808 | ||
1809 | SSRLRNS(B, H, uint16_t, int16_t, uint8_t) | |
1810 | SSRLRNS(H, W, uint32_t, int32_t, uint16_t) | |
1811 | SSRLRNS(W, D, uint64_t, int64_t, uint32_t) | |
1812 | ||
77fca794 SG |
1813 | #define VSSRLRN(NAME, BIT, E1, E2, E3) \ |
1814 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
1815 | { \ | |
1816 | int i, j, ofs; \ | |
1817 | VReg *Vd = (VReg *)vd; \ | |
1818 | VReg *Vj = (VReg *)vj; \ | |
1819 | VReg *Vk = (VReg *)vk; \ | |
1820 | int oprsz = simd_oprsz(desc); \ | |
1821 | \ | |
1822 | ofs = LSX_LEN / BIT; \ | |
1823 | for (i = 0; i < oprsz / 16; i++) { \ | |
1824 | for (j = 0; j < ofs; j++) { \ | |
1825 | Vd->E1(j + ofs * 2 * i) = do_ssrlrns_ ## E1(Vj->E2(j + ofs * i), \ | |
1826 | Vk->E3(j + ofs * i) % BIT, \ | |
1827 | BIT / 2 - 1); \ | |
1828 | } \ | |
1829 | Vd->D(2 * i + 1) = 0; \ | |
1830 | } \ | |
162cd32c SG |
1831 | } |
1832 | ||
77fca794 SG |
1833 | VSSRLRN(vssrlrn_b_h, 16, B, H, UH) |
1834 | VSSRLRN(vssrlrn_h_w, 32, H, W, UW) | |
1835 | VSSRLRN(vssrlrn_w_d, 64, W, D, UD) | |
162cd32c SG |
1836 | |
1837 | #define SSRARNS(E1, E2, T1, T2) \ | |
1838 | static T1 do_ssrarns_ ## E1(T1 e2, int sa, int sh) \ | |
1839 | { \ | |
1840 | T1 shft_res; \ | |
1841 | \ | |
1842 | shft_res = do_vsrar_ ## E2(e2, sa); \ | |
1843 | T2 mask; \ | |
77fca794 | 1844 | mask = (1ll << sh) - 1; \ |
162cd32c SG |
1845 | if (shft_res > mask) { \ |
1846 | return mask; \ | |
1847 | } else if (shft_res < -(mask +1)) { \ | |
1848 | return ~mask; \ | |
1849 | } else { \ | |
1850 | return shft_res; \ | |
1851 | } \ | |
1852 | } | |
1853 | ||
1854 | SSRARNS(B, H, int16_t, int8_t) | |
1855 | SSRARNS(H, W, int32_t, int16_t) | |
1856 | SSRARNS(W, D, int64_t, int32_t) | |
1857 | ||
77fca794 SG |
1858 | #define VSSRARN(NAME, BIT, E1, E2, E3) \ |
1859 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
1860 | { \ | |
1861 | int i, j, ofs; \ | |
1862 | VReg *Vd = (VReg *)vd; \ | |
1863 | VReg *Vj = (VReg *)vj; \ | |
1864 | VReg *Vk = (VReg *)vk; \ | |
1865 | int oprsz = simd_oprsz(desc); \ | |
1866 | \ | |
1867 | ofs = LSX_LEN / BIT; \ | |
1868 | for (i = 0; i < oprsz / 16; i++) { \ | |
1869 | for (j = 0; j < ofs; j++) { \ | |
1870 | Vd->E1(j + ofs * 2 * i) = do_ssrarns_ ## E1(Vj->E2(j + ofs * i), \ | |
1871 | Vk->E3(j + ofs * i) % BIT, \ | |
1872 | BIT/ 2 - 1); \ | |
1873 | } \ | |
1874 | Vd->D(2 * i + 1) = 0; \ | |
1875 | } \ | |
162cd32c SG |
1876 | } |
1877 | ||
77fca794 SG |
1878 | VSSRARN(vssrarn_b_h, 16, B, H, UH) |
1879 | VSSRARN(vssrarn_h_w, 32, H, W, UW) | |
1880 | VSSRARN(vssrarn_w_d, 64, W, D, UD) | |
162cd32c SG |
1881 | |
1882 | #define SSRLRNU(E1, E2, T1, T2, T3) \ | |
1883 | static T1 do_ssrlrnu_ ## E1(T3 e2, int sa, int sh) \ | |
1884 | { \ | |
1885 | T1 shft_res; \ | |
1886 | \ | |
1887 | shft_res = do_vsrlr_ ## E2(e2, sa); \ | |
1888 | \ | |
1889 | T2 mask; \ | |
77fca794 | 1890 | mask = (1ull << sh) - 1; \ |
162cd32c SG |
1891 | if (shft_res > mask) { \ |
1892 | return mask; \ | |
1893 | } else { \ | |
1894 | return shft_res; \ | |
1895 | } \ | |
1896 | } | |
1897 | ||
1898 | SSRLRNU(B, H, uint16_t, uint8_t, int16_t) | |
1899 | SSRLRNU(H, W, uint32_t, uint16_t, int32_t) | |
1900 | SSRLRNU(W, D, uint64_t, uint32_t, int64_t) | |
1901 | ||
77fca794 SG |
1902 | #define VSSRLRNU(NAME, BIT, E1, E2, E3) \ |
1903 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
1904 | { \ | |
1905 | int i, j, ofs; \ | |
1906 | VReg *Vd = (VReg *)vd; \ | |
1907 | VReg *Vj = (VReg *)vj; \ | |
1908 | VReg *Vk = (VReg *)vk; \ | |
1909 | int oprsz = simd_oprsz(desc); \ | |
1910 | \ | |
1911 | ofs = LSX_LEN / BIT; \ | |
1912 | for (i = 0; i < oprsz / 16; i++) { \ | |
1913 | for (j = 0; j < ofs; j++) { \ | |
1914 | Vd->E1(j + ofs * 2 * i) = do_ssrlrnu_ ## E1(Vj->E2(j + ofs * i), \ | |
1915 | Vk->E3(j + ofs * i) % BIT, \ | |
1916 | BIT / 2); \ | |
1917 | } \ | |
1918 | Vd->D(2 * i + 1) = 0; \ | |
1919 | } \ | |
162cd32c SG |
1920 | } |
1921 | ||
77fca794 SG |
1922 | VSSRLRNU(vssrlrn_bu_h, 16, B, H, UH) |
1923 | VSSRLRNU(vssrlrn_hu_w, 32, H, W, UW) | |
1924 | VSSRLRNU(vssrlrn_wu_d, 64, W, D, UD) | |
162cd32c SG |
1925 | |
1926 | #define SSRARNU(E1, E2, T1, T2, T3) \ | |
1927 | static T1 do_ssrarnu_ ## E1(T3 e2, int sa, int sh) \ | |
1928 | { \ | |
1929 | T1 shft_res; \ | |
1930 | \ | |
1931 | if (e2 < 0) { \ | |
1932 | shft_res = 0; \ | |
1933 | } else { \ | |
1934 | shft_res = do_vsrar_ ## E2(e2, sa); \ | |
1935 | } \ | |
1936 | T2 mask; \ | |
77fca794 | 1937 | mask = (1ull << sh) - 1; \ |
162cd32c SG |
1938 | if (shft_res > mask) { \ |
1939 | return mask; \ | |
1940 | } else { \ | |
1941 | return shft_res; \ | |
1942 | } \ | |
1943 | } | |
1944 | ||
1945 | SSRARNU(B, H, uint16_t, uint8_t, int16_t) | |
1946 | SSRARNU(H, W, uint32_t, uint16_t, int32_t) | |
1947 | SSRARNU(W, D, uint64_t, uint32_t, int64_t) | |
1948 | ||
77fca794 SG |
1949 | #define VSSRARNU(NAME, BIT, E1, E2, E3) \ |
1950 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
1951 | { \ | |
1952 | int i, j, ofs; \ | |
1953 | VReg *Vd = (VReg *)vd; \ | |
1954 | VReg *Vj = (VReg *)vj; \ | |
1955 | VReg *Vk = (VReg *)vk; \ | |
1956 | int oprsz = simd_oprsz(desc); \ | |
1957 | \ | |
1958 | ofs = LSX_LEN / BIT; \ | |
1959 | for (i = 0; i < oprsz / 16; i++) { \ | |
1960 | for (j = 0; j < ofs; j++) { \ | |
1961 | Vd->E1(j + ofs * 2 * i) = do_ssrarnu_ ## E1(Vj->E2(j + ofs * i), \ | |
1962 | Vk->E3(j + ofs * i) % BIT, \ | |
1963 | BIT / 2); \ | |
1964 | } \ | |
1965 | Vd->D(2 * i + 1) = 0; \ | |
1966 | } \ | |
162cd32c SG |
1967 | } |
1968 | ||
77fca794 SG |
1969 | VSSRARNU(vssrarn_bu_h, 16, B, H, UH) |
1970 | VSSRARNU(vssrarn_hu_w, 32, H, W, UW) | |
1971 | VSSRARNU(vssrarn_wu_d, 64, W, D, UD) | |
1972 | ||
1973 | #define VSSRLRNI(NAME, BIT, E1, E2) \ | |
1974 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
1975 | { \ | |
1976 | int i, j, ofs; \ | |
1977 | VReg temp = {}; \ | |
1978 | VReg *Vd = (VReg *)vd; \ | |
1979 | VReg *Vj = (VReg *)vj; \ | |
1980 | int oprsz = simd_oprsz(desc); \ | |
1981 | \ | |
1982 | ofs = LSX_LEN / BIT; \ | |
1983 | for (i = 0; i < oprsz / 16; i++) { \ | |
1984 | for (j = 0; j < ofs; j++) { \ | |
1985 | temp.E1(j + ofs * 2 * i) = do_ssrlrns_ ## E1(Vj->E2(j + ofs * i), \ | |
1986 | imm, BIT / 2 - 1); \ | |
1987 | temp.E1(j + ofs * (2 * i + 1)) = do_ssrlrns_ ## E1(Vd->E2(j + ofs * i), \ | |
1988 | imm, BIT / 2 - 1); \ | |
1989 | } \ | |
1990 | } \ | |
1991 | *Vd = temp; \ | |
1992 | } | |
1993 | ||
1994 | static void do_vssrlrni_q(VReg *Vd, VReg * Vj, | |
1995 | uint64_t imm, int idx, Int128 mask) | |
1996 | { | |
1997 | Int128 shft_res1, shft_res2, r1, r2; | |
1998 | if (imm == 0) { | |
1999 | shft_res1 = Vj->Q(idx); | |
2000 | shft_res2 = Vd->Q(idx); | |
2001 | } else { | |
2002 | r1 = int128_and(int128_urshift(Vj->Q(idx), (imm - 1)), int128_one()); | |
2003 | r2 = int128_and(int128_urshift(Vd->Q(idx), (imm - 1)), int128_one()); | |
2004 | shft_res1 = (int128_add(int128_urshift(Vj->Q(idx), imm), r1)); | |
2005 | shft_res2 = (int128_add(int128_urshift(Vd->Q(idx), imm), r2)); | |
2006 | } | |
162cd32c | 2007 | |
77fca794 SG |
2008 | if (int128_ult(mask, shft_res1)) { |
2009 | Vd->D(idx * 2) = int128_getlo(mask); | |
2010 | }else { | |
2011 | Vd->D(idx * 2) = int128_getlo(shft_res1); | |
2012 | } | |
2013 | ||
2014 | if (int128_ult(mask, shft_res2)) { | |
2015 | Vd->D(idx * 2 + 1) = int128_getlo(mask); | |
2016 | }else { | |
2017 | Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); | |
2018 | } | |
162cd32c SG |
2019 | } |
2020 | ||
77fca794 SG |
2021 | void HELPER(vssrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) |
2022 | { | |
2023 | int i; | |
2024 | Int128 mask; | |
2025 | VReg *Vd = (VReg *)vd; | |
2026 | VReg *Vj = (VReg *)vj; | |
2027 | int oprsz = simd_oprsz(desc); | |
2028 | ||
2029 | mask = int128_sub(int128_lshift(int128_one(), 63), int128_one()); | |
2030 | ||
2031 | for (i = 0; i < oprsz / 16; i++) { | |
2032 | do_vssrlrni_q(Vd, Vj, imm, i, mask); | |
2033 | } | |
162cd32c SG |
2034 | } |
2035 | ||
2036 | VSSRLRNI(vssrlrni_b_h, 16, B, H) | |
2037 | VSSRLRNI(vssrlrni_h_w, 32, H, W) | |
2038 | VSSRLRNI(vssrlrni_w_d, 64, W, D) | |
162cd32c | 2039 | |
77fca794 SG |
2040 | #define VSSRARNI(NAME, BIT, E1, E2) \ |
2041 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
2042 | { \ | |
2043 | int i, j, ofs; \ | |
2044 | VReg temp = {}; \ | |
2045 | VReg *Vd = (VReg *)vd; \ | |
2046 | VReg *Vj = (VReg *)vj; \ | |
2047 | int oprsz = simd_oprsz(desc); \ | |
2048 | \ | |
2049 | ofs = LSX_LEN / BIT; \ | |
2050 | for (i = 0; i < oprsz / 16; i++) { \ | |
2051 | for (j = 0; j < ofs; j++) { \ | |
2052 | temp.E1(j + ofs * 2 * i) = do_ssrarns_ ## E1(Vj->E2(j + ofs * i), \ | |
2053 | imm, BIT / 2 - 1); \ | |
2054 | temp.E1(j + ofs * (2 * i + 1)) = do_ssrarns_ ## E1(Vd->E2(j + ofs * i), \ | |
2055 | imm, BIT / 2 - 1); \ | |
2056 | } \ | |
2057 | } \ | |
2058 | *Vd = temp; \ | |
2059 | } | |
2060 | ||
2061 | static void do_vssrarni_d_q(VReg *Vd, VReg *Vj, | |
2062 | uint64_t imm, int idx, Int128 mask1, Int128 mask2) | |
162cd32c | 2063 | { |
77fca794 | 2064 | Int128 shft_res1, shft_res2, r1, r2; |
162cd32c SG |
2065 | |
2066 | if (imm == 0) { | |
77fca794 SG |
2067 | shft_res1 = Vj->Q(idx); |
2068 | shft_res2 = Vd->Q(idx); | |
162cd32c | 2069 | } else { |
77fca794 SG |
2070 | r1 = int128_and(int128_rshift(Vj->Q(idx), (imm - 1)), int128_one()); |
2071 | r2 = int128_and(int128_rshift(Vd->Q(idx), (imm - 1)), int128_one()); | |
2072 | shft_res1 = int128_add(int128_rshift(Vj->Q(idx), imm), r1); | |
2073 | shft_res2 = int128_add(int128_rshift(Vd->Q(idx), imm), r2); | |
162cd32c | 2074 | } |
77fca794 SG |
2075 | if (int128_gt(shft_res1, mask1)) { |
2076 | Vd->D(idx * 2) = int128_getlo(mask1); | |
162cd32c | 2077 | } else if (int128_lt(shft_res1, int128_neg(mask2))) { |
77fca794 | 2078 | Vd->D(idx * 2) = int128_getlo(mask2); |
162cd32c | 2079 | } else { |
77fca794 | 2080 | Vd->D(idx * 2) = int128_getlo(shft_res1); |
162cd32c SG |
2081 | } |
2082 | ||
2083 | if (int128_gt(shft_res2, mask1)) { | |
77fca794 | 2084 | Vd->D(idx * 2 + 1) = int128_getlo(mask1); |
162cd32c | 2085 | } else if (int128_lt(shft_res2, int128_neg(mask2))) { |
77fca794 | 2086 | Vd->D(idx * 2 + 1) = int128_getlo(mask2); |
162cd32c | 2087 | } else { |
77fca794 SG |
2088 | Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); |
2089 | } | |
2090 | } | |
2091 | ||
2092 | void HELPER(vssrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) | |
2093 | { | |
2094 | int i; | |
2095 | Int128 mask1, mask2; | |
2096 | VReg *Vd = (VReg *)vd; | |
2097 | VReg *Vj = (VReg *)vj; | |
2098 | int oprsz = simd_oprsz(desc); | |
2099 | ||
2100 | mask1 = int128_sub(int128_lshift(int128_one(), 63), int128_one()); | |
2101 | mask2 = int128_lshift(int128_one(), 63); | |
2102 | ||
2103 | for (i = 0; i < oprsz / 16; i++) { | |
2104 | do_vssrarni_d_q(Vd, Vj, imm, i, mask1, mask2); | |
162cd32c SG |
2105 | } |
2106 | } | |
2107 | ||
2108 | VSSRARNI(vssrarni_b_h, 16, B, H) | |
2109 | VSSRARNI(vssrarni_h_w, 32, H, W) | |
2110 | VSSRARNI(vssrarni_w_d, 64, W, D) | |
2111 | ||
77fca794 SG |
2112 | #define VSSRLRNUI(NAME, BIT, E1, E2) \ |
2113 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
2114 | { \ | |
2115 | int i, j, ofs; \ | |
2116 | VReg temp = {}; \ | |
2117 | VReg *Vd = (VReg *)vd; \ | |
2118 | VReg *Vj = (VReg *)vj; \ | |
2119 | int oprsz = simd_oprsz(desc); \ | |
2120 | \ | |
2121 | ofs = LSX_LEN / BIT; \ | |
2122 | for (i = 0; i < oprsz / 16; i++) { \ | |
2123 | for (j = 0; j < ofs; j++) { \ | |
2124 | temp.E1(j + ofs * 2 * i) = do_ssrlrnu_ ## E1(Vj->E2(j + ofs * i), \ | |
2125 | imm, BIT / 2); \ | |
2126 | temp.E1(j + ofs * (2 * i + 1)) = do_ssrlrnu_ ## E1(Vd->E2(j + ofs * i), \ | |
2127 | imm, BIT / 2); \ | |
2128 | } \ | |
2129 | } \ | |
2130 | *Vd = temp; \ | |
2131 | } | |
2132 | ||
2133 | void HELPER(vssrlrni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) | |
2134 | { | |
2135 | int i; | |
2136 | Int128 mask; | |
2137 | VReg *Vd = (VReg *)vd; | |
2138 | VReg *Vj = (VReg *)vj; | |
2139 | int oprsz = simd_oprsz(desc); | |
2140 | ||
2141 | mask = int128_sub(int128_lshift(int128_one(), 64), int128_one()); | |
2142 | ||
2143 | for (i = 0; i < oprsz / 16; i++) { | |
2144 | do_vssrlrni_q(Vd, Vj, imm, i, mask); | |
2145 | } | |
162cd32c SG |
2146 | } |
2147 | ||
2148 | VSSRLRNUI(vssrlrni_bu_h, 16, B, H) | |
2149 | VSSRLRNUI(vssrlrni_hu_w, 32, H, W) | |
2150 | VSSRLRNUI(vssrlrni_wu_d, 64, W, D) | |
162cd32c | 2151 | |
77fca794 SG |
2152 | #define VSSRARNUI(NAME, BIT, E1, E2) \ |
2153 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
2154 | { \ | |
2155 | int i, j, ofs; \ | |
2156 | VReg temp = {}; \ | |
2157 | VReg *Vd = (VReg *)vd; \ | |
2158 | VReg *Vj = (VReg *)vj; \ | |
2159 | int oprsz = simd_oprsz(desc); \ | |
2160 | \ | |
2161 | ofs = LSX_LEN / BIT; \ | |
2162 | for (i = 0; i < oprsz / 16; i++) { \ | |
2163 | for (j = 0; j < ofs; j++) { \ | |
2164 | temp.E1(j + ofs * 2 * i) = do_ssrarnu_ ## E1(Vj->E2(j + ofs * i), \ | |
2165 | imm, BIT / 2); \ | |
2166 | temp.E1(j + ofs * (2 * i + 1)) = do_ssrarnu_ ## E1(Vd->E2(j + ofs * i), \ | |
2167 | imm, BIT / 2); \ | |
2168 | } \ | |
2169 | } \ | |
2170 | *Vd = temp; \ | |
2171 | } | |
2172 | ||
2173 | static void do_vssrarni_du_q(VReg *Vd, VReg *Vj, | |
2174 | uint64_t imm, int idx, Int128 mask1, Int128 mask2) | |
162cd32c | 2175 | { |
77fca794 | 2176 | Int128 shft_res1, shft_res2, r1, r2; |
162cd32c SG |
2177 | |
2178 | if (imm == 0) { | |
77fca794 SG |
2179 | shft_res1 = Vj->Q(idx); |
2180 | shft_res2 = Vd->Q(idx); | |
162cd32c | 2181 | } else { |
77fca794 SG |
2182 | r1 = int128_and(int128_rshift(Vj->Q(idx), (imm - 1)), int128_one()); |
2183 | r2 = int128_and(int128_rshift(Vd->Q(idx), (imm - 1)), int128_one()); | |
2184 | shft_res1 = int128_add(int128_rshift(Vj->Q(idx), imm), r1); | |
2185 | shft_res2 = int128_add(int128_rshift(Vd->Q(idx), imm), r2); | |
162cd32c SG |
2186 | } |
2187 | ||
77fca794 | 2188 | if (int128_lt(Vj->Q(idx), int128_zero())) { |
162cd32c SG |
2189 | shft_res1 = int128_zero(); |
2190 | } | |
77fca794 | 2191 | if (int128_lt(Vd->Q(idx), int128_zero())) { |
162cd32c SG |
2192 | shft_res2 = int128_zero(); |
2193 | } | |
2194 | ||
162cd32c | 2195 | if (int128_gt(shft_res1, mask1)) { |
77fca794 | 2196 | Vd->D(idx * 2) = int128_getlo(mask1); |
162cd32c | 2197 | } else if (int128_lt(shft_res1, int128_neg(mask2))) { |
77fca794 | 2198 | Vd->D(idx * 2) = int128_getlo(mask2); |
162cd32c | 2199 | } else { |
77fca794 | 2200 | Vd->D(idx * 2) = int128_getlo(shft_res1); |
162cd32c SG |
2201 | } |
2202 | ||
2203 | if (int128_gt(shft_res2, mask1)) { | |
77fca794 | 2204 | Vd->D(idx * 2 + 1) = int128_getlo(mask1); |
162cd32c | 2205 | } else if (int128_lt(shft_res2, int128_neg(mask2))) { |
77fca794 | 2206 | Vd->D(idx * 2 + 1) = int128_getlo(mask2); |
162cd32c | 2207 | } else { |
77fca794 SG |
2208 | Vd->D(idx * 2 + 1) = int128_getlo(shft_res2); |
2209 | } | |
2210 | } | |
2211 | ||
2212 | void HELPER(vssrarni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) | |
2213 | { | |
2214 | int i; | |
2215 | Int128 mask1, mask2; | |
2216 | VReg *Vd = (VReg *)vd; | |
2217 | VReg *Vj = (VReg *)vj; | |
2218 | int oprsz = simd_oprsz(desc); | |
2219 | ||
2220 | mask1 = int128_sub(int128_lshift(int128_one(), 64), int128_one()); | |
2221 | mask2 = int128_lshift(int128_one(), 64); | |
2222 | ||
2223 | for (i = 0; i < oprsz / 16; i++) { | |
2224 | do_vssrarni_du_q(Vd, Vj, imm, i, mask1, mask2); | |
162cd32c SG |
2225 | } |
2226 | } | |
2227 | ||
2228 | VSSRARNUI(vssrarni_bu_h, 16, B, H) | |
2229 | VSSRARNUI(vssrarni_hu_w, 32, H, W) | |
2230 | VSSRARNUI(vssrarni_wu_d, 64, W, D) | |
2e105e12 | 2231 | |
ff27e335 SG |
2232 | #define DO_2OP(NAME, BIT, E, DO_OP) \ |
2233 | void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ | |
2234 | { \ | |
2235 | int i; \ | |
2236 | VReg *Vd = (VReg *)vd; \ | |
2237 | VReg *Vj = (VReg *)vj; \ | |
12ad133f | 2238 | int oprsz = simd_oprsz(desc); \ |
ff27e335 | 2239 | \ |
12ad133f | 2240 | for (i = 0; i < oprsz / (BIT / 8); i++) \ |
ff27e335 SG |
2241 | { \ |
2242 | Vd->E(i) = DO_OP(Vj->E(i)); \ | |
2243 | } \ | |
2e105e12 SG |
2244 | } |
2245 | ||
2e105e12 SG |
2246 | DO_2OP(vclo_b, 8, UB, DO_CLO_B) |
2247 | DO_2OP(vclo_h, 16, UH, DO_CLO_H) | |
2248 | DO_2OP(vclo_w, 32, UW, DO_CLO_W) | |
2249 | DO_2OP(vclo_d, 64, UD, DO_CLO_D) | |
2250 | DO_2OP(vclz_b, 8, UB, DO_CLZ_B) | |
2251 | DO_2OP(vclz_h, 16, UH, DO_CLZ_H) | |
2252 | DO_2OP(vclz_w, 32, UW, DO_CLZ_W) | |
2253 | DO_2OP(vclz_d, 64, UD, DO_CLZ_D) | |
bb22ee57 | 2254 | |
ff27e335 SG |
2255 | #define VPCNT(NAME, BIT, E, FN) \ |
2256 | void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ | |
2257 | { \ | |
2258 | int i; \ | |
2259 | VReg *Vd = (VReg *)vd; \ | |
2260 | VReg *Vj = (VReg *)vj; \ | |
956dec74 | 2261 | int oprsz = simd_oprsz(desc); \ |
ff27e335 | 2262 | \ |
956dec74 | 2263 | for (i = 0; i < oprsz / (BIT / 8); i++) \ |
ff27e335 SG |
2264 | { \ |
2265 | Vd->E(i) = FN(Vj->E(i)); \ | |
2266 | } \ | |
bb22ee57 SG |
2267 | } |
2268 | ||
2269 | VPCNT(vpcnt_b, 8, UB, ctpop8) | |
2270 | VPCNT(vpcnt_h, 16, UH, ctpop16) | |
2271 | VPCNT(vpcnt_w, 32, UW, ctpop32) | |
2272 | VPCNT(vpcnt_d, 64, UD, ctpop64) | |
0b1e6705 | 2273 | |
1b3e242f SG |
2274 | #define DO_BIT(NAME, BIT, E, DO_OP) \ |
2275 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
2276 | { \ | |
2277 | int i; \ | |
2278 | VReg *Vd = (VReg *)vd; \ | |
2279 | VReg *Vj = (VReg *)vj; \ | |
2280 | VReg *Vk = (VReg *)vk; \ | |
2281 | int oprsz = simd_oprsz(desc); \ | |
2282 | \ | |
2283 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
2284 | Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)%BIT); \ | |
2285 | } \ | |
0b1e6705 SG |
2286 | } |
2287 | ||
2288 | DO_BIT(vbitclr_b, 8, UB, DO_BITCLR) | |
2289 | DO_BIT(vbitclr_h, 16, UH, DO_BITCLR) | |
2290 | DO_BIT(vbitclr_w, 32, UW, DO_BITCLR) | |
2291 | DO_BIT(vbitclr_d, 64, UD, DO_BITCLR) | |
2292 | DO_BIT(vbitset_b, 8, UB, DO_BITSET) | |
2293 | DO_BIT(vbitset_h, 16, UH, DO_BITSET) | |
2294 | DO_BIT(vbitset_w, 32, UW, DO_BITSET) | |
2295 | DO_BIT(vbitset_d, 64, UD, DO_BITSET) | |
2296 | DO_BIT(vbitrev_b, 8, UB, DO_BITREV) | |
2297 | DO_BIT(vbitrev_h, 16, UH, DO_BITREV) | |
2298 | DO_BIT(vbitrev_w, 32, UW, DO_BITREV) | |
2299 | DO_BIT(vbitrev_d, 64, UD, DO_BITREV) | |
2300 | ||
1b3e242f SG |
2301 | #define DO_BITI(NAME, BIT, E, DO_OP) \ |
2302 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
2303 | { \ | |
2304 | int i; \ | |
2305 | VReg *Vd = (VReg *)vd; \ | |
2306 | VReg *Vj = (VReg *)vj; \ | |
2307 | int oprsz = simd_oprsz(desc); \ | |
2308 | \ | |
2309 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
2310 | Vd->E(i) = DO_OP(Vj->E(i), imm); \ | |
2311 | } \ | |
0b1e6705 SG |
2312 | } |
2313 | ||
2314 | DO_BITI(vbitclri_b, 8, UB, DO_BITCLR) | |
2315 | DO_BITI(vbitclri_h, 16, UH, DO_BITCLR) | |
2316 | DO_BITI(vbitclri_w, 32, UW, DO_BITCLR) | |
2317 | DO_BITI(vbitclri_d, 64, UD, DO_BITCLR) | |
2318 | DO_BITI(vbitseti_b, 8, UB, DO_BITSET) | |
2319 | DO_BITI(vbitseti_h, 16, UH, DO_BITSET) | |
2320 | DO_BITI(vbitseti_w, 32, UW, DO_BITSET) | |
2321 | DO_BITI(vbitseti_d, 64, UD, DO_BITSET) | |
2322 | DO_BITI(vbitrevi_b, 8, UB, DO_BITREV) | |
2323 | DO_BITI(vbitrevi_h, 16, UH, DO_BITREV) | |
2324 | DO_BITI(vbitrevi_w, 32, UW, DO_BITREV) | |
2325 | DO_BITI(vbitrevi_d, 64, UD, DO_BITREV) | |
ac95a0b9 | 2326 | |
04711da1 SG |
2327 | #define VFRSTP(NAME, BIT, MASK, E) \ |
2328 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
2329 | { \ | |
abee168e | 2330 | int i, j, m, ofs; \ |
04711da1 SG |
2331 | VReg *Vd = (VReg *)vd; \ |
2332 | VReg *Vj = (VReg *)vj; \ | |
2333 | VReg *Vk = (VReg *)vk; \ | |
abee168e | 2334 | int oprsz = simd_oprsz(desc); \ |
04711da1 | 2335 | \ |
abee168e SG |
2336 | ofs = LSX_LEN / BIT; \ |
2337 | for (i = 0; i < oprsz / 16; i++) { \ | |
2338 | m = Vk->E(i * ofs) & MASK; \ | |
2339 | for (j = 0; j < ofs; j++) { \ | |
2340 | if (Vj->E(j + ofs * i) < 0) { \ | |
2341 | break; \ | |
2342 | } \ | |
04711da1 | 2343 | } \ |
abee168e | 2344 | Vd->E(m + i * ofs) = j; \ |
04711da1 | 2345 | } \ |
ac95a0b9 SG |
2346 | } |
2347 | ||
2348 | VFRSTP(vfrstp_b, 8, 0xf, B) | |
2349 | VFRSTP(vfrstp_h, 16, 0x7, H) | |
2350 | ||
329517d5 SG |
2351 | #define VFRSTPI(NAME, BIT, E) \ |
2352 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
2353 | { \ | |
abee168e | 2354 | int i, j, m, ofs; \ |
329517d5 SG |
2355 | VReg *Vd = (VReg *)vd; \ |
2356 | VReg *Vj = (VReg *)vj; \ | |
abee168e | 2357 | int oprsz = simd_oprsz(desc); \ |
329517d5 | 2358 | \ |
abee168e SG |
2359 | ofs = LSX_LEN / BIT; \ |
2360 | m = imm % ofs; \ | |
2361 | for (i = 0; i < oprsz / 16; i++) { \ | |
2362 | for (j = 0; j < ofs; j++) { \ | |
2363 | if (Vj->E(j + ofs * i) < 0) { \ | |
2364 | break; \ | |
2365 | } \ | |
329517d5 | 2366 | } \ |
abee168e | 2367 | Vd->E(m + i * ofs) = j; \ |
329517d5 | 2368 | } \ |
ac95a0b9 SG |
2369 | } |
2370 | ||
2371 | VFRSTPI(vfrstpi_b, 8, B) | |
2372 | VFRSTPI(vfrstpi_h, 16, H) | |
aca67472 SG |
2373 | |
2374 | static void vec_update_fcsr0_mask(CPULoongArchState *env, | |
2375 | uintptr_t pc, int mask) | |
2376 | { | |
2377 | int flags = get_float_exception_flags(&env->fp_status); | |
2378 | ||
2379 | set_float_exception_flags(0, &env->fp_status); | |
2380 | ||
2381 | flags &= ~mask; | |
2382 | ||
2383 | if (flags) { | |
2384 | flags = ieee_ex_to_loongarch(flags); | |
2385 | UPDATE_FP_CAUSE(env->fcsr0, flags); | |
2386 | } | |
2387 | ||
2388 | if (GET_FP_ENABLES(env->fcsr0) & flags) { | |
2389 | do_raise_exception(env, EXCCODE_FPE, pc); | |
2390 | } else { | |
2391 | UPDATE_FP_FLAGS(env->fcsr0, flags); | |
2392 | } | |
2393 | } | |
2394 | ||
2395 | static void vec_update_fcsr0(CPULoongArchState *env, uintptr_t pc) | |
2396 | { | |
2397 | vec_update_fcsr0_mask(env, pc, 0); | |
2398 | } | |
2399 | ||
2400 | static inline void vec_clear_cause(CPULoongArchState *env) | |
2401 | { | |
2402 | SET_FP_CAUSE(env->fcsr0, 0); | |
2403 | } | |
2404 | ||
2405 | #define DO_3OP_F(NAME, BIT, E, FN) \ | |
3b286753 SG |
2406 | void HELPER(NAME)(void *vd, void *vj, void *vk, \ |
2407 | CPULoongArchState *env, uint32_t desc) \ | |
aca67472 SG |
2408 | { \ |
2409 | int i; \ | |
3b286753 SG |
2410 | VReg *Vd = (VReg *)vd; \ |
2411 | VReg *Vj = (VReg *)vj; \ | |
2412 | VReg *Vk = (VReg *)vk; \ | |
c9caf158 | 2413 | int oprsz = simd_oprsz(desc); \ |
aca67472 SG |
2414 | \ |
2415 | vec_clear_cause(env); \ | |
c9caf158 | 2416 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
aca67472 SG |
2417 | Vd->E(i) = FN(Vj->E(i), Vk->E(i), &env->fp_status); \ |
2418 | vec_update_fcsr0(env, GETPC()); \ | |
2419 | } \ | |
2420 | } | |
2421 | ||
2422 | DO_3OP_F(vfadd_s, 32, UW, float32_add) | |
2423 | DO_3OP_F(vfadd_d, 64, UD, float64_add) | |
2424 | DO_3OP_F(vfsub_s, 32, UW, float32_sub) | |
2425 | DO_3OP_F(vfsub_d, 64, UD, float64_sub) | |
2426 | DO_3OP_F(vfmul_s, 32, UW, float32_mul) | |
2427 | DO_3OP_F(vfmul_d, 64, UD, float64_mul) | |
2428 | DO_3OP_F(vfdiv_s, 32, UW, float32_div) | |
2429 | DO_3OP_F(vfdiv_d, 64, UD, float64_div) | |
2430 | DO_3OP_F(vfmax_s, 32, UW, float32_maxnum) | |
2431 | DO_3OP_F(vfmax_d, 64, UD, float64_maxnum) | |
2432 | DO_3OP_F(vfmin_s, 32, UW, float32_minnum) | |
2433 | DO_3OP_F(vfmin_d, 64, UD, float64_minnum) | |
2434 | DO_3OP_F(vfmaxa_s, 32, UW, float32_maxnummag) | |
2435 | DO_3OP_F(vfmaxa_d, 64, UD, float64_maxnummag) | |
2436 | DO_3OP_F(vfmina_s, 32, UW, float32_minnummag) | |
2437 | DO_3OP_F(vfmina_d, 64, UD, float64_minnummag) | |
2438 | ||
2439 | #define DO_4OP_F(NAME, BIT, E, FN, flags) \ | |
e2600dad SG |
2440 | void HELPER(NAME)(void *vd, void *vj, void *vk, void *va, \ |
2441 | CPULoongArchState *env, uint32_t desc) \ | |
aca67472 SG |
2442 | { \ |
2443 | int i; \ | |
e2600dad SG |
2444 | VReg *Vd = (VReg *)vd; \ |
2445 | VReg *Vj = (VReg *)vj; \ | |
2446 | VReg *Vk = (VReg *)vk; \ | |
2447 | VReg *Va = (VReg *)va; \ | |
c9caf158 | 2448 | int oprsz = simd_oprsz(desc); \ |
aca67472 SG |
2449 | \ |
2450 | vec_clear_cause(env); \ | |
c9caf158 | 2451 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
aca67472 SG |
2452 | Vd->E(i) = FN(Vj->E(i), Vk->E(i), Va->E(i), flags, &env->fp_status); \ |
2453 | vec_update_fcsr0(env, GETPC()); \ | |
2454 | } \ | |
2455 | } | |
2456 | ||
2457 | DO_4OP_F(vfmadd_s, 32, UW, float32_muladd, 0) | |
2458 | DO_4OP_F(vfmadd_d, 64, UD, float64_muladd, 0) | |
2459 | DO_4OP_F(vfmsub_s, 32, UW, float32_muladd, float_muladd_negate_c) | |
2460 | DO_4OP_F(vfmsub_d, 64, UD, float64_muladd, float_muladd_negate_c) | |
2461 | DO_4OP_F(vfnmadd_s, 32, UW, float32_muladd, float_muladd_negate_result) | |
2462 | DO_4OP_F(vfnmadd_d, 64, UD, float64_muladd, float_muladd_negate_result) | |
2463 | DO_4OP_F(vfnmsub_s, 32, UW, float32_muladd, | |
2464 | float_muladd_negate_c | float_muladd_negate_result) | |
2465 | DO_4OP_F(vfnmsub_d, 64, UD, float64_muladd, | |
2466 | float_muladd_negate_c | float_muladd_negate_result) | |
2467 | ||
226bf881 SG |
2468 | #define DO_2OP_F(NAME, BIT, E, FN) \ |
2469 | void HELPER(NAME)(void *vd, void *vj, \ | |
2470 | CPULoongArchState *env, uint32_t desc) \ | |
2471 | { \ | |
2472 | int i; \ | |
2473 | VReg *Vd = (VReg *)vd; \ | |
2474 | VReg *Vj = (VReg *)vj; \ | |
c9caf158 | 2475 | int oprsz = simd_oprsz(desc); \ |
226bf881 SG |
2476 | \ |
2477 | vec_clear_cause(env); \ | |
c9caf158 | 2478 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
226bf881 SG |
2479 | Vd->E(i) = FN(env, Vj->E(i)); \ |
2480 | } \ | |
aca67472 SG |
2481 | } |
2482 | ||
2483 | #define FLOGB(BIT, T) \ | |
2484 | static T do_flogb_## BIT(CPULoongArchState *env, T fj) \ | |
2485 | { \ | |
2486 | T fp, fd; \ | |
2487 | float_status *status = &env->fp_status; \ | |
2488 | FloatRoundMode old_mode = get_float_rounding_mode(status); \ | |
2489 | \ | |
2490 | set_float_rounding_mode(float_round_down, status); \ | |
2491 | fp = float ## BIT ##_log2(fj, status); \ | |
2492 | fd = float ## BIT ##_round_to_int(fp, status); \ | |
2493 | set_float_rounding_mode(old_mode, status); \ | |
2494 | vec_update_fcsr0_mask(env, GETPC(), float_flag_inexact); \ | |
2495 | return fd; \ | |
2496 | } | |
2497 | ||
2498 | FLOGB(32, uint32_t) | |
2499 | FLOGB(64, uint64_t) | |
2500 | ||
226bf881 SG |
2501 | #define FCLASS(NAME, BIT, E, FN) \ |
2502 | void HELPER(NAME)(void *vd, void *vj, \ | |
2503 | CPULoongArchState *env, uint32_t desc) \ | |
2504 | { \ | |
2505 | int i; \ | |
2506 | VReg *Vd = (VReg *)vd; \ | |
2507 | VReg *Vj = (VReg *)vj; \ | |
c9caf158 | 2508 | int oprsz = simd_oprsz(desc); \ |
226bf881 | 2509 | \ |
c9caf158 | 2510 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
226bf881 SG |
2511 | Vd->E(i) = FN(env, Vj->E(i)); \ |
2512 | } \ | |
aca67472 SG |
2513 | } |
2514 | ||
2515 | FCLASS(vfclass_s, 32, UW, helper_fclass_s) | |
2516 | FCLASS(vfclass_d, 64, UD, helper_fclass_d) | |
2517 | ||
2518 | #define FSQRT(BIT, T) \ | |
2519 | static T do_fsqrt_## BIT(CPULoongArchState *env, T fj) \ | |
2520 | { \ | |
2521 | T fd; \ | |
2522 | fd = float ## BIT ##_sqrt(fj, &env->fp_status); \ | |
2523 | vec_update_fcsr0(env, GETPC()); \ | |
2524 | return fd; \ | |
2525 | } | |
2526 | ||
2527 | FSQRT(32, uint32_t) | |
2528 | FSQRT(64, uint64_t) | |
2529 | ||
2530 | #define FRECIP(BIT, T) \ | |
2531 | static T do_frecip_## BIT(CPULoongArchState *env, T fj) \ | |
2532 | { \ | |
2533 | T fd; \ | |
2534 | fd = float ## BIT ##_div(float ## BIT ##_one, fj, &env->fp_status); \ | |
2535 | vec_update_fcsr0(env, GETPC()); \ | |
2536 | return fd; \ | |
2537 | } | |
2538 | ||
2539 | FRECIP(32, uint32_t) | |
2540 | FRECIP(64, uint64_t) | |
2541 | ||
2542 | #define FRSQRT(BIT, T) \ | |
2543 | static T do_frsqrt_## BIT(CPULoongArchState *env, T fj) \ | |
2544 | { \ | |
2545 | T fd, fp; \ | |
2546 | fp = float ## BIT ##_sqrt(fj, &env->fp_status); \ | |
2547 | fd = float ## BIT ##_div(float ## BIT ##_one, fp, &env->fp_status); \ | |
2548 | vec_update_fcsr0(env, GETPC()); \ | |
2549 | return fd; \ | |
2550 | } | |
2551 | ||
2552 | FRSQRT(32, uint32_t) | |
2553 | FRSQRT(64, uint64_t) | |
2554 | ||
2555 | DO_2OP_F(vflogb_s, 32, UW, do_flogb_32) | |
2556 | DO_2OP_F(vflogb_d, 64, UD, do_flogb_64) | |
2557 | DO_2OP_F(vfsqrt_s, 32, UW, do_fsqrt_32) | |
2558 | DO_2OP_F(vfsqrt_d, 64, UD, do_fsqrt_64) | |
2559 | DO_2OP_F(vfrecip_s, 32, UW, do_frecip_32) | |
2560 | DO_2OP_F(vfrecip_d, 64, UD, do_frecip_64) | |
2561 | DO_2OP_F(vfrsqrt_s, 32, UW, do_frsqrt_32) | |
2562 | DO_2OP_F(vfrsqrt_d, 64, UD, do_frsqrt_64) | |
399665d2 SG |
2563 | |
2564 | static uint32_t float16_cvt_float32(uint16_t h, float_status *status) | |
2565 | { | |
2566 | return float16_to_float32(h, true, status); | |
2567 | } | |
2568 | static uint64_t float32_cvt_float64(uint32_t s, float_status *status) | |
2569 | { | |
2570 | return float32_to_float64(s, status); | |
2571 | } | |
2572 | ||
2573 | static uint16_t float32_cvt_float16(uint32_t s, float_status *status) | |
2574 | { | |
2575 | return float32_to_float16(s, true, status); | |
2576 | } | |
2577 | static uint32_t float64_cvt_float32(uint64_t d, float_status *status) | |
2578 | { | |
2579 | return float64_to_float32(d, status); | |
2580 | } | |
2581 | ||
226bf881 SG |
2582 | void HELPER(vfcvtl_s_h)(void *vd, void *vj, |
2583 | CPULoongArchState *env, uint32_t desc) | |
399665d2 | 2584 | { |
60df31a2 SG |
2585 | int i, j, ofs; |
2586 | VReg temp = {}; | |
226bf881 SG |
2587 | VReg *Vd = (VReg *)vd; |
2588 | VReg *Vj = (VReg *)vj; | |
60df31a2 | 2589 | int oprsz = simd_oprsz(desc); |
399665d2 | 2590 | |
60df31a2 | 2591 | ofs = LSX_LEN / 32; |
399665d2 | 2592 | vec_clear_cause(env); |
60df31a2 SG |
2593 | for (i = 0; i < oprsz / 16; i++) { |
2594 | for (j = 0; j < ofs; j++) { | |
2595 | temp.UW(j + ofs * i) =float16_cvt_float32(Vj->UH(j + ofs * 2 * i), | |
2596 | &env->fp_status); | |
2597 | } | |
399665d2 SG |
2598 | vec_update_fcsr0(env, GETPC()); |
2599 | } | |
2600 | *Vd = temp; | |
2601 | } | |
2602 | ||
226bf881 SG |
2603 | void HELPER(vfcvtl_d_s)(void *vd, void *vj, |
2604 | CPULoongArchState *env, uint32_t desc) | |
399665d2 | 2605 | { |
60df31a2 SG |
2606 | int i, j, ofs; |
2607 | VReg temp = {}; | |
226bf881 SG |
2608 | VReg *Vd = (VReg *)vd; |
2609 | VReg *Vj = (VReg *)vj; | |
60df31a2 | 2610 | int oprsz = simd_oprsz(desc); |
399665d2 | 2611 | |
60df31a2 | 2612 | ofs = LSX_LEN / 64; |
399665d2 | 2613 | vec_clear_cause(env); |
60df31a2 SG |
2614 | for (i = 0; i < oprsz / 16; i++) { |
2615 | for (j = 0; j < ofs; j++) { | |
2616 | temp.UD(j + ofs * i) = float32_cvt_float64(Vj->UW(j + ofs * 2 * i), | |
2617 | &env->fp_status); | |
2618 | } | |
399665d2 SG |
2619 | vec_update_fcsr0(env, GETPC()); |
2620 | } | |
2621 | *Vd = temp; | |
2622 | } | |
2623 | ||
226bf881 SG |
2624 | void HELPER(vfcvth_s_h)(void *vd, void *vj, |
2625 | CPULoongArchState *env, uint32_t desc) | |
399665d2 | 2626 | { |
60df31a2 SG |
2627 | int i, j, ofs; |
2628 | VReg temp = {}; | |
226bf881 SG |
2629 | VReg *Vd = (VReg *)vd; |
2630 | VReg *Vj = (VReg *)vj; | |
60df31a2 | 2631 | int oprsz = simd_oprsz(desc); |
399665d2 | 2632 | |
60df31a2 | 2633 | ofs = LSX_LEN / 32; |
399665d2 | 2634 | vec_clear_cause(env); |
60df31a2 SG |
2635 | for (i = 0; i < oprsz / 16; i++) { |
2636 | for (j = 0; j < ofs; j++) { | |
2637 | temp.UW(j + ofs * i) = float16_cvt_float32(Vj->UH(j + ofs * (2 * i + 1)), | |
2638 | &env->fp_status); | |
2639 | } | |
399665d2 SG |
2640 | vec_update_fcsr0(env, GETPC()); |
2641 | } | |
2642 | *Vd = temp; | |
2643 | } | |
2644 | ||
226bf881 SG |
2645 | void HELPER(vfcvth_d_s)(void *vd, void *vj, |
2646 | CPULoongArchState *env, uint32_t desc) | |
399665d2 | 2647 | { |
60df31a2 SG |
2648 | int i, j, ofs; |
2649 | VReg temp = {}; | |
226bf881 SG |
2650 | VReg *Vd = (VReg *)vd; |
2651 | VReg *Vj = (VReg *)vj; | |
60df31a2 | 2652 | int oprsz = simd_oprsz(desc); |
399665d2 | 2653 | |
60df31a2 | 2654 | ofs = LSX_LEN / 64; |
399665d2 | 2655 | vec_clear_cause(env); |
60df31a2 SG |
2656 | for (i = 0; i < oprsz / 16; i++) { |
2657 | for (j = 0; j < ofs; j++) { | |
2658 | temp.UD(j + ofs * i) = float32_cvt_float64(Vj->UW(j + ofs * (2 * i + 1)), | |
2659 | &env->fp_status); | |
2660 | } | |
399665d2 SG |
2661 | vec_update_fcsr0(env, GETPC()); |
2662 | } | |
2663 | *Vd = temp; | |
2664 | } | |
2665 | ||
3b286753 SG |
2666 | void HELPER(vfcvt_h_s)(void *vd, void *vj, void *vk, |
2667 | CPULoongArchState *env, uint32_t desc) | |
399665d2 | 2668 | { |
60df31a2 SG |
2669 | int i, j, ofs; |
2670 | VReg temp = {}; | |
3b286753 SG |
2671 | VReg *Vd = (VReg *)vd; |
2672 | VReg *Vj = (VReg *)vj; | |
2673 | VReg *Vk = (VReg *)vk; | |
60df31a2 | 2674 | int oprsz = simd_oprsz(desc); |
399665d2 | 2675 | |
60df31a2 | 2676 | ofs = LSX_LEN / 32; |
399665d2 | 2677 | vec_clear_cause(env); |
60df31a2 SG |
2678 | for(i = 0; i < oprsz / 16; i++) { |
2679 | for (j = 0; j < ofs; j++) { | |
2680 | temp.UH(j + ofs * (2 * i + 1)) = float32_cvt_float16(Vj->UW(j + ofs * i), | |
2681 | &env->fp_status); | |
2682 | temp.UH(j + ofs * 2 * i) = float32_cvt_float16(Vk->UW(j + ofs * i), | |
2683 | &env->fp_status); | |
2684 | } | |
399665d2 SG |
2685 | vec_update_fcsr0(env, GETPC()); |
2686 | } | |
2687 | *Vd = temp; | |
2688 | } | |
2689 | ||
3b286753 SG |
2690 | void HELPER(vfcvt_s_d)(void *vd, void *vj, void *vk, |
2691 | CPULoongArchState *env, uint32_t desc) | |
399665d2 | 2692 | { |
60df31a2 SG |
2693 | int i, j, ofs; |
2694 | VReg temp = {}; | |
3b286753 SG |
2695 | VReg *Vd = (VReg *)vd; |
2696 | VReg *Vj = (VReg *)vj; | |
2697 | VReg *Vk = (VReg *)vk; | |
60df31a2 | 2698 | int oprsz = simd_oprsz(desc); |
399665d2 | 2699 | |
60df31a2 | 2700 | ofs = LSX_LEN / 64; |
399665d2 | 2701 | vec_clear_cause(env); |
60df31a2 SG |
2702 | for(i = 0; i < oprsz / 16; i++) { |
2703 | for (j = 0; j < ofs; j++) { | |
2704 | temp.UW(j + ofs * (2 * i + 1)) = float64_cvt_float32(Vj->UD(j + ofs * i), | |
2705 | &env->fp_status); | |
2706 | temp.UW(j + ofs * 2 * i) = float64_cvt_float32(Vk->UD(j + ofs * i), | |
2707 | &env->fp_status); | |
2708 | } | |
399665d2 SG |
2709 | vec_update_fcsr0(env, GETPC()); |
2710 | } | |
2711 | *Vd = temp; | |
2712 | } | |
2713 | ||
226bf881 SG |
2714 | void HELPER(vfrint_s)(void *vd, void *vj, |
2715 | CPULoongArchState *env, uint32_t desc) | |
399665d2 SG |
2716 | { |
2717 | int i; | |
226bf881 SG |
2718 | VReg *Vd = (VReg *)vd; |
2719 | VReg *Vj = (VReg *)vj; | |
60df31a2 | 2720 | int oprsz = simd_oprsz(desc); |
399665d2 SG |
2721 | |
2722 | vec_clear_cause(env); | |
60df31a2 | 2723 | for (i = 0; i < oprsz / 4; i++) { |
399665d2 SG |
2724 | Vd->W(i) = float32_round_to_int(Vj->UW(i), &env->fp_status); |
2725 | vec_update_fcsr0(env, GETPC()); | |
2726 | } | |
2727 | } | |
2728 | ||
226bf881 SG |
2729 | void HELPER(vfrint_d)(void *vd, void *vj, |
2730 | CPULoongArchState *env, uint32_t desc) | |
399665d2 SG |
2731 | { |
2732 | int i; | |
226bf881 SG |
2733 | VReg *Vd = (VReg *)vd; |
2734 | VReg *Vj = (VReg *)vj; | |
60df31a2 | 2735 | int oprsz = simd_oprsz(desc); |
399665d2 SG |
2736 | |
2737 | vec_clear_cause(env); | |
60df31a2 | 2738 | for (i = 0; i < oprsz / 8; i++) { |
399665d2 SG |
2739 | Vd->D(i) = float64_round_to_int(Vj->UD(i), &env->fp_status); |
2740 | vec_update_fcsr0(env, GETPC()); | |
2741 | } | |
2742 | } | |
2743 | ||
2744 | #define FCVT_2OP(NAME, BIT, E, MODE) \ | |
226bf881 SG |
2745 | void HELPER(NAME)(void *vd, void *vj, \ |
2746 | CPULoongArchState *env, uint32_t desc) \ | |
399665d2 SG |
2747 | { \ |
2748 | int i; \ | |
226bf881 SG |
2749 | VReg *Vd = (VReg *)vd; \ |
2750 | VReg *Vj = (VReg *)vj; \ | |
60df31a2 | 2751 | int oprsz = simd_oprsz(desc); \ |
399665d2 SG |
2752 | \ |
2753 | vec_clear_cause(env); \ | |
60df31a2 | 2754 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
399665d2 SG |
2755 | FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \ |
2756 | set_float_rounding_mode(MODE, &env->fp_status); \ | |
2757 | Vd->E(i) = float## BIT ## _round_to_int(Vj->E(i), &env->fp_status); \ | |
2758 | set_float_rounding_mode(old_mode, &env->fp_status); \ | |
2759 | vec_update_fcsr0(env, GETPC()); \ | |
2760 | } \ | |
2761 | } | |
2762 | ||
2763 | FCVT_2OP(vfrintrne_s, 32, UW, float_round_nearest_even) | |
2764 | FCVT_2OP(vfrintrne_d, 64, UD, float_round_nearest_even) | |
2765 | FCVT_2OP(vfrintrz_s, 32, UW, float_round_to_zero) | |
2766 | FCVT_2OP(vfrintrz_d, 64, UD, float_round_to_zero) | |
2767 | FCVT_2OP(vfrintrp_s, 32, UW, float_round_up) | |
2768 | FCVT_2OP(vfrintrp_d, 64, UD, float_round_up) | |
2769 | FCVT_2OP(vfrintrm_s, 32, UW, float_round_down) | |
2770 | FCVT_2OP(vfrintrm_d, 64, UD, float_round_down) | |
2771 | ||
2772 | #define FTINT(NAME, FMT1, FMT2, T1, T2, MODE) \ | |
2773 | static T2 do_ftint ## NAME(CPULoongArchState *env, T1 fj) \ | |
2774 | { \ | |
2775 | T2 fd; \ | |
2776 | FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \ | |
2777 | \ | |
2778 | set_float_rounding_mode(MODE, &env->fp_status); \ | |
2779 | fd = do_## FMT1 ##_to_## FMT2(env, fj); \ | |
2780 | set_float_rounding_mode(old_mode, &env->fp_status); \ | |
2781 | return fd; \ | |
2782 | } | |
2783 | ||
2784 | #define DO_FTINT(FMT1, FMT2, T1, T2) \ | |
2785 | static T2 do_## FMT1 ##_to_## FMT2(CPULoongArchState *env, T1 fj) \ | |
2786 | { \ | |
2787 | T2 fd; \ | |
2788 | \ | |
2789 | fd = FMT1 ##_to_## FMT2(fj, &env->fp_status); \ | |
2790 | if (get_float_exception_flags(&env->fp_status) & (float_flag_invalid)) { \ | |
2791 | if (FMT1 ##_is_any_nan(fj)) { \ | |
2792 | fd = 0; \ | |
2793 | } \ | |
2794 | } \ | |
2795 | vec_update_fcsr0(env, GETPC()); \ | |
2796 | return fd; \ | |
2797 | } | |
2798 | ||
2799 | DO_FTINT(float32, int32, uint32_t, uint32_t) | |
2800 | DO_FTINT(float64, int64, uint64_t, uint64_t) | |
2801 | DO_FTINT(float32, uint32, uint32_t, uint32_t) | |
2802 | DO_FTINT(float64, uint64, uint64_t, uint64_t) | |
2803 | DO_FTINT(float64, int32, uint64_t, uint32_t) | |
2804 | DO_FTINT(float32, int64, uint32_t, uint64_t) | |
2805 | ||
2806 | FTINT(rne_w_s, float32, int32, uint32_t, uint32_t, float_round_nearest_even) | |
2807 | FTINT(rne_l_d, float64, int64, uint64_t, uint64_t, float_round_nearest_even) | |
2808 | FTINT(rp_w_s, float32, int32, uint32_t, uint32_t, float_round_up) | |
2809 | FTINT(rp_l_d, float64, int64, uint64_t, uint64_t, float_round_up) | |
2810 | FTINT(rz_w_s, float32, int32, uint32_t, uint32_t, float_round_to_zero) | |
2811 | FTINT(rz_l_d, float64, int64, uint64_t, uint64_t, float_round_to_zero) | |
2812 | FTINT(rm_w_s, float32, int32, uint32_t, uint32_t, float_round_down) | |
2813 | FTINT(rm_l_d, float64, int64, uint64_t, uint64_t, float_round_down) | |
2814 | ||
2815 | DO_2OP_F(vftintrne_w_s, 32, UW, do_ftintrne_w_s) | |
2816 | DO_2OP_F(vftintrne_l_d, 64, UD, do_ftintrne_l_d) | |
2817 | DO_2OP_F(vftintrp_w_s, 32, UW, do_ftintrp_w_s) | |
2818 | DO_2OP_F(vftintrp_l_d, 64, UD, do_ftintrp_l_d) | |
2819 | DO_2OP_F(vftintrz_w_s, 32, UW, do_ftintrz_w_s) | |
2820 | DO_2OP_F(vftintrz_l_d, 64, UD, do_ftintrz_l_d) | |
2821 | DO_2OP_F(vftintrm_w_s, 32, UW, do_ftintrm_w_s) | |
2822 | DO_2OP_F(vftintrm_l_d, 64, UD, do_ftintrm_l_d) | |
2823 | DO_2OP_F(vftint_w_s, 32, UW, do_float32_to_int32) | |
2824 | DO_2OP_F(vftint_l_d, 64, UD, do_float64_to_int64) | |
2825 | ||
2826 | FTINT(rz_wu_s, float32, uint32, uint32_t, uint32_t, float_round_to_zero) | |
2827 | FTINT(rz_lu_d, float64, uint64, uint64_t, uint64_t, float_round_to_zero) | |
2828 | ||
2829 | DO_2OP_F(vftintrz_wu_s, 32, UW, do_ftintrz_wu_s) | |
2830 | DO_2OP_F(vftintrz_lu_d, 64, UD, do_ftintrz_lu_d) | |
2831 | DO_2OP_F(vftint_wu_s, 32, UW, do_float32_to_uint32) | |
2832 | DO_2OP_F(vftint_lu_d, 64, UD, do_float64_to_uint64) | |
2833 | ||
2834 | FTINT(rm_w_d, float64, int32, uint64_t, uint32_t, float_round_down) | |
2835 | FTINT(rp_w_d, float64, int32, uint64_t, uint32_t, float_round_up) | |
2836 | FTINT(rz_w_d, float64, int32, uint64_t, uint32_t, float_round_to_zero) | |
2837 | FTINT(rne_w_d, float64, int32, uint64_t, uint32_t, float_round_nearest_even) | |
2838 | ||
60df31a2 SG |
2839 | #define FTINT_W_D(NAME, FN) \ |
2840 | void HELPER(NAME)(void *vd, void *vj, void *vk, \ | |
2841 | CPULoongArchState *env, uint32_t desc) \ | |
2842 | { \ | |
2843 | int i, j, ofs; \ | |
2844 | VReg temp = {}; \ | |
2845 | VReg *Vd = (VReg *)vd; \ | |
2846 | VReg *Vj = (VReg *)vj; \ | |
2847 | VReg *Vk = (VReg *)vk; \ | |
2848 | int oprsz = simd_oprsz(desc); \ | |
2849 | \ | |
2850 | ofs = LSX_LEN / 64; \ | |
2851 | vec_clear_cause(env); \ | |
2852 | for (i = 0; i < oprsz / 16; i++) { \ | |
2853 | for (j = 0; j < ofs; j++) { \ | |
2854 | temp.W(j + ofs * (2 * i + 1)) = FN(env, Vj->UD(j + ofs * i)); \ | |
2855 | temp.W(j + ofs * 2 * i) = FN(env, Vk->UD(j + ofs * i)); \ | |
2856 | } \ | |
2857 | } \ | |
2858 | *Vd = temp; \ | |
399665d2 SG |
2859 | } |
2860 | ||
2861 | FTINT_W_D(vftint_w_d, do_float64_to_int32) | |
2862 | FTINT_W_D(vftintrm_w_d, do_ftintrm_w_d) | |
2863 | FTINT_W_D(vftintrp_w_d, do_ftintrp_w_d) | |
2864 | FTINT_W_D(vftintrz_w_d, do_ftintrz_w_d) | |
2865 | FTINT_W_D(vftintrne_w_d, do_ftintrne_w_d) | |
2866 | ||
2867 | FTINT(rml_l_s, float32, int64, uint32_t, uint64_t, float_round_down) | |
2868 | FTINT(rpl_l_s, float32, int64, uint32_t, uint64_t, float_round_up) | |
2869 | FTINT(rzl_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero) | |
2870 | FTINT(rnel_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even) | |
2871 | FTINT(rmh_l_s, float32, int64, uint32_t, uint64_t, float_round_down) | |
2872 | FTINT(rph_l_s, float32, int64, uint32_t, uint64_t, float_round_up) | |
2873 | FTINT(rzh_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero) | |
2874 | FTINT(rneh_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even) | |
2875 | ||
60df31a2 SG |
2876 | #define FTINTL_L_S(NAME, FN) \ |
2877 | void HELPER(NAME)(void *vd, void *vj, \ | |
2878 | CPULoongArchState *env, uint32_t desc) \ | |
2879 | { \ | |
2880 | int i, j, ofs; \ | |
2881 | VReg temp; \ | |
2882 | VReg *Vd = (VReg *)vd; \ | |
2883 | VReg *Vj = (VReg *)vj; \ | |
2884 | int oprsz = simd_oprsz(desc); \ | |
2885 | \ | |
2886 | ofs = LSX_LEN / 64; \ | |
2887 | vec_clear_cause(env); \ | |
2888 | for (i = 0; i < oprsz / 16; i++) { \ | |
2889 | for (j = 0; j < ofs; j++) { \ | |
2890 | temp.D(j + ofs * i) = FN(env, Vj->UW(j + ofs * 2 * i)); \ | |
2891 | } \ | |
2892 | } \ | |
2893 | *Vd = temp; \ | |
399665d2 SG |
2894 | } |
2895 | ||
2896 | FTINTL_L_S(vftintl_l_s, do_float32_to_int64) | |
2897 | FTINTL_L_S(vftintrml_l_s, do_ftintrml_l_s) | |
2898 | FTINTL_L_S(vftintrpl_l_s, do_ftintrpl_l_s) | |
2899 | FTINTL_L_S(vftintrzl_l_s, do_ftintrzl_l_s) | |
2900 | FTINTL_L_S(vftintrnel_l_s, do_ftintrnel_l_s) | |
2901 | ||
60df31a2 SG |
2902 | #define FTINTH_L_S(NAME, FN) \ |
2903 | void HELPER(NAME)(void *vd, void *vj, \ | |
2904 | CPULoongArchState *env, uint32_t desc) \ | |
2905 | { \ | |
2906 | int i, j, ofs; \ | |
2907 | VReg temp = {}; \ | |
2908 | VReg *Vd = (VReg *)vd; \ | |
2909 | VReg *Vj = (VReg *)vj; \ | |
2910 | int oprsz = simd_oprsz(desc); \ | |
2911 | \ | |
2912 | ofs = LSX_LEN / 64; \ | |
2913 | vec_clear_cause(env); \ | |
2914 | for (i = 0; i < oprsz / 16; i++) { \ | |
2915 | for (j = 0; j < ofs; j++) { \ | |
2916 | temp.D(j + ofs * i) = FN(env, Vj->UW(j + ofs * (2 * i + 1))); \ | |
2917 | } \ | |
2918 | } \ | |
2919 | *Vd = temp; \ | |
399665d2 SG |
2920 | } |
2921 | ||
2922 | FTINTH_L_S(vftinth_l_s, do_float32_to_int64) | |
2923 | FTINTH_L_S(vftintrmh_l_s, do_ftintrmh_l_s) | |
2924 | FTINTH_L_S(vftintrph_l_s, do_ftintrph_l_s) | |
2925 | FTINTH_L_S(vftintrzh_l_s, do_ftintrzh_l_s) | |
2926 | FTINTH_L_S(vftintrneh_l_s, do_ftintrneh_l_s) | |
2927 | ||
2928 | #define FFINT(NAME, FMT1, FMT2, T1, T2) \ | |
2929 | static T2 do_ffint_ ## NAME(CPULoongArchState *env, T1 fj) \ | |
2930 | { \ | |
2931 | T2 fd; \ | |
2932 | \ | |
2933 | fd = FMT1 ##_to_## FMT2(fj, &env->fp_status); \ | |
2934 | vec_update_fcsr0(env, GETPC()); \ | |
2935 | return fd; \ | |
2936 | } | |
2937 | ||
2938 | FFINT(s_w, int32, float32, int32_t, uint32_t) | |
2939 | FFINT(d_l, int64, float64, int64_t, uint64_t) | |
2940 | FFINT(s_wu, uint32, float32, uint32_t, uint32_t) | |
2941 | FFINT(d_lu, uint64, float64, uint64_t, uint64_t) | |
2942 | ||
2943 | DO_2OP_F(vffint_s_w, 32, W, do_ffint_s_w) | |
2944 | DO_2OP_F(vffint_d_l, 64, D, do_ffint_d_l) | |
2945 | DO_2OP_F(vffint_s_wu, 32, UW, do_ffint_s_wu) | |
2946 | DO_2OP_F(vffint_d_lu, 64, UD, do_ffint_d_lu) | |
2947 | ||
226bf881 SG |
2948 | void HELPER(vffintl_d_w)(void *vd, void *vj, |
2949 | CPULoongArchState *env, uint32_t desc) | |
399665d2 | 2950 | { |
60df31a2 SG |
2951 | int i, j, ofs; |
2952 | VReg temp = {}; | |
226bf881 SG |
2953 | VReg *Vd = (VReg *)vd; |
2954 | VReg *Vj = (VReg *)vj; | |
60df31a2 | 2955 | int oprsz = simd_oprsz(desc); |
399665d2 | 2956 | |
60df31a2 | 2957 | ofs = LSX_LEN / 64; |
399665d2 | 2958 | vec_clear_cause(env); |
60df31a2 SG |
2959 | for (i = 0; i < oprsz / 16; i++) { |
2960 | for (j = 0; j < ofs; j++) { | |
2961 | temp.D(j + ofs * i) = int32_to_float64(Vj->W(j + ofs * 2 * i), | |
2962 | &env->fp_status); | |
2963 | } | |
399665d2 SG |
2964 | vec_update_fcsr0(env, GETPC()); |
2965 | } | |
2966 | *Vd = temp; | |
2967 | } | |
2968 | ||
226bf881 SG |
2969 | void HELPER(vffinth_d_w)(void *vd, void *vj, |
2970 | CPULoongArchState *env, uint32_t desc) | |
399665d2 | 2971 | { |
60df31a2 SG |
2972 | int i, j, ofs; |
2973 | VReg temp = {}; | |
226bf881 SG |
2974 | VReg *Vd = (VReg *)vd; |
2975 | VReg *Vj = (VReg *)vj; | |
60df31a2 | 2976 | int oprsz = simd_oprsz(desc); |
399665d2 | 2977 | |
60df31a2 | 2978 | ofs = LSX_LEN / 64; |
399665d2 | 2979 | vec_clear_cause(env); |
60df31a2 SG |
2980 | for (i = 0; i < oprsz /16; i++) { |
2981 | for (j = 0; j < ofs; j++) { | |
2982 | temp.D(j + ofs * i) = int32_to_float64(Vj->W(j + ofs * (2 * i + 1)), | |
2983 | &env->fp_status); | |
2984 | } | |
399665d2 SG |
2985 | vec_update_fcsr0(env, GETPC()); |
2986 | } | |
2987 | *Vd = temp; | |
2988 | } | |
2989 | ||
3b286753 SG |
2990 | void HELPER(vffint_s_l)(void *vd, void *vj, void *vk, |
2991 | CPULoongArchState *env, uint32_t desc) | |
399665d2 | 2992 | { |
60df31a2 SG |
2993 | int i, j, ofs; |
2994 | VReg temp = {}; | |
3b286753 SG |
2995 | VReg *Vd = (VReg *)vd; |
2996 | VReg *Vj = (VReg *)vj; | |
2997 | VReg *Vk = (VReg *)vk; | |
60df31a2 | 2998 | int oprsz = simd_oprsz(desc); |
399665d2 | 2999 | |
60df31a2 | 3000 | ofs = LSX_LEN / 64; |
399665d2 | 3001 | vec_clear_cause(env); |
60df31a2 SG |
3002 | for (i = 0; i < oprsz / 16; i++) { |
3003 | for (j = 0; j < ofs; j++) { | |
3004 | temp.W(j + ofs * (2 * i + 1)) = int64_to_float32(Vj->D(j + ofs * i), | |
3005 | &env->fp_status); | |
3006 | temp.W(j + ofs * 2 * i) = int64_to_float32(Vk->D(j + ofs * i), | |
3007 | &env->fp_status); | |
3008 | } | |
399665d2 SG |
3009 | vec_update_fcsr0(env, GETPC()); |
3010 | } | |
3011 | *Vd = temp; | |
3012 | } | |
f435e1e5 | 3013 | |
4da72d43 SG |
3014 | #define VCMPI(NAME, BIT, E, DO_OP) \ |
3015 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
3016 | { \ | |
3017 | int i; \ | |
3018 | VReg *Vd = (VReg *)vd; \ | |
3019 | VReg *Vj = (VReg *)vj; \ | |
3020 | typedef __typeof(Vd->E(0)) TD; \ | |
3021 | int oprsz = simd_oprsz(desc); \ | |
3022 | \ | |
3023 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
3024 | Vd->E(i) = DO_OP(Vj->E(i), (TD)imm); \ | |
3025 | } \ | |
f435e1e5 SG |
3026 | } |
3027 | ||
3028 | VCMPI(vseqi_b, 8, B, VSEQ) | |
3029 | VCMPI(vseqi_h, 16, H, VSEQ) | |
3030 | VCMPI(vseqi_w, 32, W, VSEQ) | |
3031 | VCMPI(vseqi_d, 64, D, VSEQ) | |
3032 | VCMPI(vslei_b, 8, B, VSLE) | |
3033 | VCMPI(vslei_h, 16, H, VSLE) | |
3034 | VCMPI(vslei_w, 32, W, VSLE) | |
3035 | VCMPI(vslei_d, 64, D, VSLE) | |
3036 | VCMPI(vslei_bu, 8, UB, VSLE) | |
3037 | VCMPI(vslei_hu, 16, UH, VSLE) | |
3038 | VCMPI(vslei_wu, 32, UW, VSLE) | |
3039 | VCMPI(vslei_du, 64, UD, VSLE) | |
3040 | VCMPI(vslti_b, 8, B, VSLT) | |
3041 | VCMPI(vslti_h, 16, H, VSLT) | |
3042 | VCMPI(vslti_w, 32, W, VSLT) | |
3043 | VCMPI(vslti_d, 64, D, VSLT) | |
3044 | VCMPI(vslti_bu, 8, UB, VSLT) | |
3045 | VCMPI(vslti_hu, 16, UH, VSLT) | |
3046 | VCMPI(vslti_wu, 32, UW, VSLT) | |
3047 | VCMPI(vslti_du, 64, UD, VSLT) | |
386c4e86 SG |
3048 | |
3049 | static uint64_t vfcmp_common(CPULoongArchState *env, | |
3050 | FloatRelation cmp, uint32_t flags) | |
3051 | { | |
3052 | uint64_t ret = 0; | |
3053 | ||
3054 | switch (cmp) { | |
3055 | case float_relation_less: | |
3056 | ret = (flags & FCMP_LT); | |
3057 | break; | |
3058 | case float_relation_equal: | |
3059 | ret = (flags & FCMP_EQ); | |
3060 | break; | |
3061 | case float_relation_greater: | |
3062 | ret = (flags & FCMP_GT); | |
3063 | break; | |
3064 | case float_relation_unordered: | |
3065 | ret = (flags & FCMP_UN); | |
3066 | break; | |
3067 | default: | |
3068 | g_assert_not_reached(); | |
3069 | } | |
3070 | ||
3071 | if (ret) { | |
3072 | ret = -1; | |
3073 | } | |
3074 | ||
3075 | return ret; | |
3076 | } | |
3077 | ||
3078 | #define VFCMP(NAME, BIT, E, FN) \ | |
3eeda5fe | 3079 | void HELPER(NAME)(CPULoongArchState *env, uint32_t oprsz, \ |
386c4e86 SG |
3080 | uint32_t vd, uint32_t vj, uint32_t vk, uint32_t flags) \ |
3081 | { \ | |
3082 | int i; \ | |
3083 | VReg t; \ | |
3084 | VReg *Vd = &(env->fpr[vd].vreg); \ | |
3085 | VReg *Vj = &(env->fpr[vj].vreg); \ | |
3086 | VReg *Vk = &(env->fpr[vk].vreg); \ | |
3087 | \ | |
3088 | vec_clear_cause(env); \ | |
3eeda5fe | 3089 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
386c4e86 SG |
3090 | FloatRelation cmp; \ |
3091 | cmp = FN(Vj->E(i), Vk->E(i), &env->fp_status); \ | |
3092 | t.E(i) = vfcmp_common(env, cmp, flags); \ | |
3093 | vec_update_fcsr0(env, GETPC()); \ | |
3094 | } \ | |
3095 | *Vd = t; \ | |
3096 | } | |
3097 | ||
3098 | VFCMP(vfcmp_c_s, 32, UW, float32_compare_quiet) | |
3099 | VFCMP(vfcmp_s_s, 32, UW, float32_compare) | |
3100 | VFCMP(vfcmp_c_d, 64, UD, float64_compare_quiet) | |
3101 | VFCMP(vfcmp_s_d, 64, UD, float64_compare) | |
d0dfa19a | 3102 | |
f3dfcc8b | 3103 | void HELPER(vbitseli_b)(void *vd, void *vj, uint64_t imm, uint32_t desc) |
d0dfa19a SG |
3104 | { |
3105 | int i; | |
3106 | VReg *Vd = (VReg *)vd; | |
3107 | VReg *Vj = (VReg *)vj; | |
3108 | ||
f3dfcc8b | 3109 | for (i = 0; i < simd_oprsz(desc); i++) { |
d0dfa19a SG |
3110 | Vd->B(i) = (~Vd->B(i) & Vj->B(i)) | (Vd->B(i) & imm); |
3111 | } | |
3112 | } | |
3113 | ||
3114 | /* Copy from target/arm/tcg/sve_helper.c */ | |
3115 | static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) | |
3116 | { | |
f3dfcc8b | 3117 | int bits = 8 << esz; |
d0dfa19a SG |
3118 | uint64_t ones = dup_const(esz, 1); |
3119 | uint64_t signs = ones << (bits - 1); | |
3120 | uint64_t cmp0, cmp1; | |
3121 | ||
3122 | cmp1 = dup_const(esz, n); | |
3123 | cmp0 = cmp1 ^ m0; | |
3124 | cmp1 = cmp1 ^ m1; | |
3125 | cmp0 = (cmp0 - ones) & ~cmp0; | |
3126 | cmp1 = (cmp1 - ones) & ~cmp1; | |
3127 | return (cmp0 | cmp1) & signs; | |
3128 | } | |
3129 | ||
f3dfcc8b SG |
3130 | #define SETANYEQZ(NAME, MO) \ |
3131 | void HELPER(NAME)(CPULoongArchState *env, \ | |
3132 | uint32_t oprsz, uint32_t cd, uint32_t vj) \ | |
3133 | { \ | |
3134 | VReg *Vj = &(env->fpr[vj].vreg); \ | |
3135 | \ | |
3136 | env->cf[cd & 0x7] = do_match2(0, Vj->D(0), Vj->D(1), MO); \ | |
3137 | if (oprsz == 32) { \ | |
3138 | env->cf[cd & 0x7] = env->cf[cd & 0x7] || \ | |
3139 | do_match2(0, Vj->D(2), Vj->D(3), MO); \ | |
3140 | } \ | |
d0dfa19a | 3141 | } |
f3dfcc8b | 3142 | |
d0dfa19a SG |
3143 | SETANYEQZ(vsetanyeqz_b, MO_8) |
3144 | SETANYEQZ(vsetanyeqz_h, MO_16) | |
3145 | SETANYEQZ(vsetanyeqz_w, MO_32) | |
3146 | SETANYEQZ(vsetanyeqz_d, MO_64) | |
3147 | ||
f3dfcc8b SG |
3148 | #define SETALLNEZ(NAME, MO) \ |
3149 | void HELPER(NAME)(CPULoongArchState *env, \ | |
3150 | uint32_t oprsz, uint32_t cd, uint32_t vj) \ | |
3151 | { \ | |
3152 | VReg *Vj = &(env->fpr[vj].vreg); \ | |
3153 | \ | |
3154 | env->cf[cd & 0x7]= !do_match2(0, Vj->D(0), Vj->D(1), MO); \ | |
3155 | if (oprsz == 32) { \ | |
3156 | env->cf[cd & 0x7] = env->cf[cd & 0x7] && \ | |
3157 | !do_match2(0, Vj->D(2), Vj->D(3), MO); \ | |
3158 | } \ | |
d0dfa19a | 3159 | } |
f3dfcc8b | 3160 | |
d0dfa19a SG |
3161 | SETALLNEZ(vsetallnez_b, MO_8) |
3162 | SETALLNEZ(vsetallnez_h, MO_16) | |
3163 | SETALLNEZ(vsetallnez_w, MO_32) | |
3164 | SETALLNEZ(vsetallnez_d, MO_64) | |
d5e5563c | 3165 | |
df97f338 SG |
3166 | #define XVINSVE0(NAME, E, MASK) \ |
3167 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
3168 | { \ | |
3169 | VReg *Vd = (VReg *)vd; \ | |
3170 | VReg *Vj = (VReg *)vj; \ | |
3171 | Vd->E(imm & MASK) = Vj->E(0); \ | |
3172 | } | |
3173 | ||
3174 | XVINSVE0(xvinsve0_w, W, 0x7) | |
3175 | XVINSVE0(xvinsve0_d, D, 0x3) | |
3176 | ||
3177 | #define XVPICKVE(NAME, E, BIT, MASK) \ | |
3178 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
3179 | { \ | |
3180 | int i; \ | |
3181 | VReg *Vd = (VReg *)vd; \ | |
3182 | VReg *Vj = (VReg *)vj; \ | |
3183 | int oprsz = simd_oprsz(desc); \ | |
3184 | \ | |
3185 | Vd->E(0) = Vj->E(imm & MASK); \ | |
3186 | for (i = 1; i < oprsz / (BIT / 8); i++) { \ | |
3187 | Vd->E(i) = 0; \ | |
3188 | } \ | |
3189 | } | |
3190 | ||
3191 | XVPICKVE(xvpickve_w, W, 32, 0x7) | |
3192 | XVPICKVE(xvpickve_d, D, 64, 0x3) | |
3193 | ||
04711da1 SG |
3194 | #define VPACKEV(NAME, BIT, E) \ |
3195 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
3196 | { \ | |
3197 | int i; \ | |
ad292148 | 3198 | VReg temp = {}; \ |
04711da1 SG |
3199 | VReg *Vd = (VReg *)vd; \ |
3200 | VReg *Vj = (VReg *)vj; \ | |
3201 | VReg *Vk = (VReg *)vk; \ | |
ad292148 | 3202 | int oprsz = simd_oprsz(desc); \ |
04711da1 | 3203 | \ |
ad292148 | 3204 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
04711da1 SG |
3205 | temp.E(2 * i + 1) = Vj->E(2 * i); \ |
3206 | temp.E(2 *i) = Vk->E(2 * i); \ | |
3207 | } \ | |
3208 | *Vd = temp; \ | |
d5e5563c SG |
3209 | } |
3210 | ||
3211 | VPACKEV(vpackev_b, 16, B) | |
3212 | VPACKEV(vpackev_h, 32, H) | |
3213 | VPACKEV(vpackev_w, 64, W) | |
3214 | VPACKEV(vpackev_d, 128, D) | |
3215 | ||
04711da1 SG |
3216 | #define VPACKOD(NAME, BIT, E) \ |
3217 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
3218 | { \ | |
3219 | int i; \ | |
ad292148 | 3220 | VReg temp = {}; \ |
04711da1 SG |
3221 | VReg *Vd = (VReg *)vd; \ |
3222 | VReg *Vj = (VReg *)vj; \ | |
3223 | VReg *Vk = (VReg *)vk; \ | |
ad292148 | 3224 | int oprsz = simd_oprsz(desc); \ |
04711da1 | 3225 | \ |
ad292148 | 3226 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
04711da1 SG |
3227 | temp.E(2 * i + 1) = Vj->E(2 * i + 1); \ |
3228 | temp.E(2 * i) = Vk->E(2 * i + 1); \ | |
3229 | } \ | |
3230 | *Vd = temp; \ | |
d5e5563c SG |
3231 | } |
3232 | ||
3233 | VPACKOD(vpackod_b, 16, B) | |
3234 | VPACKOD(vpackod_h, 32, H) | |
3235 | VPACKOD(vpackod_w, 64, W) | |
3236 | VPACKOD(vpackod_d, 128, D) | |
3237 | ||
ad292148 SG |
3238 | #define VPICKEV(NAME, BIT, E) \ |
3239 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
3240 | { \ | |
3241 | int i, j, ofs; \ | |
3242 | VReg temp = {}; \ | |
3243 | VReg *Vd = (VReg *)vd; \ | |
3244 | VReg *Vj = (VReg *)vj; \ | |
3245 | VReg *Vk = (VReg *)vk; \ | |
3246 | int oprsz = simd_oprsz(desc); \ | |
3247 | \ | |
3248 | ofs = LSX_LEN / BIT; \ | |
3249 | for (i = 0; i < oprsz / 16; i++) { \ | |
3250 | for (j = 0; j < ofs; j++) { \ | |
3251 | temp.E(j + ofs * (2 * i + 1)) = Vj->E(2 * (j + ofs * i)); \ | |
3252 | temp.E(j + ofs * 2 * i) = Vk->E(2 * (j + ofs * i)); \ | |
3253 | } \ | |
3254 | } \ | |
3255 | *Vd = temp; \ | |
d5e5563c SG |
3256 | } |
3257 | ||
3258 | VPICKEV(vpickev_b, 16, B) | |
3259 | VPICKEV(vpickev_h, 32, H) | |
3260 | VPICKEV(vpickev_w, 64, W) | |
3261 | VPICKEV(vpickev_d, 128, D) | |
3262 | ||
ad292148 SG |
3263 | #define VPICKOD(NAME, BIT, E) \ |
3264 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
3265 | { \ | |
3266 | int i, j, ofs; \ | |
3267 | VReg temp = {}; \ | |
3268 | VReg *Vd = (VReg *)vd; \ | |
3269 | VReg *Vj = (VReg *)vj; \ | |
3270 | VReg *Vk = (VReg *)vk; \ | |
3271 | int oprsz = simd_oprsz(desc); \ | |
3272 | \ | |
3273 | ofs = LSX_LEN / BIT; \ | |
3274 | for (i = 0; i < oprsz / 16; i++) { \ | |
3275 | for (j = 0; j < ofs; j++) { \ | |
3276 | temp.E(j + ofs * (2 * i + 1)) = Vj->E(2 * (j + ofs * i) + 1); \ | |
3277 | temp.E(j + ofs * 2 * i) = Vk->E(2 * (j + ofs * i) + 1); \ | |
3278 | } \ | |
3279 | } \ | |
3280 | *Vd = temp; \ | |
d5e5563c SG |
3281 | } |
3282 | ||
3283 | VPICKOD(vpickod_b, 16, B) | |
3284 | VPICKOD(vpickod_h, 32, H) | |
3285 | VPICKOD(vpickod_w, 64, W) | |
3286 | VPICKOD(vpickod_d, 128, D) | |
e93dd431 | 3287 | |
ad292148 SG |
3288 | #define VILVL(NAME, BIT, E) \ |
3289 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
3290 | { \ | |
3291 | int i, j, ofs; \ | |
3292 | VReg temp = {}; \ | |
3293 | VReg *Vd = (VReg *)vd; \ | |
3294 | VReg *Vj = (VReg *)vj; \ | |
3295 | VReg *Vk = (VReg *)vk; \ | |
3296 | int oprsz = simd_oprsz(desc); \ | |
3297 | \ | |
3298 | ofs = LSX_LEN / BIT; \ | |
3299 | for (i = 0; i < oprsz / 16; i++) { \ | |
3300 | for (j = 0; j < ofs; j++) { \ | |
3301 | temp.E(2 * (j + ofs * i) + 1) = Vj->E(j + ofs * 2 * i); \ | |
3302 | temp.E(2 * (j + ofs * i)) = Vk->E(j + ofs * 2 * i); \ | |
3303 | } \ | |
3304 | } \ | |
3305 | *Vd = temp; \ | |
e93dd431 SG |
3306 | } |
3307 | ||
3308 | VILVL(vilvl_b, 16, B) | |
3309 | VILVL(vilvl_h, 32, H) | |
3310 | VILVL(vilvl_w, 64, W) | |
3311 | VILVL(vilvl_d, 128, D) | |
3312 | ||
ad292148 SG |
3313 | #define VILVH(NAME, BIT, E) \ |
3314 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
3315 | { \ | |
3316 | int i, j, ofs; \ | |
3317 | VReg temp = {}; \ | |
3318 | VReg *Vd = (VReg *)vd; \ | |
3319 | VReg *Vj = (VReg *)vj; \ | |
3320 | VReg *Vk = (VReg *)vk; \ | |
3321 | int oprsz = simd_oprsz(desc); \ | |
3322 | \ | |
3323 | ofs = LSX_LEN / BIT; \ | |
3324 | for (i = 0; i < oprsz / 16; i++) { \ | |
3325 | for (j = 0; j < ofs; j++) { \ | |
3326 | temp.E(2 * (j + ofs * i) + 1) = Vj->E(j + ofs * (2 * i + 1)); \ | |
3327 | temp.E(2 * (j + ofs * i)) = Vk->E(j + ofs * (2 * i + 1)); \ | |
3328 | } \ | |
3329 | } \ | |
3330 | *Vd = temp; \ | |
e93dd431 SG |
3331 | } |
3332 | ||
3333 | VILVH(vilvh_b, 16, B) | |
3334 | VILVH(vilvh_h, 32, H) | |
3335 | VILVH(vilvh_w, 64, W) | |
3336 | VILVH(vilvh_d, 128, D) | |
3337 | ||
eb48ab22 | 3338 | void HELPER(vshuf_b)(void *vd, void *vj, void *vk, void *va, uint32_t desc) |
e93dd431 | 3339 | { |
513e88a2 SG |
3340 | int i, j, m; |
3341 | VReg temp = {}; | |
eb48ab22 SG |
3342 | VReg *Vd = (VReg *)vd; |
3343 | VReg *Vj = (VReg *)vj; | |
3344 | VReg *Vk = (VReg *)vk; | |
3345 | VReg *Va = (VReg *)va; | |
513e88a2 | 3346 | int oprsz = simd_oprsz(desc); |
e93dd431 | 3347 | |
513e88a2 SG |
3348 | m = LSX_LEN / 8; |
3349 | for (i = 0; i < (oprsz / 16) * m; i++) { | |
3350 | j = i < m ? 0 : 1; | |
e93dd431 | 3351 | uint64_t k = (uint8_t)Va->B(i) % (2 * m); |
513e88a2 | 3352 | temp.B(i) = k < m ? Vk->B(k + j * m): Vj->B(k + (j - 1) * m); |
e93dd431 SG |
3353 | } |
3354 | *Vd = temp; | |
3355 | } | |
3356 | ||
513e88a2 SG |
3357 | #define VSHUF(NAME, BIT, E) \ |
3358 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
3359 | { \ | |
3360 | int i, j, m; \ | |
3361 | VReg temp = {}; \ | |
3362 | VReg *Vd = (VReg *)vd; \ | |
3363 | VReg *Vj = (VReg *)vj; \ | |
3364 | VReg *Vk = (VReg *)vk; \ | |
3365 | int oprsz = simd_oprsz(desc); \ | |
3366 | \ | |
3367 | m = LSX_LEN / BIT; \ | |
3368 | for (i = 0; i < (oprsz / 16) * m; i++) { \ | |
3369 | j = i < m ? 0 : 1; \ | |
3370 | uint64_t k = ((uint8_t)Vd->E(i)) % (2 * m); \ | |
3371 | temp.E(i) = k < m ? Vk->E(k + j * m) : Vj->E(k + (j - 1) * m); \ | |
3372 | } \ | |
3373 | *Vd = temp; \ | |
e93dd431 SG |
3374 | } |
3375 | ||
3376 | VSHUF(vshuf_h, 16, H) | |
3377 | VSHUF(vshuf_w, 32, W) | |
3378 | VSHUF(vshuf_d, 64, D) | |
3379 | ||
513e88a2 SG |
3380 | #define VSHUF4I(NAME, BIT, E) \ |
3381 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
3382 | { \ | |
3383 | int i, j, max; \ | |
3384 | VReg temp = {}; \ | |
3385 | VReg *Vd = (VReg *)vd; \ | |
3386 | VReg *Vj = (VReg *)vj; \ | |
3387 | int oprsz = simd_oprsz(desc); \ | |
3388 | \ | |
3389 | max = LSX_LEN / BIT; \ | |
3390 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
3391 | j = i < max ? 1 : 2; \ | |
3392 | temp.E(i) = Vj->E(SHF_POS(i - ((j -1)* max), imm) + (j - 1) * max); \ | |
3393 | } \ | |
3394 | *Vd = temp; \ | |
e93dd431 SG |
3395 | } |
3396 | ||
3397 | VSHUF4I(vshuf4i_b, 8, B) | |
3398 | VSHUF4I(vshuf4i_h, 16, H) | |
3399 | VSHUF4I(vshuf4i_w, 32, W) | |
3400 | ||
329517d5 | 3401 | void HELPER(vshuf4i_d)(void *vd, void *vj, uint64_t imm, uint32_t desc) |
e93dd431 | 3402 | { |
513e88a2 SG |
3403 | int i; |
3404 | VReg temp = {}; | |
329517d5 SG |
3405 | VReg *Vd = (VReg *)vd; |
3406 | VReg *Vj = (VReg *)vj; | |
513e88a2 | 3407 | int oprsz = simd_oprsz(desc); |
e93dd431 | 3408 | |
513e88a2 SG |
3409 | for (i = 0; i < oprsz / 16; i++) { |
3410 | temp.D(2 * i) = (imm & 2 ? Vj : Vd)->D((imm & 1) + 2 * i); | |
3411 | temp.D(2 * i + 1) = (imm & 8 ? Vj : Vd)->D(((imm >> 2) & 1) + 2 * i); | |
3412 | } | |
3413 | *Vd = temp; | |
3414 | } | |
3415 | ||
3416 | void HELPER(vperm_w)(void *vd, void *vj, void *vk, uint32_t desc) | |
3417 | { | |
3418 | int i, m; | |
3419 | VReg temp = {}; | |
3420 | VReg *Vd = (VReg *)vd; | |
3421 | VReg *Vj = (VReg *)vj; | |
3422 | VReg *Vk = (VReg *)vk; | |
3423 | ||
3424 | m = LASX_LEN / 32; | |
3425 | for (i = 0; i < m ; i++) { | |
3426 | uint64_t k = (uint8_t)Vk->W(i) % 8; | |
3427 | temp.W(i) = Vj->W(k); | |
3428 | } | |
e93dd431 SG |
3429 | *Vd = temp; |
3430 | } | |
3431 | ||
329517d5 | 3432 | void HELPER(vpermi_w)(void *vd, void *vj, uint64_t imm, uint32_t desc) |
e93dd431 | 3433 | { |
513e88a2 SG |
3434 | int i; |
3435 | VReg temp = {}; | |
3436 | VReg *Vd = (VReg *)vd; | |
3437 | VReg *Vj = (VReg *)vj; | |
3438 | int oprsz = simd_oprsz(desc); | |
3439 | ||
3440 | for (i = 0; i < oprsz / 16; i++) { | |
3441 | temp.W(4 * i) = Vj->W((imm & 0x3) + 4 * i); | |
3442 | temp.W(4 * i + 1) = Vj->W(((imm >> 2) & 0x3) + 4 * i); | |
3443 | temp.W(4 * i + 2) = Vd->W(((imm >> 4) & 0x3) + 4 * i); | |
3444 | temp.W(4 * i + 3) = Vd->W(((imm >> 6) & 0x3) + 4 * i); | |
3445 | } | |
3446 | *Vd = temp; | |
3447 | } | |
3448 | ||
3449 | void HELPER(vpermi_d)(void *vd, void *vj, uint64_t imm, uint32_t desc) | |
3450 | { | |
3451 | VReg temp = {}; | |
3452 | VReg *Vd = (VReg *)vd; | |
3453 | VReg *Vj = (VReg *)vj; | |
3454 | ||
3455 | temp.D(0) = Vj->D(imm & 0x3); | |
3456 | temp.D(1) = Vj->D((imm >> 2) & 0x3); | |
3457 | temp.D(2) = Vj->D((imm >> 4) & 0x3); | |
3458 | temp.D(3) = Vj->D((imm >> 6) & 0x3); | |
3459 | *Vd = temp; | |
3460 | } | |
3461 | ||
3462 | void HELPER(vpermi_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) | |
3463 | { | |
3464 | int i; | |
e93dd431 | 3465 | VReg temp; |
329517d5 SG |
3466 | VReg *Vd = (VReg *)vd; |
3467 | VReg *Vj = (VReg *)vj; | |
e93dd431 | 3468 | |
513e88a2 SG |
3469 | for (i = 0; i < 2; i++, imm >>= 4) { |
3470 | temp.Q(i) = (imm & 2 ? Vd: Vj)->Q(imm & 1); | |
3471 | } | |
e93dd431 SG |
3472 | *Vd = temp; |
3473 | } | |
3474 | ||
329517d5 SG |
3475 | #define VEXTRINS(NAME, BIT, E, MASK) \ |
3476 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
3477 | { \ | |
513e88a2 | 3478 | int i, ins, extr, max; \ |
329517d5 SG |
3479 | VReg *Vd = (VReg *)vd; \ |
3480 | VReg *Vj = (VReg *)vj; \ | |
513e88a2 | 3481 | int oprsz = simd_oprsz(desc); \ |
329517d5 | 3482 | \ |
513e88a2 | 3483 | max = LSX_LEN / BIT; \ |
329517d5 SG |
3484 | ins = (imm >> 4) & MASK; \ |
3485 | extr = imm & MASK; \ | |
513e88a2 SG |
3486 | for (i = 0; i < oprsz / 16; i++) { \ |
3487 | Vd->E(ins + i * max) = Vj->E(extr + i * max); \ | |
3488 | } \ | |
e93dd431 SG |
3489 | } |
3490 | ||
3491 | VEXTRINS(vextrins_b, 8, B, 0xf) | |
3492 | VEXTRINS(vextrins_h, 16, H, 0x7) | |
3493 | VEXTRINS(vextrins_w, 32, W, 0x3) | |
3494 | VEXTRINS(vextrins_d, 64, D, 0x1) |