]>
Commit | Line | Data |
---|---|---|
a0c9400a SG |
1 | /* SPDX-License-Identifier: GPL-2.0-or-later */ |
2 | /* | |
1dc33f26 | 3 | * QEMU LoongArch vector helper functions. |
a0c9400a SG |
4 | * |
5 | * Copyright (c) 2022-2023 Loongson Technology Corporation Limited | |
6 | */ | |
c037fbc9 SG |
7 | |
8 | #include "qemu/osdep.h" | |
9 | #include "cpu.h" | |
10 | #include "exec/exec-all.h" | |
11 | #include "exec/helper-proto.h" | |
aca67472 SG |
12 | #include "fpu/softfloat.h" |
13 | #include "internals.h" | |
d0dfa19a | 14 | #include "tcg/tcg.h" |
008a3b16 | 15 | #include "vec.h" |
64cf6b99 | 16 | #include "tcg/tcg-gvec-desc.h" |
c037fbc9 SG |
17 | |
18 | #define DO_ADD(a, b) (a + b) | |
19 | #define DO_SUB(a, b) (a - b) | |
20 | ||
21 | #define DO_ODD_EVEN(NAME, BIT, E1, E2, DO_OP) \ | |
04711da1 | 22 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
c037fbc9 SG |
23 | { \ |
24 | int i; \ | |
04711da1 SG |
25 | VReg *Vd = (VReg *)vd; \ |
26 | VReg *Vj = (VReg *)vj; \ | |
27 | VReg *Vk = (VReg *)vk; \ | |
c037fbc9 | 28 | typedef __typeof(Vd->E1(0)) TD; \ |
64cf6b99 | 29 | int oprsz = simd_oprsz(desc); \ |
c037fbc9 | 30 | \ |
64cf6b99 | 31 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
c037fbc9 SG |
32 | Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i)); \ |
33 | } \ | |
34 | } | |
35 | ||
36 | DO_ODD_EVEN(vhaddw_h_b, 16, H, B, DO_ADD) | |
37 | DO_ODD_EVEN(vhaddw_w_h, 32, W, H, DO_ADD) | |
38 | DO_ODD_EVEN(vhaddw_d_w, 64, D, W, DO_ADD) | |
39 | ||
04711da1 | 40 | void HELPER(vhaddw_q_d)(void *vd, void *vj, void *vk, uint32_t desc) |
c037fbc9 | 41 | { |
64cf6b99 | 42 | int i; |
04711da1 SG |
43 | VReg *Vd = (VReg *)vd; |
44 | VReg *Vj = (VReg *)vj; | |
45 | VReg *Vk = (VReg *)vk; | |
64cf6b99 | 46 | int oprsz = simd_oprsz(desc); |
c037fbc9 | 47 | |
64cf6b99 SG |
48 | for (i = 0; i < oprsz / 16 ; i++) { |
49 | Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i + 1)), | |
50 | int128_makes64(Vk->D(2 * i))); | |
51 | } | |
c037fbc9 SG |
52 | } |
53 | ||
54 | DO_ODD_EVEN(vhsubw_h_b, 16, H, B, DO_SUB) | |
55 | DO_ODD_EVEN(vhsubw_w_h, 32, W, H, DO_SUB) | |
56 | DO_ODD_EVEN(vhsubw_d_w, 64, D, W, DO_SUB) | |
57 | ||
04711da1 | 58 | void HELPER(vhsubw_q_d)(void *vd, void *vj, void *vk, uint32_t desc) |
c037fbc9 | 59 | { |
64cf6b99 | 60 | int i; |
04711da1 SG |
61 | VReg *Vd = (VReg *)vd; |
62 | VReg *Vj = (VReg *)vj; | |
63 | VReg *Vk = (VReg *)vk; | |
64cf6b99 | 64 | int oprsz = simd_oprsz(desc); |
c037fbc9 | 65 | |
64cf6b99 SG |
66 | for (i = 0; i < oprsz / 16; i++) { |
67 | Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)), | |
68 | int128_makes64(Vk->D(2 * i))); | |
69 | } | |
c037fbc9 SG |
70 | } |
71 | ||
72 | DO_ODD_EVEN(vhaddw_hu_bu, 16, UH, UB, DO_ADD) | |
73 | DO_ODD_EVEN(vhaddw_wu_hu, 32, UW, UH, DO_ADD) | |
74 | DO_ODD_EVEN(vhaddw_du_wu, 64, UD, UW, DO_ADD) | |
75 | ||
04711da1 | 76 | void HELPER(vhaddw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc) |
c037fbc9 | 77 | { |
64cf6b99 | 78 | int i; |
04711da1 SG |
79 | VReg *Vd = (VReg *)vd; |
80 | VReg *Vj = (VReg *)vj; | |
81 | VReg *Vk = (VReg *)vk; | |
64cf6b99 | 82 | int oprsz = simd_oprsz(desc); |
c037fbc9 | 83 | |
64cf6b99 SG |
84 | for (i = 0; i < oprsz / 16; i ++) { |
85 | Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)), | |
86 | int128_make64(Vk->UD(2 * i))); | |
87 | } | |
c037fbc9 SG |
88 | } |
89 | ||
90 | DO_ODD_EVEN(vhsubw_hu_bu, 16, UH, UB, DO_SUB) | |
91 | DO_ODD_EVEN(vhsubw_wu_hu, 32, UW, UH, DO_SUB) | |
92 | DO_ODD_EVEN(vhsubw_du_wu, 64, UD, UW, DO_SUB) | |
93 | ||
04711da1 | 94 | void HELPER(vhsubw_qu_du)(void *vd, void *vj, void *vk, uint32_t desc) |
c037fbc9 | 95 | { |
64cf6b99 | 96 | int i; |
04711da1 SG |
97 | VReg *Vd = (VReg *)vd; |
98 | VReg *Vj = (VReg *)vj; | |
99 | VReg *Vk = (VReg *)vk; | |
64cf6b99 | 100 | int oprsz = simd_oprsz(desc); |
c037fbc9 | 101 | |
64cf6b99 SG |
102 | for (i = 0; i < oprsz / 16; i++) { |
103 | Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)), | |
104 | int128_make64(Vk->UD(2 * i))); | |
105 | } | |
c037fbc9 | 106 | } |
2d5f950c SG |
107 | |
108 | #define DO_EVEN(NAME, BIT, E1, E2, DO_OP) \ | |
85995f07 | 109 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
2d5f950c SG |
110 | { \ |
111 | int i; \ | |
112 | VReg *Vd = (VReg *)vd; \ | |
113 | VReg *Vj = (VReg *)vj; \ | |
114 | VReg *Vk = (VReg *)vk; \ | |
115 | typedef __typeof(Vd->E1(0)) TD; \ | |
85995f07 SG |
116 | int oprsz = simd_oprsz(desc); \ |
117 | \ | |
118 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
2d5f950c SG |
119 | Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i) ,(TD)Vk->E2(2 * i)); \ |
120 | } \ | |
121 | } | |
122 | ||
123 | #define DO_ODD(NAME, BIT, E1, E2, DO_OP) \ | |
85995f07 | 124 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
2d5f950c SG |
125 | { \ |
126 | int i; \ | |
127 | VReg *Vd = (VReg *)vd; \ | |
128 | VReg *Vj = (VReg *)vj; \ | |
129 | VReg *Vk = (VReg *)vk; \ | |
130 | typedef __typeof(Vd->E1(0)) TD; \ | |
85995f07 SG |
131 | int oprsz = simd_oprsz(desc); \ |
132 | \ | |
133 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
2d5f950c SG |
134 | Vd->E1(i) = DO_OP((TD)Vj->E2(2 * i + 1), (TD)Vk->E2(2 * i + 1)); \ |
135 | } \ | |
136 | } | |
137 | ||
85995f07 | 138 | void HELPER(vaddwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 139 | { |
85995f07 | 140 | int i; |
2d5f950c SG |
141 | VReg *Vd = (VReg *)vd; |
142 | VReg *Vj = (VReg *)vj; | |
143 | VReg *Vk = (VReg *)vk; | |
85995f07 | 144 | int oprsz = simd_oprsz(desc); |
2d5f950c | 145 | |
85995f07 SG |
146 | for (i = 0; i < oprsz / 16; i++) { |
147 | Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i)), | |
148 | int128_makes64(Vk->D(2 * i))); | |
149 | } | |
2d5f950c SG |
150 | } |
151 | ||
152 | DO_EVEN(vaddwev_h_b, 16, H, B, DO_ADD) | |
153 | DO_EVEN(vaddwev_w_h, 32, W, H, DO_ADD) | |
154 | DO_EVEN(vaddwev_d_w, 64, D, W, DO_ADD) | |
155 | ||
85995f07 | 156 | void HELPER(vaddwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 157 | { |
85995f07 | 158 | int i; |
2d5f950c SG |
159 | VReg *Vd = (VReg *)vd; |
160 | VReg *Vj = (VReg *)vj; | |
161 | VReg *Vk = (VReg *)vk; | |
85995f07 | 162 | int oprsz = simd_oprsz(desc); |
2d5f950c | 163 | |
85995f07 SG |
164 | for (i = 0; i < oprsz / 16; i++) { |
165 | Vd->Q(i) = int128_add(int128_makes64(Vj->D(2 * i +1)), | |
166 | int128_makes64(Vk->D(2 * i +1))); | |
167 | } | |
2d5f950c SG |
168 | } |
169 | ||
170 | DO_ODD(vaddwod_h_b, 16, H, B, DO_ADD) | |
171 | DO_ODD(vaddwod_w_h, 32, W, H, DO_ADD) | |
172 | DO_ODD(vaddwod_d_w, 64, D, W, DO_ADD) | |
173 | ||
85995f07 | 174 | void HELPER(vsubwev_q_d)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 175 | { |
85995f07 | 176 | int i; |
2d5f950c SG |
177 | VReg *Vd = (VReg *)vd; |
178 | VReg *Vj = (VReg *)vj; | |
179 | VReg *Vk = (VReg *)vk; | |
85995f07 | 180 | int oprsz = simd_oprsz(desc); |
2d5f950c | 181 | |
85995f07 SG |
182 | for (i = 0; i < oprsz / 16; i++) { |
183 | Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i)), | |
184 | int128_makes64(Vk->D(2 * i))); | |
185 | } | |
2d5f950c SG |
186 | } |
187 | ||
188 | DO_EVEN(vsubwev_h_b, 16, H, B, DO_SUB) | |
189 | DO_EVEN(vsubwev_w_h, 32, W, H, DO_SUB) | |
190 | DO_EVEN(vsubwev_d_w, 64, D, W, DO_SUB) | |
191 | ||
85995f07 | 192 | void HELPER(vsubwod_q_d)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 193 | { |
85995f07 | 194 | int i; |
2d5f950c SG |
195 | VReg *Vd = (VReg *)vd; |
196 | VReg *Vj = (VReg *)vj; | |
197 | VReg *Vk = (VReg *)vk; | |
85995f07 | 198 | int oprsz = simd_oprsz(desc); |
2d5f950c | 199 | |
85995f07 SG |
200 | for (i = 0; i < oprsz / 16; i++) { |
201 | Vd->Q(i) = int128_sub(int128_makes64(Vj->D(2 * i + 1)), | |
202 | int128_makes64(Vk->D(2 * i + 1))); | |
203 | } | |
2d5f950c SG |
204 | } |
205 | ||
206 | DO_ODD(vsubwod_h_b, 16, H, B, DO_SUB) | |
207 | DO_ODD(vsubwod_w_h, 32, W, H, DO_SUB) | |
208 | DO_ODD(vsubwod_d_w, 64, D, W, DO_SUB) | |
209 | ||
85995f07 | 210 | void HELPER(vaddwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 211 | { |
85995f07 | 212 | int i; |
2d5f950c SG |
213 | VReg *Vd = (VReg *)vd; |
214 | VReg *Vj = (VReg *)vj; | |
215 | VReg *Vk = (VReg *)vk; | |
85995f07 | 216 | int oprsz = simd_oprsz(desc); |
2d5f950c | 217 | |
85995f07 SG |
218 | for (i = 0; i < oprsz / 16; i++) { |
219 | Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)), | |
220 | int128_make64(Vk->UD(2 * i))); | |
221 | } | |
2d5f950c SG |
222 | } |
223 | ||
224 | DO_EVEN(vaddwev_h_bu, 16, UH, UB, DO_ADD) | |
225 | DO_EVEN(vaddwev_w_hu, 32, UW, UH, DO_ADD) | |
226 | DO_EVEN(vaddwev_d_wu, 64, UD, UW, DO_ADD) | |
227 | ||
85995f07 | 228 | void HELPER(vaddwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 229 | { |
85995f07 | 230 | int i; |
2d5f950c SG |
231 | VReg *Vd = (VReg *)vd; |
232 | VReg *Vj = (VReg *)vj; | |
233 | VReg *Vk = (VReg *)vk; | |
85995f07 | 234 | int oprsz = simd_oprsz(desc); |
2d5f950c | 235 | |
85995f07 SG |
236 | for (i = 0; i < oprsz / 16; i++) { |
237 | Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)), | |
238 | int128_make64(Vk->UD(2 * i + 1))); | |
239 | } | |
2d5f950c SG |
240 | } |
241 | ||
242 | DO_ODD(vaddwod_h_bu, 16, UH, UB, DO_ADD) | |
243 | DO_ODD(vaddwod_w_hu, 32, UW, UH, DO_ADD) | |
244 | DO_ODD(vaddwod_d_wu, 64, UD, UW, DO_ADD) | |
245 | ||
85995f07 | 246 | void HELPER(vsubwev_q_du)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 247 | { |
85995f07 | 248 | int i; |
2d5f950c SG |
249 | VReg *Vd = (VReg *)vd; |
250 | VReg *Vj = (VReg *)vj; | |
251 | VReg *Vk = (VReg *)vk; | |
85995f07 | 252 | int oprsz = simd_oprsz(desc); |
2d5f950c | 253 | |
85995f07 SG |
254 | for (i = 0; i < oprsz / 16; i++) { |
255 | Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i)), | |
256 | int128_make64(Vk->UD(2 * i))); | |
257 | } | |
2d5f950c SG |
258 | } |
259 | ||
260 | DO_EVEN(vsubwev_h_bu, 16, UH, UB, DO_SUB) | |
261 | DO_EVEN(vsubwev_w_hu, 32, UW, UH, DO_SUB) | |
262 | DO_EVEN(vsubwev_d_wu, 64, UD, UW, DO_SUB) | |
263 | ||
85995f07 | 264 | void HELPER(vsubwod_q_du)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 265 | { |
85995f07 | 266 | int i; |
2d5f950c SG |
267 | VReg *Vd = (VReg *)vd; |
268 | VReg *Vj = (VReg *)vj; | |
269 | VReg *Vk = (VReg *)vk; | |
85995f07 | 270 | int oprsz = simd_oprsz(desc); |
2d5f950c | 271 | |
85995f07 SG |
272 | for (i = 0; i < oprsz / 16; i++) { |
273 | Vd->Q(i) = int128_sub(int128_make64(Vj->UD(2 * i + 1)), | |
274 | int128_make64(Vk->UD(2 * i + 1))); | |
275 | } | |
2d5f950c SG |
276 | } |
277 | ||
278 | DO_ODD(vsubwod_h_bu, 16, UH, UB, DO_SUB) | |
279 | DO_ODD(vsubwod_w_hu, 32, UW, UH, DO_SUB) | |
280 | DO_ODD(vsubwod_d_wu, 64, UD, UW, DO_SUB) | |
281 | ||
282 | #define DO_EVEN_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ | |
85995f07 | 283 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
2d5f950c SG |
284 | { \ |
285 | int i; \ | |
286 | VReg *Vd = (VReg *)vd; \ | |
287 | VReg *Vj = (VReg *)vj; \ | |
288 | VReg *Vk = (VReg *)vk; \ | |
289 | typedef __typeof(Vd->ES1(0)) TDS; \ | |
290 | typedef __typeof(Vd->EU1(0)) TDU; \ | |
85995f07 SG |
291 | int oprsz = simd_oprsz(desc); \ |
292 | \ | |
293 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
2d5f950c SG |
294 | Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i) ,(TDS)Vk->ES2(2 * i)); \ |
295 | } \ | |
296 | } | |
297 | ||
298 | #define DO_ODD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ | |
85995f07 | 299 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
2d5f950c SG |
300 | { \ |
301 | int i; \ | |
302 | VReg *Vd = (VReg *)vd; \ | |
303 | VReg *Vj = (VReg *)vj; \ | |
304 | VReg *Vk = (VReg *)vk; \ | |
305 | typedef __typeof(Vd->ES1(0)) TDS; \ | |
306 | typedef __typeof(Vd->EU1(0)) TDU; \ | |
85995f07 SG |
307 | int oprsz = simd_oprsz(desc); \ |
308 | \ | |
309 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
2d5f950c SG |
310 | Vd->ES1(i) = DO_OP((TDU)Vj->EU2(2 * i + 1), (TDS)Vk->ES2(2 * i + 1)); \ |
311 | } \ | |
312 | } | |
313 | ||
85995f07 | 314 | void HELPER(vaddwev_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 315 | { |
85995f07 | 316 | int i; |
2d5f950c SG |
317 | VReg *Vd = (VReg *)vd; |
318 | VReg *Vj = (VReg *)vj; | |
319 | VReg *Vk = (VReg *)vk; | |
85995f07 | 320 | int oprsz = simd_oprsz(desc); |
2d5f950c | 321 | |
85995f07 SG |
322 | for (i = 0; i < oprsz / 16; i++) { |
323 | Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i)), | |
324 | int128_makes64(Vk->D(2 * i))); | |
325 | } | |
2d5f950c SG |
326 | } |
327 | ||
328 | DO_EVEN_U_S(vaddwev_h_bu_b, 16, H, UH, B, UB, DO_ADD) | |
329 | DO_EVEN_U_S(vaddwev_w_hu_h, 32, W, UW, H, UH, DO_ADD) | |
330 | DO_EVEN_U_S(vaddwev_d_wu_w, 64, D, UD, W, UW, DO_ADD) | |
331 | ||
85995f07 | 332 | void HELPER(vaddwod_q_du_d)(void *vd, void *vj, void *vk, uint32_t desc) |
2d5f950c | 333 | { |
85995f07 | 334 | int i; |
2d5f950c SG |
335 | VReg *Vd = (VReg *)vd; |
336 | VReg *Vj = (VReg *)vj; | |
337 | VReg *Vk = (VReg *)vk; | |
85995f07 | 338 | int oprsz = simd_oprsz(desc); |
2d5f950c | 339 | |
85995f07 SG |
340 | for (i = 0; i < oprsz / 16; i++) { |
341 | Vd->Q(i) = int128_add(int128_make64(Vj->UD(2 * i + 1)), | |
342 | int128_makes64(Vk->D(2 * i + 1))); | |
343 | } | |
2d5f950c SG |
344 | } |
345 | ||
346 | DO_ODD_U_S(vaddwod_h_bu_b, 16, H, UH, B, UB, DO_ADD) | |
347 | DO_ODD_U_S(vaddwod_w_hu_h, 32, W, UW, H, UH, DO_ADD) | |
348 | DO_ODD_U_S(vaddwod_d_wu_w, 64, D, UD, W, UW, DO_ADD) | |
39e9b0a7 SG |
349 | |
350 | #define DO_VAVG(a, b) ((a >> 1) + (b >> 1) + (a & b & 1)) | |
351 | #define DO_VAVGR(a, b) ((a >> 1) + (b >> 1) + ((a | b) & 1)) | |
352 | ||
ee7250d0 SG |
353 | #define DO_3OP(NAME, BIT, E, DO_OP) \ |
354 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
355 | { \ | |
356 | int i; \ | |
357 | VReg *Vd = (VReg *)vd; \ | |
358 | VReg *Vj = (VReg *)vj; \ | |
359 | VReg *Vk = (VReg *)vk; \ | |
360 | int oprsz = simd_oprsz(desc); \ | |
361 | \ | |
362 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
363 | Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)); \ | |
364 | } \ | |
39e9b0a7 SG |
365 | } |
366 | ||
367 | DO_3OP(vavg_b, 8, B, DO_VAVG) | |
368 | DO_3OP(vavg_h, 16, H, DO_VAVG) | |
369 | DO_3OP(vavg_w, 32, W, DO_VAVG) | |
370 | DO_3OP(vavg_d, 64, D, DO_VAVG) | |
371 | DO_3OP(vavgr_b, 8, B, DO_VAVGR) | |
372 | DO_3OP(vavgr_h, 16, H, DO_VAVGR) | |
373 | DO_3OP(vavgr_w, 32, W, DO_VAVGR) | |
374 | DO_3OP(vavgr_d, 64, D, DO_VAVGR) | |
375 | DO_3OP(vavg_bu, 8, UB, DO_VAVG) | |
376 | DO_3OP(vavg_hu, 16, UH, DO_VAVG) | |
377 | DO_3OP(vavg_wu, 32, UW, DO_VAVG) | |
378 | DO_3OP(vavg_du, 64, UD, DO_VAVG) | |
379 | DO_3OP(vavgr_bu, 8, UB, DO_VAVGR) | |
380 | DO_3OP(vavgr_hu, 16, UH, DO_VAVGR) | |
381 | DO_3OP(vavgr_wu, 32, UW, DO_VAVGR) | |
382 | DO_3OP(vavgr_du, 64, UD, DO_VAVGR) | |
49725659 SG |
383 | |
384 | #define DO_VABSD(a, b) ((a > b) ? (a -b) : (b-a)) | |
385 | ||
386 | DO_3OP(vabsd_b, 8, B, DO_VABSD) | |
387 | DO_3OP(vabsd_h, 16, H, DO_VABSD) | |
388 | DO_3OP(vabsd_w, 32, W, DO_VABSD) | |
389 | DO_3OP(vabsd_d, 64, D, DO_VABSD) | |
390 | DO_3OP(vabsd_bu, 8, UB, DO_VABSD) | |
391 | DO_3OP(vabsd_hu, 16, UH, DO_VABSD) | |
392 | DO_3OP(vabsd_wu, 32, UW, DO_VABSD) | |
393 | DO_3OP(vabsd_du, 64, UD, DO_VABSD) | |
af448cb3 SG |
394 | |
395 | #define DO_VABS(a) ((a < 0) ? (-a) : (a)) | |
396 | ||
27f5485d SG |
397 | #define DO_VADDA(NAME, BIT, E) \ |
398 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
399 | { \ | |
400 | int i; \ | |
401 | VReg *Vd = (VReg *)vd; \ | |
402 | VReg *Vj = (VReg *)vj; \ | |
403 | VReg *Vk = (VReg *)vk; \ | |
404 | int oprsz = simd_oprsz(desc); \ | |
405 | \ | |
406 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
407 | Vd->E(i) = DO_VABS(Vj->E(i)) + DO_VABS(Vk->E(i)); \ | |
408 | } \ | |
af448cb3 SG |
409 | } |
410 | ||
27f5485d SG |
411 | DO_VADDA(vadda_b, 8, B) |
412 | DO_VADDA(vadda_h, 16, H) | |
413 | DO_VADDA(vadda_w, 32, W) | |
414 | DO_VADDA(vadda_d, 64, D) | |
9ab29520 SG |
415 | |
416 | #define DO_MIN(a, b) (a < b ? a : b) | |
417 | #define DO_MAX(a, b) (a > b ? a : b) | |
418 | ||
c09360fa SG |
419 | #define VMINMAXI(NAME, BIT, E, DO_OP) \ |
420 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
421 | { \ | |
422 | int i; \ | |
423 | VReg *Vd = (VReg *)vd; \ | |
424 | VReg *Vj = (VReg *)vj; \ | |
425 | typedef __typeof(Vd->E(0)) TD; \ | |
426 | int oprsz = simd_oprsz(desc); \ | |
427 | \ | |
428 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
429 | Vd->E(i) = DO_OP(Vj->E(i), (TD)imm); \ | |
430 | } \ | |
9ab29520 SG |
431 | } |
432 | ||
433 | VMINMAXI(vmini_b, 8, B, DO_MIN) | |
434 | VMINMAXI(vmini_h, 16, H, DO_MIN) | |
435 | VMINMAXI(vmini_w, 32, W, DO_MIN) | |
436 | VMINMAXI(vmini_d, 64, D, DO_MIN) | |
437 | VMINMAXI(vmaxi_b, 8, B, DO_MAX) | |
438 | VMINMAXI(vmaxi_h, 16, H, DO_MAX) | |
439 | VMINMAXI(vmaxi_w, 32, W, DO_MAX) | |
440 | VMINMAXI(vmaxi_d, 64, D, DO_MAX) | |
441 | VMINMAXI(vmini_bu, 8, UB, DO_MIN) | |
442 | VMINMAXI(vmini_hu, 16, UH, DO_MIN) | |
443 | VMINMAXI(vmini_wu, 32, UW, DO_MIN) | |
444 | VMINMAXI(vmini_du, 64, UD, DO_MIN) | |
445 | VMINMAXI(vmaxi_bu, 8, UB, DO_MAX) | |
446 | VMINMAXI(vmaxi_hu, 16, UH, DO_MAX) | |
447 | VMINMAXI(vmaxi_wu, 32, UW, DO_MAX) | |
448 | VMINMAXI(vmaxi_du, 64, UD, DO_MAX) | |
cd1c49ad | 449 | |
342dc1cf SG |
450 | #define DO_VMUH(NAME, BIT, E1, E2, DO_OP) \ |
451 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
452 | { \ | |
453 | int i; \ | |
454 | VReg *Vd = (VReg *)vd; \ | |
455 | VReg *Vj = (VReg *)vj; \ | |
456 | VReg *Vk = (VReg *)vk; \ | |
457 | typedef __typeof(Vd->E1(0)) T; \ | |
458 | int oprsz = simd_oprsz(desc); \ | |
459 | \ | |
460 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
461 | Vd->E2(i) = ((T)Vj->E2(i)) * ((T)Vk->E2(i)) >> BIT; \ | |
462 | } \ | |
cd1c49ad SG |
463 | } |
464 | ||
342dc1cf | 465 | void HELPER(vmuh_d)(void *vd, void *vj, void *vk, uint32_t desc) |
cd1c49ad | 466 | { |
342dc1cf SG |
467 | int i; |
468 | uint64_t l, h; | |
cd1c49ad SG |
469 | VReg *Vd = (VReg *)vd; |
470 | VReg *Vj = (VReg *)vj; | |
471 | VReg *Vk = (VReg *)vk; | |
342dc1cf | 472 | int oprsz = simd_oprsz(desc); |
cd1c49ad | 473 | |
342dc1cf SG |
474 | for (i = 0; i < oprsz / 8; i++) { |
475 | muls64(&l, &h, Vj->D(i), Vk->D(i)); | |
476 | Vd->D(i) = h; | |
477 | } | |
cd1c49ad SG |
478 | } |
479 | ||
480 | DO_VMUH(vmuh_b, 8, H, B, DO_MUH) | |
481 | DO_VMUH(vmuh_h, 16, W, H, DO_MUH) | |
482 | DO_VMUH(vmuh_w, 32, D, W, DO_MUH) | |
483 | ||
342dc1cf | 484 | void HELPER(vmuh_du)(void *vd, void *vj, void *vk, uint32_t desc) |
cd1c49ad | 485 | { |
342dc1cf SG |
486 | int i; |
487 | uint64_t l, h; | |
cd1c49ad SG |
488 | VReg *Vd = (VReg *)vd; |
489 | VReg *Vj = (VReg *)vj; | |
490 | VReg *Vk = (VReg *)vk; | |
342dc1cf | 491 | int oprsz = simd_oprsz(desc); |
cd1c49ad | 492 | |
342dc1cf SG |
493 | for (i = 0; i < oprsz / 8; i++) { |
494 | mulu64(&l, &h, Vj->D(i), Vk->D(i)); | |
495 | Vd->D(i) = h; | |
496 | } | |
cd1c49ad SG |
497 | } |
498 | ||
499 | DO_VMUH(vmuh_bu, 8, UH, UB, DO_MUH) | |
500 | DO_VMUH(vmuh_hu, 16, UW, UH, DO_MUH) | |
501 | DO_VMUH(vmuh_wu, 32, UD, UW, DO_MUH) | |
502 | ||
503 | #define DO_MUL(a, b) (a * b) | |
504 | ||
505 | DO_EVEN(vmulwev_h_b, 16, H, B, DO_MUL) | |
506 | DO_EVEN(vmulwev_w_h, 32, W, H, DO_MUL) | |
507 | DO_EVEN(vmulwev_d_w, 64, D, W, DO_MUL) | |
508 | ||
509 | DO_ODD(vmulwod_h_b, 16, H, B, DO_MUL) | |
510 | DO_ODD(vmulwod_w_h, 32, W, H, DO_MUL) | |
511 | DO_ODD(vmulwod_d_w, 64, D, W, DO_MUL) | |
512 | ||
513 | DO_EVEN(vmulwev_h_bu, 16, UH, UB, DO_MUL) | |
514 | DO_EVEN(vmulwev_w_hu, 32, UW, UH, DO_MUL) | |
515 | DO_EVEN(vmulwev_d_wu, 64, UD, UW, DO_MUL) | |
516 | ||
517 | DO_ODD(vmulwod_h_bu, 16, UH, UB, DO_MUL) | |
518 | DO_ODD(vmulwod_w_hu, 32, UW, UH, DO_MUL) | |
519 | DO_ODD(vmulwod_d_wu, 64, UD, UW, DO_MUL) | |
520 | ||
521 | DO_EVEN_U_S(vmulwev_h_bu_b, 16, H, UH, B, UB, DO_MUL) | |
522 | DO_EVEN_U_S(vmulwev_w_hu_h, 32, W, UW, H, UH, DO_MUL) | |
523 | DO_EVEN_U_S(vmulwev_d_wu_w, 64, D, UD, W, UW, DO_MUL) | |
524 | ||
525 | DO_ODD_U_S(vmulwod_h_bu_b, 16, H, UH, B, UB, DO_MUL) | |
526 | DO_ODD_U_S(vmulwod_w_hu_h, 32, W, UW, H, UH, DO_MUL) | |
527 | DO_ODD_U_S(vmulwod_d_wu_w, 64, D, UD, W, UW, DO_MUL) | |
d3aec65b SG |
528 | |
529 | #define DO_MADD(a, b, c) (a + b * c) | |
530 | #define DO_MSUB(a, b, c) (a - b * c) | |
531 | ||
3f450c17 SG |
532 | #define VMADDSUB(NAME, BIT, E, DO_OP) \ |
533 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
534 | { \ | |
535 | int i; \ | |
536 | VReg *Vd = (VReg *)vd; \ | |
537 | VReg *Vj = (VReg *)vj; \ | |
538 | VReg *Vk = (VReg *)vk; \ | |
539 | int oprsz = simd_oprsz(desc); \ | |
540 | \ | |
541 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
542 | Vd->E(i) = DO_OP(Vd->E(i), Vj->E(i) ,Vk->E(i)); \ | |
543 | } \ | |
d3aec65b SG |
544 | } |
545 | ||
546 | VMADDSUB(vmadd_b, 8, B, DO_MADD) | |
547 | VMADDSUB(vmadd_h, 16, H, DO_MADD) | |
548 | VMADDSUB(vmadd_w, 32, W, DO_MADD) | |
549 | VMADDSUB(vmadd_d, 64, D, DO_MADD) | |
550 | VMADDSUB(vmsub_b, 8, B, DO_MSUB) | |
551 | VMADDSUB(vmsub_h, 16, H, DO_MSUB) | |
552 | VMADDSUB(vmsub_w, 32, W, DO_MSUB) | |
553 | VMADDSUB(vmsub_d, 64, D, DO_MSUB) | |
554 | ||
555 | #define VMADDWEV(NAME, BIT, E1, E2, DO_OP) \ | |
3f450c17 | 556 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
d3aec65b SG |
557 | { \ |
558 | int i; \ | |
559 | VReg *Vd = (VReg *)vd; \ | |
560 | VReg *Vj = (VReg *)vj; \ | |
561 | VReg *Vk = (VReg *)vk; \ | |
562 | typedef __typeof(Vd->E1(0)) TD; \ | |
3f450c17 | 563 | int oprsz = simd_oprsz(desc); \ |
d3aec65b | 564 | \ |
3f450c17 | 565 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
d3aec65b SG |
566 | Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i), (TD)Vk->E2(2 * i)); \ |
567 | } \ | |
568 | } | |
569 | ||
570 | VMADDWEV(vmaddwev_h_b, 16, H, B, DO_MUL) | |
571 | VMADDWEV(vmaddwev_w_h, 32, W, H, DO_MUL) | |
572 | VMADDWEV(vmaddwev_d_w, 64, D, W, DO_MUL) | |
573 | VMADDWEV(vmaddwev_h_bu, 16, UH, UB, DO_MUL) | |
574 | VMADDWEV(vmaddwev_w_hu, 32, UW, UH, DO_MUL) | |
575 | VMADDWEV(vmaddwev_d_wu, 64, UD, UW, DO_MUL) | |
576 | ||
3f450c17 SG |
577 | #define VMADDWOD(NAME, BIT, E1, E2, DO_OP) \ |
578 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
579 | { \ | |
580 | int i; \ | |
581 | VReg *Vd = (VReg *)vd; \ | |
582 | VReg *Vj = (VReg *)vj; \ | |
583 | VReg *Vk = (VReg *)vk; \ | |
584 | typedef __typeof(Vd->E1(0)) TD; \ | |
585 | int oprsz = simd_oprsz(desc); \ | |
586 | \ | |
587 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
588 | Vd->E1(i) += DO_OP((TD)Vj->E2(2 * i + 1), \ | |
589 | (TD)Vk->E2(2 * i + 1)); \ | |
590 | } \ | |
d3aec65b SG |
591 | } |
592 | ||
593 | VMADDWOD(vmaddwod_h_b, 16, H, B, DO_MUL) | |
594 | VMADDWOD(vmaddwod_w_h, 32, W, H, DO_MUL) | |
595 | VMADDWOD(vmaddwod_d_w, 64, D, W, DO_MUL) | |
596 | VMADDWOD(vmaddwod_h_bu, 16, UH, UB, DO_MUL) | |
597 | VMADDWOD(vmaddwod_w_hu, 32, UW, UH, DO_MUL) | |
598 | VMADDWOD(vmaddwod_d_wu, 64, UD, UW, DO_MUL) | |
599 | ||
3f450c17 SG |
600 | #define VMADDWEV_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ |
601 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
602 | { \ | |
603 | int i; \ | |
604 | VReg *Vd = (VReg *)vd; \ | |
605 | VReg *Vj = (VReg *)vj; \ | |
606 | VReg *Vk = (VReg *)vk; \ | |
607 | typedef __typeof(Vd->ES1(0)) TS1; \ | |
608 | typedef __typeof(Vd->EU1(0)) TU1; \ | |
609 | int oprsz = simd_oprsz(desc); \ | |
610 | \ | |
611 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
612 | Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i), \ | |
613 | (TS1)Vk->ES2(2 * i)); \ | |
614 | } \ | |
d3aec65b SG |
615 | } |
616 | ||
617 | VMADDWEV_U_S(vmaddwev_h_bu_b, 16, H, UH, B, UB, DO_MUL) | |
618 | VMADDWEV_U_S(vmaddwev_w_hu_h, 32, W, UW, H, UH, DO_MUL) | |
619 | VMADDWEV_U_S(vmaddwev_d_wu_w, 64, D, UD, W, UW, DO_MUL) | |
620 | ||
3f450c17 SG |
621 | #define VMADDWOD_U_S(NAME, BIT, ES1, EU1, ES2, EU2, DO_OP) \ |
622 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
623 | { \ | |
624 | int i; \ | |
625 | VReg *Vd = (VReg *)vd; \ | |
626 | VReg *Vj = (VReg *)vj; \ | |
627 | VReg *Vk = (VReg *)vk; \ | |
628 | typedef __typeof(Vd->ES1(0)) TS1; \ | |
629 | typedef __typeof(Vd->EU1(0)) TU1; \ | |
630 | int oprsz = simd_oprsz(desc); \ | |
631 | \ | |
632 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
633 | Vd->ES1(i) += DO_OP((TU1)Vj->EU2(2 * i + 1), \ | |
634 | (TS1)Vk->ES2(2 * i + 1)); \ | |
635 | } \ | |
d3aec65b SG |
636 | } |
637 | ||
638 | VMADDWOD_U_S(vmaddwod_h_bu_b, 16, H, UH, B, UB, DO_MUL) | |
639 | VMADDWOD_U_S(vmaddwod_w_hu_h, 32, W, UW, H, UH, DO_MUL) | |
640 | VMADDWOD_U_S(vmaddwod_d_wu_w, 64, D, UD, W, UW, DO_MUL) | |
4cc4c0f7 SG |
641 | |
642 | #define DO_DIVU(N, M) (unlikely(M == 0) ? 0 : N / M) | |
643 | #define DO_REMU(N, M) (unlikely(M == 0) ? 0 : N % M) | |
644 | #define DO_DIV(N, M) (unlikely(M == 0) ? 0 :\ | |
645 | unlikely((N == -N) && (M == (__typeof(N))(-1))) ? N : N / M) | |
646 | #define DO_REM(N, M) (unlikely(M == 0) ? 0 :\ | |
647 | unlikely((N == -N) && (M == (__typeof(N))(-1))) ? 0 : N % M) | |
648 | ||
04711da1 SG |
649 | #define VDIV(NAME, BIT, E, DO_OP) \ |
650 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
651 | { \ | |
652 | int i; \ | |
653 | VReg *Vd = (VReg *)vd; \ | |
654 | VReg *Vj = (VReg *)vj; \ | |
655 | VReg *Vk = (VReg *)vk; \ | |
abb693de SG |
656 | int oprsz = simd_oprsz(desc); \ |
657 | \ | |
658 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
04711da1 SG |
659 | Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)); \ |
660 | } \ | |
4cc4c0f7 SG |
661 | } |
662 | ||
663 | VDIV(vdiv_b, 8, B, DO_DIV) | |
664 | VDIV(vdiv_h, 16, H, DO_DIV) | |
665 | VDIV(vdiv_w, 32, W, DO_DIV) | |
666 | VDIV(vdiv_d, 64, D, DO_DIV) | |
667 | VDIV(vdiv_bu, 8, UB, DO_DIVU) | |
668 | VDIV(vdiv_hu, 16, UH, DO_DIVU) | |
669 | VDIV(vdiv_wu, 32, UW, DO_DIVU) | |
670 | VDIV(vdiv_du, 64, UD, DO_DIVU) | |
671 | VDIV(vmod_b, 8, B, DO_REM) | |
672 | VDIV(vmod_h, 16, H, DO_REM) | |
673 | VDIV(vmod_w, 32, W, DO_REM) | |
674 | VDIV(vmod_d, 64, D, DO_REM) | |
675 | VDIV(vmod_bu, 8, UB, DO_REMU) | |
676 | VDIV(vmod_hu, 16, UH, DO_REMU) | |
677 | VDIV(vmod_wu, 32, UW, DO_REMU) | |
678 | VDIV(vmod_du, 64, UD, DO_REMU) | |
cbe44190 | 679 | |
e5c7f031 SG |
680 | #define VSAT_S(NAME, BIT, E) \ |
681 | void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \ | |
682 | { \ | |
683 | int i; \ | |
684 | VReg *Vd = (VReg *)vd; \ | |
685 | VReg *Vj = (VReg *)vj; \ | |
686 | typedef __typeof(Vd->E(0)) TD; \ | |
687 | int oprsz = simd_oprsz(desc); \ | |
688 | \ | |
689 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
690 | Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max : \ | |
691 | Vj->E(i) < (TD)~max ? (TD)~max: Vj->E(i); \ | |
692 | } \ | |
cbe44190 SG |
693 | } |
694 | ||
695 | VSAT_S(vsat_b, 8, B) | |
696 | VSAT_S(vsat_h, 16, H) | |
697 | VSAT_S(vsat_w, 32, W) | |
698 | VSAT_S(vsat_d, 64, D) | |
699 | ||
e5c7f031 SG |
700 | #define VSAT_U(NAME, BIT, E) \ |
701 | void HELPER(NAME)(void *vd, void *vj, uint64_t max, uint32_t desc) \ | |
702 | { \ | |
703 | int i; \ | |
704 | VReg *Vd = (VReg *)vd; \ | |
705 | VReg *Vj = (VReg *)vj; \ | |
706 | typedef __typeof(Vd->E(0)) TD; \ | |
707 | int oprsz = simd_oprsz(desc); \ | |
708 | \ | |
709 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
710 | Vd->E(i) = Vj->E(i) > (TD)max ? (TD)max : Vj->E(i); \ | |
711 | } \ | |
cbe44190 SG |
712 | } |
713 | ||
714 | VSAT_U(vsat_bu, 8, UB) | |
715 | VSAT_U(vsat_hu, 16, UH) | |
716 | VSAT_U(vsat_wu, 32, UW) | |
717 | VSAT_U(vsat_du, 64, UD) | |
3734ad93 | 718 | |
f0db0beb SG |
719 | #define VEXTH(NAME, BIT, E1, E2) \ |
720 | void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ | |
721 | { \ | |
722 | int i, j, ofs; \ | |
723 | VReg *Vd = (VReg *)vd; \ | |
724 | VReg *Vj = (VReg *)vj; \ | |
725 | int oprsz = simd_oprsz(desc); \ | |
726 | \ | |
727 | ofs = LSX_LEN / BIT; \ | |
728 | for (i = 0; i < oprsz / 16; i++) { \ | |
729 | for (j = 0; j < ofs; j++) { \ | |
730 | Vd->E1(j + i * ofs) = Vj->E2(j + ofs + ofs * 2 * i); \ | |
731 | } \ | |
732 | } \ | |
ff27e335 SG |
733 | } |
734 | ||
735 | void HELPER(vexth_q_d)(void *vd, void *vj, uint32_t desc) | |
3734ad93 | 736 | { |
f0db0beb | 737 | int i; |
ff27e335 SG |
738 | VReg *Vd = (VReg *)vd; |
739 | VReg *Vj = (VReg *)vj; | |
f0db0beb | 740 | int oprsz = simd_oprsz(desc); |
3734ad93 | 741 | |
f0db0beb SG |
742 | for (i = 0; i < oprsz / 16; i++) { |
743 | Vd->Q(i) = int128_makes64(Vj->D(2 * i + 1)); | |
744 | } | |
3734ad93 SG |
745 | } |
746 | ||
ff27e335 | 747 | void HELPER(vexth_qu_du)(void *vd, void *vj, uint32_t desc) |
3734ad93 | 748 | { |
f0db0beb | 749 | int i; |
ff27e335 SG |
750 | VReg *Vd = (VReg *)vd; |
751 | VReg *Vj = (VReg *)vj; | |
f0db0beb | 752 | int oprsz = simd_oprsz(desc); |
3734ad93 | 753 | |
f0db0beb SG |
754 | for (i = 0; i < oprsz / 16; i++) { |
755 | Vd->Q(i) = int128_make64(Vj->UD(2 * i + 1)); | |
756 | } | |
3734ad93 SG |
757 | } |
758 | ||
759 | VEXTH(vexth_h_b, 16, H, B) | |
760 | VEXTH(vexth_w_h, 32, W, H) | |
761 | VEXTH(vexth_d_w, 64, D, W) | |
762 | VEXTH(vexth_hu_bu, 16, UH, UB) | |
763 | VEXTH(vexth_wu_hu, 32, UW, UH) | |
764 | VEXTH(vexth_du_wu, 64, UD, UW) | |
f0e395df | 765 | |
790acb2a SG |
766 | #define VEXT2XV(NAME, BIT, E1, E2) \ |
767 | void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ | |
768 | { \ | |
769 | int i; \ | |
770 | VReg temp = {}; \ | |
771 | VReg *Vd = (VReg *)vd; \ | |
772 | VReg *Vj = (VReg *)vj; \ | |
773 | int oprsz = simd_oprsz(desc); \ | |
774 | \ | |
775 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ | |
776 | temp.E1(i) = Vj->E2(i); \ | |
777 | } \ | |
778 | *Vd = temp; \ | |
779 | } | |
780 | ||
781 | VEXT2XV(vext2xv_h_b, 16, H, B) | |
782 | VEXT2XV(vext2xv_w_b, 32, W, B) | |
783 | VEXT2XV(vext2xv_d_b, 64, D, B) | |
784 | VEXT2XV(vext2xv_w_h, 32, W, H) | |
785 | VEXT2XV(vext2xv_d_h, 64, D, H) | |
786 | VEXT2XV(vext2xv_d_w, 64, D, W) | |
787 | VEXT2XV(vext2xv_hu_bu, 16, UH, UB) | |
788 | VEXT2XV(vext2xv_wu_bu, 32, UW, UB) | |
789 | VEXT2XV(vext2xv_du_bu, 64, UD, UB) | |
790 | VEXT2XV(vext2xv_wu_hu, 32, UW, UH) | |
791 | VEXT2XV(vext2xv_du_hu, 64, UD, UH) | |
792 | VEXT2XV(vext2xv_du_wu, 64, UD, UW) | |
793 | ||
f0e395df SG |
794 | #define DO_SIGNCOV(a, b) (a == 0 ? 0 : a < 0 ? -b : b) |
795 | ||
796 | DO_3OP(vsigncov_b, 8, B, DO_SIGNCOV) | |
797 | DO_3OP(vsigncov_h, 16, H, DO_SIGNCOV) | |
798 | DO_3OP(vsigncov_w, 32, W, DO_SIGNCOV) | |
799 | DO_3OP(vsigncov_d, 64, D, DO_SIGNCOV) | |
789f4a4c SG |
800 | |
801 | static uint64_t do_vmskltz_b(int64_t val) | |
802 | { | |
803 | uint64_t m = 0x8080808080808080ULL; | |
804 | uint64_t c = val & m; | |
805 | c |= c << 7; | |
806 | c |= c << 14; | |
807 | c |= c << 28; | |
808 | return c >> 56; | |
809 | } | |
810 | ||
ff27e335 | 811 | void HELPER(vmskltz_b)(void *vd, void *vj, uint32_t desc) |
789f4a4c | 812 | { |
97074674 | 813 | int i; |
789f4a4c | 814 | uint16_t temp = 0; |
ff27e335 SG |
815 | VReg *Vd = (VReg *)vd; |
816 | VReg *Vj = (VReg *)vj; | |
97074674 | 817 | int oprsz = simd_oprsz(desc); |
789f4a4c | 818 | |
97074674 SG |
819 | for (i = 0; i < oprsz / 16; i++) { |
820 | temp = 0; | |
821 | temp = do_vmskltz_b(Vj->D(2 * i)); | |
822 | temp |= (do_vmskltz_b(Vj->D(2 * i + 1)) << 8); | |
823 | Vd->D(2 * i) = temp; | |
824 | Vd->D(2 * i + 1) = 0; | |
825 | } | |
789f4a4c SG |
826 | } |
827 | ||
828 | static uint64_t do_vmskltz_h(int64_t val) | |
829 | { | |
830 | uint64_t m = 0x8000800080008000ULL; | |
831 | uint64_t c = val & m; | |
832 | c |= c << 15; | |
833 | c |= c << 30; | |
834 | return c >> 60; | |
835 | } | |
836 | ||
ff27e335 | 837 | void HELPER(vmskltz_h)(void *vd, void *vj, uint32_t desc) |
789f4a4c | 838 | { |
97074674 | 839 | int i; |
789f4a4c | 840 | uint16_t temp = 0; |
ff27e335 SG |
841 | VReg *Vd = (VReg *)vd; |
842 | VReg *Vj = (VReg *)vj; | |
97074674 | 843 | int oprsz = simd_oprsz(desc); |
789f4a4c | 844 | |
97074674 SG |
845 | for (i = 0; i < oprsz / 16; i++) { |
846 | temp = 0; | |
847 | temp = do_vmskltz_h(Vj->D(2 * i)); | |
848 | temp |= (do_vmskltz_h(Vj->D(2 * i + 1)) << 4); | |
849 | Vd->D(2 * i) = temp; | |
850 | Vd->D(2 * i + 1) = 0; | |
851 | } | |
789f4a4c SG |
852 | } |
853 | ||
854 | static uint64_t do_vmskltz_w(int64_t val) | |
855 | { | |
856 | uint64_t m = 0x8000000080000000ULL; | |
857 | uint64_t c = val & m; | |
858 | c |= c << 31; | |
859 | return c >> 62; | |
860 | } | |
861 | ||
ff27e335 | 862 | void HELPER(vmskltz_w)(void *vd, void *vj, uint32_t desc) |
789f4a4c | 863 | { |
97074674 | 864 | int i; |
789f4a4c | 865 | uint16_t temp = 0; |
ff27e335 SG |
866 | VReg *Vd = (VReg *)vd; |
867 | VReg *Vj = (VReg *)vj; | |
97074674 | 868 | int oprsz = simd_oprsz(desc); |
789f4a4c | 869 | |
97074674 SG |
870 | for (i = 0; i < oprsz / 16; i++) { |
871 | temp = 0; | |
872 | temp = do_vmskltz_w(Vj->D(2 * i)); | |
873 | temp |= (do_vmskltz_w(Vj->D(2 * i + 1)) << 2); | |
874 | Vd->D(2 * i) = temp; | |
875 | Vd->D(2 * i + 1) = 0; | |
876 | } | |
789f4a4c SG |
877 | } |
878 | ||
879 | static uint64_t do_vmskltz_d(int64_t val) | |
880 | { | |
881 | return (uint64_t)val >> 63; | |
882 | } | |
ff27e335 | 883 | void HELPER(vmskltz_d)(void *vd, void *vj, uint32_t desc) |
789f4a4c | 884 | { |
97074674 | 885 | int i; |
789f4a4c | 886 | uint16_t temp = 0; |
ff27e335 SG |
887 | VReg *Vd = (VReg *)vd; |
888 | VReg *Vj = (VReg *)vj; | |
97074674 | 889 | int oprsz = simd_oprsz(desc); |
789f4a4c | 890 | |
97074674 SG |
891 | for (i = 0; i < oprsz / 16; i++) { |
892 | temp = 0; | |
893 | temp = do_vmskltz_d(Vj->D(2 * i)); | |
894 | temp |= (do_vmskltz_d(Vj->D(2 * i + 1)) << 1); | |
895 | Vd->D(2 * i) = temp; | |
896 | Vd->D(2 * i + 1) = 0; | |
897 | } | |
789f4a4c SG |
898 | } |
899 | ||
ff27e335 | 900 | void HELPER(vmskgez_b)(void *vd, void *vj, uint32_t desc) |
789f4a4c | 901 | { |
97074674 | 902 | int i; |
789f4a4c | 903 | uint16_t temp = 0; |
ff27e335 SG |
904 | VReg *Vd = (VReg *)vd; |
905 | VReg *Vj = (VReg *)vj; | |
97074674 | 906 | int oprsz = simd_oprsz(desc); |
789f4a4c | 907 | |
97074674 SG |
908 | for (i = 0; i < oprsz / 16; i++) { |
909 | temp = 0; | |
910 | temp = do_vmskltz_b(Vj->D(2 * i)); | |
911 | temp |= (do_vmskltz_b(Vj->D(2 * i + 1)) << 8); | |
912 | Vd->D(2 * i) = (uint16_t)(~temp); | |
913 | Vd->D(2 * i + 1) = 0; | |
914 | } | |
789f4a4c SG |
915 | } |
916 | ||
917 | static uint64_t do_vmskez_b(uint64_t a) | |
918 | { | |
919 | uint64_t m = 0x7f7f7f7f7f7f7f7fULL; | |
920 | uint64_t c = ~(((a & m) + m) | a | m); | |
921 | c |= c << 7; | |
922 | c |= c << 14; | |
923 | c |= c << 28; | |
924 | return c >> 56; | |
925 | } | |
926 | ||
ff27e335 | 927 | void HELPER(vmsknz_b)(void *vd, void *vj, uint32_t desc) |
789f4a4c | 928 | { |
97074674 | 929 | int i; |
789f4a4c | 930 | uint16_t temp = 0; |
ff27e335 SG |
931 | VReg *Vd = (VReg *)vd; |
932 | VReg *Vj = (VReg *)vj; | |
97074674 | 933 | int oprsz = simd_oprsz(desc); |
789f4a4c | 934 | |
97074674 SG |
935 | for (i = 0; i < oprsz / 16; i++) { |
936 | temp = 0; | |
937 | temp = do_vmskez_b(Vj->D(2 * i)); | |
938 | temp |= (do_vmskez_b(Vj->D(2 * i + 1)) << 8); | |
939 | Vd->D(2 * i) = (uint16_t)(~temp); | |
940 | Vd->D(2 * i + 1) = 0; | |
941 | } | |
789f4a4c | 942 | } |
f205a539 | 943 | |
4472a45a | 944 | void HELPER(vnori_b)(void *vd, void *vj, uint64_t imm, uint32_t desc) |
f205a539 SG |
945 | { |
946 | int i; | |
947 | VReg *Vd = (VReg *)vd; | |
948 | VReg *Vj = (VReg *)vj; | |
949 | ||
4472a45a | 950 | for (i = 0; i < simd_oprsz(desc); i++) { |
f205a539 SG |
951 | Vd->B(i) = ~(Vj->B(i) | (uint8_t)imm); |
952 | } | |
953 | } | |
9b21a7a5 | 954 | |
6567eac7 SG |
955 | #define VSLLWIL(NAME, BIT, E1, E2) \ |
956 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
957 | { \ | |
958 | int i, j, ofs; \ | |
959 | VReg temp = {}; \ | |
960 | VReg *Vd = (VReg *)vd; \ | |
961 | VReg *Vj = (VReg *)vj; \ | |
962 | int oprsz = simd_oprsz(desc); \ | |
963 | typedef __typeof(temp.E1(0)) TD; \ | |
964 | \ | |
965 | ofs = LSX_LEN / BIT; \ | |
966 | for (i = 0; i < oprsz / 16; i++) { \ | |
967 | for (j = 0; j < ofs; j++) { \ | |
968 | temp.E1(j + ofs * i) = (TD)Vj->E2(j + ofs * 2 * i) << (imm % BIT); \ | |
969 | } \ | |
970 | } \ | |
971 | *Vd = temp; \ | |
9b21a7a5 SG |
972 | } |
973 | ||
6567eac7 | 974 | |
ff27e335 | 975 | void HELPER(vextl_q_d)(void *vd, void *vj, uint32_t desc) |
9b21a7a5 | 976 | { |
6567eac7 | 977 | int i; |
ff27e335 SG |
978 | VReg *Vd = (VReg *)vd; |
979 | VReg *Vj = (VReg *)vj; | |
6567eac7 | 980 | int oprsz = simd_oprsz(desc); |
9b21a7a5 | 981 | |
6567eac7 SG |
982 | for (i = 0; i < oprsz / 16; i++) { |
983 | Vd->Q(i) = int128_makes64(Vj->D(2 * i)); | |
984 | } | |
9b21a7a5 SG |
985 | } |
986 | ||
ff27e335 | 987 | void HELPER(vextl_qu_du)(void *vd, void *vj, uint32_t desc) |
9b21a7a5 | 988 | { |
6567eac7 | 989 | int i; |
ff27e335 SG |
990 | VReg *Vd = (VReg *)vd; |
991 | VReg *Vj = (VReg *)vj; | |
6567eac7 | 992 | int oprsz = simd_oprsz(desc); |
9b21a7a5 | 993 | |
6567eac7 SG |
994 | for (i = 0; i < oprsz / 16; i++) { |
995 | Vd->Q(i) = int128_make64(Vj->UD(2 * i)); | |
996 | } | |
9b21a7a5 SG |
997 | } |
998 | ||
999 | VSLLWIL(vsllwil_h_b, 16, H, B) | |
1000 | VSLLWIL(vsllwil_w_h, 32, W, H) | |
1001 | VSLLWIL(vsllwil_d_w, 64, D, W) | |
1002 | VSLLWIL(vsllwil_hu_bu, 16, UH, UB) | |
1003 | VSLLWIL(vsllwil_wu_hu, 32, UW, UH) | |
1004 | VSLLWIL(vsllwil_du_wu, 64, UD, UW) | |
ecb93716 SG |
1005 | |
1006 | #define do_vsrlr(E, T) \ | |
1007 | static T do_vsrlr_ ##E(T s1, int sh) \ | |
1008 | { \ | |
1009 | if (sh == 0) { \ | |
1010 | return s1; \ | |
1011 | } else { \ | |
1012 | return (s1 >> sh) + ((s1 >> (sh - 1)) & 0x1); \ | |
1013 | } \ | |
1014 | } | |
1015 | ||
1016 | do_vsrlr(B, uint8_t) | |
1017 | do_vsrlr(H, uint16_t) | |
1018 | do_vsrlr(W, uint32_t) | |
1019 | do_vsrlr(D, uint64_t) | |
1020 | ||
1021 | #define VSRLR(NAME, BIT, T, E) \ | |
04711da1 | 1022 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
ecb93716 SG |
1023 | { \ |
1024 | int i; \ | |
04711da1 SG |
1025 | VReg *Vd = (VReg *)vd; \ |
1026 | VReg *Vj = (VReg *)vj; \ | |
1027 | VReg *Vk = (VReg *)vk; \ | |
8c272fe8 | 1028 | int oprsz = simd_oprsz(desc); \ |
ecb93716 | 1029 | \ |
8c272fe8 | 1030 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
ecb93716 SG |
1031 | Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \ |
1032 | } \ | |
1033 | } | |
1034 | ||
1035 | VSRLR(vsrlr_b, 8, uint8_t, B) | |
1036 | VSRLR(vsrlr_h, 16, uint16_t, H) | |
1037 | VSRLR(vsrlr_w, 32, uint32_t, W) | |
1038 | VSRLR(vsrlr_d, 64, uint64_t, D) | |
1039 | ||
329517d5 SG |
1040 | #define VSRLRI(NAME, BIT, E) \ |
1041 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
1042 | { \ | |
1043 | int i; \ | |
1044 | VReg *Vd = (VReg *)vd; \ | |
1045 | VReg *Vj = (VReg *)vj; \ | |
8c272fe8 | 1046 | int oprsz = simd_oprsz(desc); \ |
329517d5 | 1047 | \ |
8c272fe8 | 1048 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
329517d5 SG |
1049 | Vd->E(i) = do_vsrlr_ ## E(Vj->E(i), imm); \ |
1050 | } \ | |
ecb93716 SG |
1051 | } |
1052 | ||
1053 | VSRLRI(vsrlri_b, 8, B) | |
1054 | VSRLRI(vsrlri_h, 16, H) | |
1055 | VSRLRI(vsrlri_w, 32, W) | |
1056 | VSRLRI(vsrlri_d, 64, D) | |
1057 | ||
1058 | #define do_vsrar(E, T) \ | |
1059 | static T do_vsrar_ ##E(T s1, int sh) \ | |
1060 | { \ | |
1061 | if (sh == 0) { \ | |
1062 | return s1; \ | |
1063 | } else { \ | |
1064 | return (s1 >> sh) + ((s1 >> (sh - 1)) & 0x1); \ | |
1065 | } \ | |
1066 | } | |
1067 | ||
1068 | do_vsrar(B, int8_t) | |
1069 | do_vsrar(H, int16_t) | |
1070 | do_vsrar(W, int32_t) | |
1071 | do_vsrar(D, int64_t) | |
1072 | ||
1073 | #define VSRAR(NAME, BIT, T, E) \ | |
04711da1 | 1074 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
ecb93716 SG |
1075 | { \ |
1076 | int i; \ | |
04711da1 SG |
1077 | VReg *Vd = (VReg *)vd; \ |
1078 | VReg *Vj = (VReg *)vj; \ | |
1079 | VReg *Vk = (VReg *)vk; \ | |
8c272fe8 | 1080 | int oprsz = simd_oprsz(desc); \ |
ecb93716 | 1081 | \ |
8c272fe8 | 1082 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
ecb93716 SG |
1083 | Vd->E(i) = do_vsrar_ ## E(Vj->E(i), ((T)Vk->E(i))%BIT); \ |
1084 | } \ | |
1085 | } | |
1086 | ||
1087 | VSRAR(vsrar_b, 8, uint8_t, B) | |
1088 | VSRAR(vsrar_h, 16, uint16_t, H) | |
1089 | VSRAR(vsrar_w, 32, uint32_t, W) | |
1090 | VSRAR(vsrar_d, 64, uint64_t, D) | |
1091 | ||
329517d5 SG |
1092 | #define VSRARI(NAME, BIT, E) \ |
1093 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
1094 | { \ | |
1095 | int i; \ | |
1096 | VReg *Vd = (VReg *)vd; \ | |
1097 | VReg *Vj = (VReg *)vj; \ | |
8c272fe8 | 1098 | int oprsz = simd_oprsz(desc); \ |
329517d5 | 1099 | \ |
8c272fe8 | 1100 | for (i = 0; i < oprsz / (BIT / 8); i++) { \ |
329517d5 SG |
1101 | Vd->E(i) = do_vsrar_ ## E(Vj->E(i), imm); \ |
1102 | } \ | |
ecb93716 SG |
1103 | } |
1104 | ||
1105 | VSRARI(vsrari_b, 8, B) | |
1106 | VSRARI(vsrari_h, 16, H) | |
1107 | VSRARI(vsrari_w, 32, W) | |
1108 | VSRARI(vsrari_d, 64, D) | |
d79fb8dd SG |
1109 | |
1110 | #define R_SHIFT(a, b) (a >> b) | |
1111 | ||
40c7674e SG |
1112 | #define VSRLN(NAME, BIT, E1, E2) \ |
1113 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
1114 | { \ | |
1115 | int i, j, ofs; \ | |
1116 | VReg *Vd = (VReg *)vd; \ | |
1117 | VReg *Vj = (VReg *)vj; \ | |
1118 | VReg *Vk = (VReg *)vk; \ | |
1119 | int oprsz = simd_oprsz(desc); \ | |
1120 | \ | |
1121 | ofs = LSX_LEN / BIT; \ | |
1122 | for (i = 0; i < oprsz / 16; i++) { \ | |
1123 | for (j = 0; j < ofs; j++) { \ | |
1124 | Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), \ | |
1125 | Vk->E2(j + ofs * i) % BIT); \ | |
1126 | } \ | |
1127 | Vd->D(2 * i + 1) = 0; \ | |
1128 | } \ | |
1129 | } | |
1130 | ||
1131 | VSRLN(vsrln_b_h, 16, B, UH) | |
1132 | VSRLN(vsrln_h_w, 32, H, UW) | |
1133 | VSRLN(vsrln_w_d, 64, W, UD) | |
1134 | ||
1135 | #define VSRAN(NAME, BIT, E1, E2, E3) \ | |
1136 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
1137 | { \ | |
1138 | int i, j, ofs; \ | |
1139 | VReg *Vd = (VReg *)vd; \ | |
1140 | VReg *Vj = (VReg *)vj; \ | |
1141 | VReg *Vk = (VReg *)vk; \ | |
1142 | int oprsz = simd_oprsz(desc); \ | |
1143 | \ | |
1144 | ofs = LSX_LEN / BIT; \ | |
1145 | for (i = 0; i < oprsz / 16; i++) { \ | |
1146 | for (j = 0; j < ofs; j++) { \ | |
1147 | Vd->E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), \ | |
1148 | Vk->E3(j + ofs * i) % BIT); \ | |
1149 | } \ | |
1150 | Vd->D(2 * i + 1) = 0; \ | |
1151 | } \ | |
1152 | } | |
1153 | ||
1154 | VSRAN(vsran_b_h, 16, B, H, UH) | |
1155 | VSRAN(vsran_h_w, 32, H, W, UW) | |
1156 | VSRAN(vsran_w_d, 64, W, D, UD) | |
1157 | ||
1158 | #define VSRLNI(NAME, BIT, E1, E2) \ | |
1159 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
1160 | { \ | |
1161 | int i, j, ofs; \ | |
1162 | VReg temp = {}; \ | |
1163 | VReg *Vd = (VReg *)vd; \ | |
1164 | VReg *Vj = (VReg *)vj; \ | |
1165 | int oprsz = simd_oprsz(desc); \ | |
1166 | \ | |
1167 | ofs = LSX_LEN / BIT; \ | |
1168 | for (i = 0; i < oprsz / 16; i++) { \ | |
1169 | for (j = 0; j < ofs; j++) { \ | |
1170 | temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \ | |
1171 | temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \ | |
1172 | imm); \ | |
1173 | } \ | |
1174 | } \ | |
1175 | *Vd = temp; \ | |
329517d5 SG |
1176 | } |
1177 | ||
1178 | void HELPER(vsrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) | |
d79fb8dd | 1179 | { |
40c7674e SG |
1180 | int i; |
1181 | VReg temp = {}; | |
329517d5 SG |
1182 | VReg *Vd = (VReg *)vd; |
1183 | VReg *Vj = (VReg *)vj; | |
d79fb8dd | 1184 | |
40c7674e SG |
1185 | for (i = 0; i < 2; i++) { |
1186 | temp.D(2 * i) = int128_getlo(int128_urshift(Vj->Q(i), imm % 128)); | |
1187 | temp.D(2 * i +1) = int128_getlo(int128_urshift(Vd->Q(i), imm % 128)); | |
1188 | } | |
d79fb8dd SG |
1189 | *Vd = temp; |
1190 | } | |
1191 | ||
40c7674e SG |
1192 | VSRLNI(vsrlni_b_h, 16, B, UH) |
1193 | VSRLNI(vsrlni_h_w, 32, H, UW) | |
1194 | VSRLNI(vsrlni_w_d, 64, W, UD) | |
1195 | ||
1196 | #define VSRANI(NAME, BIT, E1, E2) \ | |
1197 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
1198 | { \ | |
1199 | int i, j, ofs; \ | |
1200 | VReg temp = {}; \ | |
1201 | VReg *Vd = (VReg *)vd; \ | |
1202 | VReg *Vj = (VReg *)vj; \ | |
1203 | int oprsz = simd_oprsz(desc); \ | |
1204 | \ | |
1205 | ofs = LSX_LEN / BIT; \ | |
1206 | for (i = 0; i < oprsz / 16; i++) { \ | |
1207 | for (j = 0; j < ofs; j++) { \ | |
1208 | temp.E1(j + ofs * 2 * i) = R_SHIFT(Vj->E2(j + ofs * i), imm); \ | |
1209 | temp.E1(j + ofs * (2 * i + 1)) = R_SHIFT(Vd->E2(j + ofs * i), \ | |
1210 | imm); \ | |
1211 | } \ | |
1212 | } \ | |
1213 | *Vd = temp; \ | |
329517d5 SG |
1214 | } |
1215 | ||
1216 | void HELPER(vsrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) | |
d79fb8dd | 1217 | { |
40c7674e SG |
1218 | int i; |
1219 | VReg temp = {}; | |
329517d5 SG |
1220 | VReg *Vd = (VReg *)vd; |
1221 | VReg *Vj = (VReg *)vj; | |
d79fb8dd | 1222 | |
40c7674e SG |
1223 | for (i = 0; i < 2; i++) { |
1224 | temp.D(2 * i) = int128_getlo(int128_rshift(Vj->Q(i), imm % 128)); | |
1225 | temp.D(2 * i + 1) = int128_getlo(int128_rshift(Vd->Q(i), imm % 128)); | |
1226 | } | |
d79fb8dd SG |
1227 | *Vd = temp; |
1228 | } | |
1229 | ||
1230 | VSRANI(vsrani_b_h, 16, B, H) | |
1231 | VSRANI(vsrani_h_w, 32, H, W) | |
1232 | VSRANI(vsrani_w_d, 64, W, D) | |
a5200a17 SG |
1233 | |
1234 | #define VSRLRN(NAME, BIT, T, E1, E2) \ | |
04711da1 | 1235 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
a5200a17 SG |
1236 | { \ |
1237 | int i; \ | |
04711da1 SG |
1238 | VReg *Vd = (VReg *)vd; \ |
1239 | VReg *Vj = (VReg *)vj; \ | |
1240 | VReg *Vk = (VReg *)vk; \ | |
a5200a17 SG |
1241 | \ |
1242 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
1243 | Vd->E1(i) = do_vsrlr_ ## E2(Vj->E2(i), ((T)Vk->E2(i))%BIT); \ | |
1244 | } \ | |
1245 | Vd->D(1) = 0; \ | |
1246 | } | |
1247 | ||
1248 | VSRLRN(vsrlrn_b_h, 16, uint16_t, B, H) | |
1249 | VSRLRN(vsrlrn_h_w, 32, uint32_t, H, W) | |
1250 | VSRLRN(vsrlrn_w_d, 64, uint64_t, W, D) | |
1251 | ||
1252 | #define VSRARN(NAME, BIT, T, E1, E2) \ | |
04711da1 | 1253 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
a5200a17 SG |
1254 | { \ |
1255 | int i; \ | |
04711da1 SG |
1256 | VReg *Vd = (VReg *)vd; \ |
1257 | VReg *Vj = (VReg *)vj; \ | |
1258 | VReg *Vk = (VReg *)vk; \ | |
a5200a17 SG |
1259 | \ |
1260 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
1261 | Vd->E1(i) = do_vsrar_ ## E2(Vj->E2(i), ((T)Vk->E2(i))%BIT); \ | |
1262 | } \ | |
1263 | Vd->D(1) = 0; \ | |
1264 | } | |
1265 | ||
1266 | VSRARN(vsrarn_b_h, 16, uint8_t, B, H) | |
1267 | VSRARN(vsrarn_h_w, 32, uint16_t, H, W) | |
1268 | VSRARN(vsrarn_w_d, 64, uint32_t, W, D) | |
1269 | ||
329517d5 SG |
1270 | #define VSRLRNI(NAME, BIT, E1, E2) \ |
1271 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
1272 | { \ | |
1273 | int i, max; \ | |
1274 | VReg temp; \ | |
1275 | VReg *Vd = (VReg *)vd; \ | |
1276 | VReg *Vj = (VReg *)vj; \ | |
1277 | \ | |
1278 | temp.D(0) = 0; \ | |
1279 | temp.D(1) = 0; \ | |
1280 | max = LSX_LEN/BIT; \ | |
1281 | for (i = 0; i < max; i++) { \ | |
1282 | temp.E1(i) = do_vsrlr_ ## E2(Vj->E2(i), imm); \ | |
1283 | temp.E1(i + max) = do_vsrlr_ ## E2(Vd->E2(i), imm); \ | |
1284 | } \ | |
1285 | *Vd = temp; \ | |
1286 | } | |
1287 | ||
1288 | void HELPER(vsrlrni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) | |
a5200a17 SG |
1289 | { |
1290 | VReg temp; | |
329517d5 SG |
1291 | VReg *Vd = (VReg *)vd; |
1292 | VReg *Vj = (VReg *)vj; | |
a5200a17 SG |
1293 | Int128 r1, r2; |
1294 | ||
1295 | if (imm == 0) { | |
1296 | temp.D(0) = int128_getlo(Vj->Q(0)); | |
1297 | temp.D(1) = int128_getlo(Vd->Q(0)); | |
1298 | } else { | |
1299 | r1 = int128_and(int128_urshift(Vj->Q(0), (imm -1)), int128_one()); | |
1300 | r2 = int128_and(int128_urshift(Vd->Q(0), (imm -1)), int128_one()); | |
1301 | ||
1302 | temp.D(0) = int128_getlo(int128_add(int128_urshift(Vj->Q(0), imm), r1)); | |
1303 | temp.D(1) = int128_getlo(int128_add(int128_urshift(Vd->Q(0), imm), r2)); | |
1304 | } | |
1305 | *Vd = temp; | |
1306 | } | |
1307 | ||
1308 | VSRLRNI(vsrlrni_b_h, 16, B, H) | |
1309 | VSRLRNI(vsrlrni_h_w, 32, H, W) | |
1310 | VSRLRNI(vsrlrni_w_d, 64, W, D) | |
1311 | ||
329517d5 SG |
1312 | #define VSRARNI(NAME, BIT, E1, E2) \ |
1313 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
1314 | { \ | |
1315 | int i, max; \ | |
1316 | VReg temp; \ | |
1317 | VReg *Vd = (VReg *)vd; \ | |
1318 | VReg *Vj = (VReg *)vj; \ | |
1319 | \ | |
1320 | temp.D(0) = 0; \ | |
1321 | temp.D(1) = 0; \ | |
1322 | max = LSX_LEN/BIT; \ | |
1323 | for (i = 0; i < max; i++) { \ | |
1324 | temp.E1(i) = do_vsrar_ ## E2(Vj->E2(i), imm); \ | |
1325 | temp.E1(i + max) = do_vsrar_ ## E2(Vd->E2(i), imm); \ | |
1326 | } \ | |
1327 | *Vd = temp; \ | |
1328 | } | |
1329 | ||
1330 | void HELPER(vsrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) | |
a5200a17 SG |
1331 | { |
1332 | VReg temp; | |
329517d5 SG |
1333 | VReg *Vd = (VReg *)vd; |
1334 | VReg *Vj = (VReg *)vj; | |
a5200a17 SG |
1335 | Int128 r1, r2; |
1336 | ||
1337 | if (imm == 0) { | |
1338 | temp.D(0) = int128_getlo(Vj->Q(0)); | |
1339 | temp.D(1) = int128_getlo(Vd->Q(0)); | |
1340 | } else { | |
1341 | r1 = int128_and(int128_rshift(Vj->Q(0), (imm -1)), int128_one()); | |
1342 | r2 = int128_and(int128_rshift(Vd->Q(0), (imm -1)), int128_one()); | |
1343 | ||
1344 | temp.D(0) = int128_getlo(int128_add(int128_rshift(Vj->Q(0), imm), r1)); | |
1345 | temp.D(1) = int128_getlo(int128_add(int128_rshift(Vd->Q(0), imm), r2)); | |
1346 | } | |
1347 | *Vd = temp; | |
1348 | } | |
1349 | ||
1350 | VSRARNI(vsrarni_b_h, 16, B, H) | |
1351 | VSRARNI(vsrarni_h_w, 32, H, W) | |
1352 | VSRARNI(vsrarni_w_d, 64, W, D) | |
83b3815d SG |
1353 | |
1354 | #define SSRLNS(NAME, T1, T2, T3) \ | |
1355 | static T1 do_ssrlns_ ## NAME(T2 e2, int sa, int sh) \ | |
1356 | { \ | |
1357 | T1 shft_res; \ | |
1358 | if (sa == 0) { \ | |
1359 | shft_res = e2; \ | |
1360 | } else { \ | |
1361 | shft_res = (((T1)e2) >> sa); \ | |
1362 | } \ | |
1363 | T3 mask; \ | |
1364 | mask = (1ull << sh) -1; \ | |
1365 | if (shft_res > mask) { \ | |
1366 | return mask; \ | |
1367 | } else { \ | |
1368 | return shft_res; \ | |
1369 | } \ | |
1370 | } | |
1371 | ||
1372 | SSRLNS(B, uint16_t, int16_t, uint8_t) | |
1373 | SSRLNS(H, uint32_t, int32_t, uint16_t) | |
1374 | SSRLNS(W, uint64_t, int64_t, uint32_t) | |
1375 | ||
1376 | #define VSSRLN(NAME, BIT, T, E1, E2) \ | |
04711da1 | 1377 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
83b3815d SG |
1378 | { \ |
1379 | int i; \ | |
04711da1 SG |
1380 | VReg *Vd = (VReg *)vd; \ |
1381 | VReg *Vj = (VReg *)vj; \ | |
1382 | VReg *Vk = (VReg *)vk; \ | |
83b3815d SG |
1383 | \ |
1384 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
1385 | Vd->E1(i) = do_ssrlns_ ## E1(Vj->E2(i), (T)Vk->E2(i)% BIT, BIT/2 -1); \ | |
1386 | } \ | |
1387 | Vd->D(1) = 0; \ | |
1388 | } | |
1389 | ||
1390 | VSSRLN(vssrln_b_h, 16, uint16_t, B, H) | |
1391 | VSSRLN(vssrln_h_w, 32, uint32_t, H, W) | |
1392 | VSSRLN(vssrln_w_d, 64, uint64_t, W, D) | |
1393 | ||
1394 | #define SSRANS(E, T1, T2) \ | |
1395 | static T1 do_ssrans_ ## E(T1 e2, int sa, int sh) \ | |
1396 | { \ | |
1397 | T1 shft_res; \ | |
1398 | if (sa == 0) { \ | |
1399 | shft_res = e2; \ | |
1400 | } else { \ | |
1401 | shft_res = e2 >> sa; \ | |
1402 | } \ | |
1403 | T2 mask; \ | |
1404 | mask = (1ll << sh) -1; \ | |
1405 | if (shft_res > mask) { \ | |
1406 | return mask; \ | |
1407 | } else if (shft_res < -(mask +1)) { \ | |
1408 | return ~mask; \ | |
1409 | } else { \ | |
1410 | return shft_res; \ | |
1411 | } \ | |
1412 | } | |
1413 | ||
1414 | SSRANS(B, int16_t, int8_t) | |
1415 | SSRANS(H, int32_t, int16_t) | |
1416 | SSRANS(W, int64_t, int32_t) | |
1417 | ||
1418 | #define VSSRAN(NAME, BIT, T, E1, E2) \ | |
04711da1 | 1419 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
83b3815d SG |
1420 | { \ |
1421 | int i; \ | |
04711da1 SG |
1422 | VReg *Vd = (VReg *)vd; \ |
1423 | VReg *Vj = (VReg *)vj; \ | |
1424 | VReg *Vk = (VReg *)vk; \ | |
83b3815d SG |
1425 | \ |
1426 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
1427 | Vd->E1(i) = do_ssrans_ ## E1(Vj->E2(i), (T)Vk->E2(i)%BIT, BIT/2 -1); \ | |
1428 | } \ | |
1429 | Vd->D(1) = 0; \ | |
1430 | } | |
1431 | ||
1432 | VSSRAN(vssran_b_h, 16, uint16_t, B, H) | |
1433 | VSSRAN(vssran_h_w, 32, uint32_t, H, W) | |
1434 | VSSRAN(vssran_w_d, 64, uint64_t, W, D) | |
1435 | ||
1436 | #define SSRLNU(E, T1, T2, T3) \ | |
1437 | static T1 do_ssrlnu_ ## E(T3 e2, int sa, int sh) \ | |
1438 | { \ | |
1439 | T1 shft_res; \ | |
1440 | if (sa == 0) { \ | |
1441 | shft_res = e2; \ | |
1442 | } else { \ | |
1443 | shft_res = (((T1)e2) >> sa); \ | |
1444 | } \ | |
1445 | T2 mask; \ | |
1446 | mask = (1ull << sh) -1; \ | |
1447 | if (shft_res > mask) { \ | |
1448 | return mask; \ | |
1449 | } else { \ | |
1450 | return shft_res; \ | |
1451 | } \ | |
1452 | } | |
1453 | ||
1454 | SSRLNU(B, uint16_t, uint8_t, int16_t) | |
1455 | SSRLNU(H, uint32_t, uint16_t, int32_t) | |
1456 | SSRLNU(W, uint64_t, uint32_t, int64_t) | |
1457 | ||
1458 | #define VSSRLNU(NAME, BIT, T, E1, E2) \ | |
04711da1 | 1459 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
83b3815d SG |
1460 | { \ |
1461 | int i; \ | |
04711da1 SG |
1462 | VReg *Vd = (VReg *)vd; \ |
1463 | VReg *Vj = (VReg *)vj; \ | |
1464 | VReg *Vk = (VReg *)vk; \ | |
83b3815d SG |
1465 | \ |
1466 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
1467 | Vd->E1(i) = do_ssrlnu_ ## E1(Vj->E2(i), (T)Vk->E2(i)%BIT, BIT/2); \ | |
1468 | } \ | |
1469 | Vd->D(1) = 0; \ | |
1470 | } | |
1471 | ||
1472 | VSSRLNU(vssrln_bu_h, 16, uint16_t, B, H) | |
1473 | VSSRLNU(vssrln_hu_w, 32, uint32_t, H, W) | |
1474 | VSSRLNU(vssrln_wu_d, 64, uint64_t, W, D) | |
1475 | ||
1476 | #define SSRANU(E, T1, T2, T3) \ | |
1477 | static T1 do_ssranu_ ## E(T3 e2, int sa, int sh) \ | |
1478 | { \ | |
1479 | T1 shft_res; \ | |
1480 | if (sa == 0) { \ | |
1481 | shft_res = e2; \ | |
1482 | } else { \ | |
1483 | shft_res = e2 >> sa; \ | |
1484 | } \ | |
1485 | if (e2 < 0) { \ | |
1486 | shft_res = 0; \ | |
1487 | } \ | |
1488 | T2 mask; \ | |
1489 | mask = (1ull << sh) -1; \ | |
1490 | if (shft_res > mask) { \ | |
1491 | return mask; \ | |
1492 | } else { \ | |
1493 | return shft_res; \ | |
1494 | } \ | |
1495 | } | |
1496 | ||
1497 | SSRANU(B, uint16_t, uint8_t, int16_t) | |
1498 | SSRANU(H, uint32_t, uint16_t, int32_t) | |
1499 | SSRANU(W, uint64_t, uint32_t, int64_t) | |
1500 | ||
1501 | #define VSSRANU(NAME, BIT, T, E1, E2) \ | |
04711da1 | 1502 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
83b3815d SG |
1503 | { \ |
1504 | int i; \ | |
04711da1 SG |
1505 | VReg *Vd = (VReg *)vd; \ |
1506 | VReg *Vj = (VReg *)vj; \ | |
1507 | VReg *Vk = (VReg *)vk; \ | |
83b3815d SG |
1508 | \ |
1509 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
1510 | Vd->E1(i) = do_ssranu_ ## E1(Vj->E2(i), (T)Vk->E2(i)%BIT, BIT/2); \ | |
1511 | } \ | |
1512 | Vd->D(1) = 0; \ | |
1513 | } | |
1514 | ||
1515 | VSSRANU(vssran_bu_h, 16, uint16_t, B, H) | |
1516 | VSSRANU(vssran_hu_w, 32, uint32_t, H, W) | |
1517 | VSSRANU(vssran_wu_d, 64, uint64_t, W, D) | |
1518 | ||
1519 | #define VSSRLNI(NAME, BIT, E1, E2) \ | |
329517d5 | 1520 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ |
83b3815d SG |
1521 | { \ |
1522 | int i; \ | |
1523 | VReg temp; \ | |
329517d5 SG |
1524 | VReg *Vd = (VReg *)vd; \ |
1525 | VReg *Vj = (VReg *)vj; \ | |
83b3815d SG |
1526 | \ |
1527 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
1528 | temp.E1(i) = do_ssrlns_ ## E1(Vj->E2(i), imm, BIT/2 -1); \ | |
1529 | temp.E1(i + LSX_LEN/BIT) = do_ssrlns_ ## E1(Vd->E2(i), imm, BIT/2 -1);\ | |
1530 | } \ | |
1531 | *Vd = temp; \ | |
1532 | } | |
1533 | ||
329517d5 | 1534 | void HELPER(vssrlni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) |
83b3815d SG |
1535 | { |
1536 | Int128 shft_res1, shft_res2, mask; | |
329517d5 SG |
1537 | VReg *Vd = (VReg *)vd; |
1538 | VReg *Vj = (VReg *)vj; | |
83b3815d SG |
1539 | |
1540 | if (imm == 0) { | |
1541 | shft_res1 = Vj->Q(0); | |
1542 | shft_res2 = Vd->Q(0); | |
1543 | } else { | |
1544 | shft_res1 = int128_urshift(Vj->Q(0), imm); | |
1545 | shft_res2 = int128_urshift(Vd->Q(0), imm); | |
1546 | } | |
1547 | mask = int128_sub(int128_lshift(int128_one(), 63), int128_one()); | |
1548 | ||
1549 | if (int128_ult(mask, shft_res1)) { | |
1550 | Vd->D(0) = int128_getlo(mask); | |
1551 | }else { | |
1552 | Vd->D(0) = int128_getlo(shft_res1); | |
1553 | } | |
1554 | ||
1555 | if (int128_ult(mask, shft_res2)) { | |
1556 | Vd->D(1) = int128_getlo(mask); | |
1557 | }else { | |
1558 | Vd->D(1) = int128_getlo(shft_res2); | |
1559 | } | |
1560 | } | |
1561 | ||
1562 | VSSRLNI(vssrlni_b_h, 16, B, H) | |
1563 | VSSRLNI(vssrlni_h_w, 32, H, W) | |
1564 | VSSRLNI(vssrlni_w_d, 64, W, D) | |
1565 | ||
1566 | #define VSSRANI(NAME, BIT, E1, E2) \ | |
329517d5 | 1567 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ |
83b3815d SG |
1568 | { \ |
1569 | int i; \ | |
1570 | VReg temp; \ | |
329517d5 SG |
1571 | VReg *Vd = (VReg *)vd; \ |
1572 | VReg *Vj = (VReg *)vj; \ | |
83b3815d SG |
1573 | \ |
1574 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
1575 | temp.E1(i) = do_ssrans_ ## E1(Vj->E2(i), imm, BIT/2 -1); \ | |
1576 | temp.E1(i + LSX_LEN/BIT) = do_ssrans_ ## E1(Vd->E2(i), imm, BIT/2 -1); \ | |
1577 | } \ | |
1578 | *Vd = temp; \ | |
1579 | } | |
1580 | ||
329517d5 | 1581 | void HELPER(vssrani_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) |
83b3815d SG |
1582 | { |
1583 | Int128 shft_res1, shft_res2, mask, min; | |
329517d5 SG |
1584 | VReg *Vd = (VReg *)vd; |
1585 | VReg *Vj = (VReg *)vj; | |
83b3815d SG |
1586 | |
1587 | if (imm == 0) { | |
1588 | shft_res1 = Vj->Q(0); | |
1589 | shft_res2 = Vd->Q(0); | |
1590 | } else { | |
1591 | shft_res1 = int128_rshift(Vj->Q(0), imm); | |
1592 | shft_res2 = int128_rshift(Vd->Q(0), imm); | |
1593 | } | |
1594 | mask = int128_sub(int128_lshift(int128_one(), 63), int128_one()); | |
1595 | min = int128_lshift(int128_one(), 63); | |
1596 | ||
1597 | if (int128_gt(shft_res1, mask)) { | |
1598 | Vd->D(0) = int128_getlo(mask); | |
1599 | } else if (int128_lt(shft_res1, int128_neg(min))) { | |
1600 | Vd->D(0) = int128_getlo(min); | |
1601 | } else { | |
1602 | Vd->D(0) = int128_getlo(shft_res1); | |
1603 | } | |
1604 | ||
1605 | if (int128_gt(shft_res2, mask)) { | |
1606 | Vd->D(1) = int128_getlo(mask); | |
1607 | } else if (int128_lt(shft_res2, int128_neg(min))) { | |
1608 | Vd->D(1) = int128_getlo(min); | |
1609 | } else { | |
1610 | Vd->D(1) = int128_getlo(shft_res2); | |
1611 | } | |
1612 | } | |
1613 | ||
1614 | VSSRANI(vssrani_b_h, 16, B, H) | |
1615 | VSSRANI(vssrani_h_w, 32, H, W) | |
1616 | VSSRANI(vssrani_w_d, 64, W, D) | |
1617 | ||
1618 | #define VSSRLNUI(NAME, BIT, E1, E2) \ | |
329517d5 | 1619 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ |
83b3815d SG |
1620 | { \ |
1621 | int i; \ | |
1622 | VReg temp; \ | |
329517d5 SG |
1623 | VReg *Vd = (VReg *)vd; \ |
1624 | VReg *Vj = (VReg *)vj; \ | |
83b3815d SG |
1625 | \ |
1626 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
1627 | temp.E1(i) = do_ssrlnu_ ## E1(Vj->E2(i), imm, BIT/2); \ | |
1628 | temp.E1(i + LSX_LEN/BIT) = do_ssrlnu_ ## E1(Vd->E2(i), imm, BIT/2); \ | |
1629 | } \ | |
1630 | *Vd = temp; \ | |
1631 | } | |
1632 | ||
329517d5 | 1633 | void HELPER(vssrlni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) |
83b3815d SG |
1634 | { |
1635 | Int128 shft_res1, shft_res2, mask; | |
329517d5 SG |
1636 | VReg *Vd = (VReg *)vd; |
1637 | VReg *Vj = (VReg *)vj; | |
83b3815d SG |
1638 | |
1639 | if (imm == 0) { | |
1640 | shft_res1 = Vj->Q(0); | |
1641 | shft_res2 = Vd->Q(0); | |
1642 | } else { | |
1643 | shft_res1 = int128_urshift(Vj->Q(0), imm); | |
1644 | shft_res2 = int128_urshift(Vd->Q(0), imm); | |
1645 | } | |
1646 | mask = int128_sub(int128_lshift(int128_one(), 64), int128_one()); | |
1647 | ||
1648 | if (int128_ult(mask, shft_res1)) { | |
1649 | Vd->D(0) = int128_getlo(mask); | |
1650 | }else { | |
1651 | Vd->D(0) = int128_getlo(shft_res1); | |
1652 | } | |
1653 | ||
1654 | if (int128_ult(mask, shft_res2)) { | |
1655 | Vd->D(1) = int128_getlo(mask); | |
1656 | }else { | |
1657 | Vd->D(1) = int128_getlo(shft_res2); | |
1658 | } | |
1659 | } | |
1660 | ||
1661 | VSSRLNUI(vssrlni_bu_h, 16, B, H) | |
1662 | VSSRLNUI(vssrlni_hu_w, 32, H, W) | |
1663 | VSSRLNUI(vssrlni_wu_d, 64, W, D) | |
1664 | ||
1665 | #define VSSRANUI(NAME, BIT, E1, E2) \ | |
329517d5 | 1666 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ |
83b3815d SG |
1667 | { \ |
1668 | int i; \ | |
1669 | VReg temp; \ | |
329517d5 SG |
1670 | VReg *Vd = (VReg *)vd; \ |
1671 | VReg *Vj = (VReg *)vj; \ | |
83b3815d SG |
1672 | \ |
1673 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
1674 | temp.E1(i) = do_ssranu_ ## E1(Vj->E2(i), imm, BIT/2); \ | |
1675 | temp.E1(i + LSX_LEN/BIT) = do_ssranu_ ## E1(Vd->E2(i), imm, BIT/2); \ | |
1676 | } \ | |
1677 | *Vd = temp; \ | |
1678 | } | |
1679 | ||
329517d5 | 1680 | void HELPER(vssrani_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) |
83b3815d SG |
1681 | { |
1682 | Int128 shft_res1, shft_res2, mask; | |
329517d5 SG |
1683 | VReg *Vd = (VReg *)vd; |
1684 | VReg *Vj = (VReg *)vj; | |
83b3815d SG |
1685 | |
1686 | if (imm == 0) { | |
1687 | shft_res1 = Vj->Q(0); | |
1688 | shft_res2 = Vd->Q(0); | |
1689 | } else { | |
1690 | shft_res1 = int128_rshift(Vj->Q(0), imm); | |
1691 | shft_res2 = int128_rshift(Vd->Q(0), imm); | |
1692 | } | |
1693 | ||
1694 | if (int128_lt(Vj->Q(0), int128_zero())) { | |
1695 | shft_res1 = int128_zero(); | |
1696 | } | |
1697 | ||
1698 | if (int128_lt(Vd->Q(0), int128_zero())) { | |
1699 | shft_res2 = int128_zero(); | |
1700 | } | |
1701 | ||
1702 | mask = int128_sub(int128_lshift(int128_one(), 64), int128_one()); | |
1703 | ||
1704 | if (int128_ult(mask, shft_res1)) { | |
1705 | Vd->D(0) = int128_getlo(mask); | |
1706 | }else { | |
1707 | Vd->D(0) = int128_getlo(shft_res1); | |
1708 | } | |
1709 | ||
1710 | if (int128_ult(mask, shft_res2)) { | |
1711 | Vd->D(1) = int128_getlo(mask); | |
1712 | }else { | |
1713 | Vd->D(1) = int128_getlo(shft_res2); | |
1714 | } | |
1715 | } | |
1716 | ||
1717 | VSSRANUI(vssrani_bu_h, 16, B, H) | |
1718 | VSSRANUI(vssrani_hu_w, 32, H, W) | |
1719 | VSSRANUI(vssrani_wu_d, 64, W, D) | |
162cd32c SG |
1720 | |
1721 | #define SSRLRNS(E1, E2, T1, T2, T3) \ | |
1722 | static T1 do_ssrlrns_ ## E1(T2 e2, int sa, int sh) \ | |
1723 | { \ | |
1724 | T1 shft_res; \ | |
1725 | \ | |
1726 | shft_res = do_vsrlr_ ## E2(e2, sa); \ | |
1727 | T1 mask; \ | |
1728 | mask = (1ull << sh) -1; \ | |
1729 | if (shft_res > mask) { \ | |
1730 | return mask; \ | |
1731 | } else { \ | |
1732 | return shft_res; \ | |
1733 | } \ | |
1734 | } | |
1735 | ||
1736 | SSRLRNS(B, H, uint16_t, int16_t, uint8_t) | |
1737 | SSRLRNS(H, W, uint32_t, int32_t, uint16_t) | |
1738 | SSRLRNS(W, D, uint64_t, int64_t, uint32_t) | |
1739 | ||
1740 | #define VSSRLRN(NAME, BIT, T, E1, E2) \ | |
04711da1 | 1741 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
162cd32c SG |
1742 | { \ |
1743 | int i; \ | |
04711da1 SG |
1744 | VReg *Vd = (VReg *)vd; \ |
1745 | VReg *Vj = (VReg *)vj; \ | |
1746 | VReg *Vk = (VReg *)vk; \ | |
162cd32c SG |
1747 | \ |
1748 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
1749 | Vd->E1(i) = do_ssrlrns_ ## E1(Vj->E2(i), (T)Vk->E2(i)%BIT, BIT/2 -1); \ | |
1750 | } \ | |
1751 | Vd->D(1) = 0; \ | |
1752 | } | |
1753 | ||
1754 | VSSRLRN(vssrlrn_b_h, 16, uint16_t, B, H) | |
1755 | VSSRLRN(vssrlrn_h_w, 32, uint32_t, H, W) | |
1756 | VSSRLRN(vssrlrn_w_d, 64, uint64_t, W, D) | |
1757 | ||
1758 | #define SSRARNS(E1, E2, T1, T2) \ | |
1759 | static T1 do_ssrarns_ ## E1(T1 e2, int sa, int sh) \ | |
1760 | { \ | |
1761 | T1 shft_res; \ | |
1762 | \ | |
1763 | shft_res = do_vsrar_ ## E2(e2, sa); \ | |
1764 | T2 mask; \ | |
1765 | mask = (1ll << sh) -1; \ | |
1766 | if (shft_res > mask) { \ | |
1767 | return mask; \ | |
1768 | } else if (shft_res < -(mask +1)) { \ | |
1769 | return ~mask; \ | |
1770 | } else { \ | |
1771 | return shft_res; \ | |
1772 | } \ | |
1773 | } | |
1774 | ||
1775 | SSRARNS(B, H, int16_t, int8_t) | |
1776 | SSRARNS(H, W, int32_t, int16_t) | |
1777 | SSRARNS(W, D, int64_t, int32_t) | |
1778 | ||
1779 | #define VSSRARN(NAME, BIT, T, E1, E2) \ | |
04711da1 | 1780 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
162cd32c SG |
1781 | { \ |
1782 | int i; \ | |
04711da1 SG |
1783 | VReg *Vd = (VReg *)vd; \ |
1784 | VReg *Vj = (VReg *)vj; \ | |
1785 | VReg *Vk = (VReg *)vk; \ | |
162cd32c SG |
1786 | \ |
1787 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
1788 | Vd->E1(i) = do_ssrarns_ ## E1(Vj->E2(i), (T)Vk->E2(i)%BIT, BIT/2 -1); \ | |
1789 | } \ | |
1790 | Vd->D(1) = 0; \ | |
1791 | } | |
1792 | ||
1793 | VSSRARN(vssrarn_b_h, 16, uint16_t, B, H) | |
1794 | VSSRARN(vssrarn_h_w, 32, uint32_t, H, W) | |
1795 | VSSRARN(vssrarn_w_d, 64, uint64_t, W, D) | |
1796 | ||
1797 | #define SSRLRNU(E1, E2, T1, T2, T3) \ | |
1798 | static T1 do_ssrlrnu_ ## E1(T3 e2, int sa, int sh) \ | |
1799 | { \ | |
1800 | T1 shft_res; \ | |
1801 | \ | |
1802 | shft_res = do_vsrlr_ ## E2(e2, sa); \ | |
1803 | \ | |
1804 | T2 mask; \ | |
1805 | mask = (1ull << sh) -1; \ | |
1806 | if (shft_res > mask) { \ | |
1807 | return mask; \ | |
1808 | } else { \ | |
1809 | return shft_res; \ | |
1810 | } \ | |
1811 | } | |
1812 | ||
1813 | SSRLRNU(B, H, uint16_t, uint8_t, int16_t) | |
1814 | SSRLRNU(H, W, uint32_t, uint16_t, int32_t) | |
1815 | SSRLRNU(W, D, uint64_t, uint32_t, int64_t) | |
1816 | ||
1817 | #define VSSRLRNU(NAME, BIT, T, E1, E2) \ | |
04711da1 | 1818 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
162cd32c SG |
1819 | { \ |
1820 | int i; \ | |
04711da1 SG |
1821 | VReg *Vd = (VReg *)vd; \ |
1822 | VReg *Vj = (VReg *)vj; \ | |
1823 | VReg *Vk = (VReg *)vk; \ | |
162cd32c SG |
1824 | \ |
1825 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
1826 | Vd->E1(i) = do_ssrlrnu_ ## E1(Vj->E2(i), (T)Vk->E2(i)%BIT, BIT/2); \ | |
1827 | } \ | |
1828 | Vd->D(1) = 0; \ | |
1829 | } | |
1830 | ||
1831 | VSSRLRNU(vssrlrn_bu_h, 16, uint16_t, B, H) | |
1832 | VSSRLRNU(vssrlrn_hu_w, 32, uint32_t, H, W) | |
1833 | VSSRLRNU(vssrlrn_wu_d, 64, uint64_t, W, D) | |
1834 | ||
1835 | #define SSRARNU(E1, E2, T1, T2, T3) \ | |
1836 | static T1 do_ssrarnu_ ## E1(T3 e2, int sa, int sh) \ | |
1837 | { \ | |
1838 | T1 shft_res; \ | |
1839 | \ | |
1840 | if (e2 < 0) { \ | |
1841 | shft_res = 0; \ | |
1842 | } else { \ | |
1843 | shft_res = do_vsrar_ ## E2(e2, sa); \ | |
1844 | } \ | |
1845 | T2 mask; \ | |
1846 | mask = (1ull << sh) -1; \ | |
1847 | if (shft_res > mask) { \ | |
1848 | return mask; \ | |
1849 | } else { \ | |
1850 | return shft_res; \ | |
1851 | } \ | |
1852 | } | |
1853 | ||
1854 | SSRARNU(B, H, uint16_t, uint8_t, int16_t) | |
1855 | SSRARNU(H, W, uint32_t, uint16_t, int32_t) | |
1856 | SSRARNU(W, D, uint64_t, uint32_t, int64_t) | |
1857 | ||
1858 | #define VSSRARNU(NAME, BIT, T, E1, E2) \ | |
04711da1 | 1859 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ |
162cd32c SG |
1860 | { \ |
1861 | int i; \ | |
04711da1 SG |
1862 | VReg *Vd = (VReg *)vd; \ |
1863 | VReg *Vj = (VReg *)vj; \ | |
1864 | VReg *Vk = (VReg *)vk; \ | |
162cd32c SG |
1865 | \ |
1866 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
1867 | Vd->E1(i) = do_ssrarnu_ ## E1(Vj->E2(i), (T)Vk->E2(i)%BIT, BIT/2); \ | |
1868 | } \ | |
1869 | Vd->D(1) = 0; \ | |
1870 | } | |
1871 | ||
1872 | VSSRARNU(vssrarn_bu_h, 16, uint16_t, B, H) | |
1873 | VSSRARNU(vssrarn_hu_w, 32, uint32_t, H, W) | |
1874 | VSSRARNU(vssrarn_wu_d, 64, uint64_t, W, D) | |
1875 | ||
1876 | #define VSSRLRNI(NAME, BIT, E1, E2) \ | |
329517d5 | 1877 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ |
162cd32c SG |
1878 | { \ |
1879 | int i; \ | |
1880 | VReg temp; \ | |
329517d5 SG |
1881 | VReg *Vd = (VReg *)vd; \ |
1882 | VReg *Vj = (VReg *)vj; \ | |
162cd32c SG |
1883 | \ |
1884 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
1885 | temp.E1(i) = do_ssrlrns_ ## E1(Vj->E2(i), imm, BIT/2 -1); \ | |
1886 | temp.E1(i + LSX_LEN/BIT) = do_ssrlrns_ ## E1(Vd->E2(i), imm, BIT/2 -1);\ | |
1887 | } \ | |
1888 | *Vd = temp; \ | |
1889 | } | |
1890 | ||
1891 | #define VSSRLRNI_Q(NAME, sh) \ | |
329517d5 | 1892 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ |
162cd32c SG |
1893 | { \ |
1894 | Int128 shft_res1, shft_res2, mask, r1, r2; \ | |
329517d5 SG |
1895 | VReg *Vd = (VReg *)vd; \ |
1896 | VReg *Vj = (VReg *)vj; \ | |
162cd32c SG |
1897 | \ |
1898 | if (imm == 0) { \ | |
1899 | shft_res1 = Vj->Q(0); \ | |
1900 | shft_res2 = Vd->Q(0); \ | |
1901 | } else { \ | |
1902 | r1 = int128_and(int128_urshift(Vj->Q(0), (imm -1)), int128_one()); \ | |
1903 | r2 = int128_and(int128_urshift(Vd->Q(0), (imm -1)), int128_one()); \ | |
1904 | \ | |
1905 | shft_res1 = (int128_add(int128_urshift(Vj->Q(0), imm), r1)); \ | |
1906 | shft_res2 = (int128_add(int128_urshift(Vd->Q(0), imm), r2)); \ | |
1907 | } \ | |
1908 | \ | |
1909 | mask = int128_sub(int128_lshift(int128_one(), sh), int128_one()); \ | |
1910 | \ | |
1911 | if (int128_ult(mask, shft_res1)) { \ | |
1912 | Vd->D(0) = int128_getlo(mask); \ | |
1913 | }else { \ | |
1914 | Vd->D(0) = int128_getlo(shft_res1); \ | |
1915 | } \ | |
1916 | \ | |
1917 | if (int128_ult(mask, shft_res2)) { \ | |
1918 | Vd->D(1) = int128_getlo(mask); \ | |
1919 | }else { \ | |
1920 | Vd->D(1) = int128_getlo(shft_res2); \ | |
1921 | } \ | |
1922 | } | |
1923 | ||
1924 | VSSRLRNI(vssrlrni_b_h, 16, B, H) | |
1925 | VSSRLRNI(vssrlrni_h_w, 32, H, W) | |
1926 | VSSRLRNI(vssrlrni_w_d, 64, W, D) | |
1927 | VSSRLRNI_Q(vssrlrni_d_q, 63) | |
1928 | ||
1929 | #define VSSRARNI(NAME, BIT, E1, E2) \ | |
329517d5 | 1930 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ |
162cd32c SG |
1931 | { \ |
1932 | int i; \ | |
1933 | VReg temp; \ | |
329517d5 SG |
1934 | VReg *Vd = (VReg *)vd; \ |
1935 | VReg *Vj = (VReg *)vj; \ | |
162cd32c SG |
1936 | \ |
1937 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
1938 | temp.E1(i) = do_ssrarns_ ## E1(Vj->E2(i), imm, BIT/2 -1); \ | |
1939 | temp.E1(i + LSX_LEN/BIT) = do_ssrarns_ ## E1(Vd->E2(i), imm, BIT/2 -1); \ | |
1940 | } \ | |
1941 | *Vd = temp; \ | |
1942 | } | |
1943 | ||
329517d5 | 1944 | void HELPER(vssrarni_d_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) |
162cd32c SG |
1945 | { |
1946 | Int128 shft_res1, shft_res2, mask1, mask2, r1, r2; | |
329517d5 SG |
1947 | VReg *Vd = (VReg *)vd; |
1948 | VReg *Vj = (VReg *)vj; | |
162cd32c SG |
1949 | |
1950 | if (imm == 0) { | |
1951 | shft_res1 = Vj->Q(0); | |
1952 | shft_res2 = Vd->Q(0); | |
1953 | } else { | |
1954 | r1 = int128_and(int128_rshift(Vj->Q(0), (imm -1)), int128_one()); | |
1955 | r2 = int128_and(int128_rshift(Vd->Q(0), (imm -1)), int128_one()); | |
1956 | ||
1957 | shft_res1 = int128_add(int128_rshift(Vj->Q(0), imm), r1); | |
1958 | shft_res2 = int128_add(int128_rshift(Vd->Q(0), imm), r2); | |
1959 | } | |
1960 | ||
1961 | mask1 = int128_sub(int128_lshift(int128_one(), 63), int128_one()); | |
1962 | mask2 = int128_lshift(int128_one(), 63); | |
1963 | ||
1964 | if (int128_gt(shft_res1, mask1)) { | |
1965 | Vd->D(0) = int128_getlo(mask1); | |
1966 | } else if (int128_lt(shft_res1, int128_neg(mask2))) { | |
1967 | Vd->D(0) = int128_getlo(mask2); | |
1968 | } else { | |
1969 | Vd->D(0) = int128_getlo(shft_res1); | |
1970 | } | |
1971 | ||
1972 | if (int128_gt(shft_res2, mask1)) { | |
1973 | Vd->D(1) = int128_getlo(mask1); | |
1974 | } else if (int128_lt(shft_res2, int128_neg(mask2))) { | |
1975 | Vd->D(1) = int128_getlo(mask2); | |
1976 | } else { | |
1977 | Vd->D(1) = int128_getlo(shft_res2); | |
1978 | } | |
1979 | } | |
1980 | ||
1981 | VSSRARNI(vssrarni_b_h, 16, B, H) | |
1982 | VSSRARNI(vssrarni_h_w, 32, H, W) | |
1983 | VSSRARNI(vssrarni_w_d, 64, W, D) | |
1984 | ||
1985 | #define VSSRLRNUI(NAME, BIT, E1, E2) \ | |
329517d5 | 1986 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ |
162cd32c SG |
1987 | { \ |
1988 | int i; \ | |
1989 | VReg temp; \ | |
329517d5 SG |
1990 | VReg *Vd = (VReg *)vd; \ |
1991 | VReg *Vj = (VReg *)vj; \ | |
162cd32c SG |
1992 | \ |
1993 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
1994 | temp.E1(i) = do_ssrlrnu_ ## E1(Vj->E2(i), imm, BIT/2); \ | |
1995 | temp.E1(i + LSX_LEN/BIT) = do_ssrlrnu_ ## E1(Vd->E2(i), imm, BIT/2); \ | |
1996 | } \ | |
1997 | *Vd = temp; \ | |
1998 | } | |
1999 | ||
2000 | VSSRLRNUI(vssrlrni_bu_h, 16, B, H) | |
2001 | VSSRLRNUI(vssrlrni_hu_w, 32, H, W) | |
2002 | VSSRLRNUI(vssrlrni_wu_d, 64, W, D) | |
2003 | VSSRLRNI_Q(vssrlrni_du_q, 64) | |
2004 | ||
2005 | #define VSSRARNUI(NAME, BIT, E1, E2) \ | |
329517d5 | 2006 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ |
162cd32c SG |
2007 | { \ |
2008 | int i; \ | |
2009 | VReg temp; \ | |
329517d5 SG |
2010 | VReg *Vd = (VReg *)vd; \ |
2011 | VReg *Vj = (VReg *)vj; \ | |
162cd32c SG |
2012 | \ |
2013 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
2014 | temp.E1(i) = do_ssrarnu_ ## E1(Vj->E2(i), imm, BIT/2); \ | |
2015 | temp.E1(i + LSX_LEN/BIT) = do_ssrarnu_ ## E1(Vd->E2(i), imm, BIT/2); \ | |
2016 | } \ | |
2017 | *Vd = temp; \ | |
2018 | } | |
2019 | ||
329517d5 | 2020 | void HELPER(vssrarni_du_q)(void *vd, void *vj, uint64_t imm, uint32_t desc) |
162cd32c SG |
2021 | { |
2022 | Int128 shft_res1, shft_res2, mask1, mask2, r1, r2; | |
329517d5 SG |
2023 | VReg *Vd = (VReg *)vd; |
2024 | VReg *Vj = (VReg *)vj; | |
162cd32c SG |
2025 | |
2026 | if (imm == 0) { | |
2027 | shft_res1 = Vj->Q(0); | |
2028 | shft_res2 = Vd->Q(0); | |
2029 | } else { | |
2030 | r1 = int128_and(int128_rshift(Vj->Q(0), (imm -1)), int128_one()); | |
2031 | r2 = int128_and(int128_rshift(Vd->Q(0), (imm -1)), int128_one()); | |
2032 | ||
2033 | shft_res1 = int128_add(int128_rshift(Vj->Q(0), imm), r1); | |
2034 | shft_res2 = int128_add(int128_rshift(Vd->Q(0), imm), r2); | |
2035 | } | |
2036 | ||
2037 | if (int128_lt(Vj->Q(0), int128_zero())) { | |
2038 | shft_res1 = int128_zero(); | |
2039 | } | |
2040 | if (int128_lt(Vd->Q(0), int128_zero())) { | |
2041 | shft_res2 = int128_zero(); | |
2042 | } | |
2043 | ||
2044 | mask1 = int128_sub(int128_lshift(int128_one(), 64), int128_one()); | |
2045 | mask2 = int128_lshift(int128_one(), 64); | |
2046 | ||
2047 | if (int128_gt(shft_res1, mask1)) { | |
2048 | Vd->D(0) = int128_getlo(mask1); | |
2049 | } else if (int128_lt(shft_res1, int128_neg(mask2))) { | |
2050 | Vd->D(0) = int128_getlo(mask2); | |
2051 | } else { | |
2052 | Vd->D(0) = int128_getlo(shft_res1); | |
2053 | } | |
2054 | ||
2055 | if (int128_gt(shft_res2, mask1)) { | |
2056 | Vd->D(1) = int128_getlo(mask1); | |
2057 | } else if (int128_lt(shft_res2, int128_neg(mask2))) { | |
2058 | Vd->D(1) = int128_getlo(mask2); | |
2059 | } else { | |
2060 | Vd->D(1) = int128_getlo(shft_res2); | |
2061 | } | |
2062 | } | |
2063 | ||
2064 | VSSRARNUI(vssrarni_bu_h, 16, B, H) | |
2065 | VSSRARNUI(vssrarni_hu_w, 32, H, W) | |
2066 | VSSRARNUI(vssrarni_wu_d, 64, W, D) | |
2e105e12 | 2067 | |
ff27e335 SG |
2068 | #define DO_2OP(NAME, BIT, E, DO_OP) \ |
2069 | void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ | |
2070 | { \ | |
2071 | int i; \ | |
2072 | VReg *Vd = (VReg *)vd; \ | |
2073 | VReg *Vj = (VReg *)vj; \ | |
2074 | \ | |
2075 | for (i = 0; i < LSX_LEN/BIT; i++) \ | |
2076 | { \ | |
2077 | Vd->E(i) = DO_OP(Vj->E(i)); \ | |
2078 | } \ | |
2e105e12 SG |
2079 | } |
2080 | ||
2081 | #define DO_CLO_B(N) (clz32(~N & 0xff) - 24) | |
2082 | #define DO_CLO_H(N) (clz32(~N & 0xffff) - 16) | |
2083 | #define DO_CLO_W(N) (clz32(~N)) | |
2084 | #define DO_CLO_D(N) (clz64(~N)) | |
2085 | #define DO_CLZ_B(N) (clz32(N) - 24) | |
2086 | #define DO_CLZ_H(N) (clz32(N) - 16) | |
2087 | #define DO_CLZ_W(N) (clz32(N)) | |
2088 | #define DO_CLZ_D(N) (clz64(N)) | |
2089 | ||
2090 | DO_2OP(vclo_b, 8, UB, DO_CLO_B) | |
2091 | DO_2OP(vclo_h, 16, UH, DO_CLO_H) | |
2092 | DO_2OP(vclo_w, 32, UW, DO_CLO_W) | |
2093 | DO_2OP(vclo_d, 64, UD, DO_CLO_D) | |
2094 | DO_2OP(vclz_b, 8, UB, DO_CLZ_B) | |
2095 | DO_2OP(vclz_h, 16, UH, DO_CLZ_H) | |
2096 | DO_2OP(vclz_w, 32, UW, DO_CLZ_W) | |
2097 | DO_2OP(vclz_d, 64, UD, DO_CLZ_D) | |
bb22ee57 | 2098 | |
ff27e335 SG |
2099 | #define VPCNT(NAME, BIT, E, FN) \ |
2100 | void HELPER(NAME)(void *vd, void *vj, uint32_t desc) \ | |
2101 | { \ | |
2102 | int i; \ | |
2103 | VReg *Vd = (VReg *)vd; \ | |
2104 | VReg *Vj = (VReg *)vj; \ | |
2105 | \ | |
2106 | for (i = 0; i < LSX_LEN/BIT; i++) \ | |
2107 | { \ | |
2108 | Vd->E(i) = FN(Vj->E(i)); \ | |
2109 | } \ | |
bb22ee57 SG |
2110 | } |
2111 | ||
2112 | VPCNT(vpcnt_b, 8, UB, ctpop8) | |
2113 | VPCNT(vpcnt_h, 16, UH, ctpop16) | |
2114 | VPCNT(vpcnt_w, 32, UW, ctpop32) | |
2115 | VPCNT(vpcnt_d, 64, UD, ctpop64) | |
0b1e6705 SG |
2116 | |
2117 | #define DO_BITCLR(a, bit) (a & ~(1ull << bit)) | |
2118 | #define DO_BITSET(a, bit) (a | 1ull << bit) | |
2119 | #define DO_BITREV(a, bit) (a ^ (1ull << bit)) | |
2120 | ||
2121 | #define DO_BIT(NAME, BIT, E, DO_OP) \ | |
2122 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t v) \ | |
2123 | { \ | |
2124 | int i; \ | |
2125 | VReg *Vd = (VReg *)vd; \ | |
2126 | VReg *Vj = (VReg *)vj; \ | |
2127 | VReg *Vk = (VReg *)vk; \ | |
2128 | \ | |
2129 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
2130 | Vd->E(i) = DO_OP(Vj->E(i), Vk->E(i)%BIT); \ | |
2131 | } \ | |
2132 | } | |
2133 | ||
2134 | DO_BIT(vbitclr_b, 8, UB, DO_BITCLR) | |
2135 | DO_BIT(vbitclr_h, 16, UH, DO_BITCLR) | |
2136 | DO_BIT(vbitclr_w, 32, UW, DO_BITCLR) | |
2137 | DO_BIT(vbitclr_d, 64, UD, DO_BITCLR) | |
2138 | DO_BIT(vbitset_b, 8, UB, DO_BITSET) | |
2139 | DO_BIT(vbitset_h, 16, UH, DO_BITSET) | |
2140 | DO_BIT(vbitset_w, 32, UW, DO_BITSET) | |
2141 | DO_BIT(vbitset_d, 64, UD, DO_BITSET) | |
2142 | DO_BIT(vbitrev_b, 8, UB, DO_BITREV) | |
2143 | DO_BIT(vbitrev_h, 16, UH, DO_BITREV) | |
2144 | DO_BIT(vbitrev_w, 32, UW, DO_BITREV) | |
2145 | DO_BIT(vbitrev_d, 64, UD, DO_BITREV) | |
2146 | ||
2147 | #define DO_BITI(NAME, BIT, E, DO_OP) \ | |
2148 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t v) \ | |
2149 | { \ | |
2150 | int i; \ | |
2151 | VReg *Vd = (VReg *)vd; \ | |
2152 | VReg *Vj = (VReg *)vj; \ | |
2153 | \ | |
2154 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
2155 | Vd->E(i) = DO_OP(Vj->E(i), imm); \ | |
2156 | } \ | |
2157 | } | |
2158 | ||
2159 | DO_BITI(vbitclri_b, 8, UB, DO_BITCLR) | |
2160 | DO_BITI(vbitclri_h, 16, UH, DO_BITCLR) | |
2161 | DO_BITI(vbitclri_w, 32, UW, DO_BITCLR) | |
2162 | DO_BITI(vbitclri_d, 64, UD, DO_BITCLR) | |
2163 | DO_BITI(vbitseti_b, 8, UB, DO_BITSET) | |
2164 | DO_BITI(vbitseti_h, 16, UH, DO_BITSET) | |
2165 | DO_BITI(vbitseti_w, 32, UW, DO_BITSET) | |
2166 | DO_BITI(vbitseti_d, 64, UD, DO_BITSET) | |
2167 | DO_BITI(vbitrevi_b, 8, UB, DO_BITREV) | |
2168 | DO_BITI(vbitrevi_h, 16, UH, DO_BITREV) | |
2169 | DO_BITI(vbitrevi_w, 32, UW, DO_BITREV) | |
2170 | DO_BITI(vbitrevi_d, 64, UD, DO_BITREV) | |
ac95a0b9 | 2171 | |
04711da1 SG |
2172 | #define VFRSTP(NAME, BIT, MASK, E) \ |
2173 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
2174 | { \ | |
2175 | int i, m; \ | |
2176 | VReg *Vd = (VReg *)vd; \ | |
2177 | VReg *Vj = (VReg *)vj; \ | |
2178 | VReg *Vk = (VReg *)vk; \ | |
2179 | \ | |
2180 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
2181 | if (Vj->E(i) < 0) { \ | |
2182 | break; \ | |
2183 | } \ | |
2184 | } \ | |
2185 | m = Vk->E(0) & MASK; \ | |
2186 | Vd->E(m) = i; \ | |
ac95a0b9 SG |
2187 | } |
2188 | ||
2189 | VFRSTP(vfrstp_b, 8, 0xf, B) | |
2190 | VFRSTP(vfrstp_h, 16, 0x7, H) | |
2191 | ||
329517d5 SG |
2192 | #define VFRSTPI(NAME, BIT, E) \ |
2193 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
2194 | { \ | |
2195 | int i, m; \ | |
2196 | VReg *Vd = (VReg *)vd; \ | |
2197 | VReg *Vj = (VReg *)vj; \ | |
2198 | \ | |
2199 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
2200 | if (Vj->E(i) < 0) { \ | |
2201 | break; \ | |
2202 | } \ | |
2203 | } \ | |
2204 | m = imm % (LSX_LEN/BIT); \ | |
2205 | Vd->E(m) = i; \ | |
ac95a0b9 SG |
2206 | } |
2207 | ||
2208 | VFRSTPI(vfrstpi_b, 8, B) | |
2209 | VFRSTPI(vfrstpi_h, 16, H) | |
aca67472 SG |
2210 | |
2211 | static void vec_update_fcsr0_mask(CPULoongArchState *env, | |
2212 | uintptr_t pc, int mask) | |
2213 | { | |
2214 | int flags = get_float_exception_flags(&env->fp_status); | |
2215 | ||
2216 | set_float_exception_flags(0, &env->fp_status); | |
2217 | ||
2218 | flags &= ~mask; | |
2219 | ||
2220 | if (flags) { | |
2221 | flags = ieee_ex_to_loongarch(flags); | |
2222 | UPDATE_FP_CAUSE(env->fcsr0, flags); | |
2223 | } | |
2224 | ||
2225 | if (GET_FP_ENABLES(env->fcsr0) & flags) { | |
2226 | do_raise_exception(env, EXCCODE_FPE, pc); | |
2227 | } else { | |
2228 | UPDATE_FP_FLAGS(env->fcsr0, flags); | |
2229 | } | |
2230 | } | |
2231 | ||
2232 | static void vec_update_fcsr0(CPULoongArchState *env, uintptr_t pc) | |
2233 | { | |
2234 | vec_update_fcsr0_mask(env, pc, 0); | |
2235 | } | |
2236 | ||
2237 | static inline void vec_clear_cause(CPULoongArchState *env) | |
2238 | { | |
2239 | SET_FP_CAUSE(env->fcsr0, 0); | |
2240 | } | |
2241 | ||
2242 | #define DO_3OP_F(NAME, BIT, E, FN) \ | |
3b286753 SG |
2243 | void HELPER(NAME)(void *vd, void *vj, void *vk, \ |
2244 | CPULoongArchState *env, uint32_t desc) \ | |
aca67472 SG |
2245 | { \ |
2246 | int i; \ | |
3b286753 SG |
2247 | VReg *Vd = (VReg *)vd; \ |
2248 | VReg *Vj = (VReg *)vj; \ | |
2249 | VReg *Vk = (VReg *)vk; \ | |
aca67472 SG |
2250 | \ |
2251 | vec_clear_cause(env); \ | |
2252 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
2253 | Vd->E(i) = FN(Vj->E(i), Vk->E(i), &env->fp_status); \ | |
2254 | vec_update_fcsr0(env, GETPC()); \ | |
2255 | } \ | |
2256 | } | |
2257 | ||
2258 | DO_3OP_F(vfadd_s, 32, UW, float32_add) | |
2259 | DO_3OP_F(vfadd_d, 64, UD, float64_add) | |
2260 | DO_3OP_F(vfsub_s, 32, UW, float32_sub) | |
2261 | DO_3OP_F(vfsub_d, 64, UD, float64_sub) | |
2262 | DO_3OP_F(vfmul_s, 32, UW, float32_mul) | |
2263 | DO_3OP_F(vfmul_d, 64, UD, float64_mul) | |
2264 | DO_3OP_F(vfdiv_s, 32, UW, float32_div) | |
2265 | DO_3OP_F(vfdiv_d, 64, UD, float64_div) | |
2266 | DO_3OP_F(vfmax_s, 32, UW, float32_maxnum) | |
2267 | DO_3OP_F(vfmax_d, 64, UD, float64_maxnum) | |
2268 | DO_3OP_F(vfmin_s, 32, UW, float32_minnum) | |
2269 | DO_3OP_F(vfmin_d, 64, UD, float64_minnum) | |
2270 | DO_3OP_F(vfmaxa_s, 32, UW, float32_maxnummag) | |
2271 | DO_3OP_F(vfmaxa_d, 64, UD, float64_maxnummag) | |
2272 | DO_3OP_F(vfmina_s, 32, UW, float32_minnummag) | |
2273 | DO_3OP_F(vfmina_d, 64, UD, float64_minnummag) | |
2274 | ||
2275 | #define DO_4OP_F(NAME, BIT, E, FN, flags) \ | |
e2600dad SG |
2276 | void HELPER(NAME)(void *vd, void *vj, void *vk, void *va, \ |
2277 | CPULoongArchState *env, uint32_t desc) \ | |
aca67472 SG |
2278 | { \ |
2279 | int i; \ | |
e2600dad SG |
2280 | VReg *Vd = (VReg *)vd; \ |
2281 | VReg *Vj = (VReg *)vj; \ | |
2282 | VReg *Vk = (VReg *)vk; \ | |
2283 | VReg *Va = (VReg *)va; \ | |
aca67472 SG |
2284 | \ |
2285 | vec_clear_cause(env); \ | |
2286 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
2287 | Vd->E(i) = FN(Vj->E(i), Vk->E(i), Va->E(i), flags, &env->fp_status); \ | |
2288 | vec_update_fcsr0(env, GETPC()); \ | |
2289 | } \ | |
2290 | } | |
2291 | ||
2292 | DO_4OP_F(vfmadd_s, 32, UW, float32_muladd, 0) | |
2293 | DO_4OP_F(vfmadd_d, 64, UD, float64_muladd, 0) | |
2294 | DO_4OP_F(vfmsub_s, 32, UW, float32_muladd, float_muladd_negate_c) | |
2295 | DO_4OP_F(vfmsub_d, 64, UD, float64_muladd, float_muladd_negate_c) | |
2296 | DO_4OP_F(vfnmadd_s, 32, UW, float32_muladd, float_muladd_negate_result) | |
2297 | DO_4OP_F(vfnmadd_d, 64, UD, float64_muladd, float_muladd_negate_result) | |
2298 | DO_4OP_F(vfnmsub_s, 32, UW, float32_muladd, | |
2299 | float_muladd_negate_c | float_muladd_negate_result) | |
2300 | DO_4OP_F(vfnmsub_d, 64, UD, float64_muladd, | |
2301 | float_muladd_negate_c | float_muladd_negate_result) | |
2302 | ||
226bf881 SG |
2303 | #define DO_2OP_F(NAME, BIT, E, FN) \ |
2304 | void HELPER(NAME)(void *vd, void *vj, \ | |
2305 | CPULoongArchState *env, uint32_t desc) \ | |
2306 | { \ | |
2307 | int i; \ | |
2308 | VReg *Vd = (VReg *)vd; \ | |
2309 | VReg *Vj = (VReg *)vj; \ | |
2310 | \ | |
2311 | vec_clear_cause(env); \ | |
2312 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
2313 | Vd->E(i) = FN(env, Vj->E(i)); \ | |
2314 | } \ | |
aca67472 SG |
2315 | } |
2316 | ||
2317 | #define FLOGB(BIT, T) \ | |
2318 | static T do_flogb_## BIT(CPULoongArchState *env, T fj) \ | |
2319 | { \ | |
2320 | T fp, fd; \ | |
2321 | float_status *status = &env->fp_status; \ | |
2322 | FloatRoundMode old_mode = get_float_rounding_mode(status); \ | |
2323 | \ | |
2324 | set_float_rounding_mode(float_round_down, status); \ | |
2325 | fp = float ## BIT ##_log2(fj, status); \ | |
2326 | fd = float ## BIT ##_round_to_int(fp, status); \ | |
2327 | set_float_rounding_mode(old_mode, status); \ | |
2328 | vec_update_fcsr0_mask(env, GETPC(), float_flag_inexact); \ | |
2329 | return fd; \ | |
2330 | } | |
2331 | ||
2332 | FLOGB(32, uint32_t) | |
2333 | FLOGB(64, uint64_t) | |
2334 | ||
226bf881 SG |
2335 | #define FCLASS(NAME, BIT, E, FN) \ |
2336 | void HELPER(NAME)(void *vd, void *vj, \ | |
2337 | CPULoongArchState *env, uint32_t desc) \ | |
2338 | { \ | |
2339 | int i; \ | |
2340 | VReg *Vd = (VReg *)vd; \ | |
2341 | VReg *Vj = (VReg *)vj; \ | |
2342 | \ | |
2343 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
2344 | Vd->E(i) = FN(env, Vj->E(i)); \ | |
2345 | } \ | |
aca67472 SG |
2346 | } |
2347 | ||
2348 | FCLASS(vfclass_s, 32, UW, helper_fclass_s) | |
2349 | FCLASS(vfclass_d, 64, UD, helper_fclass_d) | |
2350 | ||
2351 | #define FSQRT(BIT, T) \ | |
2352 | static T do_fsqrt_## BIT(CPULoongArchState *env, T fj) \ | |
2353 | { \ | |
2354 | T fd; \ | |
2355 | fd = float ## BIT ##_sqrt(fj, &env->fp_status); \ | |
2356 | vec_update_fcsr0(env, GETPC()); \ | |
2357 | return fd; \ | |
2358 | } | |
2359 | ||
2360 | FSQRT(32, uint32_t) | |
2361 | FSQRT(64, uint64_t) | |
2362 | ||
2363 | #define FRECIP(BIT, T) \ | |
2364 | static T do_frecip_## BIT(CPULoongArchState *env, T fj) \ | |
2365 | { \ | |
2366 | T fd; \ | |
2367 | fd = float ## BIT ##_div(float ## BIT ##_one, fj, &env->fp_status); \ | |
2368 | vec_update_fcsr0(env, GETPC()); \ | |
2369 | return fd; \ | |
2370 | } | |
2371 | ||
2372 | FRECIP(32, uint32_t) | |
2373 | FRECIP(64, uint64_t) | |
2374 | ||
2375 | #define FRSQRT(BIT, T) \ | |
2376 | static T do_frsqrt_## BIT(CPULoongArchState *env, T fj) \ | |
2377 | { \ | |
2378 | T fd, fp; \ | |
2379 | fp = float ## BIT ##_sqrt(fj, &env->fp_status); \ | |
2380 | fd = float ## BIT ##_div(float ## BIT ##_one, fp, &env->fp_status); \ | |
2381 | vec_update_fcsr0(env, GETPC()); \ | |
2382 | return fd; \ | |
2383 | } | |
2384 | ||
2385 | FRSQRT(32, uint32_t) | |
2386 | FRSQRT(64, uint64_t) | |
2387 | ||
2388 | DO_2OP_F(vflogb_s, 32, UW, do_flogb_32) | |
2389 | DO_2OP_F(vflogb_d, 64, UD, do_flogb_64) | |
2390 | DO_2OP_F(vfsqrt_s, 32, UW, do_fsqrt_32) | |
2391 | DO_2OP_F(vfsqrt_d, 64, UD, do_fsqrt_64) | |
2392 | DO_2OP_F(vfrecip_s, 32, UW, do_frecip_32) | |
2393 | DO_2OP_F(vfrecip_d, 64, UD, do_frecip_64) | |
2394 | DO_2OP_F(vfrsqrt_s, 32, UW, do_frsqrt_32) | |
2395 | DO_2OP_F(vfrsqrt_d, 64, UD, do_frsqrt_64) | |
399665d2 SG |
2396 | |
2397 | static uint32_t float16_cvt_float32(uint16_t h, float_status *status) | |
2398 | { | |
2399 | return float16_to_float32(h, true, status); | |
2400 | } | |
2401 | static uint64_t float32_cvt_float64(uint32_t s, float_status *status) | |
2402 | { | |
2403 | return float32_to_float64(s, status); | |
2404 | } | |
2405 | ||
2406 | static uint16_t float32_cvt_float16(uint32_t s, float_status *status) | |
2407 | { | |
2408 | return float32_to_float16(s, true, status); | |
2409 | } | |
2410 | static uint32_t float64_cvt_float32(uint64_t d, float_status *status) | |
2411 | { | |
2412 | return float64_to_float32(d, status); | |
2413 | } | |
2414 | ||
226bf881 SG |
2415 | void HELPER(vfcvtl_s_h)(void *vd, void *vj, |
2416 | CPULoongArchState *env, uint32_t desc) | |
399665d2 SG |
2417 | { |
2418 | int i; | |
2419 | VReg temp; | |
226bf881 SG |
2420 | VReg *Vd = (VReg *)vd; |
2421 | VReg *Vj = (VReg *)vj; | |
399665d2 SG |
2422 | |
2423 | vec_clear_cause(env); | |
2424 | for (i = 0; i < LSX_LEN/32; i++) { | |
2425 | temp.UW(i) = float16_cvt_float32(Vj->UH(i), &env->fp_status); | |
2426 | vec_update_fcsr0(env, GETPC()); | |
2427 | } | |
2428 | *Vd = temp; | |
2429 | } | |
2430 | ||
226bf881 SG |
2431 | void HELPER(vfcvtl_d_s)(void *vd, void *vj, |
2432 | CPULoongArchState *env, uint32_t desc) | |
399665d2 SG |
2433 | { |
2434 | int i; | |
2435 | VReg temp; | |
226bf881 SG |
2436 | VReg *Vd = (VReg *)vd; |
2437 | VReg *Vj = (VReg *)vj; | |
399665d2 SG |
2438 | |
2439 | vec_clear_cause(env); | |
2440 | for (i = 0; i < LSX_LEN/64; i++) { | |
2441 | temp.UD(i) = float32_cvt_float64(Vj->UW(i), &env->fp_status); | |
2442 | vec_update_fcsr0(env, GETPC()); | |
2443 | } | |
2444 | *Vd = temp; | |
2445 | } | |
2446 | ||
226bf881 SG |
2447 | void HELPER(vfcvth_s_h)(void *vd, void *vj, |
2448 | CPULoongArchState *env, uint32_t desc) | |
399665d2 SG |
2449 | { |
2450 | int i; | |
2451 | VReg temp; | |
226bf881 SG |
2452 | VReg *Vd = (VReg *)vd; |
2453 | VReg *Vj = (VReg *)vj; | |
399665d2 SG |
2454 | |
2455 | vec_clear_cause(env); | |
2456 | for (i = 0; i < LSX_LEN/32; i++) { | |
2457 | temp.UW(i) = float16_cvt_float32(Vj->UH(i + 4), &env->fp_status); | |
2458 | vec_update_fcsr0(env, GETPC()); | |
2459 | } | |
2460 | *Vd = temp; | |
2461 | } | |
2462 | ||
226bf881 SG |
2463 | void HELPER(vfcvth_d_s)(void *vd, void *vj, |
2464 | CPULoongArchState *env, uint32_t desc) | |
399665d2 SG |
2465 | { |
2466 | int i; | |
2467 | VReg temp; | |
226bf881 SG |
2468 | VReg *Vd = (VReg *)vd; |
2469 | VReg *Vj = (VReg *)vj; | |
399665d2 SG |
2470 | |
2471 | vec_clear_cause(env); | |
2472 | for (i = 0; i < LSX_LEN/64; i++) { | |
2473 | temp.UD(i) = float32_cvt_float64(Vj->UW(i + 2), &env->fp_status); | |
2474 | vec_update_fcsr0(env, GETPC()); | |
2475 | } | |
2476 | *Vd = temp; | |
2477 | } | |
2478 | ||
3b286753 SG |
2479 | void HELPER(vfcvt_h_s)(void *vd, void *vj, void *vk, |
2480 | CPULoongArchState *env, uint32_t desc) | |
399665d2 SG |
2481 | { |
2482 | int i; | |
2483 | VReg temp; | |
3b286753 SG |
2484 | VReg *Vd = (VReg *)vd; |
2485 | VReg *Vj = (VReg *)vj; | |
2486 | VReg *Vk = (VReg *)vk; | |
399665d2 SG |
2487 | |
2488 | vec_clear_cause(env); | |
2489 | for(i = 0; i < LSX_LEN/32; i++) { | |
2490 | temp.UH(i + 4) = float32_cvt_float16(Vj->UW(i), &env->fp_status); | |
2491 | temp.UH(i) = float32_cvt_float16(Vk->UW(i), &env->fp_status); | |
2492 | vec_update_fcsr0(env, GETPC()); | |
2493 | } | |
2494 | *Vd = temp; | |
2495 | } | |
2496 | ||
3b286753 SG |
2497 | void HELPER(vfcvt_s_d)(void *vd, void *vj, void *vk, |
2498 | CPULoongArchState *env, uint32_t desc) | |
399665d2 SG |
2499 | { |
2500 | int i; | |
2501 | VReg temp; | |
3b286753 SG |
2502 | VReg *Vd = (VReg *)vd; |
2503 | VReg *Vj = (VReg *)vj; | |
2504 | VReg *Vk = (VReg *)vk; | |
399665d2 SG |
2505 | |
2506 | vec_clear_cause(env); | |
2507 | for(i = 0; i < LSX_LEN/64; i++) { | |
2508 | temp.UW(i + 2) = float64_cvt_float32(Vj->UD(i), &env->fp_status); | |
2509 | temp.UW(i) = float64_cvt_float32(Vk->UD(i), &env->fp_status); | |
2510 | vec_update_fcsr0(env, GETPC()); | |
2511 | } | |
2512 | *Vd = temp; | |
2513 | } | |
2514 | ||
226bf881 SG |
2515 | void HELPER(vfrint_s)(void *vd, void *vj, |
2516 | CPULoongArchState *env, uint32_t desc) | |
399665d2 SG |
2517 | { |
2518 | int i; | |
226bf881 SG |
2519 | VReg *Vd = (VReg *)vd; |
2520 | VReg *Vj = (VReg *)vj; | |
399665d2 SG |
2521 | |
2522 | vec_clear_cause(env); | |
2523 | for (i = 0; i < 4; i++) { | |
2524 | Vd->W(i) = float32_round_to_int(Vj->UW(i), &env->fp_status); | |
2525 | vec_update_fcsr0(env, GETPC()); | |
2526 | } | |
2527 | } | |
2528 | ||
226bf881 SG |
2529 | void HELPER(vfrint_d)(void *vd, void *vj, |
2530 | CPULoongArchState *env, uint32_t desc) | |
399665d2 SG |
2531 | { |
2532 | int i; | |
226bf881 SG |
2533 | VReg *Vd = (VReg *)vd; |
2534 | VReg *Vj = (VReg *)vj; | |
399665d2 SG |
2535 | |
2536 | vec_clear_cause(env); | |
2537 | for (i = 0; i < 2; i++) { | |
2538 | Vd->D(i) = float64_round_to_int(Vj->UD(i), &env->fp_status); | |
2539 | vec_update_fcsr0(env, GETPC()); | |
2540 | } | |
2541 | } | |
2542 | ||
2543 | #define FCVT_2OP(NAME, BIT, E, MODE) \ | |
226bf881 SG |
2544 | void HELPER(NAME)(void *vd, void *vj, \ |
2545 | CPULoongArchState *env, uint32_t desc) \ | |
399665d2 SG |
2546 | { \ |
2547 | int i; \ | |
226bf881 SG |
2548 | VReg *Vd = (VReg *)vd; \ |
2549 | VReg *Vj = (VReg *)vj; \ | |
399665d2 SG |
2550 | \ |
2551 | vec_clear_cause(env); \ | |
2552 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
2553 | FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \ | |
2554 | set_float_rounding_mode(MODE, &env->fp_status); \ | |
2555 | Vd->E(i) = float## BIT ## _round_to_int(Vj->E(i), &env->fp_status); \ | |
2556 | set_float_rounding_mode(old_mode, &env->fp_status); \ | |
2557 | vec_update_fcsr0(env, GETPC()); \ | |
2558 | } \ | |
2559 | } | |
2560 | ||
2561 | FCVT_2OP(vfrintrne_s, 32, UW, float_round_nearest_even) | |
2562 | FCVT_2OP(vfrintrne_d, 64, UD, float_round_nearest_even) | |
2563 | FCVT_2OP(vfrintrz_s, 32, UW, float_round_to_zero) | |
2564 | FCVT_2OP(vfrintrz_d, 64, UD, float_round_to_zero) | |
2565 | FCVT_2OP(vfrintrp_s, 32, UW, float_round_up) | |
2566 | FCVT_2OP(vfrintrp_d, 64, UD, float_round_up) | |
2567 | FCVT_2OP(vfrintrm_s, 32, UW, float_round_down) | |
2568 | FCVT_2OP(vfrintrm_d, 64, UD, float_round_down) | |
2569 | ||
2570 | #define FTINT(NAME, FMT1, FMT2, T1, T2, MODE) \ | |
2571 | static T2 do_ftint ## NAME(CPULoongArchState *env, T1 fj) \ | |
2572 | { \ | |
2573 | T2 fd; \ | |
2574 | FloatRoundMode old_mode = get_float_rounding_mode(&env->fp_status); \ | |
2575 | \ | |
2576 | set_float_rounding_mode(MODE, &env->fp_status); \ | |
2577 | fd = do_## FMT1 ##_to_## FMT2(env, fj); \ | |
2578 | set_float_rounding_mode(old_mode, &env->fp_status); \ | |
2579 | return fd; \ | |
2580 | } | |
2581 | ||
2582 | #define DO_FTINT(FMT1, FMT2, T1, T2) \ | |
2583 | static T2 do_## FMT1 ##_to_## FMT2(CPULoongArchState *env, T1 fj) \ | |
2584 | { \ | |
2585 | T2 fd; \ | |
2586 | \ | |
2587 | fd = FMT1 ##_to_## FMT2(fj, &env->fp_status); \ | |
2588 | if (get_float_exception_flags(&env->fp_status) & (float_flag_invalid)) { \ | |
2589 | if (FMT1 ##_is_any_nan(fj)) { \ | |
2590 | fd = 0; \ | |
2591 | } \ | |
2592 | } \ | |
2593 | vec_update_fcsr0(env, GETPC()); \ | |
2594 | return fd; \ | |
2595 | } | |
2596 | ||
2597 | DO_FTINT(float32, int32, uint32_t, uint32_t) | |
2598 | DO_FTINT(float64, int64, uint64_t, uint64_t) | |
2599 | DO_FTINT(float32, uint32, uint32_t, uint32_t) | |
2600 | DO_FTINT(float64, uint64, uint64_t, uint64_t) | |
2601 | DO_FTINT(float64, int32, uint64_t, uint32_t) | |
2602 | DO_FTINT(float32, int64, uint32_t, uint64_t) | |
2603 | ||
2604 | FTINT(rne_w_s, float32, int32, uint32_t, uint32_t, float_round_nearest_even) | |
2605 | FTINT(rne_l_d, float64, int64, uint64_t, uint64_t, float_round_nearest_even) | |
2606 | FTINT(rp_w_s, float32, int32, uint32_t, uint32_t, float_round_up) | |
2607 | FTINT(rp_l_d, float64, int64, uint64_t, uint64_t, float_round_up) | |
2608 | FTINT(rz_w_s, float32, int32, uint32_t, uint32_t, float_round_to_zero) | |
2609 | FTINT(rz_l_d, float64, int64, uint64_t, uint64_t, float_round_to_zero) | |
2610 | FTINT(rm_w_s, float32, int32, uint32_t, uint32_t, float_round_down) | |
2611 | FTINT(rm_l_d, float64, int64, uint64_t, uint64_t, float_round_down) | |
2612 | ||
2613 | DO_2OP_F(vftintrne_w_s, 32, UW, do_ftintrne_w_s) | |
2614 | DO_2OP_F(vftintrne_l_d, 64, UD, do_ftintrne_l_d) | |
2615 | DO_2OP_F(vftintrp_w_s, 32, UW, do_ftintrp_w_s) | |
2616 | DO_2OP_F(vftintrp_l_d, 64, UD, do_ftintrp_l_d) | |
2617 | DO_2OP_F(vftintrz_w_s, 32, UW, do_ftintrz_w_s) | |
2618 | DO_2OP_F(vftintrz_l_d, 64, UD, do_ftintrz_l_d) | |
2619 | DO_2OP_F(vftintrm_w_s, 32, UW, do_ftintrm_w_s) | |
2620 | DO_2OP_F(vftintrm_l_d, 64, UD, do_ftintrm_l_d) | |
2621 | DO_2OP_F(vftint_w_s, 32, UW, do_float32_to_int32) | |
2622 | DO_2OP_F(vftint_l_d, 64, UD, do_float64_to_int64) | |
2623 | ||
2624 | FTINT(rz_wu_s, float32, uint32, uint32_t, uint32_t, float_round_to_zero) | |
2625 | FTINT(rz_lu_d, float64, uint64, uint64_t, uint64_t, float_round_to_zero) | |
2626 | ||
2627 | DO_2OP_F(vftintrz_wu_s, 32, UW, do_ftintrz_wu_s) | |
2628 | DO_2OP_F(vftintrz_lu_d, 64, UD, do_ftintrz_lu_d) | |
2629 | DO_2OP_F(vftint_wu_s, 32, UW, do_float32_to_uint32) | |
2630 | DO_2OP_F(vftint_lu_d, 64, UD, do_float64_to_uint64) | |
2631 | ||
2632 | FTINT(rm_w_d, float64, int32, uint64_t, uint32_t, float_round_down) | |
2633 | FTINT(rp_w_d, float64, int32, uint64_t, uint32_t, float_round_up) | |
2634 | FTINT(rz_w_d, float64, int32, uint64_t, uint32_t, float_round_to_zero) | |
2635 | FTINT(rne_w_d, float64, int32, uint64_t, uint32_t, float_round_nearest_even) | |
2636 | ||
2637 | #define FTINT_W_D(NAME, FN) \ | |
3b286753 SG |
2638 | void HELPER(NAME)(void *vd, void *vj, void *vk, \ |
2639 | CPULoongArchState *env, uint32_t desc) \ | |
399665d2 SG |
2640 | { \ |
2641 | int i; \ | |
2642 | VReg temp; \ | |
3b286753 SG |
2643 | VReg *Vd = (VReg *)vd; \ |
2644 | VReg *Vj = (VReg *)vj; \ | |
2645 | VReg *Vk = (VReg *)vk; \ | |
399665d2 SG |
2646 | \ |
2647 | vec_clear_cause(env); \ | |
2648 | for (i = 0; i < 2; i++) { \ | |
2649 | temp.W(i + 2) = FN(env, Vj->UD(i)); \ | |
2650 | temp.W(i) = FN(env, Vk->UD(i)); \ | |
2651 | } \ | |
2652 | *Vd = temp; \ | |
2653 | } | |
2654 | ||
2655 | FTINT_W_D(vftint_w_d, do_float64_to_int32) | |
2656 | FTINT_W_D(vftintrm_w_d, do_ftintrm_w_d) | |
2657 | FTINT_W_D(vftintrp_w_d, do_ftintrp_w_d) | |
2658 | FTINT_W_D(vftintrz_w_d, do_ftintrz_w_d) | |
2659 | FTINT_W_D(vftintrne_w_d, do_ftintrne_w_d) | |
2660 | ||
2661 | FTINT(rml_l_s, float32, int64, uint32_t, uint64_t, float_round_down) | |
2662 | FTINT(rpl_l_s, float32, int64, uint32_t, uint64_t, float_round_up) | |
2663 | FTINT(rzl_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero) | |
2664 | FTINT(rnel_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even) | |
2665 | FTINT(rmh_l_s, float32, int64, uint32_t, uint64_t, float_round_down) | |
2666 | FTINT(rph_l_s, float32, int64, uint32_t, uint64_t, float_round_up) | |
2667 | FTINT(rzh_l_s, float32, int64, uint32_t, uint64_t, float_round_to_zero) | |
2668 | FTINT(rneh_l_s, float32, int64, uint32_t, uint64_t, float_round_nearest_even) | |
2669 | ||
226bf881 SG |
2670 | #define FTINTL_L_S(NAME, FN) \ |
2671 | void HELPER(NAME)(void *vd, void *vj, \ | |
2672 | CPULoongArchState *env, uint32_t desc) \ | |
2673 | { \ | |
2674 | int i; \ | |
2675 | VReg temp; \ | |
2676 | VReg *Vd = (VReg *)vd; \ | |
2677 | VReg *Vj = (VReg *)vj; \ | |
2678 | \ | |
2679 | vec_clear_cause(env); \ | |
2680 | for (i = 0; i < 2; i++) { \ | |
2681 | temp.D(i) = FN(env, Vj->UW(i)); \ | |
2682 | } \ | |
2683 | *Vd = temp; \ | |
399665d2 SG |
2684 | } |
2685 | ||
2686 | FTINTL_L_S(vftintl_l_s, do_float32_to_int64) | |
2687 | FTINTL_L_S(vftintrml_l_s, do_ftintrml_l_s) | |
2688 | FTINTL_L_S(vftintrpl_l_s, do_ftintrpl_l_s) | |
2689 | FTINTL_L_S(vftintrzl_l_s, do_ftintrzl_l_s) | |
2690 | FTINTL_L_S(vftintrnel_l_s, do_ftintrnel_l_s) | |
2691 | ||
226bf881 SG |
2692 | #define FTINTH_L_S(NAME, FN) \ |
2693 | void HELPER(NAME)(void *vd, void *vj, \ | |
2694 | CPULoongArchState *env, uint32_t desc) \ | |
2695 | { \ | |
2696 | int i; \ | |
2697 | VReg temp; \ | |
2698 | VReg *Vd = (VReg *)vd; \ | |
2699 | VReg *Vj = (VReg *)vj; \ | |
2700 | \ | |
2701 | vec_clear_cause(env); \ | |
2702 | for (i = 0; i < 2; i++) { \ | |
2703 | temp.D(i) = FN(env, Vj->UW(i + 2)); \ | |
2704 | } \ | |
2705 | *Vd = temp; \ | |
399665d2 SG |
2706 | } |
2707 | ||
2708 | FTINTH_L_S(vftinth_l_s, do_float32_to_int64) | |
2709 | FTINTH_L_S(vftintrmh_l_s, do_ftintrmh_l_s) | |
2710 | FTINTH_L_S(vftintrph_l_s, do_ftintrph_l_s) | |
2711 | FTINTH_L_S(vftintrzh_l_s, do_ftintrzh_l_s) | |
2712 | FTINTH_L_S(vftintrneh_l_s, do_ftintrneh_l_s) | |
2713 | ||
2714 | #define FFINT(NAME, FMT1, FMT2, T1, T2) \ | |
2715 | static T2 do_ffint_ ## NAME(CPULoongArchState *env, T1 fj) \ | |
2716 | { \ | |
2717 | T2 fd; \ | |
2718 | \ | |
2719 | fd = FMT1 ##_to_## FMT2(fj, &env->fp_status); \ | |
2720 | vec_update_fcsr0(env, GETPC()); \ | |
2721 | return fd; \ | |
2722 | } | |
2723 | ||
2724 | FFINT(s_w, int32, float32, int32_t, uint32_t) | |
2725 | FFINT(d_l, int64, float64, int64_t, uint64_t) | |
2726 | FFINT(s_wu, uint32, float32, uint32_t, uint32_t) | |
2727 | FFINT(d_lu, uint64, float64, uint64_t, uint64_t) | |
2728 | ||
2729 | DO_2OP_F(vffint_s_w, 32, W, do_ffint_s_w) | |
2730 | DO_2OP_F(vffint_d_l, 64, D, do_ffint_d_l) | |
2731 | DO_2OP_F(vffint_s_wu, 32, UW, do_ffint_s_wu) | |
2732 | DO_2OP_F(vffint_d_lu, 64, UD, do_ffint_d_lu) | |
2733 | ||
226bf881 SG |
2734 | void HELPER(vffintl_d_w)(void *vd, void *vj, |
2735 | CPULoongArchState *env, uint32_t desc) | |
399665d2 SG |
2736 | { |
2737 | int i; | |
2738 | VReg temp; | |
226bf881 SG |
2739 | VReg *Vd = (VReg *)vd; |
2740 | VReg *Vj = (VReg *)vj; | |
399665d2 SG |
2741 | |
2742 | vec_clear_cause(env); | |
2743 | for (i = 0; i < 2; i++) { | |
2744 | temp.D(i) = int32_to_float64(Vj->W(i), &env->fp_status); | |
2745 | vec_update_fcsr0(env, GETPC()); | |
2746 | } | |
2747 | *Vd = temp; | |
2748 | } | |
2749 | ||
226bf881 SG |
2750 | void HELPER(vffinth_d_w)(void *vd, void *vj, |
2751 | CPULoongArchState *env, uint32_t desc) | |
399665d2 SG |
2752 | { |
2753 | int i; | |
2754 | VReg temp; | |
226bf881 SG |
2755 | VReg *Vd = (VReg *)vd; |
2756 | VReg *Vj = (VReg *)vj; | |
399665d2 SG |
2757 | |
2758 | vec_clear_cause(env); | |
2759 | for (i = 0; i < 2; i++) { | |
2760 | temp.D(i) = int32_to_float64(Vj->W(i + 2), &env->fp_status); | |
2761 | vec_update_fcsr0(env, GETPC()); | |
2762 | } | |
2763 | *Vd = temp; | |
2764 | } | |
2765 | ||
3b286753 SG |
2766 | void HELPER(vffint_s_l)(void *vd, void *vj, void *vk, |
2767 | CPULoongArchState *env, uint32_t desc) | |
399665d2 SG |
2768 | { |
2769 | int i; | |
2770 | VReg temp; | |
3b286753 SG |
2771 | VReg *Vd = (VReg *)vd; |
2772 | VReg *Vj = (VReg *)vj; | |
2773 | VReg *Vk = (VReg *)vk; | |
399665d2 SG |
2774 | |
2775 | vec_clear_cause(env); | |
2776 | for (i = 0; i < 2; i++) { | |
2777 | temp.W(i + 2) = int64_to_float32(Vj->D(i), &env->fp_status); | |
2778 | temp.W(i) = int64_to_float32(Vk->D(i), &env->fp_status); | |
2779 | vec_update_fcsr0(env, GETPC()); | |
2780 | } | |
2781 | *Vd = temp; | |
2782 | } | |
f435e1e5 SG |
2783 | |
2784 | #define VSEQ(a, b) (a == b ? -1 : 0) | |
2785 | #define VSLE(a, b) (a <= b ? -1 : 0) | |
2786 | #define VSLT(a, b) (a < b ? -1 : 0) | |
2787 | ||
2788 | #define VCMPI(NAME, BIT, E, DO_OP) \ | |
2789 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t v) \ | |
2790 | { \ | |
2791 | int i; \ | |
2792 | VReg *Vd = (VReg *)vd; \ | |
2793 | VReg *Vj = (VReg *)vj; \ | |
2794 | typedef __typeof(Vd->E(0)) TD; \ | |
2795 | \ | |
2796 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
2797 | Vd->E(i) = DO_OP(Vj->E(i), (TD)imm); \ | |
2798 | } \ | |
2799 | } | |
2800 | ||
2801 | VCMPI(vseqi_b, 8, B, VSEQ) | |
2802 | VCMPI(vseqi_h, 16, H, VSEQ) | |
2803 | VCMPI(vseqi_w, 32, W, VSEQ) | |
2804 | VCMPI(vseqi_d, 64, D, VSEQ) | |
2805 | VCMPI(vslei_b, 8, B, VSLE) | |
2806 | VCMPI(vslei_h, 16, H, VSLE) | |
2807 | VCMPI(vslei_w, 32, W, VSLE) | |
2808 | VCMPI(vslei_d, 64, D, VSLE) | |
2809 | VCMPI(vslei_bu, 8, UB, VSLE) | |
2810 | VCMPI(vslei_hu, 16, UH, VSLE) | |
2811 | VCMPI(vslei_wu, 32, UW, VSLE) | |
2812 | VCMPI(vslei_du, 64, UD, VSLE) | |
2813 | VCMPI(vslti_b, 8, B, VSLT) | |
2814 | VCMPI(vslti_h, 16, H, VSLT) | |
2815 | VCMPI(vslti_w, 32, W, VSLT) | |
2816 | VCMPI(vslti_d, 64, D, VSLT) | |
2817 | VCMPI(vslti_bu, 8, UB, VSLT) | |
2818 | VCMPI(vslti_hu, 16, UH, VSLT) | |
2819 | VCMPI(vslti_wu, 32, UW, VSLT) | |
2820 | VCMPI(vslti_du, 64, UD, VSLT) | |
386c4e86 SG |
2821 | |
2822 | static uint64_t vfcmp_common(CPULoongArchState *env, | |
2823 | FloatRelation cmp, uint32_t flags) | |
2824 | { | |
2825 | uint64_t ret = 0; | |
2826 | ||
2827 | switch (cmp) { | |
2828 | case float_relation_less: | |
2829 | ret = (flags & FCMP_LT); | |
2830 | break; | |
2831 | case float_relation_equal: | |
2832 | ret = (flags & FCMP_EQ); | |
2833 | break; | |
2834 | case float_relation_greater: | |
2835 | ret = (flags & FCMP_GT); | |
2836 | break; | |
2837 | case float_relation_unordered: | |
2838 | ret = (flags & FCMP_UN); | |
2839 | break; | |
2840 | default: | |
2841 | g_assert_not_reached(); | |
2842 | } | |
2843 | ||
2844 | if (ret) { | |
2845 | ret = -1; | |
2846 | } | |
2847 | ||
2848 | return ret; | |
2849 | } | |
2850 | ||
2851 | #define VFCMP(NAME, BIT, E, FN) \ | |
2852 | void HELPER(NAME)(CPULoongArchState *env, \ | |
2853 | uint32_t vd, uint32_t vj, uint32_t vk, uint32_t flags) \ | |
2854 | { \ | |
2855 | int i; \ | |
2856 | VReg t; \ | |
2857 | VReg *Vd = &(env->fpr[vd].vreg); \ | |
2858 | VReg *Vj = &(env->fpr[vj].vreg); \ | |
2859 | VReg *Vk = &(env->fpr[vk].vreg); \ | |
2860 | \ | |
2861 | vec_clear_cause(env); \ | |
2862 | for (i = 0; i < LSX_LEN/BIT ; i++) { \ | |
2863 | FloatRelation cmp; \ | |
2864 | cmp = FN(Vj->E(i), Vk->E(i), &env->fp_status); \ | |
2865 | t.E(i) = vfcmp_common(env, cmp, flags); \ | |
2866 | vec_update_fcsr0(env, GETPC()); \ | |
2867 | } \ | |
2868 | *Vd = t; \ | |
2869 | } | |
2870 | ||
2871 | VFCMP(vfcmp_c_s, 32, UW, float32_compare_quiet) | |
2872 | VFCMP(vfcmp_s_s, 32, UW, float32_compare) | |
2873 | VFCMP(vfcmp_c_d, 64, UD, float64_compare_quiet) | |
2874 | VFCMP(vfcmp_s_d, 64, UD, float64_compare) | |
d0dfa19a SG |
2875 | |
2876 | void HELPER(vbitseli_b)(void *vd, void *vj, uint64_t imm, uint32_t v) | |
2877 | { | |
2878 | int i; | |
2879 | VReg *Vd = (VReg *)vd; | |
2880 | VReg *Vj = (VReg *)vj; | |
2881 | ||
2882 | for (i = 0; i < 16; i++) { | |
2883 | Vd->B(i) = (~Vd->B(i) & Vj->B(i)) | (Vd->B(i) & imm); | |
2884 | } | |
2885 | } | |
2886 | ||
2887 | /* Copy from target/arm/tcg/sve_helper.c */ | |
2888 | static inline bool do_match2(uint64_t n, uint64_t m0, uint64_t m1, int esz) | |
2889 | { | |
2890 | uint64_t bits = 8 << esz; | |
2891 | uint64_t ones = dup_const(esz, 1); | |
2892 | uint64_t signs = ones << (bits - 1); | |
2893 | uint64_t cmp0, cmp1; | |
2894 | ||
2895 | cmp1 = dup_const(esz, n); | |
2896 | cmp0 = cmp1 ^ m0; | |
2897 | cmp1 = cmp1 ^ m1; | |
2898 | cmp0 = (cmp0 - ones) & ~cmp0; | |
2899 | cmp1 = (cmp1 - ones) & ~cmp1; | |
2900 | return (cmp0 | cmp1) & signs; | |
2901 | } | |
2902 | ||
2903 | #define SETANYEQZ(NAME, MO) \ | |
2904 | void HELPER(NAME)(CPULoongArchState *env, uint32_t cd, uint32_t vj) \ | |
2905 | { \ | |
2906 | VReg *Vj = &(env->fpr[vj].vreg); \ | |
2907 | \ | |
2908 | env->cf[cd & 0x7] = do_match2(0, Vj->D(0), Vj->D(1), MO); \ | |
2909 | } | |
2910 | SETANYEQZ(vsetanyeqz_b, MO_8) | |
2911 | SETANYEQZ(vsetanyeqz_h, MO_16) | |
2912 | SETANYEQZ(vsetanyeqz_w, MO_32) | |
2913 | SETANYEQZ(vsetanyeqz_d, MO_64) | |
2914 | ||
2915 | #define SETALLNEZ(NAME, MO) \ | |
2916 | void HELPER(NAME)(CPULoongArchState *env, uint32_t cd, uint32_t vj) \ | |
2917 | { \ | |
2918 | VReg *Vj = &(env->fpr[vj].vreg); \ | |
2919 | \ | |
2920 | env->cf[cd & 0x7]= !do_match2(0, Vj->D(0), Vj->D(1), MO); \ | |
2921 | } | |
2922 | SETALLNEZ(vsetallnez_b, MO_8) | |
2923 | SETALLNEZ(vsetallnez_h, MO_16) | |
2924 | SETALLNEZ(vsetallnez_w, MO_32) | |
2925 | SETALLNEZ(vsetallnez_d, MO_64) | |
d5e5563c | 2926 | |
04711da1 SG |
2927 | #define VPACKEV(NAME, BIT, E) \ |
2928 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
2929 | { \ | |
2930 | int i; \ | |
2931 | VReg temp; \ | |
2932 | VReg *Vd = (VReg *)vd; \ | |
2933 | VReg *Vj = (VReg *)vj; \ | |
2934 | VReg *Vk = (VReg *)vk; \ | |
2935 | \ | |
2936 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
2937 | temp.E(2 * i + 1) = Vj->E(2 * i); \ | |
2938 | temp.E(2 *i) = Vk->E(2 * i); \ | |
2939 | } \ | |
2940 | *Vd = temp; \ | |
d5e5563c SG |
2941 | } |
2942 | ||
2943 | VPACKEV(vpackev_b, 16, B) | |
2944 | VPACKEV(vpackev_h, 32, H) | |
2945 | VPACKEV(vpackev_w, 64, W) | |
2946 | VPACKEV(vpackev_d, 128, D) | |
2947 | ||
04711da1 SG |
2948 | #define VPACKOD(NAME, BIT, E) \ |
2949 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
2950 | { \ | |
2951 | int i; \ | |
2952 | VReg temp; \ | |
2953 | VReg *Vd = (VReg *)vd; \ | |
2954 | VReg *Vj = (VReg *)vj; \ | |
2955 | VReg *Vk = (VReg *)vk; \ | |
2956 | \ | |
2957 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
2958 | temp.E(2 * i + 1) = Vj->E(2 * i + 1); \ | |
2959 | temp.E(2 * i) = Vk->E(2 * i + 1); \ | |
2960 | } \ | |
2961 | *Vd = temp; \ | |
d5e5563c SG |
2962 | } |
2963 | ||
2964 | VPACKOD(vpackod_b, 16, B) | |
2965 | VPACKOD(vpackod_h, 32, H) | |
2966 | VPACKOD(vpackod_w, 64, W) | |
2967 | VPACKOD(vpackod_d, 128, D) | |
2968 | ||
04711da1 SG |
2969 | #define VPICKEV(NAME, BIT, E) \ |
2970 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
2971 | { \ | |
2972 | int i; \ | |
2973 | VReg temp; \ | |
2974 | VReg *Vd = (VReg *)vd; \ | |
2975 | VReg *Vj = (VReg *)vj; \ | |
2976 | VReg *Vk = (VReg *)vk; \ | |
2977 | \ | |
2978 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
2979 | temp.E(i + LSX_LEN/BIT) = Vj->E(2 * i); \ | |
2980 | temp.E(i) = Vk->E(2 * i); \ | |
2981 | } \ | |
2982 | *Vd = temp; \ | |
d5e5563c SG |
2983 | } |
2984 | ||
2985 | VPICKEV(vpickev_b, 16, B) | |
2986 | VPICKEV(vpickev_h, 32, H) | |
2987 | VPICKEV(vpickev_w, 64, W) | |
2988 | VPICKEV(vpickev_d, 128, D) | |
2989 | ||
04711da1 SG |
2990 | #define VPICKOD(NAME, BIT, E) \ |
2991 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
2992 | { \ | |
2993 | int i; \ | |
2994 | VReg temp; \ | |
2995 | VReg *Vd = (VReg *)vd; \ | |
2996 | VReg *Vj = (VReg *)vj; \ | |
2997 | VReg *Vk = (VReg *)vk; \ | |
2998 | \ | |
2999 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
3000 | temp.E(i + LSX_LEN/BIT) = Vj->E(2 * i + 1); \ | |
3001 | temp.E(i) = Vk->E(2 * i + 1); \ | |
3002 | } \ | |
3003 | *Vd = temp; \ | |
d5e5563c SG |
3004 | } |
3005 | ||
3006 | VPICKOD(vpickod_b, 16, B) | |
3007 | VPICKOD(vpickod_h, 32, H) | |
3008 | VPICKOD(vpickod_w, 64, W) | |
3009 | VPICKOD(vpickod_d, 128, D) | |
e93dd431 | 3010 | |
04711da1 SG |
3011 | #define VILVL(NAME, BIT, E) \ |
3012 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
3013 | { \ | |
3014 | int i; \ | |
3015 | VReg temp; \ | |
3016 | VReg *Vd = (VReg *)vd; \ | |
3017 | VReg *Vj = (VReg *)vj; \ | |
3018 | VReg *Vk = (VReg *)vk; \ | |
3019 | \ | |
3020 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
3021 | temp.E(2 * i + 1) = Vj->E(i); \ | |
3022 | temp.E(2 * i) = Vk->E(i); \ | |
3023 | } \ | |
3024 | *Vd = temp; \ | |
e93dd431 SG |
3025 | } |
3026 | ||
3027 | VILVL(vilvl_b, 16, B) | |
3028 | VILVL(vilvl_h, 32, H) | |
3029 | VILVL(vilvl_w, 64, W) | |
3030 | VILVL(vilvl_d, 128, D) | |
3031 | ||
04711da1 SG |
3032 | #define VILVH(NAME, BIT, E) \ |
3033 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
3034 | { \ | |
3035 | int i; \ | |
3036 | VReg temp; \ | |
3037 | VReg *Vd = (VReg *)vd; \ | |
3038 | VReg *Vj = (VReg *)vj; \ | |
3039 | VReg *Vk = (VReg *)vk; \ | |
3040 | \ | |
3041 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
3042 | temp.E(2 * i + 1) = Vj->E(i + LSX_LEN/BIT); \ | |
3043 | temp.E(2 * i) = Vk->E(i + LSX_LEN/BIT); \ | |
3044 | } \ | |
3045 | *Vd = temp; \ | |
e93dd431 SG |
3046 | } |
3047 | ||
3048 | VILVH(vilvh_b, 16, B) | |
3049 | VILVH(vilvh_h, 32, H) | |
3050 | VILVH(vilvh_w, 64, W) | |
3051 | VILVH(vilvh_d, 128, D) | |
3052 | ||
eb48ab22 | 3053 | void HELPER(vshuf_b)(void *vd, void *vj, void *vk, void *va, uint32_t desc) |
e93dd431 SG |
3054 | { |
3055 | int i, m; | |
3056 | VReg temp; | |
eb48ab22 SG |
3057 | VReg *Vd = (VReg *)vd; |
3058 | VReg *Vj = (VReg *)vj; | |
3059 | VReg *Vk = (VReg *)vk; | |
3060 | VReg *Va = (VReg *)va; | |
e93dd431 SG |
3061 | |
3062 | m = LSX_LEN/8; | |
3063 | for (i = 0; i < m ; i++) { | |
3064 | uint64_t k = (uint8_t)Va->B(i) % (2 * m); | |
3065 | temp.B(i) = k < m ? Vk->B(k) : Vj->B(k - m); | |
3066 | } | |
3067 | *Vd = temp; | |
3068 | } | |
3069 | ||
04711da1 SG |
3070 | #define VSHUF(NAME, BIT, E) \ |
3071 | void HELPER(NAME)(void *vd, void *vj, void *vk, uint32_t desc) \ | |
3072 | { \ | |
3073 | int i, m; \ | |
3074 | VReg temp; \ | |
3075 | VReg *Vd = (VReg *)vd; \ | |
3076 | VReg *Vj = (VReg *)vj; \ | |
3077 | VReg *Vk = (VReg *)vk; \ | |
3078 | \ | |
3079 | m = LSX_LEN/BIT; \ | |
3080 | for (i = 0; i < m; i++) { \ | |
3081 | uint64_t k = ((uint8_t) Vd->E(i)) % (2 * m); \ | |
3082 | temp.E(i) = k < m ? Vk->E(k) : Vj->E(k - m); \ | |
3083 | } \ | |
3084 | *Vd = temp; \ | |
e93dd431 SG |
3085 | } |
3086 | ||
3087 | VSHUF(vshuf_h, 16, H) | |
3088 | VSHUF(vshuf_w, 32, W) | |
3089 | VSHUF(vshuf_d, 64, D) | |
3090 | ||
329517d5 SG |
3091 | #define VSHUF4I(NAME, BIT, E) \ |
3092 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
3093 | { \ | |
3094 | int i; \ | |
3095 | VReg temp; \ | |
3096 | VReg *Vd = (VReg *)vd; \ | |
3097 | VReg *Vj = (VReg *)vj; \ | |
3098 | \ | |
3099 | for (i = 0; i < LSX_LEN/BIT; i++) { \ | |
3100 | temp.E(i) = Vj->E(((i) & 0xfc) + (((imm) >> \ | |
3101 | (2 * ((i) & 0x03))) & 0x03)); \ | |
3102 | } \ | |
3103 | *Vd = temp; \ | |
e93dd431 SG |
3104 | } |
3105 | ||
3106 | VSHUF4I(vshuf4i_b, 8, B) | |
3107 | VSHUF4I(vshuf4i_h, 16, H) | |
3108 | VSHUF4I(vshuf4i_w, 32, W) | |
3109 | ||
329517d5 | 3110 | void HELPER(vshuf4i_d)(void *vd, void *vj, uint64_t imm, uint32_t desc) |
e93dd431 | 3111 | { |
329517d5 SG |
3112 | VReg *Vd = (VReg *)vd; |
3113 | VReg *Vj = (VReg *)vj; | |
e93dd431 SG |
3114 | |
3115 | VReg temp; | |
3116 | temp.D(0) = (imm & 2 ? Vj : Vd)->D(imm & 1); | |
3117 | temp.D(1) = (imm & 8 ? Vj : Vd)->D((imm >> 2) & 1); | |
3118 | *Vd = temp; | |
3119 | } | |
3120 | ||
329517d5 | 3121 | void HELPER(vpermi_w)(void *vd, void *vj, uint64_t imm, uint32_t desc) |
e93dd431 SG |
3122 | { |
3123 | VReg temp; | |
329517d5 SG |
3124 | VReg *Vd = (VReg *)vd; |
3125 | VReg *Vj = (VReg *)vj; | |
e93dd431 SG |
3126 | |
3127 | temp.W(0) = Vj->W(imm & 0x3); | |
3128 | temp.W(1) = Vj->W((imm >> 2) & 0x3); | |
3129 | temp.W(2) = Vd->W((imm >> 4) & 0x3); | |
3130 | temp.W(3) = Vd->W((imm >> 6) & 0x3); | |
3131 | *Vd = temp; | |
3132 | } | |
3133 | ||
329517d5 SG |
3134 | #define VEXTRINS(NAME, BIT, E, MASK) \ |
3135 | void HELPER(NAME)(void *vd, void *vj, uint64_t imm, uint32_t desc) \ | |
3136 | { \ | |
3137 | int ins, extr; \ | |
3138 | VReg *Vd = (VReg *)vd; \ | |
3139 | VReg *Vj = (VReg *)vj; \ | |
3140 | \ | |
3141 | ins = (imm >> 4) & MASK; \ | |
3142 | extr = imm & MASK; \ | |
3143 | Vd->E(ins) = Vj->E(extr); \ | |
e93dd431 SG |
3144 | } |
3145 | ||
3146 | VEXTRINS(vextrins_b, 8, B, 0xf) | |
3147 | VEXTRINS(vextrins_h, 16, H, 0x7) | |
3148 | VEXTRINS(vextrins_w, 32, W, 0x3) | |
3149 | VEXTRINS(vextrins_d, 64, D, 0x1) |