]> git.proxmox.com Git - mirror_qemu.git/blame - tcg/tcg-op-gvec.c
tcg: Add generic vector ops for comparisons
[mirror_qemu.git] / tcg / tcg-op-gvec.c
CommitLineData
db432672
RH
1/*
2 * Generic vector operation expansion
3 *
4 * Copyright (c) 2018 Linaro
5 *
6 * This library is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2 of the License, or (at your option) any later version.
10 *
11 * This library is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
15 *
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with this library; if not, see <http://www.gnu.org/licenses/>.
18 */
19
20#include "qemu/osdep.h"
21#include "qemu-common.h"
22#include "tcg.h"
23#include "tcg-op.h"
24#include "tcg-op-gvec.h"
25#include "tcg-gvec-desc.h"
26
27#define MAX_UNROLL 4
28
29/* Verify vector size and alignment rules. OFS should be the OR of all
30 of the operand offsets so that we can check them all at once. */
31static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
32{
33 uint32_t opr_align = oprsz >= 16 ? 15 : 7;
34 uint32_t max_align = maxsz >= 16 || oprsz >= 16 ? 15 : 7;
35 tcg_debug_assert(oprsz > 0);
36 tcg_debug_assert(oprsz <= maxsz);
37 tcg_debug_assert((oprsz & opr_align) == 0);
38 tcg_debug_assert((maxsz & max_align) == 0);
39 tcg_debug_assert((ofs & max_align) == 0);
40}
41
42/* Verify vector overlap rules for two operands. */
43static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
44{
45 tcg_debug_assert(d == a || d + s <= a || a + s <= d);
46}
47
48/* Verify vector overlap rules for three operands. */
49static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
50{
51 check_overlap_2(d, a, s);
52 check_overlap_2(d, b, s);
53 check_overlap_2(a, b, s);
54}
55
56/* Verify vector overlap rules for four operands. */
57static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
58 uint32_t c, uint32_t s)
59{
60 check_overlap_2(d, a, s);
61 check_overlap_2(d, b, s);
62 check_overlap_2(d, c, s);
63 check_overlap_2(a, b, s);
64 check_overlap_2(a, c, s);
65 check_overlap_2(b, c, s);
66}
67
68/* Create a descriptor from components. */
69uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
70{
71 uint32_t desc = 0;
72
73 assert(oprsz % 8 == 0 && oprsz <= (8 << SIMD_OPRSZ_BITS));
74 assert(maxsz % 8 == 0 && maxsz <= (8 << SIMD_MAXSZ_BITS));
75 assert(data == sextract32(data, 0, SIMD_DATA_BITS));
76
77 oprsz = (oprsz / 8) - 1;
78 maxsz = (maxsz / 8) - 1;
79 desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
80 desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
81 desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
82
83 return desc;
84}
85
86/* Generate a call to a gvec-style helper with two vector operands. */
87void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
88 uint32_t oprsz, uint32_t maxsz, int32_t data,
89 gen_helper_gvec_2 *fn)
90{
91 TCGv_ptr a0, a1;
92 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
93
94 a0 = tcg_temp_new_ptr();
95 a1 = tcg_temp_new_ptr();
96
97 tcg_gen_addi_ptr(a0, cpu_env, dofs);
98 tcg_gen_addi_ptr(a1, cpu_env, aofs);
99
100 fn(a0, a1, desc);
101
102 tcg_temp_free_ptr(a0);
103 tcg_temp_free_ptr(a1);
104 tcg_temp_free_i32(desc);
105}
106
107/* Generate a call to a gvec-style helper with three vector operands. */
108void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
109 uint32_t oprsz, uint32_t maxsz, int32_t data,
110 gen_helper_gvec_3 *fn)
111{
112 TCGv_ptr a0, a1, a2;
113 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
114
115 a0 = tcg_temp_new_ptr();
116 a1 = tcg_temp_new_ptr();
117 a2 = tcg_temp_new_ptr();
118
119 tcg_gen_addi_ptr(a0, cpu_env, dofs);
120 tcg_gen_addi_ptr(a1, cpu_env, aofs);
121 tcg_gen_addi_ptr(a2, cpu_env, bofs);
122
123 fn(a0, a1, a2, desc);
124
125 tcg_temp_free_ptr(a0);
126 tcg_temp_free_ptr(a1);
127 tcg_temp_free_ptr(a2);
128 tcg_temp_free_i32(desc);
129}
130
131/* Generate a call to a gvec-style helper with four vector operands. */
132void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
133 uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
134 int32_t data, gen_helper_gvec_4 *fn)
135{
136 TCGv_ptr a0, a1, a2, a3;
137 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
138
139 a0 = tcg_temp_new_ptr();
140 a1 = tcg_temp_new_ptr();
141 a2 = tcg_temp_new_ptr();
142 a3 = tcg_temp_new_ptr();
143
144 tcg_gen_addi_ptr(a0, cpu_env, dofs);
145 tcg_gen_addi_ptr(a1, cpu_env, aofs);
146 tcg_gen_addi_ptr(a2, cpu_env, bofs);
147 tcg_gen_addi_ptr(a3, cpu_env, cofs);
148
149 fn(a0, a1, a2, a3, desc);
150
151 tcg_temp_free_ptr(a0);
152 tcg_temp_free_ptr(a1);
153 tcg_temp_free_ptr(a2);
154 tcg_temp_free_ptr(a3);
155 tcg_temp_free_i32(desc);
156}
157
158/* Generate a call to a gvec-style helper with five vector operands. */
159void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
160 uint32_t cofs, uint32_t xofs, uint32_t oprsz,
161 uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
162{
163 TCGv_ptr a0, a1, a2, a3, a4;
164 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
165
166 a0 = tcg_temp_new_ptr();
167 a1 = tcg_temp_new_ptr();
168 a2 = tcg_temp_new_ptr();
169 a3 = tcg_temp_new_ptr();
170 a4 = tcg_temp_new_ptr();
171
172 tcg_gen_addi_ptr(a0, cpu_env, dofs);
173 tcg_gen_addi_ptr(a1, cpu_env, aofs);
174 tcg_gen_addi_ptr(a2, cpu_env, bofs);
175 tcg_gen_addi_ptr(a3, cpu_env, cofs);
176 tcg_gen_addi_ptr(a4, cpu_env, xofs);
177
178 fn(a0, a1, a2, a3, a4, desc);
179
180 tcg_temp_free_ptr(a0);
181 tcg_temp_free_ptr(a1);
182 tcg_temp_free_ptr(a2);
183 tcg_temp_free_ptr(a3);
184 tcg_temp_free_ptr(a4);
185 tcg_temp_free_i32(desc);
186}
187
188/* Generate a call to a gvec-style helper with three vector operands
189 and an extra pointer operand. */
190void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
191 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
192 int32_t data, gen_helper_gvec_2_ptr *fn)
193{
194 TCGv_ptr a0, a1;
195 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
196
197 a0 = tcg_temp_new_ptr();
198 a1 = tcg_temp_new_ptr();
199
200 tcg_gen_addi_ptr(a0, cpu_env, dofs);
201 tcg_gen_addi_ptr(a1, cpu_env, aofs);
202
203 fn(a0, a1, ptr, desc);
204
205 tcg_temp_free_ptr(a0);
206 tcg_temp_free_ptr(a1);
207 tcg_temp_free_i32(desc);
208}
209
210/* Generate a call to a gvec-style helper with three vector operands
211 and an extra pointer operand. */
212void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
213 TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
214 int32_t data, gen_helper_gvec_3_ptr *fn)
215{
216 TCGv_ptr a0, a1, a2;
217 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
218
219 a0 = tcg_temp_new_ptr();
220 a1 = tcg_temp_new_ptr();
221 a2 = tcg_temp_new_ptr();
222
223 tcg_gen_addi_ptr(a0, cpu_env, dofs);
224 tcg_gen_addi_ptr(a1, cpu_env, aofs);
225 tcg_gen_addi_ptr(a2, cpu_env, bofs);
226
227 fn(a0, a1, a2, ptr, desc);
228
229 tcg_temp_free_ptr(a0);
230 tcg_temp_free_ptr(a1);
231 tcg_temp_free_ptr(a2);
232 tcg_temp_free_i32(desc);
233}
234
235/* Generate a call to a gvec-style helper with four vector operands
236 and an extra pointer operand. */
237void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
238 uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
239 uint32_t maxsz, int32_t data,
240 gen_helper_gvec_4_ptr *fn)
241{
242 TCGv_ptr a0, a1, a2, a3;
243 TCGv_i32 desc = tcg_const_i32(simd_desc(oprsz, maxsz, data));
244
245 a0 = tcg_temp_new_ptr();
246 a1 = tcg_temp_new_ptr();
247 a2 = tcg_temp_new_ptr();
248 a3 = tcg_temp_new_ptr();
249
250 tcg_gen_addi_ptr(a0, cpu_env, dofs);
251 tcg_gen_addi_ptr(a1, cpu_env, aofs);
252 tcg_gen_addi_ptr(a2, cpu_env, bofs);
253 tcg_gen_addi_ptr(a3, cpu_env, cofs);
254
255 fn(a0, a1, a2, a3, ptr, desc);
256
257 tcg_temp_free_ptr(a0);
258 tcg_temp_free_ptr(a1);
259 tcg_temp_free_ptr(a2);
260 tcg_temp_free_ptr(a3);
261 tcg_temp_free_i32(desc);
262}
263
264/* Return true if we want to implement something of OPRSZ bytes
265 in units of LNSZ. This limits the expansion of inline code. */
266static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
267{
268 uint32_t lnct = oprsz / lnsz;
269 return lnct >= 1 && lnct <= MAX_UNROLL;
270}
271
272static void expand_clr(uint32_t dofs, uint32_t maxsz);
273
274/* Duplicate C as per VECE. */
275uint64_t (dup_const)(unsigned vece, uint64_t c)
276{
277 switch (vece) {
278 case MO_8:
279 return 0x0101010101010101ull * (uint8_t)c;
280 case MO_16:
281 return 0x0001000100010001ull * (uint16_t)c;
282 case MO_32:
283 return 0x0000000100000001ull * (uint32_t)c;
284 case MO_64:
285 return c;
286 default:
287 g_assert_not_reached();
288 }
289}
290
291/* Duplicate IN into OUT as per VECE. */
292static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
293{
294 switch (vece) {
295 case MO_8:
296 tcg_gen_ext8u_i32(out, in);
297 tcg_gen_muli_i32(out, out, 0x01010101);
298 break;
299 case MO_16:
300 tcg_gen_deposit_i32(out, in, in, 16, 16);
301 break;
302 case MO_32:
303 tcg_gen_mov_i32(out, in);
304 break;
305 default:
306 g_assert_not_reached();
307 }
308}
309
310static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
311{
312 switch (vece) {
313 case MO_8:
314 tcg_gen_ext8u_i64(out, in);
315 tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
316 break;
317 case MO_16:
318 tcg_gen_ext16u_i64(out, in);
319 tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
320 break;
321 case MO_32:
322 tcg_gen_deposit_i64(out, in, in, 32, 32);
323 break;
324 case MO_64:
325 tcg_gen_mov_i64(out, in);
326 break;
327 default:
328 g_assert_not_reached();
329 }
330}
331
332/* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
333 * Only one of IN_32 or IN_64 may be set;
334 * IN_C is used if IN_32 and IN_64 are unset.
335 */
336static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
337 uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
338 uint64_t in_c)
339{
340 TCGType type;
341 TCGv_i64 t_64;
342 TCGv_i32 t_32, t_desc;
343 TCGv_ptr t_ptr;
344 uint32_t i;
345
346 assert(vece <= (in_32 ? MO_32 : MO_64));
347 assert(in_32 == NULL || in_64 == NULL);
348
349 /* If we're storing 0, expand oprsz to maxsz. */
350 if (in_32 == NULL && in_64 == NULL) {
351 in_c = dup_const(vece, in_c);
352 if (in_c == 0) {
353 oprsz = maxsz;
354 }
355 }
356
357 type = 0;
358 if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)) {
359 type = TCG_TYPE_V256;
360 } else if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)) {
361 type = TCG_TYPE_V128;
362 } else if (TCG_TARGET_HAS_v64 && check_size_impl(oprsz, 8)
363 /* Prefer integer when 64-bit host and no variable dup. */
364 && !(TCG_TARGET_REG_BITS == 64 && in_32 == NULL
365 && (in_64 == NULL || vece == MO_64))) {
366 type = TCG_TYPE_V64;
367 }
368
369 /* Implement inline with a vector type, if possible. */
370 if (type != 0) {
371 TCGv_vec t_vec = tcg_temp_new_vec(type);
372
373 if (in_32) {
374 tcg_gen_dup_i32_vec(vece, t_vec, in_32);
375 } else if (in_64) {
376 tcg_gen_dup_i64_vec(vece, t_vec, in_64);
377 } else {
378 switch (vece) {
379 case MO_8:
380 tcg_gen_dup8i_vec(t_vec, in_c);
381 break;
382 case MO_16:
383 tcg_gen_dup16i_vec(t_vec, in_c);
384 break;
385 case MO_32:
386 tcg_gen_dup32i_vec(t_vec, in_c);
387 break;
388 default:
389 tcg_gen_dup64i_vec(t_vec, in_c);
390 break;
391 }
392 }
393
394 i = 0;
395 if (TCG_TARGET_HAS_v256) {
396 for (; i + 32 <= oprsz; i += 32) {
397 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
398 }
399 }
400 if (TCG_TARGET_HAS_v128) {
401 for (; i + 16 <= oprsz; i += 16) {
402 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
403 }
404 }
405 if (TCG_TARGET_HAS_v64) {
406 for (; i < oprsz; i += 8) {
407 tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
408 }
409 }
410 tcg_temp_free_vec(t_vec);
411 goto done;
412 }
413
414 /* Otherwise, inline with an integer type, unless "large". */
415 if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
416 t_64 = NULL;
417 t_32 = NULL;
418
419 if (in_32) {
420 /* We are given a 32-bit variable input. For a 64-bit host,
421 use a 64-bit operation unless the 32-bit operation would
422 be simple enough. */
423 if (TCG_TARGET_REG_BITS == 64
424 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
425 t_64 = tcg_temp_new_i64();
426 tcg_gen_extu_i32_i64(t_64, in_32);
427 gen_dup_i64(vece, t_64, t_64);
428 } else {
429 t_32 = tcg_temp_new_i32();
430 gen_dup_i32(vece, t_32, in_32);
431 }
432 } else if (in_64) {
433 /* We are given a 64-bit variable input. */
434 t_64 = tcg_temp_new_i64();
435 gen_dup_i64(vece, t_64, in_64);
436 } else {
437 /* We are given a constant input. */
438 /* For 64-bit hosts, use 64-bit constants for "simple" constants
439 or when we'd need too many 32-bit stores, or when a 64-bit
440 constant is really required. */
441 if (vece == MO_64
442 || (TCG_TARGET_REG_BITS == 64
443 && (in_c == 0 || in_c == -1
444 || !check_size_impl(oprsz, 4)))) {
445 t_64 = tcg_const_i64(in_c);
446 } else {
447 t_32 = tcg_const_i32(in_c);
448 }
449 }
450
451 /* Implement inline if we picked an implementation size above. */
452 if (t_32) {
453 for (i = 0; i < oprsz; i += 4) {
454 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
455 }
456 tcg_temp_free_i32(t_32);
457 goto done;
458 }
459 if (t_64) {
460 for (i = 0; i < oprsz; i += 8) {
461 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
462 }
463 tcg_temp_free_i64(t_64);
464 goto done;
465 }
466 }
467
468 /* Otherwise implement out of line. */
469 t_ptr = tcg_temp_new_ptr();
470 tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
471 t_desc = tcg_const_i32(simd_desc(oprsz, maxsz, 0));
472
473 if (vece == MO_64) {
474 if (in_64) {
475 gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
476 } else {
477 t_64 = tcg_const_i64(in_c);
478 gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
479 tcg_temp_free_i64(t_64);
480 }
481 } else {
482 typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
483 static dup_fn * const fns[3] = {
484 gen_helper_gvec_dup8,
485 gen_helper_gvec_dup16,
486 gen_helper_gvec_dup32
487 };
488
489 if (in_32) {
490 fns[vece](t_ptr, t_desc, in_32);
491 } else {
492 t_32 = tcg_temp_new_i32();
493 if (in_64) {
494 tcg_gen_extrl_i64_i32(t_32, in_64);
495 } else if (vece == MO_8) {
496 tcg_gen_movi_i32(t_32, in_c & 0xff);
497 } else if (vece == MO_16) {
498 tcg_gen_movi_i32(t_32, in_c & 0xffff);
499 } else {
500 tcg_gen_movi_i32(t_32, in_c);
501 }
502 fns[vece](t_ptr, t_desc, t_32);
503 tcg_temp_free_i32(t_32);
504 }
505 }
506
507 tcg_temp_free_ptr(t_ptr);
508 tcg_temp_free_i32(t_desc);
509 return;
510
511 done:
512 if (oprsz < maxsz) {
513 expand_clr(dofs + oprsz, maxsz - oprsz);
514 }
515}
516
517/* Likewise, but with zero. */
518static void expand_clr(uint32_t dofs, uint32_t maxsz)
519{
520 do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
521}
522
523/* Expand OPSZ bytes worth of two-operand operations using i32 elements. */
524static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
525 void (*fni)(TCGv_i32, TCGv_i32))
526{
527 TCGv_i32 t0 = tcg_temp_new_i32();
528 uint32_t i;
529
530 for (i = 0; i < oprsz; i += 4) {
531 tcg_gen_ld_i32(t0, cpu_env, aofs + i);
532 fni(t0, t0);
533 tcg_gen_st_i32(t0, cpu_env, dofs + i);
534 }
535 tcg_temp_free_i32(t0);
536}
537
d0ec9796
RH
538static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
539 int32_t c, bool load_dest,
540 void (*fni)(TCGv_i32, TCGv_i32, int32_t))
541{
542 TCGv_i32 t0 = tcg_temp_new_i32();
543 TCGv_i32 t1 = tcg_temp_new_i32();
544 uint32_t i;
545
546 for (i = 0; i < oprsz; i += 4) {
547 tcg_gen_ld_i32(t0, cpu_env, aofs + i);
548 if (load_dest) {
549 tcg_gen_ld_i32(t1, cpu_env, dofs + i);
550 }
551 fni(t1, t0, c);
552 tcg_gen_st_i32(t1, cpu_env, dofs + i);
553 }
554 tcg_temp_free_i32(t0);
555 tcg_temp_free_i32(t1);
556}
557
db432672
RH
558/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
559static void expand_3_i32(uint32_t dofs, uint32_t aofs,
560 uint32_t bofs, uint32_t oprsz, bool load_dest,
561 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
562{
563 TCGv_i32 t0 = tcg_temp_new_i32();
564 TCGv_i32 t1 = tcg_temp_new_i32();
565 TCGv_i32 t2 = tcg_temp_new_i32();
566 uint32_t i;
567
568 for (i = 0; i < oprsz; i += 4) {
569 tcg_gen_ld_i32(t0, cpu_env, aofs + i);
570 tcg_gen_ld_i32(t1, cpu_env, bofs + i);
571 if (load_dest) {
572 tcg_gen_ld_i32(t2, cpu_env, dofs + i);
573 }
574 fni(t2, t0, t1);
575 tcg_gen_st_i32(t2, cpu_env, dofs + i);
576 }
577 tcg_temp_free_i32(t2);
578 tcg_temp_free_i32(t1);
579 tcg_temp_free_i32(t0);
580}
581
582/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
583static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
584 uint32_t cofs, uint32_t oprsz,
585 void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
586{
587 TCGv_i32 t0 = tcg_temp_new_i32();
588 TCGv_i32 t1 = tcg_temp_new_i32();
589 TCGv_i32 t2 = tcg_temp_new_i32();
590 TCGv_i32 t3 = tcg_temp_new_i32();
591 uint32_t i;
592
593 for (i = 0; i < oprsz; i += 4) {
594 tcg_gen_ld_i32(t1, cpu_env, aofs + i);
595 tcg_gen_ld_i32(t2, cpu_env, bofs + i);
596 tcg_gen_ld_i32(t3, cpu_env, cofs + i);
597 fni(t0, t1, t2, t3);
598 tcg_gen_st_i32(t0, cpu_env, dofs + i);
599 }
600 tcg_temp_free_i32(t3);
601 tcg_temp_free_i32(t2);
602 tcg_temp_free_i32(t1);
603 tcg_temp_free_i32(t0);
604}
605
606/* Expand OPSZ bytes worth of two-operand operations using i64 elements. */
607static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
608 void (*fni)(TCGv_i64, TCGv_i64))
609{
610 TCGv_i64 t0 = tcg_temp_new_i64();
611 uint32_t i;
612
613 for (i = 0; i < oprsz; i += 8) {
614 tcg_gen_ld_i64(t0, cpu_env, aofs + i);
615 fni(t0, t0);
616 tcg_gen_st_i64(t0, cpu_env, dofs + i);
617 }
618 tcg_temp_free_i64(t0);
619}
620
d0ec9796
RH
621static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
622 int64_t c, bool load_dest,
623 void (*fni)(TCGv_i64, TCGv_i64, int64_t))
624{
625 TCGv_i64 t0 = tcg_temp_new_i64();
626 TCGv_i64 t1 = tcg_temp_new_i64();
627 uint32_t i;
628
629 for (i = 0; i < oprsz; i += 8) {
630 tcg_gen_ld_i64(t0, cpu_env, aofs + i);
631 if (load_dest) {
632 tcg_gen_ld_i64(t1, cpu_env, dofs + i);
633 }
634 fni(t1, t0, c);
635 tcg_gen_st_i64(t1, cpu_env, dofs + i);
636 }
637 tcg_temp_free_i64(t0);
638 tcg_temp_free_i64(t1);
639}
640
db432672
RH
641/* Expand OPSZ bytes worth of three-operand operations using i64 elements. */
642static void expand_3_i64(uint32_t dofs, uint32_t aofs,
643 uint32_t bofs, uint32_t oprsz, bool load_dest,
644 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
645{
646 TCGv_i64 t0 = tcg_temp_new_i64();
647 TCGv_i64 t1 = tcg_temp_new_i64();
648 TCGv_i64 t2 = tcg_temp_new_i64();
649 uint32_t i;
650
651 for (i = 0; i < oprsz; i += 8) {
652 tcg_gen_ld_i64(t0, cpu_env, aofs + i);
653 tcg_gen_ld_i64(t1, cpu_env, bofs + i);
654 if (load_dest) {
655 tcg_gen_ld_i64(t2, cpu_env, dofs + i);
656 }
657 fni(t2, t0, t1);
658 tcg_gen_st_i64(t2, cpu_env, dofs + i);
659 }
660 tcg_temp_free_i64(t2);
661 tcg_temp_free_i64(t1);
662 tcg_temp_free_i64(t0);
663}
664
665/* Expand OPSZ bytes worth of three-operand operations using i64 elements. */
666static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
667 uint32_t cofs, uint32_t oprsz,
668 void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
669{
670 TCGv_i64 t0 = tcg_temp_new_i64();
671 TCGv_i64 t1 = tcg_temp_new_i64();
672 TCGv_i64 t2 = tcg_temp_new_i64();
673 TCGv_i64 t3 = tcg_temp_new_i64();
674 uint32_t i;
675
676 for (i = 0; i < oprsz; i += 8) {
677 tcg_gen_ld_i64(t1, cpu_env, aofs + i);
678 tcg_gen_ld_i64(t2, cpu_env, bofs + i);
679 tcg_gen_ld_i64(t3, cpu_env, cofs + i);
680 fni(t0, t1, t2, t3);
681 tcg_gen_st_i64(t0, cpu_env, dofs + i);
682 }
683 tcg_temp_free_i64(t3);
684 tcg_temp_free_i64(t2);
685 tcg_temp_free_i64(t1);
686 tcg_temp_free_i64(t0);
687}
688
689/* Expand OPSZ bytes worth of two-operand operations using host vectors. */
690static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
691 uint32_t oprsz, uint32_t tysz, TCGType type,
692 void (*fni)(unsigned, TCGv_vec, TCGv_vec))
693{
694 TCGv_vec t0 = tcg_temp_new_vec(type);
695 uint32_t i;
696
697 for (i = 0; i < oprsz; i += tysz) {
698 tcg_gen_ld_vec(t0, cpu_env, aofs + i);
699 fni(vece, t0, t0);
700 tcg_gen_st_vec(t0, cpu_env, dofs + i);
701 }
702 tcg_temp_free_vec(t0);
703}
704
d0ec9796
RH
705/* Expand OPSZ bytes worth of two-vector operands and an immediate operand
706 using host vectors. */
707static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
708 uint32_t oprsz, uint32_t tysz, TCGType type,
709 int64_t c, bool load_dest,
710 void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
711{
712 TCGv_vec t0 = tcg_temp_new_vec(type);
713 TCGv_vec t1 = tcg_temp_new_vec(type);
714 uint32_t i;
715
716 for (i = 0; i < oprsz; i += tysz) {
717 tcg_gen_ld_vec(t0, cpu_env, aofs + i);
718 if (load_dest) {
719 tcg_gen_ld_vec(t1, cpu_env, dofs + i);
720 }
721 fni(vece, t1, t0, c);
722 tcg_gen_st_vec(t1, cpu_env, dofs + i);
723 }
724 tcg_temp_free_vec(t0);
725 tcg_temp_free_vec(t1);
726}
727
db432672
RH
728/* Expand OPSZ bytes worth of three-operand operations using host vectors. */
729static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
730 uint32_t bofs, uint32_t oprsz,
731 uint32_t tysz, TCGType type, bool load_dest,
732 void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
733{
734 TCGv_vec t0 = tcg_temp_new_vec(type);
735 TCGv_vec t1 = tcg_temp_new_vec(type);
736 TCGv_vec t2 = tcg_temp_new_vec(type);
737 uint32_t i;
738
739 for (i = 0; i < oprsz; i += tysz) {
740 tcg_gen_ld_vec(t0, cpu_env, aofs + i);
741 tcg_gen_ld_vec(t1, cpu_env, bofs + i);
742 if (load_dest) {
743 tcg_gen_ld_vec(t2, cpu_env, dofs + i);
744 }
745 fni(vece, t2, t0, t1);
746 tcg_gen_st_vec(t2, cpu_env, dofs + i);
747 }
748 tcg_temp_free_vec(t2);
749 tcg_temp_free_vec(t1);
750 tcg_temp_free_vec(t0);
751}
752
753/* Expand OPSZ bytes worth of four-operand operations using host vectors. */
754static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
755 uint32_t bofs, uint32_t cofs, uint32_t oprsz,
756 uint32_t tysz, TCGType type,
757 void (*fni)(unsigned, TCGv_vec, TCGv_vec,
758 TCGv_vec, TCGv_vec))
759{
760 TCGv_vec t0 = tcg_temp_new_vec(type);
761 TCGv_vec t1 = tcg_temp_new_vec(type);
762 TCGv_vec t2 = tcg_temp_new_vec(type);
763 TCGv_vec t3 = tcg_temp_new_vec(type);
764 uint32_t i;
765
766 for (i = 0; i < oprsz; i += tysz) {
767 tcg_gen_ld_vec(t1, cpu_env, aofs + i);
768 tcg_gen_ld_vec(t2, cpu_env, bofs + i);
769 tcg_gen_ld_vec(t3, cpu_env, cofs + i);
770 fni(vece, t0, t1, t2, t3);
771 tcg_gen_st_vec(t0, cpu_env, dofs + i);
772 }
773 tcg_temp_free_vec(t3);
774 tcg_temp_free_vec(t2);
775 tcg_temp_free_vec(t1);
776 tcg_temp_free_vec(t0);
777}
778
779/* Expand a vector two-operand operation. */
780void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
781 uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
782{
783 check_size_align(oprsz, maxsz, dofs | aofs);
784 check_overlap_2(dofs, aofs, maxsz);
785
786 /* Recall that ARM SVE allows vector sizes that are not a power of 2.
787 Expand with successively smaller host vector sizes. The intent is
788 that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */
789 /* ??? For maxsz > oprsz, the host may be able to use an opr-sized
790 operation, zeroing the balance of the register. We can then
791 use a max-sized store to implement the clearing without an extra
792 store operation. This is true for aarch64 and x86_64 hosts. */
793
794 if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
795 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
796 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
797 expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256, g->fniv);
798 if (some == oprsz) {
799 goto done;
800 }
801 dofs += some;
802 aofs += some;
803 oprsz -= some;
804 maxsz -= some;
805 }
806
807 if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
808 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
809 expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128, g->fniv);
810 } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
811 && g->fniv && check_size_impl(oprsz, 8)
812 && (!g->opc
813 || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
814 expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64, g->fniv);
815 } else if (g->fni8 && check_size_impl(oprsz, 8)) {
816 expand_2_i64(dofs, aofs, oprsz, g->fni8);
817 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
818 expand_2_i32(dofs, aofs, oprsz, g->fni4);
819 } else {
820 assert(g->fno != NULL);
821 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
822 return;
823 }
824
825 done:
826 if (oprsz < maxsz) {
827 expand_clr(dofs + oprsz, maxsz - oprsz);
828 }
829}
830
d0ec9796
RH
831void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
832 uint32_t maxsz, int64_t c, const GVecGen2i *g)
833{
834 check_size_align(oprsz, maxsz, dofs | aofs);
835 check_overlap_2(dofs, aofs, maxsz);
836
837 /* Recall that ARM SVE allows vector sizes that are not a power of 2.
838 Expand with successively smaller host vector sizes. The intent is
839 that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */
840
841 if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
842 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
843 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
844 expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
845 c, g->load_dest, g->fniv);
846 if (some == oprsz) {
847 goto done;
848 }
849 dofs += some;
850 aofs += some;
851 oprsz -= some;
852 maxsz -= some;
853 }
854
855 if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
856 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
857 expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
858 c, g->load_dest, g->fniv);
859 } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
860 && g->fniv && check_size_impl(oprsz, 8)
861 && (!g->opc
862 || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
863 expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
864 c, g->load_dest, g->fniv);
865 } else if (g->fni8 && check_size_impl(oprsz, 8)) {
866 expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
867 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
868 expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
869 } else {
870 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
871 return;
872 }
873
874 done:
875 if (oprsz < maxsz) {
876 expand_clr(dofs + oprsz, maxsz - oprsz);
877 }
878}
879
db432672
RH
880/* Expand a vector three-operand operation. */
881void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
882 uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
883{
884 check_size_align(oprsz, maxsz, dofs | aofs | bofs);
885 check_overlap_3(dofs, aofs, bofs, maxsz);
886
887 /* Recall that ARM SVE allows vector sizes that are not a power of 2.
888 Expand with successively smaller host vector sizes. The intent is
889 that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */
890
891 if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
892 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
893 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
894 expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
895 g->load_dest, g->fniv);
896 if (some == oprsz) {
897 goto done;
898 }
899 dofs += some;
900 aofs += some;
901 bofs += some;
902 oprsz -= some;
903 maxsz -= some;
904 }
905
906 if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
907 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
908 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
909 g->load_dest, g->fniv);
910 } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
911 && g->fniv && check_size_impl(oprsz, 8)
912 && (!g->opc
913 || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
914 expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
915 g->load_dest, g->fniv);
916 } else if (g->fni8 && check_size_impl(oprsz, 8)) {
917 expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
918 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
919 expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
920 } else {
921 assert(g->fno != NULL);
922 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, g->data, g->fno);
923 }
924
925 done:
926 if (oprsz < maxsz) {
927 expand_clr(dofs + oprsz, maxsz - oprsz);
928 }
929}
930
931/* Expand a vector four-operand operation. */
932void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
933 uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
934{
935 check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
936 check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
937
938 /* Recall that ARM SVE allows vector sizes that are not a power of 2.
939 Expand with successively smaller host vector sizes. The intent is
940 that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */
941
942 if (TCG_TARGET_HAS_v256 && g->fniv && check_size_impl(oprsz, 32)
943 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V256, g->vece))) {
944 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
945 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
946 32, TCG_TYPE_V256, g->fniv);
947 if (some == oprsz) {
948 goto done;
949 }
950 dofs += some;
951 aofs += some;
952 bofs += some;
953 cofs += some;
954 oprsz -= some;
955 maxsz -= some;
956 }
957
958 if (TCG_TARGET_HAS_v128 && g->fniv && check_size_impl(oprsz, 16)
959 && (!g->opc || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V128, g->vece))) {
960 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
961 16, TCG_TYPE_V128, g->fniv);
962 } else if (TCG_TARGET_HAS_v64 && !g->prefer_i64
963 && g->fniv && check_size_impl(oprsz, 8)
964 && (!g->opc
965 || tcg_can_emit_vec_op(g->opc, TCG_TYPE_V64, g->vece))) {
966 expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
967 8, TCG_TYPE_V64, g->fniv);
968 } else if (g->fni8 && check_size_impl(oprsz, 8)) {
969 expand_4_i64(dofs, aofs, bofs, cofs, oprsz, g->fni8);
970 } else if (g->fni4 && check_size_impl(oprsz, 4)) {
971 expand_4_i32(dofs, aofs, bofs, cofs, oprsz, g->fni4);
972 } else {
973 assert(g->fno != NULL);
974 tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
975 oprsz, maxsz, g->data, g->fno);
976 return;
977 }
978
979 done:
980 if (oprsz < maxsz) {
981 expand_clr(dofs + oprsz, maxsz - oprsz);
982 }
983}
984
985/*
986 * Expand specific vector operations.
987 */
988
989static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
990{
991 tcg_gen_mov_vec(a, b);
992}
993
994void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
995 uint32_t oprsz, uint32_t maxsz)
996{
997 static const GVecGen2 g = {
998 .fni8 = tcg_gen_mov_i64,
999 .fniv = vec_mov2,
1000 .fno = gen_helper_gvec_mov,
1001 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1002 };
1003 if (dofs != aofs) {
1004 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1005 } else {
1006 check_size_align(oprsz, maxsz, dofs);
1007 if (oprsz < maxsz) {
1008 expand_clr(dofs + oprsz, maxsz - oprsz);
1009 }
1010 }
1011}
1012
1013void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1014 uint32_t maxsz, TCGv_i32 in)
1015{
1016 check_size_align(oprsz, maxsz, dofs);
1017 tcg_debug_assert(vece <= MO_32);
1018 do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1019}
1020
1021void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1022 uint32_t maxsz, TCGv_i64 in)
1023{
1024 check_size_align(oprsz, maxsz, dofs);
1025 tcg_debug_assert(vece <= MO_64);
1026 do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1027}
1028
1029void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1030 uint32_t oprsz, uint32_t maxsz)
1031{
1032 if (vece <= MO_32) {
1033 TCGv_i32 in = tcg_temp_new_i32();
1034 switch (vece) {
1035 case MO_8:
1036 tcg_gen_ld8u_i32(in, cpu_env, aofs);
1037 break;
1038 case MO_16:
1039 tcg_gen_ld16u_i32(in, cpu_env, aofs);
1040 break;
1041 case MO_32:
1042 tcg_gen_ld_i32(in, cpu_env, aofs);
1043 break;
1044 }
1045 tcg_gen_gvec_dup_i32(vece, dofs, oprsz, maxsz, in);
1046 tcg_temp_free_i32(in);
1047 } else if (vece == MO_64) {
1048 TCGv_i64 in = tcg_temp_new_i64();
1049 tcg_gen_ld_i64(in, cpu_env, aofs);
1050 tcg_gen_gvec_dup_i64(MO_64, dofs, oprsz, maxsz, in);
1051 tcg_temp_free_i64(in);
1052 } else {
1053 /* 128-bit duplicate. */
1054 /* ??? Dup to 256-bit vector. */
1055 int i;
1056
1057 tcg_debug_assert(vece == 4);
1058 tcg_debug_assert(oprsz >= 16);
1059 if (TCG_TARGET_HAS_v128) {
1060 TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1061
1062 tcg_gen_ld_vec(in, cpu_env, aofs);
1063 for (i = 0; i < oprsz; i += 16) {
1064 tcg_gen_st_vec(in, cpu_env, dofs + i);
1065 }
1066 tcg_temp_free_vec(in);
1067 } else {
1068 TCGv_i64 in0 = tcg_temp_new_i64();
1069 TCGv_i64 in1 = tcg_temp_new_i64();
1070
1071 tcg_gen_ld_i64(in0, cpu_env, aofs);
1072 tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1073 for (i = 0; i < oprsz; i += 16) {
1074 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1075 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1076 }
1077 tcg_temp_free_i64(in0);
1078 tcg_temp_free_i64(in1);
1079 }
1080 }
1081}
1082
1083void tcg_gen_gvec_dup64i(uint32_t dofs, uint32_t oprsz,
1084 uint32_t maxsz, uint64_t x)
1085{
1086 check_size_align(oprsz, maxsz, dofs);
1087 do_dup(MO_64, dofs, oprsz, maxsz, NULL, NULL, x);
1088}
1089
1090void tcg_gen_gvec_dup32i(uint32_t dofs, uint32_t oprsz,
1091 uint32_t maxsz, uint32_t x)
1092{
1093 check_size_align(oprsz, maxsz, dofs);
1094 do_dup(MO_32, dofs, oprsz, maxsz, NULL, NULL, x);
1095}
1096
1097void tcg_gen_gvec_dup16i(uint32_t dofs, uint32_t oprsz,
1098 uint32_t maxsz, uint16_t x)
1099{
1100 check_size_align(oprsz, maxsz, dofs);
1101 do_dup(MO_16, dofs, oprsz, maxsz, NULL, NULL, x);
1102}
1103
1104void tcg_gen_gvec_dup8i(uint32_t dofs, uint32_t oprsz,
1105 uint32_t maxsz, uint8_t x)
1106{
1107 check_size_align(oprsz, maxsz, dofs);
1108 do_dup(MO_8, dofs, oprsz, maxsz, NULL, NULL, x);
1109}
1110
1111void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1112 uint32_t oprsz, uint32_t maxsz)
1113{
1114 static const GVecGen2 g = {
1115 .fni8 = tcg_gen_not_i64,
1116 .fniv = tcg_gen_not_vec,
1117 .fno = gen_helper_gvec_not,
1118 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1119 };
1120 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1121}
1122
1123/* Perform a vector addition using normal addition and a mask. The mask
1124 should be the sign bit of each lane. This 6-operation form is more
1125 efficient than separate additions when there are 4 or more lanes in
1126 the 64-bit operation. */
1127static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1128{
1129 TCGv_i64 t1 = tcg_temp_new_i64();
1130 TCGv_i64 t2 = tcg_temp_new_i64();
1131 TCGv_i64 t3 = tcg_temp_new_i64();
1132
1133 tcg_gen_andc_i64(t1, a, m);
1134 tcg_gen_andc_i64(t2, b, m);
1135 tcg_gen_xor_i64(t3, a, b);
1136 tcg_gen_add_i64(d, t1, t2);
1137 tcg_gen_and_i64(t3, t3, m);
1138 tcg_gen_xor_i64(d, d, t3);
1139
1140 tcg_temp_free_i64(t1);
1141 tcg_temp_free_i64(t2);
1142 tcg_temp_free_i64(t3);
1143}
1144
1145void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1146{
1147 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1148 gen_addv_mask(d, a, b, m);
1149 tcg_temp_free_i64(m);
1150}
1151
1152void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1153{
1154 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1155 gen_addv_mask(d, a, b, m);
1156 tcg_temp_free_i64(m);
1157}
1158
1159void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1160{
1161 TCGv_i64 t1 = tcg_temp_new_i64();
1162 TCGv_i64 t2 = tcg_temp_new_i64();
1163
1164 tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1165 tcg_gen_add_i64(t2, a, b);
1166 tcg_gen_add_i64(t1, t1, b);
1167 tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1168
1169 tcg_temp_free_i64(t1);
1170 tcg_temp_free_i64(t2);
1171}
1172
1173void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1174 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1175{
1176 static const GVecGen3 g[4] = {
1177 { .fni8 = tcg_gen_vec_add8_i64,
1178 .fniv = tcg_gen_add_vec,
1179 .fno = gen_helper_gvec_add8,
1180 .opc = INDEX_op_add_vec,
1181 .vece = MO_8 },
1182 { .fni8 = tcg_gen_vec_add16_i64,
1183 .fniv = tcg_gen_add_vec,
1184 .fno = gen_helper_gvec_add16,
1185 .opc = INDEX_op_add_vec,
1186 .vece = MO_16 },
1187 { .fni4 = tcg_gen_add_i32,
1188 .fniv = tcg_gen_add_vec,
1189 .fno = gen_helper_gvec_add32,
1190 .opc = INDEX_op_add_vec,
1191 .vece = MO_32 },
1192 { .fni8 = tcg_gen_add_i64,
1193 .fniv = tcg_gen_add_vec,
1194 .fno = gen_helper_gvec_add64,
1195 .opc = INDEX_op_add_vec,
1196 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1197 .vece = MO_64 },
1198 };
1199
1200 tcg_debug_assert(vece <= MO_64);
1201 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1202}
1203
1204/* Perform a vector subtraction using normal subtraction and a mask.
1205 Compare gen_addv_mask above. */
1206static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1207{
1208 TCGv_i64 t1 = tcg_temp_new_i64();
1209 TCGv_i64 t2 = tcg_temp_new_i64();
1210 TCGv_i64 t3 = tcg_temp_new_i64();
1211
1212 tcg_gen_or_i64(t1, a, m);
1213 tcg_gen_andc_i64(t2, b, m);
1214 tcg_gen_eqv_i64(t3, a, b);
1215 tcg_gen_sub_i64(d, t1, t2);
1216 tcg_gen_and_i64(t3, t3, m);
1217 tcg_gen_xor_i64(d, d, t3);
1218
1219 tcg_temp_free_i64(t1);
1220 tcg_temp_free_i64(t2);
1221 tcg_temp_free_i64(t3);
1222}
1223
1224void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1225{
1226 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1227 gen_subv_mask(d, a, b, m);
1228 tcg_temp_free_i64(m);
1229}
1230
1231void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1232{
1233 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1234 gen_subv_mask(d, a, b, m);
1235 tcg_temp_free_i64(m);
1236}
1237
1238void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1239{
1240 TCGv_i64 t1 = tcg_temp_new_i64();
1241 TCGv_i64 t2 = tcg_temp_new_i64();
1242
1243 tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1244 tcg_gen_sub_i64(t2, a, b);
1245 tcg_gen_sub_i64(t1, a, t1);
1246 tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1247
1248 tcg_temp_free_i64(t1);
1249 tcg_temp_free_i64(t2);
1250}
1251
1252void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1253 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1254{
1255 static const GVecGen3 g[4] = {
1256 { .fni8 = tcg_gen_vec_sub8_i64,
1257 .fniv = tcg_gen_sub_vec,
1258 .fno = gen_helper_gvec_sub8,
1259 .opc = INDEX_op_sub_vec,
1260 .vece = MO_8 },
1261 { .fni8 = tcg_gen_vec_sub16_i64,
1262 .fniv = tcg_gen_sub_vec,
1263 .fno = gen_helper_gvec_sub16,
1264 .opc = INDEX_op_sub_vec,
1265 .vece = MO_16 },
1266 { .fni4 = tcg_gen_sub_i32,
1267 .fniv = tcg_gen_sub_vec,
1268 .fno = gen_helper_gvec_sub32,
1269 .opc = INDEX_op_sub_vec,
1270 .vece = MO_32 },
1271 { .fni8 = tcg_gen_sub_i64,
1272 .fniv = tcg_gen_sub_vec,
1273 .fno = gen_helper_gvec_sub64,
1274 .opc = INDEX_op_sub_vec,
1275 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1276 .vece = MO_64 },
1277 };
1278
1279 tcg_debug_assert(vece <= MO_64);
1280 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1281}
1282
1283/* Perform a vector negation using normal negation and a mask.
1284 Compare gen_subv_mask above. */
1285static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
1286{
1287 TCGv_i64 t2 = tcg_temp_new_i64();
1288 TCGv_i64 t3 = tcg_temp_new_i64();
1289
1290 tcg_gen_andc_i64(t3, m, b);
1291 tcg_gen_andc_i64(t2, b, m);
1292 tcg_gen_sub_i64(d, m, t2);
1293 tcg_gen_xor_i64(d, d, t3);
1294
1295 tcg_temp_free_i64(t2);
1296 tcg_temp_free_i64(t3);
1297}
1298
1299void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
1300{
1301 TCGv_i64 m = tcg_const_i64(dup_const(MO_8, 0x80));
1302 gen_negv_mask(d, b, m);
1303 tcg_temp_free_i64(m);
1304}
1305
1306void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
1307{
1308 TCGv_i64 m = tcg_const_i64(dup_const(MO_16, 0x8000));
1309 gen_negv_mask(d, b, m);
1310 tcg_temp_free_i64(m);
1311}
1312
1313void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
1314{
1315 TCGv_i64 t1 = tcg_temp_new_i64();
1316 TCGv_i64 t2 = tcg_temp_new_i64();
1317
1318 tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1319 tcg_gen_neg_i64(t2, b);
1320 tcg_gen_neg_i64(t1, t1);
1321 tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1322
1323 tcg_temp_free_i64(t1);
1324 tcg_temp_free_i64(t2);
1325}
1326
1327void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
1328 uint32_t oprsz, uint32_t maxsz)
1329{
1330 static const GVecGen2 g[4] = {
1331 { .fni8 = tcg_gen_vec_neg8_i64,
1332 .fniv = tcg_gen_neg_vec,
1333 .fno = gen_helper_gvec_neg8,
1334 .opc = INDEX_op_neg_vec,
1335 .vece = MO_8 },
1336 { .fni8 = tcg_gen_vec_neg16_i64,
1337 .fniv = tcg_gen_neg_vec,
1338 .fno = gen_helper_gvec_neg16,
1339 .opc = INDEX_op_neg_vec,
1340 .vece = MO_16 },
1341 { .fni4 = tcg_gen_neg_i32,
1342 .fniv = tcg_gen_neg_vec,
1343 .fno = gen_helper_gvec_neg32,
1344 .opc = INDEX_op_neg_vec,
1345 .vece = MO_32 },
1346 { .fni8 = tcg_gen_neg_i64,
1347 .fniv = tcg_gen_neg_vec,
1348 .fno = gen_helper_gvec_neg64,
1349 .opc = INDEX_op_neg_vec,
1350 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1351 .vece = MO_64 },
1352 };
1353
1354 tcg_debug_assert(vece <= MO_64);
1355 tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
1356}
1357
1358void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
1359 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1360{
1361 static const GVecGen3 g = {
1362 .fni8 = tcg_gen_and_i64,
1363 .fniv = tcg_gen_and_vec,
1364 .fno = gen_helper_gvec_and,
1365 .opc = INDEX_op_and_vec,
1366 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1367 };
1368 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1369}
1370
1371void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
1372 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1373{
1374 static const GVecGen3 g = {
1375 .fni8 = tcg_gen_or_i64,
1376 .fniv = tcg_gen_or_vec,
1377 .fno = gen_helper_gvec_or,
1378 .opc = INDEX_op_or_vec,
1379 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1380 };
1381 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1382}
1383
1384void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
1385 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1386{
1387 static const GVecGen3 g = {
1388 .fni8 = tcg_gen_xor_i64,
1389 .fniv = tcg_gen_xor_vec,
1390 .fno = gen_helper_gvec_xor,
1391 .opc = INDEX_op_xor_vec,
1392 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1393 };
1394 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1395}
1396
1397void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
1398 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1399{
1400 static const GVecGen3 g = {
1401 .fni8 = tcg_gen_andc_i64,
1402 .fniv = tcg_gen_andc_vec,
1403 .fno = gen_helper_gvec_andc,
1404 .opc = INDEX_op_andc_vec,
1405 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1406 };
1407 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1408}
1409
1410void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
1411 uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1412{
1413 static const GVecGen3 g = {
1414 .fni8 = tcg_gen_orc_i64,
1415 .fniv = tcg_gen_orc_vec,
1416 .fno = gen_helper_gvec_orc,
1417 .opc = INDEX_op_orc_vec,
1418 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1419 };
1420 tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
1421}
d0ec9796
RH
1422
1423void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1424{
1425 uint64_t mask = dup_const(MO_8, 0xff << c);
1426 tcg_gen_shli_i64(d, a, c);
1427 tcg_gen_andi_i64(d, d, mask);
1428}
1429
1430void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1431{
1432 uint64_t mask = dup_const(MO_16, 0xffff << c);
1433 tcg_gen_shli_i64(d, a, c);
1434 tcg_gen_andi_i64(d, d, mask);
1435}
1436
1437void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
1438 int64_t shift, uint32_t oprsz, uint32_t maxsz)
1439{
1440 static const GVecGen2i g[4] = {
1441 { .fni8 = tcg_gen_vec_shl8i_i64,
1442 .fniv = tcg_gen_shli_vec,
1443 .fno = gen_helper_gvec_shl8i,
1444 .opc = INDEX_op_shli_vec,
1445 .vece = MO_8 },
1446 { .fni8 = tcg_gen_vec_shl16i_i64,
1447 .fniv = tcg_gen_shli_vec,
1448 .fno = gen_helper_gvec_shl16i,
1449 .opc = INDEX_op_shli_vec,
1450 .vece = MO_16 },
1451 { .fni4 = tcg_gen_shli_i32,
1452 .fniv = tcg_gen_shli_vec,
1453 .fno = gen_helper_gvec_shl32i,
1454 .opc = INDEX_op_shli_vec,
1455 .vece = MO_32 },
1456 { .fni8 = tcg_gen_shli_i64,
1457 .fniv = tcg_gen_shli_vec,
1458 .fno = gen_helper_gvec_shl64i,
1459 .opc = INDEX_op_shli_vec,
1460 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1461 .vece = MO_64 },
1462 };
1463
1464 tcg_debug_assert(vece <= MO_64);
1465 tcg_debug_assert(shift >= 0 && shift < (8 << vece));
1466 if (shift == 0) {
1467 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
1468 } else {
1469 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
1470 }
1471}
1472
1473void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1474{
1475 uint64_t mask = dup_const(MO_8, 0xff >> c);
1476 tcg_gen_shri_i64(d, a, c);
1477 tcg_gen_andi_i64(d, d, mask);
1478}
1479
1480void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1481{
1482 uint64_t mask = dup_const(MO_16, 0xffff >> c);
1483 tcg_gen_shri_i64(d, a, c);
1484 tcg_gen_andi_i64(d, d, mask);
1485}
1486
1487void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
1488 int64_t shift, uint32_t oprsz, uint32_t maxsz)
1489{
1490 static const GVecGen2i g[4] = {
1491 { .fni8 = tcg_gen_vec_shr8i_i64,
1492 .fniv = tcg_gen_shri_vec,
1493 .fno = gen_helper_gvec_shr8i,
1494 .opc = INDEX_op_shri_vec,
1495 .vece = MO_8 },
1496 { .fni8 = tcg_gen_vec_shr16i_i64,
1497 .fniv = tcg_gen_shri_vec,
1498 .fno = gen_helper_gvec_shr16i,
1499 .opc = INDEX_op_shri_vec,
1500 .vece = MO_16 },
1501 { .fni4 = tcg_gen_shri_i32,
1502 .fniv = tcg_gen_shri_vec,
1503 .fno = gen_helper_gvec_shr32i,
1504 .opc = INDEX_op_shri_vec,
1505 .vece = MO_32 },
1506 { .fni8 = tcg_gen_shri_i64,
1507 .fniv = tcg_gen_shri_vec,
1508 .fno = gen_helper_gvec_shr64i,
1509 .opc = INDEX_op_shri_vec,
1510 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1511 .vece = MO_64 },
1512 };
1513
1514 tcg_debug_assert(vece <= MO_64);
1515 tcg_debug_assert(shift >= 0 && shift < (8 << vece));
1516 if (shift == 0) {
1517 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
1518 } else {
1519 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
1520 }
1521}
1522
1523void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1524{
1525 uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
1526 uint64_t c_mask = dup_const(MO_8, 0xff >> c);
1527 TCGv_i64 s = tcg_temp_new_i64();
1528
1529 tcg_gen_shri_i64(d, a, c);
1530 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */
1531 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
1532 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */
1533 tcg_gen_or_i64(d, d, s); /* include sign extension */
1534 tcg_temp_free_i64(s);
1535}
1536
1537void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
1538{
1539 uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
1540 uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
1541 TCGv_i64 s = tcg_temp_new_i64();
1542
1543 tcg_gen_shri_i64(d, a, c);
1544 tcg_gen_andi_i64(s, d, s_mask); /* isolate (shifted) sign bit */
1545 tcg_gen_andi_i64(d, d, c_mask); /* clear out bits above sign */
1546 tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
1547 tcg_gen_or_i64(d, d, s); /* include sign extension */
1548 tcg_temp_free_i64(s);
1549}
1550
1551void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
1552 int64_t shift, uint32_t oprsz, uint32_t maxsz)
1553{
1554 static const GVecGen2i g[4] = {
1555 { .fni8 = tcg_gen_vec_sar8i_i64,
1556 .fniv = tcg_gen_sari_vec,
1557 .fno = gen_helper_gvec_sar8i,
1558 .opc = INDEX_op_sari_vec,
1559 .vece = MO_8 },
1560 { .fni8 = tcg_gen_vec_sar16i_i64,
1561 .fniv = tcg_gen_sari_vec,
1562 .fno = gen_helper_gvec_sar16i,
1563 .opc = INDEX_op_sari_vec,
1564 .vece = MO_16 },
1565 { .fni4 = tcg_gen_sari_i32,
1566 .fniv = tcg_gen_sari_vec,
1567 .fno = gen_helper_gvec_sar32i,
1568 .opc = INDEX_op_sari_vec,
1569 .vece = MO_32 },
1570 { .fni8 = tcg_gen_sari_i64,
1571 .fniv = tcg_gen_sari_vec,
1572 .fno = gen_helper_gvec_sar64i,
1573 .opc = INDEX_op_sari_vec,
1574 .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1575 .vece = MO_64 },
1576 };
1577
1578 tcg_debug_assert(vece <= MO_64);
1579 tcg_debug_assert(shift >= 0 && shift < (8 << vece));
1580 if (shift == 0) {
1581 tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
1582 } else {
1583 tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
1584 }
1585}
212be173
RH
1586
1587/* Expand OPSZ bytes worth of three-operand operations using i32 elements. */
1588static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1589 uint32_t oprsz, TCGCond cond)
1590{
1591 TCGv_i32 t0 = tcg_temp_new_i32();
1592 TCGv_i32 t1 = tcg_temp_new_i32();
1593 uint32_t i;
1594
1595 for (i = 0; i < oprsz; i += 4) {
1596 tcg_gen_ld_i32(t0, cpu_env, aofs + i);
1597 tcg_gen_ld_i32(t1, cpu_env, bofs + i);
1598 tcg_gen_setcond_i32(cond, t0, t0, t1);
1599 tcg_gen_neg_i32(t0, t0);
1600 tcg_gen_st_i32(t0, cpu_env, dofs + i);
1601 }
1602 tcg_temp_free_i32(t1);
1603 tcg_temp_free_i32(t0);
1604}
1605
1606static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1607 uint32_t oprsz, TCGCond cond)
1608{
1609 TCGv_i64 t0 = tcg_temp_new_i64();
1610 TCGv_i64 t1 = tcg_temp_new_i64();
1611 uint32_t i;
1612
1613 for (i = 0; i < oprsz; i += 8) {
1614 tcg_gen_ld_i64(t0, cpu_env, aofs + i);
1615 tcg_gen_ld_i64(t1, cpu_env, bofs + i);
1616 tcg_gen_setcond_i64(cond, t0, t0, t1);
1617 tcg_gen_neg_i64(t0, t0);
1618 tcg_gen_st_i64(t0, cpu_env, dofs + i);
1619 }
1620 tcg_temp_free_i64(t1);
1621 tcg_temp_free_i64(t0);
1622}
1623
1624static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1625 uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1626 TCGType type, TCGCond cond)
1627{
1628 TCGv_vec t0 = tcg_temp_new_vec(type);
1629 TCGv_vec t1 = tcg_temp_new_vec(type);
1630 uint32_t i;
1631
1632 for (i = 0; i < oprsz; i += tysz) {
1633 tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1634 tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1635 tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
1636 tcg_gen_st_vec(t0, cpu_env, dofs + i);
1637 }
1638 tcg_temp_free_vec(t1);
1639 tcg_temp_free_vec(t0);
1640}
1641
1642void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
1643 uint32_t aofs, uint32_t bofs,
1644 uint32_t oprsz, uint32_t maxsz)
1645{
1646 static gen_helper_gvec_3 * const eq_fn[4] = {
1647 gen_helper_gvec_eq8, gen_helper_gvec_eq16,
1648 gen_helper_gvec_eq32, gen_helper_gvec_eq64
1649 };
1650 static gen_helper_gvec_3 * const ne_fn[4] = {
1651 gen_helper_gvec_ne8, gen_helper_gvec_ne16,
1652 gen_helper_gvec_ne32, gen_helper_gvec_ne64
1653 };
1654 static gen_helper_gvec_3 * const lt_fn[4] = {
1655 gen_helper_gvec_lt8, gen_helper_gvec_lt16,
1656 gen_helper_gvec_lt32, gen_helper_gvec_lt64
1657 };
1658 static gen_helper_gvec_3 * const le_fn[4] = {
1659 gen_helper_gvec_le8, gen_helper_gvec_le16,
1660 gen_helper_gvec_le32, gen_helper_gvec_le64
1661 };
1662 static gen_helper_gvec_3 * const ltu_fn[4] = {
1663 gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
1664 gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
1665 };
1666 static gen_helper_gvec_3 * const leu_fn[4] = {
1667 gen_helper_gvec_leu8, gen_helper_gvec_leu16,
1668 gen_helper_gvec_leu32, gen_helper_gvec_leu64
1669 };
1670 static gen_helper_gvec_3 * const * const fns[16] = {
1671 [TCG_COND_EQ] = eq_fn,
1672 [TCG_COND_NE] = ne_fn,
1673 [TCG_COND_LT] = lt_fn,
1674 [TCG_COND_LE] = le_fn,
1675 [TCG_COND_LTU] = ltu_fn,
1676 [TCG_COND_LEU] = leu_fn,
1677 };
1678
1679 check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1680 check_overlap_3(dofs, aofs, bofs, maxsz);
1681
1682 if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
1683 do_dup(MO_8, dofs, oprsz, maxsz,
1684 NULL, NULL, -(cond == TCG_COND_ALWAYS));
1685 return;
1686 }
1687
1688 /* Recall that ARM SVE allows vector sizes that are not a power of 2.
1689 Expand with successively smaller host vector sizes. The intent is
1690 that e.g. oprsz == 80 would be expanded with 2x32 + 1x16. */
1691
1692 if (TCG_TARGET_HAS_v256 && check_size_impl(oprsz, 32)
1693 && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V256, vece)) {
1694 uint32_t some = QEMU_ALIGN_DOWN(oprsz, 32);
1695 expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
1696 if (some == oprsz) {
1697 goto done;
1698 }
1699 dofs += some;
1700 aofs += some;
1701 bofs += some;
1702 oprsz -= some;
1703 maxsz -= some;
1704 }
1705
1706 if (TCG_TARGET_HAS_v128 && check_size_impl(oprsz, 16)
1707 && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V128, vece)) {
1708 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
1709 } else if (TCG_TARGET_HAS_v64
1710 && check_size_impl(oprsz, 8)
1711 && (TCG_TARGET_REG_BITS == 32 || vece != MO_64)
1712 && tcg_can_emit_vec_op(INDEX_op_cmp_vec, TCG_TYPE_V64, vece)) {
1713 expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
1714 } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
1715 expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
1716 } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
1717 expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
1718 } else {
1719 gen_helper_gvec_3 * const *fn = fns[cond];
1720
1721 if (fn == NULL) {
1722 uint32_t tmp;
1723 tmp = aofs, aofs = bofs, bofs = tmp;
1724 cond = tcg_swap_cond(cond);
1725 fn = fns[cond];
1726 assert(fn != NULL);
1727 }
1728 tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
1729 return;
1730 }
1731
1732 done:
1733 if (oprsz < maxsz) {
1734 expand_clr(dofs + oprsz, maxsz - oprsz);
1735 }
1736}