tcg/tcg-op-gvec.c

   1 /*
   2  * Generic vector operation expansion
   3  *
   4  * Copyright (c) 2018 Linaro
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include "tcg/tcg.h"
  22 #include "tcg/tcg-op.h"
  23 #include "tcg/tcg-op-gvec.h"
  24 #include "qemu/main-loop.h"
  25 #include "tcg/tcg-gvec-desc.h"
  26
  27 #define MAX_UNROLL  4
  28
  29 #ifdef CONFIG_DEBUG_TCG
  30 static const TCGOpcode vecop_list_empty[1] = { 0 };
  31 #else
  32 #define vecop_list_empty NULL
  33 #endif
  34
  35
  36 /* Verify vector size and alignment rules.  OFS should be the OR of all
  37    of the operand offsets so that we can check them all at once.  */
  38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
  39 {
  40     uint32_t max_align;
  41
  42     switch (oprsz) {
  43     case 8:
  44     case 16:
  45     case 32:
  46         tcg_debug_assert(oprsz <= maxsz);
  47         break;
  48     default:
  49         tcg_debug_assert(oprsz == maxsz);
  50         break;
  51     }
  52     tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
  53
  54     max_align = maxsz >= 16 ? 15 : 7;
  55     tcg_debug_assert((maxsz & max_align) == 0);
  56     tcg_debug_assert((ofs & max_align) == 0);
  57 }
  58
  59 /* Verify vector overlap rules for two operands.  */
  60 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
  61 {
  62     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
  63 }
  64
  65 /* Verify vector overlap rules for three operands.  */
  66 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
  67 {
  68     check_overlap_2(d, a, s);
  69     check_overlap_2(d, b, s);
  70     check_overlap_2(a, b, s);
  71 }
  72
  73 /* Verify vector overlap rules for four operands.  */
  74 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
  75                             uint32_t c, uint32_t s)
  76 {
  77     check_overlap_2(d, a, s);
  78     check_overlap_2(d, b, s);
  79     check_overlap_2(d, c, s);
  80     check_overlap_2(a, b, s);
  81     check_overlap_2(a, c, s);
  82     check_overlap_2(b, c, s);
  83 }
  84
  85 /* Create a descriptor from components.  */
  86 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
  87 {
  88     uint32_t desc = 0;
  89
  90     check_size_align(oprsz, maxsz, 0);
  91     tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
  92
  93     oprsz = (oprsz / 8) - 1;
  94     maxsz = (maxsz / 8) - 1;
  95
  96     /*
  97      * We have just asserted in check_size_align that either
  98      * oprsz is {8,16,32} or matches maxsz.  Encode the final
  99      * case with '2', as that would otherwise map to 24.
 100      */
 101     if (oprsz == maxsz) {
 102         oprsz = 2;
 103     }
 104
 105     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
 106     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
 107     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
 108
 109     return desc;
 110 }
 111
 112 /* Generate a call to a gvec-style helper with two vector operands.  */
 113 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
 114                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 115                         gen_helper_gvec_2 *fn)
 116 {
 117     TCGv_ptr a0, a1;
 118     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 119
 120     a0 = tcg_temp_new_ptr();
 121     a1 = tcg_temp_new_ptr();
 122
 123     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 124     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 125
 126     fn(a0, a1, desc);
 127
 128     tcg_temp_free_ptr(a0);
 129     tcg_temp_free_ptr(a1);
 130 }
 131
 132 /* Generate a call to a gvec-style helper with two vector operands
 133    and one scalar operand.  */
 134 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 135                          uint32_t oprsz, uint32_t maxsz, int32_t data,
 136                          gen_helper_gvec_2i *fn)
 137 {
 138     TCGv_ptr a0, a1;
 139     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 140
 141     a0 = tcg_temp_new_ptr();
 142     a1 = tcg_temp_new_ptr();
 143
 144     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 145     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 146
 147     fn(a0, a1, c, desc);
 148
 149     tcg_temp_free_ptr(a0);
 150     tcg_temp_free_ptr(a1);
 151 }
 152
 153 /* Generate a call to a gvec-style helper with three vector operands.  */
 154 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 155                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 156                         gen_helper_gvec_3 *fn)
 157 {
 158     TCGv_ptr a0, a1, a2;
 159     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 160
 161     a0 = tcg_temp_new_ptr();
 162     a1 = tcg_temp_new_ptr();
 163     a2 = tcg_temp_new_ptr();
 164
 165     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 166     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 167     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 168
 169     fn(a0, a1, a2, desc);
 170
 171     tcg_temp_free_ptr(a0);
 172     tcg_temp_free_ptr(a1);
 173     tcg_temp_free_ptr(a2);
 174 }
 175
 176 /* Generate a call to a gvec-style helper with four vector operands.  */
 177 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 178                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
 179                         int32_t data, gen_helper_gvec_4 *fn)
 180 {
 181     TCGv_ptr a0, a1, a2, a3;
 182     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 183
 184     a0 = tcg_temp_new_ptr();
 185     a1 = tcg_temp_new_ptr();
 186     a2 = tcg_temp_new_ptr();
 187     a3 = tcg_temp_new_ptr();
 188
 189     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 190     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 191     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 192     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 193
 194     fn(a0, a1, a2, a3, desc);
 195
 196     tcg_temp_free_ptr(a0);
 197     tcg_temp_free_ptr(a1);
 198     tcg_temp_free_ptr(a2);
 199     tcg_temp_free_ptr(a3);
 200 }
 201
 202 /* Generate a call to a gvec-style helper with five vector operands.  */
 203 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 204                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
 205                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
 206 {
 207     TCGv_ptr a0, a1, a2, a3, a4;
 208     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 209
 210     a0 = tcg_temp_new_ptr();
 211     a1 = tcg_temp_new_ptr();
 212     a2 = tcg_temp_new_ptr();
 213     a3 = tcg_temp_new_ptr();
 214     a4 = tcg_temp_new_ptr();
 215
 216     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 217     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 218     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 219     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 220     tcg_gen_addi_ptr(a4, cpu_env, xofs);
 221
 222     fn(a0, a1, a2, a3, a4, desc);
 223
 224     tcg_temp_free_ptr(a0);
 225     tcg_temp_free_ptr(a1);
 226     tcg_temp_free_ptr(a2);
 227     tcg_temp_free_ptr(a3);
 228     tcg_temp_free_ptr(a4);
 229 }
 230
 231 /* Generate a call to a gvec-style helper with three vector operands
 232    and an extra pointer operand.  */
 233 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 234                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 235                         int32_t data, gen_helper_gvec_2_ptr *fn)
 236 {
 237     TCGv_ptr a0, a1;
 238     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 239
 240     a0 = tcg_temp_new_ptr();
 241     a1 = tcg_temp_new_ptr();
 242
 243     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 244     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 245
 246     fn(a0, a1, ptr, desc);
 247
 248     tcg_temp_free_ptr(a0);
 249     tcg_temp_free_ptr(a1);
 250 }
 251
 252 /* Generate a call to a gvec-style helper with three vector operands
 253    and an extra pointer operand.  */
 254 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 255                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 256                         int32_t data, gen_helper_gvec_3_ptr *fn)
 257 {
 258     TCGv_ptr a0, a1, a2;
 259     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 260
 261     a0 = tcg_temp_new_ptr();
 262     a1 = tcg_temp_new_ptr();
 263     a2 = tcg_temp_new_ptr();
 264
 265     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 266     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 267     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 268
 269     fn(a0, a1, a2, ptr, desc);
 270
 271     tcg_temp_free_ptr(a0);
 272     tcg_temp_free_ptr(a1);
 273     tcg_temp_free_ptr(a2);
 274 }
 275
 276 /* Generate a call to a gvec-style helper with four vector operands
 277    and an extra pointer operand.  */
 278 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 279                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
 280                         uint32_t maxsz, int32_t data,
 281                         gen_helper_gvec_4_ptr *fn)
 282 {
 283     TCGv_ptr a0, a1, a2, a3;
 284     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 285
 286     a0 = tcg_temp_new_ptr();
 287     a1 = tcg_temp_new_ptr();
 288     a2 = tcg_temp_new_ptr();
 289     a3 = tcg_temp_new_ptr();
 290
 291     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 292     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 293     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 294     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 295
 296     fn(a0, a1, a2, a3, ptr, desc);
 297
 298     tcg_temp_free_ptr(a0);
 299     tcg_temp_free_ptr(a1);
 300     tcg_temp_free_ptr(a2);
 301     tcg_temp_free_ptr(a3);
 302 }
 303
 304 /* Generate a call to a gvec-style helper with five vector operands
 305    and an extra pointer operand.  */
 306 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 307                         uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
 308                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 309                         gen_helper_gvec_5_ptr *fn)
 310 {
 311     TCGv_ptr a0, a1, a2, a3, a4;
 312     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 313
 314     a0 = tcg_temp_new_ptr();
 315     a1 = tcg_temp_new_ptr();
 316     a2 = tcg_temp_new_ptr();
 317     a3 = tcg_temp_new_ptr();
 318     a4 = tcg_temp_new_ptr();
 319
 320     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 321     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 322     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 323     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 324     tcg_gen_addi_ptr(a4, cpu_env, eofs);
 325
 326     fn(a0, a1, a2, a3, a4, ptr, desc);
 327
 328     tcg_temp_free_ptr(a0);
 329     tcg_temp_free_ptr(a1);
 330     tcg_temp_free_ptr(a2);
 331     tcg_temp_free_ptr(a3);
 332     tcg_temp_free_ptr(a4);
 333 }
 334
 335 /* Return true if we want to implement something of OPRSZ bytes
 336    in units of LNSZ.  This limits the expansion of inline code.  */
 337 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
 338 {
 339     uint32_t q, r;
 340
 341     if (oprsz < lnsz) {
 342         return false;
 343     }
 344
 345     q = oprsz / lnsz;
 346     r = oprsz % lnsz;
 347     tcg_debug_assert((r & 7) == 0);
 348
 349     if (lnsz < 16) {
 350         /* For sizes below 16, accept no remainder. */
 351         if (r != 0) {
 352             return false;
 353         }
 354     } else {
 355         /*
 356          * Recall that ARM SVE allows vector sizes that are not a
 357          * power of 2, but always a multiple of 16.  The intent is
 358          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 359          * In addition, expand_clr needs to handle a multiple of 8.
 360          * Thus we can handle the tail with one more operation per
 361          * diminishing power of 2.
 362          */
 363         q += ctpop32(r);
 364     }
 365
 366     return q <= MAX_UNROLL;
 367 }
 368
 369 static void expand_clr(uint32_t dofs, uint32_t maxsz);
 370
 371 /* Duplicate C as per VECE.  */
 372 uint64_t (dup_const)(unsigned vece, uint64_t c)
 373 {
 374     switch (vece) {
 375     case MO_8:
 376         return 0x0101010101010101ull * (uint8_t)c;
 377     case MO_16:
 378         return 0x0001000100010001ull * (uint16_t)c;
 379     case MO_32:
 380         return 0x0000000100000001ull * (uint32_t)c;
 381     case MO_64:
 382         return c;
 383     default:
 384         g_assert_not_reached();
 385     }
 386 }
 387
 388 /* Duplicate IN into OUT as per VECE.  */
 389 void tcg_gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
 390 {
 391     switch (vece) {
 392     case MO_8:
 393         tcg_gen_ext8u_i32(out, in);
 394         tcg_gen_muli_i32(out, out, 0x01010101);
 395         break;
 396     case MO_16:
 397         tcg_gen_deposit_i32(out, in, in, 16, 16);
 398         break;
 399     case MO_32:
 400         tcg_gen_mov_i32(out, in);
 401         break;
 402     default:
 403         g_assert_not_reached();
 404     }
 405 }
 406
 407 void tcg_gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
 408 {
 409     switch (vece) {
 410     case MO_8:
 411         tcg_gen_ext8u_i64(out, in);
 412         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
 413         break;
 414     case MO_16:
 415         tcg_gen_ext16u_i64(out, in);
 416         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
 417         break;
 418     case MO_32:
 419         tcg_gen_deposit_i64(out, in, in, 32, 32);
 420         break;
 421     case MO_64:
 422         tcg_gen_mov_i64(out, in);
 423         break;
 424     default:
 425         g_assert_not_reached();
 426     }
 427 }
 428
 429 /* Select a supported vector type for implementing an operation on SIZE
 430  * bytes.  If OP is 0, assume that the real operation to be performed is
 431  * required by all backends.  Otherwise, make sure than OP can be performed
 432  * on elements of size VECE in the selected type.  Do not select V64 if
 433  * PREFER_I64 is true.  Return 0 if no vector type is selected.
 434  */
 435 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
 436                                   uint32_t size, bool prefer_i64)
 437 {
 438     /*
 439      * Recall that ARM SVE allows vector sizes that are not a
 440      * power of 2, but always a multiple of 16.  The intent is
 441      * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 442      * It is hard to imagine a case in which v256 is supported
 443      * but v128 is not, but check anyway.
 444      * In addition, expand_clr needs to handle a multiple of 8.
 445      */
 446     if (TCG_TARGET_HAS_v256 &&
 447         check_size_impl(size, 32) &&
 448         tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
 449         (!(size & 16) ||
 450          (TCG_TARGET_HAS_v128 &&
 451           tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
 452         (!(size & 8) ||
 453          (TCG_TARGET_HAS_v64 &&
 454           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
 455         return TCG_TYPE_V256;
 456     }
 457     if (TCG_TARGET_HAS_v128 &&
 458         check_size_impl(size, 16) &&
 459         tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
 460         (!(size & 8) ||
 461          (TCG_TARGET_HAS_v64 &&
 462           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
 463         return TCG_TYPE_V128;
 464     }
 465     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
 466         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
 467         return TCG_TYPE_V64;
 468     }
 469     return 0;
 470 }
 471
 472 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
 473                          uint32_t maxsz, TCGv_vec t_vec)
 474 {
 475     uint32_t i = 0;
 476
 477     tcg_debug_assert(oprsz >= 8);
 478
 479     /*
 480      * This may be expand_clr for the tail of an operation, e.g.
 481      * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
 482      * are misaligned wrt the maximum vector size, so do that first.
 483      */
 484     if (dofs & 8) {
 485         tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 486         i += 8;
 487     }
 488
 489     switch (type) {
 490     case TCG_TYPE_V256:
 491         /*
 492          * Recall that ARM SVE allows vector sizes that are not a
 493          * power of 2, but always a multiple of 16.  The intent is
 494          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 495          */
 496         for (; i + 32 <= oprsz; i += 32) {
 497             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
 498         }
 499         /* fallthru */
 500     case TCG_TYPE_V128:
 501         for (; i + 16 <= oprsz; i += 16) {
 502             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
 503         }
 504         break;
 505     case TCG_TYPE_V64:
 506         for (; i < oprsz; i += 8) {
 507             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 508         }
 509         break;
 510     default:
 511         g_assert_not_reached();
 512     }
 513
 514     if (oprsz < maxsz) {
 515         expand_clr(dofs + oprsz, maxsz - oprsz);
 516     }
 517 }
 518
 519 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
 520  * Only one of IN_32 or IN_64 may be set;
 521  * IN_C is used if IN_32 and IN_64 are unset.
 522  */
 523 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
 524                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
 525                    uint64_t in_c)
 526 {
 527     TCGType type;
 528     TCGv_i64 t_64;
 529     TCGv_i32 t_32, t_desc;
 530     TCGv_ptr t_ptr;
 531     uint32_t i;
 532
 533     assert(vece <= (in_32 ? MO_32 : MO_64));
 534     assert(in_32 == NULL || in_64 == NULL);
 535
 536     /* If we're storing 0, expand oprsz to maxsz.  */
 537     if (in_32 == NULL && in_64 == NULL) {
 538         in_c = dup_const(vece, in_c);
 539         if (in_c == 0) {
 540             oprsz = maxsz;
 541             vece = MO_8;
 542         } else if (in_c == dup_const(MO_8, in_c)) {
 543             vece = MO_8;
 544         }
 545     }
 546
 547     /* Implement inline with a vector type, if possible.
 548      * Prefer integer when 64-bit host and no variable dup.
 549      */
 550     type = choose_vector_type(NULL, vece, oprsz,
 551                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
 552                                && (in_64 == NULL || vece == MO_64)));
 553     if (type != 0) {
 554         TCGv_vec t_vec = tcg_temp_new_vec(type);
 555
 556         if (in_32) {
 557             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
 558         } else if (in_64) {
 559             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
 560         } else {
 561             tcg_gen_dupi_vec(vece, t_vec, in_c);
 562         }
 563         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
 564         tcg_temp_free_vec(t_vec);
 565         return;
 566     }
 567
 568     /* Otherwise, inline with an integer type, unless "large".  */
 569     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
 570         t_64 = NULL;
 571         t_32 = NULL;
 572
 573         if (in_32) {
 574             /* We are given a 32-bit variable input.  For a 64-bit host,
 575                use a 64-bit operation unless the 32-bit operation would
 576                be simple enough.  */
 577             if (TCG_TARGET_REG_BITS == 64
 578                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
 579                 t_64 = tcg_temp_new_i64();
 580                 tcg_gen_extu_i32_i64(t_64, in_32);
 581                 tcg_gen_dup_i64(vece, t_64, t_64);
 582             } else {
 583                 t_32 = tcg_temp_new_i32();
 584                 tcg_gen_dup_i32(vece, t_32, in_32);
 585             }
 586         } else if (in_64) {
 587             /* We are given a 64-bit variable input.  */
 588             t_64 = tcg_temp_new_i64();
 589             tcg_gen_dup_i64(vece, t_64, in_64);
 590         } else {
 591             /* We are given a constant input.  */
 592             /* For 64-bit hosts, use 64-bit constants for "simple" constants
 593                or when we'd need too many 32-bit stores, or when a 64-bit
 594                constant is really required.  */
 595             if (vece == MO_64
 596                 || (TCG_TARGET_REG_BITS == 64
 597                     && (in_c == 0 || in_c == -1
 598                         || !check_size_impl(oprsz, 4)))) {
 599                 t_64 = tcg_constant_i64(in_c);
 600             } else {
 601                 t_32 = tcg_constant_i32(in_c);
 602             }
 603         }
 604
 605         /* Implement inline if we picked an implementation size above.  */
 606         if (t_32) {
 607             for (i = 0; i < oprsz; i += 4) {
 608                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
 609             }
 610             tcg_temp_free_i32(t_32);
 611             goto done;
 612         }
 613         if (t_64) {
 614             for (i = 0; i < oprsz; i += 8) {
 615                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
 616             }
 617             tcg_temp_free_i64(t_64);
 618             goto done;
 619         }
 620     }
 621
 622     /* Otherwise implement out of line.  */
 623     t_ptr = tcg_temp_new_ptr();
 624     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
 625
 626     /*
 627      * This may be expand_clr for the tail of an operation, e.g.
 628      * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned
 629      * wrt simd_desc and will assert.  Simply pass all replicated byte
 630      * stores through to memset.
 631      */
 632     if (oprsz == maxsz && vece == MO_8) {
 633         TCGv_ptr t_size = tcg_const_ptr(oprsz);
 634         TCGv_i32 t_val;
 635
 636         if (in_32) {
 637             t_val = in_32;
 638         } else if (in_64) {
 639             t_val = tcg_temp_new_i32();
 640             tcg_gen_extrl_i64_i32(t_val, in_64);
 641         } else {
 642             t_val = tcg_constant_i32(in_c);
 643         }
 644         gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
 645
 646         if (in_64) {
 647             tcg_temp_free_i32(t_val);
 648         }
 649         tcg_temp_free_ptr(t_size);
 650         tcg_temp_free_ptr(t_ptr);
 651         return;
 652     }
 653
 654     t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
 655
 656     if (vece == MO_64) {
 657         if (in_64) {
 658             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
 659         } else {
 660             t_64 = tcg_constant_i64(in_c);
 661             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
 662         }
 663     } else {
 664         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
 665         static dup_fn * const fns[3] = {
 666             gen_helper_gvec_dup8,
 667             gen_helper_gvec_dup16,
 668             gen_helper_gvec_dup32
 669         };
 670
 671         if (in_32) {
 672             fns[vece](t_ptr, t_desc, in_32);
 673         } else if (in_64) {
 674             t_32 = tcg_temp_new_i32();
 675             tcg_gen_extrl_i64_i32(t_32, in_64);
 676             fns[vece](t_ptr, t_desc, t_32);
 677             tcg_temp_free_i32(t_32);
 678         } else {
 679             if (vece == MO_8) {
 680                 in_c &= 0xff;
 681             } else if (vece == MO_16) {
 682                 in_c &= 0xffff;
 683             }
 684             t_32 = tcg_constant_i32(in_c);
 685             fns[vece](t_ptr, t_desc, t_32);
 686         }
 687     }
 688
 689     tcg_temp_free_ptr(t_ptr);
 690     return;
 691
 692  done:
 693     if (oprsz < maxsz) {
 694         expand_clr(dofs + oprsz, maxsz - oprsz);
 695     }
 696 }
 697
 698 /* Likewise, but with zero.  */
 699 static void expand_clr(uint32_t dofs, uint32_t maxsz)
 700 {
 701     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
 702 }
 703
 704 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
 705 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 706                          bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
 707 {
 708     TCGv_i32 t0 = tcg_temp_new_i32();
 709     TCGv_i32 t1 = tcg_temp_new_i32();
 710     uint32_t i;
 711
 712     for (i = 0; i < oprsz; i += 4) {
 713         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 714         if (load_dest) {
 715             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 716         }
 717         fni(t1, t0);
 718         tcg_gen_st_i32(t1, cpu_env, dofs + i);
 719     }
 720     tcg_temp_free_i32(t0);
 721     tcg_temp_free_i32(t1);
 722 }
 723
 724 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 725                           int32_t c, bool load_dest,
 726                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
 727 {
 728     TCGv_i32 t0 = tcg_temp_new_i32();
 729     TCGv_i32 t1 = tcg_temp_new_i32();
 730     uint32_t i;
 731
 732     for (i = 0; i < oprsz; i += 4) {
 733         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 734         if (load_dest) {
 735             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 736         }
 737         fni(t1, t0, c);
 738         tcg_gen_st_i32(t1, cpu_env, dofs + i);
 739     }
 740     tcg_temp_free_i32(t0);
 741     tcg_temp_free_i32(t1);
 742 }
 743
 744 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 745                           TCGv_i32 c, bool scalar_first,
 746                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 747 {
 748     TCGv_i32 t0 = tcg_temp_new_i32();
 749     TCGv_i32 t1 = tcg_temp_new_i32();
 750     uint32_t i;
 751
 752     for (i = 0; i < oprsz; i += 4) {
 753         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 754         if (scalar_first) {
 755             fni(t1, c, t0);
 756         } else {
 757             fni(t1, t0, c);
 758         }
 759         tcg_gen_st_i32(t1, cpu_env, dofs + i);
 760     }
 761     tcg_temp_free_i32(t0);
 762     tcg_temp_free_i32(t1);
 763 }
 764
 765 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 766 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
 767                          uint32_t bofs, uint32_t oprsz, bool load_dest,
 768                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 769 {
 770     TCGv_i32 t0 = tcg_temp_new_i32();
 771     TCGv_i32 t1 = tcg_temp_new_i32();
 772     TCGv_i32 t2 = tcg_temp_new_i32();
 773     uint32_t i;
 774
 775     for (i = 0; i < oprsz; i += 4) {
 776         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 777         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 778         if (load_dest) {
 779             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 780         }
 781         fni(t2, t0, t1);
 782         tcg_gen_st_i32(t2, cpu_env, dofs + i);
 783     }
 784     tcg_temp_free_i32(t2);
 785     tcg_temp_free_i32(t1);
 786     tcg_temp_free_i32(t0);
 787 }
 788
 789 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 790                           uint32_t oprsz, int32_t c, bool load_dest,
 791                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
 792 {
 793     TCGv_i32 t0 = tcg_temp_new_i32();
 794     TCGv_i32 t1 = tcg_temp_new_i32();
 795     TCGv_i32 t2 = tcg_temp_new_i32();
 796     uint32_t i;
 797
 798     for (i = 0; i < oprsz; i += 4) {
 799         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 800         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 801         if (load_dest) {
 802             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 803         }
 804         fni(t2, t0, t1, c);
 805         tcg_gen_st_i32(t2, cpu_env, dofs + i);
 806     }
 807     tcg_temp_free_i32(t0);
 808     tcg_temp_free_i32(t1);
 809     tcg_temp_free_i32(t2);
 810 }
 811
 812 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 813 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 814                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
 815                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
 816 {
 817     TCGv_i32 t0 = tcg_temp_new_i32();
 818     TCGv_i32 t1 = tcg_temp_new_i32();
 819     TCGv_i32 t2 = tcg_temp_new_i32();
 820     TCGv_i32 t3 = tcg_temp_new_i32();
 821     uint32_t i;
 822
 823     for (i = 0; i < oprsz; i += 4) {
 824         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
 825         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
 826         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
 827         fni(t0, t1, t2, t3);
 828         tcg_gen_st_i32(t0, cpu_env, dofs + i);
 829         if (write_aofs) {
 830             tcg_gen_st_i32(t1, cpu_env, aofs + i);
 831         }
 832     }
 833     tcg_temp_free_i32(t3);
 834     tcg_temp_free_i32(t2);
 835     tcg_temp_free_i32(t1);
 836     tcg_temp_free_i32(t0);
 837 }
 838
 839 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
 840 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 841                          bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
 842 {
 843     TCGv_i64 t0 = tcg_temp_new_i64();
 844     TCGv_i64 t1 = tcg_temp_new_i64();
 845     uint32_t i;
 846
 847     for (i = 0; i < oprsz; i += 8) {
 848         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 849         if (load_dest) {
 850             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 851         }
 852         fni(t1, t0);
 853         tcg_gen_st_i64(t1, cpu_env, dofs + i);
 854     }
 855     tcg_temp_free_i64(t0);
 856     tcg_temp_free_i64(t1);
 857 }
 858
 859 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 860                           int64_t c, bool load_dest,
 861                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
 862 {
 863     TCGv_i64 t0 = tcg_temp_new_i64();
 864     TCGv_i64 t1 = tcg_temp_new_i64();
 865     uint32_t i;
 866
 867     for (i = 0; i < oprsz; i += 8) {
 868         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 869         if (load_dest) {
 870             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 871         }
 872         fni(t1, t0, c);
 873         tcg_gen_st_i64(t1, cpu_env, dofs + i);
 874     }
 875     tcg_temp_free_i64(t0);
 876     tcg_temp_free_i64(t1);
 877 }
 878
 879 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 880                           TCGv_i64 c, bool scalar_first,
 881                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 882 {
 883     TCGv_i64 t0 = tcg_temp_new_i64();
 884     TCGv_i64 t1 = tcg_temp_new_i64();
 885     uint32_t i;
 886
 887     for (i = 0; i < oprsz; i += 8) {
 888         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 889         if (scalar_first) {
 890             fni(t1, c, t0);
 891         } else {
 892             fni(t1, t0, c);
 893         }
 894         tcg_gen_st_i64(t1, cpu_env, dofs + i);
 895     }
 896     tcg_temp_free_i64(t0);
 897     tcg_temp_free_i64(t1);
 898 }
 899
 900 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 901 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
 902                          uint32_t bofs, uint32_t oprsz, bool load_dest,
 903                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 904 {
 905     TCGv_i64 t0 = tcg_temp_new_i64();
 906     TCGv_i64 t1 = tcg_temp_new_i64();
 907     TCGv_i64 t2 = tcg_temp_new_i64();
 908     uint32_t i;
 909
 910     for (i = 0; i < oprsz; i += 8) {
 911         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 912         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 913         if (load_dest) {
 914             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 915         }
 916         fni(t2, t0, t1);
 917         tcg_gen_st_i64(t2, cpu_env, dofs + i);
 918     }
 919     tcg_temp_free_i64(t2);
 920     tcg_temp_free_i64(t1);
 921     tcg_temp_free_i64(t0);
 922 }
 923
 924 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 925                           uint32_t oprsz, int64_t c, bool load_dest,
 926                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
 927 {
 928     TCGv_i64 t0 = tcg_temp_new_i64();
 929     TCGv_i64 t1 = tcg_temp_new_i64();
 930     TCGv_i64 t2 = tcg_temp_new_i64();
 931     uint32_t i;
 932
 933     for (i = 0; i < oprsz; i += 8) {
 934         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 935         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 936         if (load_dest) {
 937             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 938         }
 939         fni(t2, t0, t1, c);
 940         tcg_gen_st_i64(t2, cpu_env, dofs + i);
 941     }
 942     tcg_temp_free_i64(t0);
 943     tcg_temp_free_i64(t1);
 944     tcg_temp_free_i64(t2);
 945 }
 946
 947 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 948 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 949                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
 950                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
 951 {
 952     TCGv_i64 t0 = tcg_temp_new_i64();
 953     TCGv_i64 t1 = tcg_temp_new_i64();
 954     TCGv_i64 t2 = tcg_temp_new_i64();
 955     TCGv_i64 t3 = tcg_temp_new_i64();
 956     uint32_t i;
 957
 958     for (i = 0; i < oprsz; i += 8) {
 959         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
 960         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
 961         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
 962         fni(t0, t1, t2, t3);
 963         tcg_gen_st_i64(t0, cpu_env, dofs + i);
 964         if (write_aofs) {
 965             tcg_gen_st_i64(t1, cpu_env, aofs + i);
 966         }
 967     }
 968     tcg_temp_free_i64(t3);
 969     tcg_temp_free_i64(t2);
 970     tcg_temp_free_i64(t1);
 971     tcg_temp_free_i64(t0);
 972 }
 973
 974 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
 975 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 976                          uint32_t oprsz, uint32_t tysz, TCGType type,
 977                          bool load_dest,
 978                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
 979 {
 980     TCGv_vec t0 = tcg_temp_new_vec(type);
 981     TCGv_vec t1 = tcg_temp_new_vec(type);
 982     uint32_t i;
 983
 984     for (i = 0; i < oprsz; i += tysz) {
 985         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 986         if (load_dest) {
 987             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
 988         }
 989         fni(vece, t1, t0);
 990         tcg_gen_st_vec(t1, cpu_env, dofs + i);
 991     }
 992     tcg_temp_free_vec(t0);
 993     tcg_temp_free_vec(t1);
 994 }
 995
 996 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
 997    using host vectors.  */
 998 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 999                           uint32_t oprsz, uint32_t tysz, TCGType type,
1000                           int64_t c, bool load_dest,
1001                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
1002 {
1003     TCGv_vec t0 = tcg_temp_new_vec(type);
1004     TCGv_vec t1 = tcg_temp_new_vec(type);
1005     uint32_t i;
1006
1007     for (i = 0; i < oprsz; i += tysz) {
1008         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1009         if (load_dest) {
1010             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
1011         }
1012         fni(vece, t1, t0, c);
1013         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1014     }
1015     tcg_temp_free_vec(t0);
1016     tcg_temp_free_vec(t1);
1017 }
1018
1019 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1020                           uint32_t oprsz, uint32_t tysz, TCGType type,
1021                           TCGv_vec c, bool scalar_first,
1022                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1023 {
1024     TCGv_vec t0 = tcg_temp_new_vec(type);
1025     TCGv_vec t1 = tcg_temp_new_vec(type);
1026     uint32_t i;
1027
1028     for (i = 0; i < oprsz; i += tysz) {
1029         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1030         if (scalar_first) {
1031             fni(vece, t1, c, t0);
1032         } else {
1033             fni(vece, t1, t0, c);
1034         }
1035         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1036     }
1037     tcg_temp_free_vec(t0);
1038     tcg_temp_free_vec(t1);
1039 }
1040
1041 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
1042 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1043                          uint32_t bofs, uint32_t oprsz,
1044                          uint32_t tysz, TCGType type, bool load_dest,
1045                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1046 {
1047     TCGv_vec t0 = tcg_temp_new_vec(type);
1048     TCGv_vec t1 = tcg_temp_new_vec(type);
1049     TCGv_vec t2 = tcg_temp_new_vec(type);
1050     uint32_t i;
1051
1052     for (i = 0; i < oprsz; i += tysz) {
1053         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1054         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1055         if (load_dest) {
1056             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1057         }
1058         fni(vece, t2, t0, t1);
1059         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1060     }
1061     tcg_temp_free_vec(t2);
1062     tcg_temp_free_vec(t1);
1063     tcg_temp_free_vec(t0);
1064 }
1065
1066 /*
1067  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1068  * using host vectors.
1069  */
1070 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1071                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1072                           TCGType type, int64_t c, bool load_dest,
1073                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1074                                       int64_t))
1075 {
1076     TCGv_vec t0 = tcg_temp_new_vec(type);
1077     TCGv_vec t1 = tcg_temp_new_vec(type);
1078     TCGv_vec t2 = tcg_temp_new_vec(type);
1079     uint32_t i;
1080
1081     for (i = 0; i < oprsz; i += tysz) {
1082         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1083         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1084         if (load_dest) {
1085             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1086         }
1087         fni(vece, t2, t0, t1, c);
1088         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1089     }
1090     tcg_temp_free_vec(t0);
1091     tcg_temp_free_vec(t1);
1092     tcg_temp_free_vec(t2);
1093 }
1094
1095 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
1096 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1097                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1098                          uint32_t tysz, TCGType type, bool write_aofs,
1099                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1100                                      TCGv_vec, TCGv_vec))
1101 {
1102     TCGv_vec t0 = tcg_temp_new_vec(type);
1103     TCGv_vec t1 = tcg_temp_new_vec(type);
1104     TCGv_vec t2 = tcg_temp_new_vec(type);
1105     TCGv_vec t3 = tcg_temp_new_vec(type);
1106     uint32_t i;
1107
1108     for (i = 0; i < oprsz; i += tysz) {
1109         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1110         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1111         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1112         fni(vece, t0, t1, t2, t3);
1113         tcg_gen_st_vec(t0, cpu_env, dofs + i);
1114         if (write_aofs) {
1115             tcg_gen_st_vec(t1, cpu_env, aofs + i);
1116         }
1117     }
1118     tcg_temp_free_vec(t3);
1119     tcg_temp_free_vec(t2);
1120     tcg_temp_free_vec(t1);
1121     tcg_temp_free_vec(t0);
1122 }
1123
1124 /* Expand a vector two-operand operation.  */
1125 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1126                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1127 {
1128     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1129     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1130     TCGType type;
1131     uint32_t some;
1132
1133     check_size_align(oprsz, maxsz, dofs | aofs);
1134     check_overlap_2(dofs, aofs, maxsz);
1135
1136     type = 0;
1137     if (g->fniv) {
1138         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1139     }
1140     switch (type) {
1141     case TCG_TYPE_V256:
1142         /* Recall that ARM SVE allows vector sizes that are not a
1143          * power of 2, but always a multiple of 16.  The intent is
1144          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1145          */
1146         some = QEMU_ALIGN_DOWN(oprsz, 32);
1147         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1148                      g->load_dest, g->fniv);
1149         if (some == oprsz) {
1150             break;
1151         }
1152         dofs += some;
1153         aofs += some;
1154         oprsz -= some;
1155         maxsz -= some;
1156         /* fallthru */
1157     case TCG_TYPE_V128:
1158         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1159                      g->load_dest, g->fniv);
1160         break;
1161     case TCG_TYPE_V64:
1162         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1163                      g->load_dest, g->fniv);
1164         break;
1165
1166     case 0:
1167         if (g->fni8 && check_size_impl(oprsz, 8)) {
1168             expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
1169         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1170             expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
1171         } else {
1172             assert(g->fno != NULL);
1173             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1174             oprsz = maxsz;
1175         }
1176         break;
1177
1178     default:
1179         g_assert_not_reached();
1180     }
1181     tcg_swap_vecop_list(hold_list);
1182
1183     if (oprsz < maxsz) {
1184         expand_clr(dofs + oprsz, maxsz - oprsz);
1185     }
1186 }
1187
1188 /* Expand a vector operation with two vectors and an immediate.  */
1189 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1190                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1191 {
1192     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1193     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1194     TCGType type;
1195     uint32_t some;
1196
1197     check_size_align(oprsz, maxsz, dofs | aofs);
1198     check_overlap_2(dofs, aofs, maxsz);
1199
1200     type = 0;
1201     if (g->fniv) {
1202         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1203     }
1204     switch (type) {
1205     case TCG_TYPE_V256:
1206         /* Recall that ARM SVE allows vector sizes that are not a
1207          * power of 2, but always a multiple of 16.  The intent is
1208          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1209          */
1210         some = QEMU_ALIGN_DOWN(oprsz, 32);
1211         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1212                       c, g->load_dest, g->fniv);
1213         if (some == oprsz) {
1214             break;
1215         }
1216         dofs += some;
1217         aofs += some;
1218         oprsz -= some;
1219         maxsz -= some;
1220         /* fallthru */
1221     case TCG_TYPE_V128:
1222         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1223                       c, g->load_dest, g->fniv);
1224         break;
1225     case TCG_TYPE_V64:
1226         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1227                       c, g->load_dest, g->fniv);
1228         break;
1229
1230     case 0:
1231         if (g->fni8 && check_size_impl(oprsz, 8)) {
1232             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1233         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1234             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1235         } else {
1236             if (g->fno) {
1237                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1238             } else {
1239                 TCGv_i64 tcg_c = tcg_constant_i64(c);
1240                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1241                                     maxsz, c, g->fnoi);
1242             }
1243             oprsz = maxsz;
1244         }
1245         break;
1246
1247     default:
1248         g_assert_not_reached();
1249     }
1250     tcg_swap_vecop_list(hold_list);
1251
1252     if (oprsz < maxsz) {
1253         expand_clr(dofs + oprsz, maxsz - oprsz);
1254     }
1255 }
1256
1257 /* Expand a vector operation with two vectors and a scalar.  */
1258 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1259                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1260 {
1261     TCGType type;
1262
1263     check_size_align(oprsz, maxsz, dofs | aofs);
1264     check_overlap_2(dofs, aofs, maxsz);
1265
1266     type = 0;
1267     if (g->fniv) {
1268         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1269     }
1270     if (type != 0) {
1271         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1272         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1273         TCGv_vec t_vec = tcg_temp_new_vec(type);
1274         uint32_t some;
1275
1276         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1277
1278         switch (type) {
1279         case TCG_TYPE_V256:
1280             /* Recall that ARM SVE allows vector sizes that are not a
1281              * power of 2, but always a multiple of 16.  The intent is
1282              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1283              */
1284             some = QEMU_ALIGN_DOWN(oprsz, 32);
1285             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1286                           t_vec, g->scalar_first, g->fniv);
1287             if (some == oprsz) {
1288                 break;
1289             }
1290             dofs += some;
1291             aofs += some;
1292             oprsz -= some;
1293             maxsz -= some;
1294             /* fallthru */
1295
1296         case TCG_TYPE_V128:
1297             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1298                           t_vec, g->scalar_first, g->fniv);
1299             break;
1300
1301         case TCG_TYPE_V64:
1302             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1303                           t_vec, g->scalar_first, g->fniv);
1304             break;
1305
1306         default:
1307             g_assert_not_reached();
1308         }
1309         tcg_temp_free_vec(t_vec);
1310         tcg_swap_vecop_list(hold_list);
1311     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1312         TCGv_i64 t64 = tcg_temp_new_i64();
1313
1314         tcg_gen_dup_i64(g->vece, t64, c);
1315         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1316         tcg_temp_free_i64(t64);
1317     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1318         TCGv_i32 t32 = tcg_temp_new_i32();
1319
1320         tcg_gen_extrl_i64_i32(t32, c);
1321         tcg_gen_dup_i32(g->vece, t32, t32);
1322         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1323         tcg_temp_free_i32(t32);
1324     } else {
1325         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1326         return;
1327     }
1328
1329     if (oprsz < maxsz) {
1330         expand_clr(dofs + oprsz, maxsz - oprsz);
1331     }
1332 }
1333
1334 /* Expand a vector three-operand operation.  */
1335 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1336                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1337 {
1338     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1339     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1340     TCGType type;
1341     uint32_t some;
1342
1343     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1344     check_overlap_3(dofs, aofs, bofs, maxsz);
1345
1346     type = 0;
1347     if (g->fniv) {
1348         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1349     }
1350     switch (type) {
1351     case TCG_TYPE_V256:
1352         /* Recall that ARM SVE allows vector sizes that are not a
1353          * power of 2, but always a multiple of 16.  The intent is
1354          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1355          */
1356         some = QEMU_ALIGN_DOWN(oprsz, 32);
1357         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1358                      g->load_dest, g->fniv);
1359         if (some == oprsz) {
1360             break;
1361         }
1362         dofs += some;
1363         aofs += some;
1364         bofs += some;
1365         oprsz -= some;
1366         maxsz -= some;
1367         /* fallthru */
1368     case TCG_TYPE_V128:
1369         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1370                      g->load_dest, g->fniv);
1371         break;
1372     case TCG_TYPE_V64:
1373         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1374                      g->load_dest, g->fniv);
1375         break;
1376
1377     case 0:
1378         if (g->fni8 && check_size_impl(oprsz, 8)) {
1379             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1380         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1381             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1382         } else {
1383             assert(g->fno != NULL);
1384             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1385                                maxsz, g->data, g->fno);
1386             oprsz = maxsz;
1387         }
1388         break;
1389
1390     default:
1391         g_assert_not_reached();
1392     }
1393     tcg_swap_vecop_list(hold_list);
1394
1395     if (oprsz < maxsz) {
1396         expand_clr(dofs + oprsz, maxsz - oprsz);
1397     }
1398 }
1399
1400 /* Expand a vector operation with three vectors and an immediate.  */
1401 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1402                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1403                      const GVecGen3i *g)
1404 {
1405     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1406     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1407     TCGType type;
1408     uint32_t some;
1409
1410     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1411     check_overlap_3(dofs, aofs, bofs, maxsz);
1412
1413     type = 0;
1414     if (g->fniv) {
1415         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1416     }
1417     switch (type) {
1418     case TCG_TYPE_V256:
1419         /*
1420          * Recall that ARM SVE allows vector sizes that are not a
1421          * power of 2, but always a multiple of 16.  The intent is
1422          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1423          */
1424         some = QEMU_ALIGN_DOWN(oprsz, 32);
1425         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1426                       c, g->load_dest, g->fniv);
1427         if (some == oprsz) {
1428             break;
1429         }
1430         dofs += some;
1431         aofs += some;
1432         bofs += some;
1433         oprsz -= some;
1434         maxsz -= some;
1435         /* fallthru */
1436     case TCG_TYPE_V128:
1437         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1438                       c, g->load_dest, g->fniv);
1439         break;
1440     case TCG_TYPE_V64:
1441         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1442                       c, g->load_dest, g->fniv);
1443         break;
1444
1445     case 0:
1446         if (g->fni8 && check_size_impl(oprsz, 8)) {
1447             expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1448         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1449             expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1450         } else {
1451             assert(g->fno != NULL);
1452             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1453             oprsz = maxsz;
1454         }
1455         break;
1456
1457     default:
1458         g_assert_not_reached();
1459     }
1460     tcg_swap_vecop_list(hold_list);
1461
1462     if (oprsz < maxsz) {
1463         expand_clr(dofs + oprsz, maxsz - oprsz);
1464     }
1465 }
1466
1467 /* Expand a vector four-operand operation.  */
1468 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1469                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1470 {
1471     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1472     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1473     TCGType type;
1474     uint32_t some;
1475
1476     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1477     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1478
1479     type = 0;
1480     if (g->fniv) {
1481         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1482     }
1483     switch (type) {
1484     case TCG_TYPE_V256:
1485         /* Recall that ARM SVE allows vector sizes that are not a
1486          * power of 2, but always a multiple of 16.  The intent is
1487          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1488          */
1489         some = QEMU_ALIGN_DOWN(oprsz, 32);
1490         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1491                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1492         if (some == oprsz) {
1493             break;
1494         }
1495         dofs += some;
1496         aofs += some;
1497         bofs += some;
1498         cofs += some;
1499         oprsz -= some;
1500         maxsz -= some;
1501         /* fallthru */
1502     case TCG_TYPE_V128:
1503         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1504                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1505         break;
1506     case TCG_TYPE_V64:
1507         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1508                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1509         break;
1510
1511     case 0:
1512         if (g->fni8 && check_size_impl(oprsz, 8)) {
1513             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1514                          g->write_aofs, g->fni8);
1515         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1516             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1517                          g->write_aofs, g->fni4);
1518         } else {
1519             assert(g->fno != NULL);
1520             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1521                                oprsz, maxsz, g->data, g->fno);
1522             oprsz = maxsz;
1523         }
1524         break;
1525
1526     default:
1527         g_assert_not_reached();
1528     }
1529     tcg_swap_vecop_list(hold_list);
1530
1531     if (oprsz < maxsz) {
1532         expand_clr(dofs + oprsz, maxsz - oprsz);
1533     }
1534 }
1535
1536 /*
1537  * Expand specific vector operations.
1538  */
1539
1540 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1541 {
1542     tcg_gen_mov_vec(a, b);
1543 }
1544
1545 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1546                       uint32_t oprsz, uint32_t maxsz)
1547 {
1548     static const GVecGen2 g = {
1549         .fni8 = tcg_gen_mov_i64,
1550         .fniv = vec_mov2,
1551         .fno = gen_helper_gvec_mov,
1552         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1553     };
1554     if (dofs != aofs) {
1555         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1556     } else {
1557         check_size_align(oprsz, maxsz, dofs);
1558         if (oprsz < maxsz) {
1559             expand_clr(dofs + oprsz, maxsz - oprsz);
1560         }
1561     }
1562 }
1563
1564 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1565                           uint32_t maxsz, TCGv_i32 in)
1566 {
1567     check_size_align(oprsz, maxsz, dofs);
1568     tcg_debug_assert(vece <= MO_32);
1569     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1570 }
1571
1572 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1573                           uint32_t maxsz, TCGv_i64 in)
1574 {
1575     check_size_align(oprsz, maxsz, dofs);
1576     tcg_debug_assert(vece <= MO_64);
1577     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1578 }
1579
1580 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1581                           uint32_t oprsz, uint32_t maxsz)
1582 {
1583     check_size_align(oprsz, maxsz, dofs);
1584     if (vece <= MO_64) {
1585         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1586         if (type != 0) {
1587             TCGv_vec t_vec = tcg_temp_new_vec(type);
1588             tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1589             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1590             tcg_temp_free_vec(t_vec);
1591         } else if (vece <= MO_32) {
1592             TCGv_i32 in = tcg_temp_new_i32();
1593             switch (vece) {
1594             case MO_8:
1595                 tcg_gen_ld8u_i32(in, cpu_env, aofs);
1596                 break;
1597             case MO_16:
1598                 tcg_gen_ld16u_i32(in, cpu_env, aofs);
1599                 break;
1600             default:
1601                 tcg_gen_ld_i32(in, cpu_env, aofs);
1602                 break;
1603             }
1604             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1605             tcg_temp_free_i32(in);
1606         } else {
1607             TCGv_i64 in = tcg_temp_new_i64();
1608             tcg_gen_ld_i64(in, cpu_env, aofs);
1609             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1610             tcg_temp_free_i64(in);
1611         }
1612     } else if (vece == 4) {
1613         /* 128-bit duplicate.  */
1614         int i;
1615
1616         tcg_debug_assert(oprsz >= 16);
1617         if (TCG_TARGET_HAS_v128) {
1618             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1619
1620             tcg_gen_ld_vec(in, cpu_env, aofs);
1621             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1622                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1623             }
1624             tcg_temp_free_vec(in);
1625         } else {
1626             TCGv_i64 in0 = tcg_temp_new_i64();
1627             TCGv_i64 in1 = tcg_temp_new_i64();
1628
1629             tcg_gen_ld_i64(in0, cpu_env, aofs);
1630             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1631             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1632                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1633                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1634             }
1635             tcg_temp_free_i64(in0);
1636             tcg_temp_free_i64(in1);
1637         }
1638         if (oprsz < maxsz) {
1639             expand_clr(dofs + oprsz, maxsz - oprsz);
1640         }
1641     } else if (vece == 5) {
1642         /* 256-bit duplicate.  */
1643         int i;
1644
1645         tcg_debug_assert(oprsz >= 32);
1646         tcg_debug_assert(oprsz % 32 == 0);
1647         if (TCG_TARGET_HAS_v256) {
1648             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1649
1650             tcg_gen_ld_vec(in, cpu_env, aofs);
1651             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1652                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1653             }
1654             tcg_temp_free_vec(in);
1655         } else if (TCG_TARGET_HAS_v128) {
1656             TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1657             TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1658
1659             tcg_gen_ld_vec(in0, cpu_env, aofs);
1660             tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
1661             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1662                 tcg_gen_st_vec(in0, cpu_env, dofs + i);
1663                 tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
1664             }
1665             tcg_temp_free_vec(in0);
1666             tcg_temp_free_vec(in1);
1667         } else {
1668             TCGv_i64 in[4];
1669             int j;
1670
1671             for (j = 0; j < 4; ++j) {
1672                 in[j] = tcg_temp_new_i64();
1673                 tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
1674             }
1675             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1676                 for (j = 0; j < 4; ++j) {
1677                     tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
1678                 }
1679             }
1680             for (j = 0; j < 4; ++j) {
1681                 tcg_temp_free_i64(in[j]);
1682             }
1683         }
1684         if (oprsz < maxsz) {
1685             expand_clr(dofs + oprsz, maxsz - oprsz);
1686         }
1687     } else {
1688         g_assert_not_reached();
1689     }
1690 }
1691
1692 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1693                           uint32_t maxsz, uint64_t x)
1694 {
1695     check_size_align(oprsz, maxsz, dofs);
1696     do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
1697 }
1698
1699 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1700                       uint32_t oprsz, uint32_t maxsz)
1701 {
1702     static const GVecGen2 g = {
1703         .fni8 = tcg_gen_not_i64,
1704         .fniv = tcg_gen_not_vec,
1705         .fno = gen_helper_gvec_not,
1706         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1707     };
1708     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1709 }
1710
1711 /* Perform a vector addition using normal addition and a mask.  The mask
1712    should be the sign bit of each lane.  This 6-operation form is more
1713    efficient than separate additions when there are 4 or more lanes in
1714    the 64-bit operation.  */
1715 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1716 {
1717     TCGv_i64 t1 = tcg_temp_new_i64();
1718     TCGv_i64 t2 = tcg_temp_new_i64();
1719     TCGv_i64 t3 = tcg_temp_new_i64();
1720
1721     tcg_gen_andc_i64(t1, a, m);
1722     tcg_gen_andc_i64(t2, b, m);
1723     tcg_gen_xor_i64(t3, a, b);
1724     tcg_gen_add_i64(d, t1, t2);
1725     tcg_gen_and_i64(t3, t3, m);
1726     tcg_gen_xor_i64(d, d, t3);
1727
1728     tcg_temp_free_i64(t1);
1729     tcg_temp_free_i64(t2);
1730     tcg_temp_free_i64(t3);
1731 }
1732
1733 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1734 {
1735     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1736     gen_addv_mask(d, a, b, m);
1737 }
1738
1739 void tcg_gen_vec_add8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1740 {
1741     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
1742     TCGv_i32 t1 = tcg_temp_new_i32();
1743     TCGv_i32 t2 = tcg_temp_new_i32();
1744     TCGv_i32 t3 = tcg_temp_new_i32();
1745
1746     tcg_gen_andc_i32(t1, a, m);
1747     tcg_gen_andc_i32(t2, b, m);
1748     tcg_gen_xor_i32(t3, a, b);
1749     tcg_gen_add_i32(d, t1, t2);
1750     tcg_gen_and_i32(t3, t3, m);
1751     tcg_gen_xor_i32(d, d, t3);
1752
1753     tcg_temp_free_i32(t1);
1754     tcg_temp_free_i32(t2);
1755     tcg_temp_free_i32(t3);
1756 }
1757
1758 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1759 {
1760     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1761     gen_addv_mask(d, a, b, m);
1762 }
1763
1764 void tcg_gen_vec_add16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1765 {
1766     TCGv_i32 t1 = tcg_temp_new_i32();
1767     TCGv_i32 t2 = tcg_temp_new_i32();
1768
1769     tcg_gen_andi_i32(t1, a, ~0xffff);
1770     tcg_gen_add_i32(t2, a, b);
1771     tcg_gen_add_i32(t1, t1, b);
1772     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
1773
1774     tcg_temp_free_i32(t1);
1775     tcg_temp_free_i32(t2);
1776 }
1777
1778 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1779 {
1780     TCGv_i64 t1 = tcg_temp_new_i64();
1781     TCGv_i64 t2 = tcg_temp_new_i64();
1782
1783     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1784     tcg_gen_add_i64(t2, a, b);
1785     tcg_gen_add_i64(t1, t1, b);
1786     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1787
1788     tcg_temp_free_i64(t1);
1789     tcg_temp_free_i64(t2);
1790 }
1791
1792 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1793
1794 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1795                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1796 {
1797     static const GVecGen3 g[4] = {
1798         { .fni8 = tcg_gen_vec_add8_i64,
1799           .fniv = tcg_gen_add_vec,
1800           .fno = gen_helper_gvec_add8,
1801           .opt_opc = vecop_list_add,
1802           .vece = MO_8 },
1803         { .fni8 = tcg_gen_vec_add16_i64,
1804           .fniv = tcg_gen_add_vec,
1805           .fno = gen_helper_gvec_add16,
1806           .opt_opc = vecop_list_add,
1807           .vece = MO_16 },
1808         { .fni4 = tcg_gen_add_i32,
1809           .fniv = tcg_gen_add_vec,
1810           .fno = gen_helper_gvec_add32,
1811           .opt_opc = vecop_list_add,
1812           .vece = MO_32 },
1813         { .fni8 = tcg_gen_add_i64,
1814           .fniv = tcg_gen_add_vec,
1815           .fno = gen_helper_gvec_add64,
1816           .opt_opc = vecop_list_add,
1817           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1818           .vece = MO_64 },
1819     };
1820
1821     tcg_debug_assert(vece <= MO_64);
1822     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1823 }
1824
1825 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1826                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1827 {
1828     static const GVecGen2s g[4] = {
1829         { .fni8 = tcg_gen_vec_add8_i64,
1830           .fniv = tcg_gen_add_vec,
1831           .fno = gen_helper_gvec_adds8,
1832           .opt_opc = vecop_list_add,
1833           .vece = MO_8 },
1834         { .fni8 = tcg_gen_vec_add16_i64,
1835           .fniv = tcg_gen_add_vec,
1836           .fno = gen_helper_gvec_adds16,
1837           .opt_opc = vecop_list_add,
1838           .vece = MO_16 },
1839         { .fni4 = tcg_gen_add_i32,
1840           .fniv = tcg_gen_add_vec,
1841           .fno = gen_helper_gvec_adds32,
1842           .opt_opc = vecop_list_add,
1843           .vece = MO_32 },
1844         { .fni8 = tcg_gen_add_i64,
1845           .fniv = tcg_gen_add_vec,
1846           .fno = gen_helper_gvec_adds64,
1847           .opt_opc = vecop_list_add,
1848           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1849           .vece = MO_64 },
1850     };
1851
1852     tcg_debug_assert(vece <= MO_64);
1853     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1854 }
1855
1856 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1857                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1858 {
1859     TCGv_i64 tmp = tcg_constant_i64(c);
1860     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1861 }
1862
1863 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1864
1865 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1866                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1867 {
1868     static const GVecGen2s g[4] = {
1869         { .fni8 = tcg_gen_vec_sub8_i64,
1870           .fniv = tcg_gen_sub_vec,
1871           .fno = gen_helper_gvec_subs8,
1872           .opt_opc = vecop_list_sub,
1873           .vece = MO_8 },
1874         { .fni8 = tcg_gen_vec_sub16_i64,
1875           .fniv = tcg_gen_sub_vec,
1876           .fno = gen_helper_gvec_subs16,
1877           .opt_opc = vecop_list_sub,
1878           .vece = MO_16 },
1879         { .fni4 = tcg_gen_sub_i32,
1880           .fniv = tcg_gen_sub_vec,
1881           .fno = gen_helper_gvec_subs32,
1882           .opt_opc = vecop_list_sub,
1883           .vece = MO_32 },
1884         { .fni8 = tcg_gen_sub_i64,
1885           .fniv = tcg_gen_sub_vec,
1886           .fno = gen_helper_gvec_subs64,
1887           .opt_opc = vecop_list_sub,
1888           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1889           .vece = MO_64 },
1890     };
1891
1892     tcg_debug_assert(vece <= MO_64);
1893     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1894 }
1895
1896 /* Perform a vector subtraction using normal subtraction and a mask.
1897    Compare gen_addv_mask above.  */
1898 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1899 {
1900     TCGv_i64 t1 = tcg_temp_new_i64();
1901     TCGv_i64 t2 = tcg_temp_new_i64();
1902     TCGv_i64 t3 = tcg_temp_new_i64();
1903
1904     tcg_gen_or_i64(t1, a, m);
1905     tcg_gen_andc_i64(t2, b, m);
1906     tcg_gen_eqv_i64(t3, a, b);
1907     tcg_gen_sub_i64(d, t1, t2);
1908     tcg_gen_and_i64(t3, t3, m);
1909     tcg_gen_xor_i64(d, d, t3);
1910
1911     tcg_temp_free_i64(t1);
1912     tcg_temp_free_i64(t2);
1913     tcg_temp_free_i64(t3);
1914 }
1915
1916 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1917 {
1918     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1919     gen_subv_mask(d, a, b, m);
1920 }
1921
1922 void tcg_gen_vec_sub8_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1923 {
1924     TCGv_i32 m = tcg_constant_i32((int32_t)dup_const(MO_8, 0x80));
1925     TCGv_i32 t1 = tcg_temp_new_i32();
1926     TCGv_i32 t2 = tcg_temp_new_i32();
1927     TCGv_i32 t3 = tcg_temp_new_i32();
1928
1929     tcg_gen_or_i32(t1, a, m);
1930     tcg_gen_andc_i32(t2, b, m);
1931     tcg_gen_eqv_i32(t3, a, b);
1932     tcg_gen_sub_i32(d, t1, t2);
1933     tcg_gen_and_i32(t3, t3, m);
1934     tcg_gen_xor_i32(d, d, t3);
1935
1936     tcg_temp_free_i32(t1);
1937     tcg_temp_free_i32(t2);
1938     tcg_temp_free_i32(t3);
1939 }
1940
1941 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1942 {
1943     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1944     gen_subv_mask(d, a, b, m);
1945 }
1946
1947 void tcg_gen_vec_sub16_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
1948 {
1949     TCGv_i32 t1 = tcg_temp_new_i32();
1950     TCGv_i32 t2 = tcg_temp_new_i32();
1951
1952     tcg_gen_andi_i32(t1, b, ~0xffff);
1953     tcg_gen_sub_i32(t2, a, b);
1954     tcg_gen_sub_i32(t1, a, t1);
1955     tcg_gen_deposit_i32(d, t1, t2, 0, 16);
1956
1957     tcg_temp_free_i32(t1);
1958     tcg_temp_free_i32(t2);
1959 }
1960
1961 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1962 {
1963     TCGv_i64 t1 = tcg_temp_new_i64();
1964     TCGv_i64 t2 = tcg_temp_new_i64();
1965
1966     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1967     tcg_gen_sub_i64(t2, a, b);
1968     tcg_gen_sub_i64(t1, a, t1);
1969     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1970
1971     tcg_temp_free_i64(t1);
1972     tcg_temp_free_i64(t2);
1973 }
1974
1975 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1976                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1977 {
1978     static const GVecGen3 g[4] = {
1979         { .fni8 = tcg_gen_vec_sub8_i64,
1980           .fniv = tcg_gen_sub_vec,
1981           .fno = gen_helper_gvec_sub8,
1982           .opt_opc = vecop_list_sub,
1983           .vece = MO_8 },
1984         { .fni8 = tcg_gen_vec_sub16_i64,
1985           .fniv = tcg_gen_sub_vec,
1986           .fno = gen_helper_gvec_sub16,
1987           .opt_opc = vecop_list_sub,
1988           .vece = MO_16 },
1989         { .fni4 = tcg_gen_sub_i32,
1990           .fniv = tcg_gen_sub_vec,
1991           .fno = gen_helper_gvec_sub32,
1992           .opt_opc = vecop_list_sub,
1993           .vece = MO_32 },
1994         { .fni8 = tcg_gen_sub_i64,
1995           .fniv = tcg_gen_sub_vec,
1996           .fno = gen_helper_gvec_sub64,
1997           .opt_opc = vecop_list_sub,
1998           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1999           .vece = MO_64 },
2000     };
2001
2002     tcg_debug_assert(vece <= MO_64);
2003     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2004 }
2005
2006 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
2007
2008 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
2009                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2010 {
2011     static const GVecGen3 g[4] = {
2012         { .fniv = tcg_gen_mul_vec,
2013           .fno = gen_helper_gvec_mul8,
2014           .opt_opc = vecop_list_mul,
2015           .vece = MO_8 },
2016         { .fniv = tcg_gen_mul_vec,
2017           .fno = gen_helper_gvec_mul16,
2018           .opt_opc = vecop_list_mul,
2019           .vece = MO_16 },
2020         { .fni4 = tcg_gen_mul_i32,
2021           .fniv = tcg_gen_mul_vec,
2022           .fno = gen_helper_gvec_mul32,
2023           .opt_opc = vecop_list_mul,
2024           .vece = MO_32 },
2025         { .fni8 = tcg_gen_mul_i64,
2026           .fniv = tcg_gen_mul_vec,
2027           .fno = gen_helper_gvec_mul64,
2028           .opt_opc = vecop_list_mul,
2029           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2030           .vece = MO_64 },
2031     };
2032
2033     tcg_debug_assert(vece <= MO_64);
2034     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2035 }
2036
2037 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
2038                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2039 {
2040     static const GVecGen2s g[4] = {
2041         { .fniv = tcg_gen_mul_vec,
2042           .fno = gen_helper_gvec_muls8,
2043           .opt_opc = vecop_list_mul,
2044           .vece = MO_8 },
2045         { .fniv = tcg_gen_mul_vec,
2046           .fno = gen_helper_gvec_muls16,
2047           .opt_opc = vecop_list_mul,
2048           .vece = MO_16 },
2049         { .fni4 = tcg_gen_mul_i32,
2050           .fniv = tcg_gen_mul_vec,
2051           .fno = gen_helper_gvec_muls32,
2052           .opt_opc = vecop_list_mul,
2053           .vece = MO_32 },
2054         { .fni8 = tcg_gen_mul_i64,
2055           .fniv = tcg_gen_mul_vec,
2056           .fno = gen_helper_gvec_muls64,
2057           .opt_opc = vecop_list_mul,
2058           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2059           .vece = MO_64 },
2060     };
2061
2062     tcg_debug_assert(vece <= MO_64);
2063     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
2064 }
2065
2066 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
2067                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2068 {
2069     TCGv_i64 tmp = tcg_constant_i64(c);
2070     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
2071 }
2072
2073 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2074                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2075 {
2076     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
2077     static const GVecGen3 g[4] = {
2078         { .fniv = tcg_gen_ssadd_vec,
2079           .fno = gen_helper_gvec_ssadd8,
2080           .opt_opc = vecop_list,
2081           .vece = MO_8 },
2082         { .fniv = tcg_gen_ssadd_vec,
2083           .fno = gen_helper_gvec_ssadd16,
2084           .opt_opc = vecop_list,
2085           .vece = MO_16 },
2086         { .fniv = tcg_gen_ssadd_vec,
2087           .fno = gen_helper_gvec_ssadd32,
2088           .opt_opc = vecop_list,
2089           .vece = MO_32 },
2090         { .fniv = tcg_gen_ssadd_vec,
2091           .fno = gen_helper_gvec_ssadd64,
2092           .opt_opc = vecop_list,
2093           .vece = MO_64 },
2094     };
2095     tcg_debug_assert(vece <= MO_64);
2096     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2097 }
2098
2099 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2100                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2101 {
2102     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2103     static const GVecGen3 g[4] = {
2104         { .fniv = tcg_gen_sssub_vec,
2105           .fno = gen_helper_gvec_sssub8,
2106           .opt_opc = vecop_list,
2107           .vece = MO_8 },
2108         { .fniv = tcg_gen_sssub_vec,
2109           .fno = gen_helper_gvec_sssub16,
2110           .opt_opc = vecop_list,
2111           .vece = MO_16 },
2112         { .fniv = tcg_gen_sssub_vec,
2113           .fno = gen_helper_gvec_sssub32,
2114           .opt_opc = vecop_list,
2115           .vece = MO_32 },
2116         { .fniv = tcg_gen_sssub_vec,
2117           .fno = gen_helper_gvec_sssub64,
2118           .opt_opc = vecop_list,
2119           .vece = MO_64 },
2120     };
2121     tcg_debug_assert(vece <= MO_64);
2122     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2123 }
2124
2125 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2126 {
2127     TCGv_i32 max = tcg_constant_i32(-1);
2128     tcg_gen_add_i32(d, a, b);
2129     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2130 }
2131
2132 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2133 {
2134     TCGv_i64 max = tcg_constant_i64(-1);
2135     tcg_gen_add_i64(d, a, b);
2136     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2137 }
2138
2139 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2140                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2141 {
2142     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2143     static const GVecGen3 g[4] = {
2144         { .fniv = tcg_gen_usadd_vec,
2145           .fno = gen_helper_gvec_usadd8,
2146           .opt_opc = vecop_list,
2147           .vece = MO_8 },
2148         { .fniv = tcg_gen_usadd_vec,
2149           .fno = gen_helper_gvec_usadd16,
2150           .opt_opc = vecop_list,
2151           .vece = MO_16 },
2152         { .fni4 = tcg_gen_usadd_i32,
2153           .fniv = tcg_gen_usadd_vec,
2154           .fno = gen_helper_gvec_usadd32,
2155           .opt_opc = vecop_list,
2156           .vece = MO_32 },
2157         { .fni8 = tcg_gen_usadd_i64,
2158           .fniv = tcg_gen_usadd_vec,
2159           .fno = gen_helper_gvec_usadd64,
2160           .opt_opc = vecop_list,
2161           .vece = MO_64 }
2162     };
2163     tcg_debug_assert(vece <= MO_64);
2164     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2165 }
2166
2167 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2168 {
2169     TCGv_i32 min = tcg_constant_i32(0);
2170     tcg_gen_sub_i32(d, a, b);
2171     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2172 }
2173
2174 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2175 {
2176     TCGv_i64 min = tcg_constant_i64(0);
2177     tcg_gen_sub_i64(d, a, b);
2178     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2179 }
2180
2181 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2182                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2183 {
2184     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2185     static const GVecGen3 g[4] = {
2186         { .fniv = tcg_gen_ussub_vec,
2187           .fno = gen_helper_gvec_ussub8,
2188           .opt_opc = vecop_list,
2189           .vece = MO_8 },
2190         { .fniv = tcg_gen_ussub_vec,
2191           .fno = gen_helper_gvec_ussub16,
2192           .opt_opc = vecop_list,
2193           .vece = MO_16 },
2194         { .fni4 = tcg_gen_ussub_i32,
2195           .fniv = tcg_gen_ussub_vec,
2196           .fno = gen_helper_gvec_ussub32,
2197           .opt_opc = vecop_list,
2198           .vece = MO_32 },
2199         { .fni8 = tcg_gen_ussub_i64,
2200           .fniv = tcg_gen_ussub_vec,
2201           .fno = gen_helper_gvec_ussub64,
2202           .opt_opc = vecop_list,
2203           .vece = MO_64 }
2204     };
2205     tcg_debug_assert(vece <= MO_64);
2206     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2207 }
2208
2209 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2210                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2211 {
2212     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2213     static const GVecGen3 g[4] = {
2214         { .fniv = tcg_gen_smin_vec,
2215           .fno = gen_helper_gvec_smin8,
2216           .opt_opc = vecop_list,
2217           .vece = MO_8 },
2218         { .fniv = tcg_gen_smin_vec,
2219           .fno = gen_helper_gvec_smin16,
2220           .opt_opc = vecop_list,
2221           .vece = MO_16 },
2222         { .fni4 = tcg_gen_smin_i32,
2223           .fniv = tcg_gen_smin_vec,
2224           .fno = gen_helper_gvec_smin32,
2225           .opt_opc = vecop_list,
2226           .vece = MO_32 },
2227         { .fni8 = tcg_gen_smin_i64,
2228           .fniv = tcg_gen_smin_vec,
2229           .fno = gen_helper_gvec_smin64,
2230           .opt_opc = vecop_list,
2231           .vece = MO_64 }
2232     };
2233     tcg_debug_assert(vece <= MO_64);
2234     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2235 }
2236
2237 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2238                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2239 {
2240     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2241     static const GVecGen3 g[4] = {
2242         { .fniv = tcg_gen_umin_vec,
2243           .fno = gen_helper_gvec_umin8,
2244           .opt_opc = vecop_list,
2245           .vece = MO_8 },
2246         { .fniv = tcg_gen_umin_vec,
2247           .fno = gen_helper_gvec_umin16,
2248           .opt_opc = vecop_list,
2249           .vece = MO_16 },
2250         { .fni4 = tcg_gen_umin_i32,
2251           .fniv = tcg_gen_umin_vec,
2252           .fno = gen_helper_gvec_umin32,
2253           .opt_opc = vecop_list,
2254           .vece = MO_32 },
2255         { .fni8 = tcg_gen_umin_i64,
2256           .fniv = tcg_gen_umin_vec,
2257           .fno = gen_helper_gvec_umin64,
2258           .opt_opc = vecop_list,
2259           .vece = MO_64 }
2260     };
2261     tcg_debug_assert(vece <= MO_64);
2262     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2263 }
2264
2265 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2266                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2267 {
2268     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2269     static const GVecGen3 g[4] = {
2270         { .fniv = tcg_gen_smax_vec,
2271           .fno = gen_helper_gvec_smax8,
2272           .opt_opc = vecop_list,
2273           .vece = MO_8 },
2274         { .fniv = tcg_gen_smax_vec,
2275           .fno = gen_helper_gvec_smax16,
2276           .opt_opc = vecop_list,
2277           .vece = MO_16 },
2278         { .fni4 = tcg_gen_smax_i32,
2279           .fniv = tcg_gen_smax_vec,
2280           .fno = gen_helper_gvec_smax32,
2281           .opt_opc = vecop_list,
2282           .vece = MO_32 },
2283         { .fni8 = tcg_gen_smax_i64,
2284           .fniv = tcg_gen_smax_vec,
2285           .fno = gen_helper_gvec_smax64,
2286           .opt_opc = vecop_list,
2287           .vece = MO_64 }
2288     };
2289     tcg_debug_assert(vece <= MO_64);
2290     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2291 }
2292
2293 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2294                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2295 {
2296     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2297     static const GVecGen3 g[4] = {
2298         { .fniv = tcg_gen_umax_vec,
2299           .fno = gen_helper_gvec_umax8,
2300           .opt_opc = vecop_list,
2301           .vece = MO_8 },
2302         { .fniv = tcg_gen_umax_vec,
2303           .fno = gen_helper_gvec_umax16,
2304           .opt_opc = vecop_list,
2305           .vece = MO_16 },
2306         { .fni4 = tcg_gen_umax_i32,
2307           .fniv = tcg_gen_umax_vec,
2308           .fno = gen_helper_gvec_umax32,
2309           .opt_opc = vecop_list,
2310           .vece = MO_32 },
2311         { .fni8 = tcg_gen_umax_i64,
2312           .fniv = tcg_gen_umax_vec,
2313           .fno = gen_helper_gvec_umax64,
2314           .opt_opc = vecop_list,
2315           .vece = MO_64 }
2316     };
2317     tcg_debug_assert(vece <= MO_64);
2318     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2319 }
2320
2321 /* Perform a vector negation using normal negation and a mask.
2322    Compare gen_subv_mask above.  */
2323 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2324 {
2325     TCGv_i64 t2 = tcg_temp_new_i64();
2326     TCGv_i64 t3 = tcg_temp_new_i64();
2327
2328     tcg_gen_andc_i64(t3, m, b);
2329     tcg_gen_andc_i64(t2, b, m);
2330     tcg_gen_sub_i64(d, m, t2);
2331     tcg_gen_xor_i64(d, d, t3);
2332
2333     tcg_temp_free_i64(t2);
2334     tcg_temp_free_i64(t3);
2335 }
2336
2337 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2338 {
2339     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2340     gen_negv_mask(d, b, m);
2341 }
2342
2343 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2344 {
2345     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2346     gen_negv_mask(d, b, m);
2347 }
2348
2349 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2350 {
2351     TCGv_i64 t1 = tcg_temp_new_i64();
2352     TCGv_i64 t2 = tcg_temp_new_i64();
2353
2354     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2355     tcg_gen_neg_i64(t2, b);
2356     tcg_gen_neg_i64(t1, t1);
2357     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2358
2359     tcg_temp_free_i64(t1);
2360     tcg_temp_free_i64(t2);
2361 }
2362
2363 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2364                       uint32_t oprsz, uint32_t maxsz)
2365 {
2366     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2367     static const GVecGen2 g[4] = {
2368         { .fni8 = tcg_gen_vec_neg8_i64,
2369           .fniv = tcg_gen_neg_vec,
2370           .fno = gen_helper_gvec_neg8,
2371           .opt_opc = vecop_list,
2372           .vece = MO_8 },
2373         { .fni8 = tcg_gen_vec_neg16_i64,
2374           .fniv = tcg_gen_neg_vec,
2375           .fno = gen_helper_gvec_neg16,
2376           .opt_opc = vecop_list,
2377           .vece = MO_16 },
2378         { .fni4 = tcg_gen_neg_i32,
2379           .fniv = tcg_gen_neg_vec,
2380           .fno = gen_helper_gvec_neg32,
2381           .opt_opc = vecop_list,
2382           .vece = MO_32 },
2383         { .fni8 = tcg_gen_neg_i64,
2384           .fniv = tcg_gen_neg_vec,
2385           .fno = gen_helper_gvec_neg64,
2386           .opt_opc = vecop_list,
2387           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2388           .vece = MO_64 },
2389     };
2390
2391     tcg_debug_assert(vece <= MO_64);
2392     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2393 }
2394
2395 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2396 {
2397     TCGv_i64 t = tcg_temp_new_i64();
2398     int nbit = 8 << vece;
2399
2400     /* Create -1 for each negative element.  */
2401     tcg_gen_shri_i64(t, b, nbit - 1);
2402     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2403     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2404
2405     /*
2406      * Invert (via xor -1) and add one.
2407      * Because of the ordering the msb is cleared,
2408      * so we never have carry into the next element.
2409      */
2410     tcg_gen_xor_i64(d, b, t);
2411     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2412     tcg_gen_add_i64(d, d, t);
2413
2414     tcg_temp_free_i64(t);
2415 }
2416
2417 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2418 {
2419     gen_absv_mask(d, b, MO_8);
2420 }
2421
2422 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2423 {
2424     gen_absv_mask(d, b, MO_16);
2425 }
2426
2427 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2428                       uint32_t oprsz, uint32_t maxsz)
2429 {
2430     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2431     static const GVecGen2 g[4] = {
2432         { .fni8 = tcg_gen_vec_abs8_i64,
2433           .fniv = tcg_gen_abs_vec,
2434           .fno = gen_helper_gvec_abs8,
2435           .opt_opc = vecop_list,
2436           .vece = MO_8 },
2437         { .fni8 = tcg_gen_vec_abs16_i64,
2438           .fniv = tcg_gen_abs_vec,
2439           .fno = gen_helper_gvec_abs16,
2440           .opt_opc = vecop_list,
2441           .vece = MO_16 },
2442         { .fni4 = tcg_gen_abs_i32,
2443           .fniv = tcg_gen_abs_vec,
2444           .fno = gen_helper_gvec_abs32,
2445           .opt_opc = vecop_list,
2446           .vece = MO_32 },
2447         { .fni8 = tcg_gen_abs_i64,
2448           .fniv = tcg_gen_abs_vec,
2449           .fno = gen_helper_gvec_abs64,
2450           .opt_opc = vecop_list,
2451           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2452           .vece = MO_64 },
2453     };
2454
2455     tcg_debug_assert(vece <= MO_64);
2456     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2457 }
2458
2459 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2460                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2461 {
2462     static const GVecGen3 g = {
2463         .fni8 = tcg_gen_and_i64,
2464         .fniv = tcg_gen_and_vec,
2465         .fno = gen_helper_gvec_and,
2466         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2467     };
2468
2469     if (aofs == bofs) {
2470         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2471     } else {
2472         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2473     }
2474 }
2475
2476 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2477                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2478 {
2479     static const GVecGen3 g = {
2480         .fni8 = tcg_gen_or_i64,
2481         .fniv = tcg_gen_or_vec,
2482         .fno = gen_helper_gvec_or,
2483         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2484     };
2485
2486     if (aofs == bofs) {
2487         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2488     } else {
2489         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2490     }
2491 }
2492
2493 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2494                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2495 {
2496     static const GVecGen3 g = {
2497         .fni8 = tcg_gen_xor_i64,
2498         .fniv = tcg_gen_xor_vec,
2499         .fno = gen_helper_gvec_xor,
2500         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2501     };
2502
2503     if (aofs == bofs) {
2504         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2505     } else {
2506         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2507     }
2508 }
2509
2510 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2511                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2512 {
2513     static const GVecGen3 g = {
2514         .fni8 = tcg_gen_andc_i64,
2515         .fniv = tcg_gen_andc_vec,
2516         .fno = gen_helper_gvec_andc,
2517         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2518     };
2519
2520     if (aofs == bofs) {
2521         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2522     } else {
2523         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2524     }
2525 }
2526
2527 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2528                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2529 {
2530     static const GVecGen3 g = {
2531         .fni8 = tcg_gen_orc_i64,
2532         .fniv = tcg_gen_orc_vec,
2533         .fno = gen_helper_gvec_orc,
2534         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2535     };
2536
2537     if (aofs == bofs) {
2538         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2539     } else {
2540         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2541     }
2542 }
2543
2544 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2545                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2546 {
2547     static const GVecGen3 g = {
2548         .fni8 = tcg_gen_nand_i64,
2549         .fniv = tcg_gen_nand_vec,
2550         .fno = gen_helper_gvec_nand,
2551         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2552     };
2553
2554     if (aofs == bofs) {
2555         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2556     } else {
2557         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2558     }
2559 }
2560
2561 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2562                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2563 {
2564     static const GVecGen3 g = {
2565         .fni8 = tcg_gen_nor_i64,
2566         .fniv = tcg_gen_nor_vec,
2567         .fno = gen_helper_gvec_nor,
2568         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2569     };
2570
2571     if (aofs == bofs) {
2572         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2573     } else {
2574         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2575     }
2576 }
2577
2578 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2579                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2580 {
2581     static const GVecGen3 g = {
2582         .fni8 = tcg_gen_eqv_i64,
2583         .fniv = tcg_gen_eqv_vec,
2584         .fno = gen_helper_gvec_eqv,
2585         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2586     };
2587
2588     if (aofs == bofs) {
2589         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2590     } else {
2591         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2592     }
2593 }
2594
2595 static const GVecGen2s gop_ands = {
2596     .fni8 = tcg_gen_and_i64,
2597     .fniv = tcg_gen_and_vec,
2598     .fno = gen_helper_gvec_ands,
2599     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2600     .vece = MO_64
2601 };
2602
2603 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2604                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2605 {
2606     TCGv_i64 tmp = tcg_temp_new_i64();
2607     tcg_gen_dup_i64(vece, tmp, c);
2608     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2609     tcg_temp_free_i64(tmp);
2610 }
2611
2612 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2613                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2614 {
2615     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2616     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2617 }
2618
2619 static const GVecGen2s gop_xors = {
2620     .fni8 = tcg_gen_xor_i64,
2621     .fniv = tcg_gen_xor_vec,
2622     .fno = gen_helper_gvec_xors,
2623     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2624     .vece = MO_64
2625 };
2626
2627 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2628                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2629 {
2630     TCGv_i64 tmp = tcg_temp_new_i64();
2631     tcg_gen_dup_i64(vece, tmp, c);
2632     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2633     tcg_temp_free_i64(tmp);
2634 }
2635
2636 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2637                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2638 {
2639     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2640     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2641 }
2642
2643 static const GVecGen2s gop_ors = {
2644     .fni8 = tcg_gen_or_i64,
2645     .fniv = tcg_gen_or_vec,
2646     .fno = gen_helper_gvec_ors,
2647     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2648     .vece = MO_64
2649 };
2650
2651 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2652                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2653 {
2654     TCGv_i64 tmp = tcg_temp_new_i64();
2655     tcg_gen_dup_i64(vece, tmp, c);
2656     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2657     tcg_temp_free_i64(tmp);
2658 }
2659
2660 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2661                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2662 {
2663     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2664     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2665 }
2666
2667 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2668 {
2669     uint64_t mask = dup_const(MO_8, 0xff << c);
2670     tcg_gen_shli_i64(d, a, c);
2671     tcg_gen_andi_i64(d, d, mask);
2672 }
2673
2674 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2675 {
2676     uint64_t mask = dup_const(MO_16, 0xffff << c);
2677     tcg_gen_shli_i64(d, a, c);
2678     tcg_gen_andi_i64(d, d, mask);
2679 }
2680
2681 void tcg_gen_vec_shl8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2682 {
2683     uint32_t mask = dup_const(MO_8, 0xff << c);
2684     tcg_gen_shli_i32(d, a, c);
2685     tcg_gen_andi_i32(d, d, mask);
2686 }
2687
2688 void tcg_gen_vec_shl16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2689 {
2690     uint32_t mask = dup_const(MO_16, 0xffff << c);
2691     tcg_gen_shli_i32(d, a, c);
2692     tcg_gen_andi_i32(d, d, mask);
2693 }
2694
2695 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2696                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2697 {
2698     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2699     static const GVecGen2i g[4] = {
2700         { .fni8 = tcg_gen_vec_shl8i_i64,
2701           .fniv = tcg_gen_shli_vec,
2702           .fno = gen_helper_gvec_shl8i,
2703           .opt_opc = vecop_list,
2704           .vece = MO_8 },
2705         { .fni8 = tcg_gen_vec_shl16i_i64,
2706           .fniv = tcg_gen_shli_vec,
2707           .fno = gen_helper_gvec_shl16i,
2708           .opt_opc = vecop_list,
2709           .vece = MO_16 },
2710         { .fni4 = tcg_gen_shli_i32,
2711           .fniv = tcg_gen_shli_vec,
2712           .fno = gen_helper_gvec_shl32i,
2713           .opt_opc = vecop_list,
2714           .vece = MO_32 },
2715         { .fni8 = tcg_gen_shli_i64,
2716           .fniv = tcg_gen_shli_vec,
2717           .fno = gen_helper_gvec_shl64i,
2718           .opt_opc = vecop_list,
2719           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2720           .vece = MO_64 },
2721     };
2722
2723     tcg_debug_assert(vece <= MO_64);
2724     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2725     if (shift == 0) {
2726         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2727     } else {
2728         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2729     }
2730 }
2731
2732 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2733 {
2734     uint64_t mask = dup_const(MO_8, 0xff >> c);
2735     tcg_gen_shri_i64(d, a, c);
2736     tcg_gen_andi_i64(d, d, mask);
2737 }
2738
2739 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2740 {
2741     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2742     tcg_gen_shri_i64(d, a, c);
2743     tcg_gen_andi_i64(d, d, mask);
2744 }
2745
2746 void tcg_gen_vec_shr8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2747 {
2748     uint32_t mask = dup_const(MO_8, 0xff >> c);
2749     tcg_gen_shri_i32(d, a, c);
2750     tcg_gen_andi_i32(d, d, mask);
2751 }
2752
2753 void tcg_gen_vec_shr16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2754 {
2755     uint32_t mask = dup_const(MO_16, 0xffff >> c);
2756     tcg_gen_shri_i32(d, a, c);
2757     tcg_gen_andi_i32(d, d, mask);
2758 }
2759
2760 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2761                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2762 {
2763     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2764     static const GVecGen2i g[4] = {
2765         { .fni8 = tcg_gen_vec_shr8i_i64,
2766           .fniv = tcg_gen_shri_vec,
2767           .fno = gen_helper_gvec_shr8i,
2768           .opt_opc = vecop_list,
2769           .vece = MO_8 },
2770         { .fni8 = tcg_gen_vec_shr16i_i64,
2771           .fniv = tcg_gen_shri_vec,
2772           .fno = gen_helper_gvec_shr16i,
2773           .opt_opc = vecop_list,
2774           .vece = MO_16 },
2775         { .fni4 = tcg_gen_shri_i32,
2776           .fniv = tcg_gen_shri_vec,
2777           .fno = gen_helper_gvec_shr32i,
2778           .opt_opc = vecop_list,
2779           .vece = MO_32 },
2780         { .fni8 = tcg_gen_shri_i64,
2781           .fniv = tcg_gen_shri_vec,
2782           .fno = gen_helper_gvec_shr64i,
2783           .opt_opc = vecop_list,
2784           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2785           .vece = MO_64 },
2786     };
2787
2788     tcg_debug_assert(vece <= MO_64);
2789     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2790     if (shift == 0) {
2791         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2792     } else {
2793         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2794     }
2795 }
2796
2797 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2798 {
2799     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2800     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2801     TCGv_i64 s = tcg_temp_new_i64();
2802
2803     tcg_gen_shri_i64(d, a, c);
2804     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2805     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2806     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2807     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2808     tcg_temp_free_i64(s);
2809 }
2810
2811 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2812 {
2813     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2814     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2815     TCGv_i64 s = tcg_temp_new_i64();
2816
2817     tcg_gen_shri_i64(d, a, c);
2818     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2819     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2820     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2821     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2822     tcg_temp_free_i64(s);
2823 }
2824
2825 void tcg_gen_vec_sar8i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2826 {
2827     uint32_t s_mask = dup_const(MO_8, 0x80 >> c);
2828     uint32_t c_mask = dup_const(MO_8, 0xff >> c);
2829     TCGv_i32 s = tcg_temp_new_i32();
2830
2831     tcg_gen_shri_i32(d, a, c);
2832     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2833     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2834     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2835     tcg_gen_or_i32(d, d, s);         /* include sign extension */
2836     tcg_temp_free_i32(s);
2837 }
2838
2839 void tcg_gen_vec_sar16i_i32(TCGv_i32 d, TCGv_i32 a, int32_t c)
2840 {
2841     uint32_t s_mask = dup_const(MO_16, 0x8000 >> c);
2842     uint32_t c_mask = dup_const(MO_16, 0xffff >> c);
2843     TCGv_i32 s = tcg_temp_new_i32();
2844
2845     tcg_gen_shri_i32(d, a, c);
2846     tcg_gen_andi_i32(s, d, s_mask);  /* isolate (shifted) sign bit */
2847     tcg_gen_andi_i32(d, d, c_mask);  /* clear out bits above sign  */
2848     tcg_gen_muli_i32(s, s, (2 << c) - 2); /* replicate isolated signs */
2849     tcg_gen_or_i32(d, d, s);         /* include sign extension */
2850     tcg_temp_free_i32(s);
2851 }
2852
2853 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2854                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2855 {
2856     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
2857     static const GVecGen2i g[4] = {
2858         { .fni8 = tcg_gen_vec_sar8i_i64,
2859           .fniv = tcg_gen_sari_vec,
2860           .fno = gen_helper_gvec_sar8i,
2861           .opt_opc = vecop_list,
2862           .vece = MO_8 },
2863         { .fni8 = tcg_gen_vec_sar16i_i64,
2864           .fniv = tcg_gen_sari_vec,
2865           .fno = gen_helper_gvec_sar16i,
2866           .opt_opc = vecop_list,
2867           .vece = MO_16 },
2868         { .fni4 = tcg_gen_sari_i32,
2869           .fniv = tcg_gen_sari_vec,
2870           .fno = gen_helper_gvec_sar32i,
2871           .opt_opc = vecop_list,
2872           .vece = MO_32 },
2873         { .fni8 = tcg_gen_sari_i64,
2874           .fniv = tcg_gen_sari_vec,
2875           .fno = gen_helper_gvec_sar64i,
2876           .opt_opc = vecop_list,
2877           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2878           .vece = MO_64 },
2879     };
2880
2881     tcg_debug_assert(vece <= MO_64);
2882     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2883     if (shift == 0) {
2884         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2885     } else {
2886         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2887     }
2888 }
2889
2890 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2891 {
2892     uint64_t mask = dup_const(MO_8, 0xff << c);
2893
2894     tcg_gen_shli_i64(d, a, c);
2895     tcg_gen_shri_i64(a, a, 8 - c);
2896     tcg_gen_andi_i64(d, d, mask);
2897     tcg_gen_andi_i64(a, a, ~mask);
2898     tcg_gen_or_i64(d, d, a);
2899 }
2900
2901 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2902 {
2903     uint64_t mask = dup_const(MO_16, 0xffff << c);
2904
2905     tcg_gen_shli_i64(d, a, c);
2906     tcg_gen_shri_i64(a, a, 16 - c);
2907     tcg_gen_andi_i64(d, d, mask);
2908     tcg_gen_andi_i64(a, a, ~mask);
2909     tcg_gen_or_i64(d, d, a);
2910 }
2911
2912 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
2913                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
2914 {
2915     static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
2916     static const GVecGen2i g[4] = {
2917         { .fni8 = tcg_gen_vec_rotl8i_i64,
2918           .fniv = tcg_gen_rotli_vec,
2919           .fno = gen_helper_gvec_rotl8i,
2920           .opt_opc = vecop_list,
2921           .vece = MO_8 },
2922         { .fni8 = tcg_gen_vec_rotl16i_i64,
2923           .fniv = tcg_gen_rotli_vec,
2924           .fno = gen_helper_gvec_rotl16i,
2925           .opt_opc = vecop_list,
2926           .vece = MO_16 },
2927         { .fni4 = tcg_gen_rotli_i32,
2928           .fniv = tcg_gen_rotli_vec,
2929           .fno = gen_helper_gvec_rotl32i,
2930           .opt_opc = vecop_list,
2931           .vece = MO_32 },
2932         { .fni8 = tcg_gen_rotli_i64,
2933           .fniv = tcg_gen_rotli_vec,
2934           .fno = gen_helper_gvec_rotl64i,
2935           .opt_opc = vecop_list,
2936           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2937           .vece = MO_64 },
2938     };
2939
2940     tcg_debug_assert(vece <= MO_64);
2941     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2942     if (shift == 0) {
2943         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2944     } else {
2945         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2946     }
2947 }
2948
2949 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
2950                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
2951 {
2952     tcg_debug_assert(vece <= MO_64);
2953     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2954     tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
2955                        oprsz, maxsz);
2956 }
2957
2958 /*
2959  * Specialized generation vector shifts by a non-constant scalar.
2960  */
2961
2962 typedef struct {
2963     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
2964     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
2965     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
2966     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
2967     gen_helper_gvec_2 *fno[4];
2968     TCGOpcode s_list[2];
2969     TCGOpcode v_list[2];
2970 } GVecGen2sh;
2971
2972 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2973                            uint32_t oprsz, uint32_t tysz, TCGType type,
2974                            TCGv_i32 shift,
2975                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
2976 {
2977     TCGv_vec t0 = tcg_temp_new_vec(type);
2978     uint32_t i;
2979
2980     for (i = 0; i < oprsz; i += tysz) {
2981         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2982         fni(vece, t0, t0, shift);
2983         tcg_gen_st_vec(t0, cpu_env, dofs + i);
2984     }
2985     tcg_temp_free_vec(t0);
2986 }
2987
2988 static void
2989 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
2990                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
2991 {
2992     TCGType type;
2993     uint32_t some;
2994
2995     check_size_align(oprsz, maxsz, dofs | aofs);
2996     check_overlap_2(dofs, aofs, maxsz);
2997
2998     /* If the backend has a scalar expansion, great.  */
2999     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
3000     if (type) {
3001         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3002         switch (type) {
3003         case TCG_TYPE_V256:
3004             some = QEMU_ALIGN_DOWN(oprsz, 32);
3005             expand_2sh_vec(vece, dofs, aofs, some, 32,
3006                            TCG_TYPE_V256, shift, g->fniv_s);
3007             if (some == oprsz) {
3008                 break;
3009             }
3010             dofs += some;
3011             aofs += some;
3012             oprsz -= some;
3013             maxsz -= some;
3014             /* fallthru */
3015         case TCG_TYPE_V128:
3016             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
3017                            TCG_TYPE_V128, shift, g->fniv_s);
3018             break;
3019         case TCG_TYPE_V64:
3020             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
3021                            TCG_TYPE_V64, shift, g->fniv_s);
3022             break;
3023         default:
3024             g_assert_not_reached();
3025         }
3026         tcg_swap_vecop_list(hold_list);
3027         goto clear_tail;
3028     }
3029
3030     /* If the backend supports variable vector shifts, also cool.  */
3031     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
3032     if (type) {
3033         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
3034         TCGv_vec v_shift = tcg_temp_new_vec(type);
3035
3036         if (vece == MO_64) {
3037             TCGv_i64 sh64 = tcg_temp_new_i64();
3038             tcg_gen_extu_i32_i64(sh64, shift);
3039             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
3040             tcg_temp_free_i64(sh64);
3041         } else {
3042             tcg_gen_dup_i32_vec(vece, v_shift, shift);
3043         }
3044
3045         switch (type) {
3046         case TCG_TYPE_V256:
3047             some = QEMU_ALIGN_DOWN(oprsz, 32);
3048             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
3049                           v_shift, false, g->fniv_v);
3050             if (some == oprsz) {
3051                 break;
3052             }
3053             dofs += some;
3054             aofs += some;
3055             oprsz -= some;
3056             maxsz -= some;
3057             /* fallthru */
3058         case TCG_TYPE_V128:
3059             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
3060                           v_shift, false, g->fniv_v);
3061             break;
3062         case TCG_TYPE_V64:
3063             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
3064                           v_shift, false, g->fniv_v);
3065             break;
3066         default:
3067             g_assert_not_reached();
3068         }
3069         tcg_temp_free_vec(v_shift);
3070         tcg_swap_vecop_list(hold_list);
3071         goto clear_tail;
3072     }
3073
3074     /* Otherwise fall back to integral... */
3075     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3076         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
3077     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3078         TCGv_i64 sh64 = tcg_temp_new_i64();
3079         tcg_gen_extu_i32_i64(sh64, shift);
3080         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
3081         tcg_temp_free_i64(sh64);
3082     } else {
3083         TCGv_ptr a0 = tcg_temp_new_ptr();
3084         TCGv_ptr a1 = tcg_temp_new_ptr();
3085         TCGv_i32 desc = tcg_temp_new_i32();
3086
3087         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
3088         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
3089         tcg_gen_addi_ptr(a0, cpu_env, dofs);
3090         tcg_gen_addi_ptr(a1, cpu_env, aofs);
3091
3092         g->fno[vece](a0, a1, desc);
3093
3094         tcg_temp_free_ptr(a0);
3095         tcg_temp_free_ptr(a1);
3096         tcg_temp_free_i32(desc);
3097         return;
3098     }
3099
3100  clear_tail:
3101     if (oprsz < maxsz) {
3102         expand_clr(dofs + oprsz, maxsz - oprsz);
3103     }
3104 }
3105
3106 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
3107                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3108 {
3109     static const GVecGen2sh g = {
3110         .fni4 = tcg_gen_shl_i32,
3111         .fni8 = tcg_gen_shl_i64,
3112         .fniv_s = tcg_gen_shls_vec,
3113         .fniv_v = tcg_gen_shlv_vec,
3114         .fno = {
3115             gen_helper_gvec_shl8i,
3116             gen_helper_gvec_shl16i,
3117             gen_helper_gvec_shl32i,
3118             gen_helper_gvec_shl64i,
3119         },
3120         .s_list = { INDEX_op_shls_vec, 0 },
3121         .v_list = { INDEX_op_shlv_vec, 0 },
3122     };
3123
3124     tcg_debug_assert(vece <= MO_64);
3125     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3126 }
3127
3128 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3129                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3130 {
3131     static const GVecGen2sh g = {
3132         .fni4 = tcg_gen_shr_i32,
3133         .fni8 = tcg_gen_shr_i64,
3134         .fniv_s = tcg_gen_shrs_vec,
3135         .fniv_v = tcg_gen_shrv_vec,
3136         .fno = {
3137             gen_helper_gvec_shr8i,
3138             gen_helper_gvec_shr16i,
3139             gen_helper_gvec_shr32i,
3140             gen_helper_gvec_shr64i,
3141         },
3142         .s_list = { INDEX_op_shrs_vec, 0 },
3143         .v_list = { INDEX_op_shrv_vec, 0 },
3144     };
3145
3146     tcg_debug_assert(vece <= MO_64);
3147     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3148 }
3149
3150 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3151                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3152 {
3153     static const GVecGen2sh g = {
3154         .fni4 = tcg_gen_sar_i32,
3155         .fni8 = tcg_gen_sar_i64,
3156         .fniv_s = tcg_gen_sars_vec,
3157         .fniv_v = tcg_gen_sarv_vec,
3158         .fno = {
3159             gen_helper_gvec_sar8i,
3160             gen_helper_gvec_sar16i,
3161             gen_helper_gvec_sar32i,
3162             gen_helper_gvec_sar64i,
3163         },
3164         .s_list = { INDEX_op_sars_vec, 0 },
3165         .v_list = { INDEX_op_sarv_vec, 0 },
3166     };
3167
3168     tcg_debug_assert(vece <= MO_64);
3169     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3170 }
3171
3172 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3173                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3174 {
3175     static const GVecGen2sh g = {
3176         .fni4 = tcg_gen_rotl_i32,
3177         .fni8 = tcg_gen_rotl_i64,
3178         .fniv_s = tcg_gen_rotls_vec,
3179         .fniv_v = tcg_gen_rotlv_vec,
3180         .fno = {
3181             gen_helper_gvec_rotl8i,
3182             gen_helper_gvec_rotl16i,
3183             gen_helper_gvec_rotl32i,
3184             gen_helper_gvec_rotl64i,
3185         },
3186         .s_list = { INDEX_op_rotls_vec, 0 },
3187         .v_list = { INDEX_op_rotlv_vec, 0 },
3188     };
3189
3190     tcg_debug_assert(vece <= MO_64);
3191     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3192 }
3193
3194 /*
3195  * Expand D = A << (B % element bits)
3196  *
3197  * Unlike scalar shifts, where it is easy for the target front end
3198  * to include the modulo as part of the expansion.  If the target
3199  * naturally includes the modulo as part of the operation, great!
3200  * If the target has some other behaviour from out-of-range shifts,
3201  * then it could not use this function anyway, and would need to
3202  * do it's own expansion with custom functions.
3203  */
3204 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3205                                  TCGv_vec a, TCGv_vec b)
3206 {
3207     TCGv_vec t = tcg_temp_new_vec_matching(d);
3208     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3209
3210     tcg_gen_and_vec(vece, t, b, m);
3211     tcg_gen_shlv_vec(vece, d, a, t);
3212     tcg_temp_free_vec(t);
3213 }
3214
3215 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3216 {
3217     TCGv_i32 t = tcg_temp_new_i32();
3218
3219     tcg_gen_andi_i32(t, b, 31);
3220     tcg_gen_shl_i32(d, a, t);
3221     tcg_temp_free_i32(t);
3222 }
3223
3224 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3225 {
3226     TCGv_i64 t = tcg_temp_new_i64();
3227
3228     tcg_gen_andi_i64(t, b, 63);
3229     tcg_gen_shl_i64(d, a, t);
3230     tcg_temp_free_i64(t);
3231 }
3232
3233 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3234                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3235 {
3236     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3237     static const GVecGen3 g[4] = {
3238         { .fniv = tcg_gen_shlv_mod_vec,
3239           .fno = gen_helper_gvec_shl8v,
3240           .opt_opc = vecop_list,
3241           .vece = MO_8 },
3242         { .fniv = tcg_gen_shlv_mod_vec,
3243           .fno = gen_helper_gvec_shl16v,
3244           .opt_opc = vecop_list,
3245           .vece = MO_16 },
3246         { .fni4 = tcg_gen_shl_mod_i32,
3247           .fniv = tcg_gen_shlv_mod_vec,
3248           .fno = gen_helper_gvec_shl32v,
3249           .opt_opc = vecop_list,
3250           .vece = MO_32 },
3251         { .fni8 = tcg_gen_shl_mod_i64,
3252           .fniv = tcg_gen_shlv_mod_vec,
3253           .fno = gen_helper_gvec_shl64v,
3254           .opt_opc = vecop_list,
3255           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3256           .vece = MO_64 },
3257     };
3258
3259     tcg_debug_assert(vece <= MO_64);
3260     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3261 }
3262
3263 /*
3264  * Similarly for logical right shifts.
3265  */
3266
3267 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3268                                  TCGv_vec a, TCGv_vec b)
3269 {
3270     TCGv_vec t = tcg_temp_new_vec_matching(d);
3271     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3272
3273     tcg_gen_and_vec(vece, t, b, m);
3274     tcg_gen_shrv_vec(vece, d, a, t);
3275     tcg_temp_free_vec(t);
3276 }
3277
3278 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3279 {
3280     TCGv_i32 t = tcg_temp_new_i32();
3281
3282     tcg_gen_andi_i32(t, b, 31);
3283     tcg_gen_shr_i32(d, a, t);
3284     tcg_temp_free_i32(t);
3285 }
3286
3287 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3288 {
3289     TCGv_i64 t = tcg_temp_new_i64();
3290
3291     tcg_gen_andi_i64(t, b, 63);
3292     tcg_gen_shr_i64(d, a, t);
3293     tcg_temp_free_i64(t);
3294 }
3295
3296 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3297                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3298 {
3299     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3300     static const GVecGen3 g[4] = {
3301         { .fniv = tcg_gen_shrv_mod_vec,
3302           .fno = gen_helper_gvec_shr8v,
3303           .opt_opc = vecop_list,
3304           .vece = MO_8 },
3305         { .fniv = tcg_gen_shrv_mod_vec,
3306           .fno = gen_helper_gvec_shr16v,
3307           .opt_opc = vecop_list,
3308           .vece = MO_16 },
3309         { .fni4 = tcg_gen_shr_mod_i32,
3310           .fniv = tcg_gen_shrv_mod_vec,
3311           .fno = gen_helper_gvec_shr32v,
3312           .opt_opc = vecop_list,
3313           .vece = MO_32 },
3314         { .fni8 = tcg_gen_shr_mod_i64,
3315           .fniv = tcg_gen_shrv_mod_vec,
3316           .fno = gen_helper_gvec_shr64v,
3317           .opt_opc = vecop_list,
3318           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3319           .vece = MO_64 },
3320     };
3321
3322     tcg_debug_assert(vece <= MO_64);
3323     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3324 }
3325
3326 /*
3327  * Similarly for arithmetic right shifts.
3328  */
3329
3330 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3331                                  TCGv_vec a, TCGv_vec b)
3332 {
3333     TCGv_vec t = tcg_temp_new_vec_matching(d);
3334     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3335
3336     tcg_gen_and_vec(vece, t, b, m);
3337     tcg_gen_sarv_vec(vece, d, a, t);
3338     tcg_temp_free_vec(t);
3339 }
3340
3341 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3342 {
3343     TCGv_i32 t = tcg_temp_new_i32();
3344
3345     tcg_gen_andi_i32(t, b, 31);
3346     tcg_gen_sar_i32(d, a, t);
3347     tcg_temp_free_i32(t);
3348 }
3349
3350 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3351 {
3352     TCGv_i64 t = tcg_temp_new_i64();
3353
3354     tcg_gen_andi_i64(t, b, 63);
3355     tcg_gen_sar_i64(d, a, t);
3356     tcg_temp_free_i64(t);
3357 }
3358
3359 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3360                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3361 {
3362     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3363     static const GVecGen3 g[4] = {
3364         { .fniv = tcg_gen_sarv_mod_vec,
3365           .fno = gen_helper_gvec_sar8v,
3366           .opt_opc = vecop_list,
3367           .vece = MO_8 },
3368         { .fniv = tcg_gen_sarv_mod_vec,
3369           .fno = gen_helper_gvec_sar16v,
3370           .opt_opc = vecop_list,
3371           .vece = MO_16 },
3372         { .fni4 = tcg_gen_sar_mod_i32,
3373           .fniv = tcg_gen_sarv_mod_vec,
3374           .fno = gen_helper_gvec_sar32v,
3375           .opt_opc = vecop_list,
3376           .vece = MO_32 },
3377         { .fni8 = tcg_gen_sar_mod_i64,
3378           .fniv = tcg_gen_sarv_mod_vec,
3379           .fno = gen_helper_gvec_sar64v,
3380           .opt_opc = vecop_list,
3381           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3382           .vece = MO_64 },
3383     };
3384
3385     tcg_debug_assert(vece <= MO_64);
3386     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3387 }
3388
3389 /*
3390  * Similarly for rotates.
3391  */
3392
3393 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3394                                   TCGv_vec a, TCGv_vec b)
3395 {
3396     TCGv_vec t = tcg_temp_new_vec_matching(d);
3397     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3398
3399     tcg_gen_and_vec(vece, t, b, m);
3400     tcg_gen_rotlv_vec(vece, d, a, t);
3401     tcg_temp_free_vec(t);
3402 }
3403
3404 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3405 {
3406     TCGv_i32 t = tcg_temp_new_i32();
3407
3408     tcg_gen_andi_i32(t, b, 31);
3409     tcg_gen_rotl_i32(d, a, t);
3410     tcg_temp_free_i32(t);
3411 }
3412
3413 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3414 {
3415     TCGv_i64 t = tcg_temp_new_i64();
3416
3417     tcg_gen_andi_i64(t, b, 63);
3418     tcg_gen_rotl_i64(d, a, t);
3419     tcg_temp_free_i64(t);
3420 }
3421
3422 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3423                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3424 {
3425     static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3426     static const GVecGen3 g[4] = {
3427         { .fniv = tcg_gen_rotlv_mod_vec,
3428           .fno = gen_helper_gvec_rotl8v,
3429           .opt_opc = vecop_list,
3430           .vece = MO_8 },
3431         { .fniv = tcg_gen_rotlv_mod_vec,
3432           .fno = gen_helper_gvec_rotl16v,
3433           .opt_opc = vecop_list,
3434           .vece = MO_16 },
3435         { .fni4 = tcg_gen_rotl_mod_i32,
3436           .fniv = tcg_gen_rotlv_mod_vec,
3437           .fno = gen_helper_gvec_rotl32v,
3438           .opt_opc = vecop_list,
3439           .vece = MO_32 },
3440         { .fni8 = tcg_gen_rotl_mod_i64,
3441           .fniv = tcg_gen_rotlv_mod_vec,
3442           .fno = gen_helper_gvec_rotl64v,
3443           .opt_opc = vecop_list,
3444           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3445           .vece = MO_64 },
3446     };
3447
3448     tcg_debug_assert(vece <= MO_64);
3449     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3450 }
3451
3452 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3453                                   TCGv_vec a, TCGv_vec b)
3454 {
3455     TCGv_vec t = tcg_temp_new_vec_matching(d);
3456     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3457
3458     tcg_gen_and_vec(vece, t, b, m);
3459     tcg_gen_rotrv_vec(vece, d, a, t);
3460     tcg_temp_free_vec(t);
3461 }
3462
3463 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3464 {
3465     TCGv_i32 t = tcg_temp_new_i32();
3466
3467     tcg_gen_andi_i32(t, b, 31);
3468     tcg_gen_rotr_i32(d, a, t);
3469     tcg_temp_free_i32(t);
3470 }
3471
3472 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3473 {
3474     TCGv_i64 t = tcg_temp_new_i64();
3475
3476     tcg_gen_andi_i64(t, b, 63);
3477     tcg_gen_rotr_i64(d, a, t);
3478     tcg_temp_free_i64(t);
3479 }
3480
3481 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3482                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3483 {
3484     static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3485     static const GVecGen3 g[4] = {
3486         { .fniv = tcg_gen_rotrv_mod_vec,
3487           .fno = gen_helper_gvec_rotr8v,
3488           .opt_opc = vecop_list,
3489           .vece = MO_8 },
3490         { .fniv = tcg_gen_rotrv_mod_vec,
3491           .fno = gen_helper_gvec_rotr16v,
3492           .opt_opc = vecop_list,
3493           .vece = MO_16 },
3494         { .fni4 = tcg_gen_rotr_mod_i32,
3495           .fniv = tcg_gen_rotrv_mod_vec,
3496           .fno = gen_helper_gvec_rotr32v,
3497           .opt_opc = vecop_list,
3498           .vece = MO_32 },
3499         { .fni8 = tcg_gen_rotr_mod_i64,
3500           .fniv = tcg_gen_rotrv_mod_vec,
3501           .fno = gen_helper_gvec_rotr64v,
3502           .opt_opc = vecop_list,
3503           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3504           .vece = MO_64 },
3505     };
3506
3507     tcg_debug_assert(vece <= MO_64);
3508     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3509 }
3510
3511 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3512 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3513                            uint32_t oprsz, TCGCond cond)
3514 {
3515     TCGv_i32 t0 = tcg_temp_new_i32();
3516     TCGv_i32 t1 = tcg_temp_new_i32();
3517     uint32_t i;
3518
3519     for (i = 0; i < oprsz; i += 4) {
3520         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3521         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3522         tcg_gen_setcond_i32(cond, t0, t0, t1);
3523         tcg_gen_neg_i32(t0, t0);
3524         tcg_gen_st_i32(t0, cpu_env, dofs + i);
3525     }
3526     tcg_temp_free_i32(t1);
3527     tcg_temp_free_i32(t0);
3528 }
3529
3530 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3531                            uint32_t oprsz, TCGCond cond)
3532 {
3533     TCGv_i64 t0 = tcg_temp_new_i64();
3534     TCGv_i64 t1 = tcg_temp_new_i64();
3535     uint32_t i;
3536
3537     for (i = 0; i < oprsz; i += 8) {
3538         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3539         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3540         tcg_gen_setcond_i64(cond, t0, t0, t1);
3541         tcg_gen_neg_i64(t0, t0);
3542         tcg_gen_st_i64(t0, cpu_env, dofs + i);
3543     }
3544     tcg_temp_free_i64(t1);
3545     tcg_temp_free_i64(t0);
3546 }
3547
3548 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3549                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3550                            TCGType type, TCGCond cond)
3551 {
3552     TCGv_vec t0 = tcg_temp_new_vec(type);
3553     TCGv_vec t1 = tcg_temp_new_vec(type);
3554     uint32_t i;
3555
3556     for (i = 0; i < oprsz; i += tysz) {
3557         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3558         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3559         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3560         tcg_gen_st_vec(t0, cpu_env, dofs + i);
3561     }
3562     tcg_temp_free_vec(t1);
3563     tcg_temp_free_vec(t0);
3564 }
3565
3566 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3567                       uint32_t aofs, uint32_t bofs,
3568                       uint32_t oprsz, uint32_t maxsz)
3569 {
3570     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3571     static gen_helper_gvec_3 * const eq_fn[4] = {
3572         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3573         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3574     };
3575     static gen_helper_gvec_3 * const ne_fn[4] = {
3576         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3577         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3578     };
3579     static gen_helper_gvec_3 * const lt_fn[4] = {
3580         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3581         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3582     };
3583     static gen_helper_gvec_3 * const le_fn[4] = {
3584         gen_helper_gvec_le8, gen_helper_gvec_le16,
3585         gen_helper_gvec_le32, gen_helper_gvec_le64
3586     };
3587     static gen_helper_gvec_3 * const ltu_fn[4] = {
3588         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3589         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3590     };
3591     static gen_helper_gvec_3 * const leu_fn[4] = {
3592         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3593         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3594     };
3595     static gen_helper_gvec_3 * const * const fns[16] = {
3596         [TCG_COND_EQ] = eq_fn,
3597         [TCG_COND_NE] = ne_fn,
3598         [TCG_COND_LT] = lt_fn,
3599         [TCG_COND_LE] = le_fn,
3600         [TCG_COND_LTU] = ltu_fn,
3601         [TCG_COND_LEU] = leu_fn,
3602     };
3603
3604     const TCGOpcode *hold_list;
3605     TCGType type;
3606     uint32_t some;
3607
3608     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3609     check_overlap_3(dofs, aofs, bofs, maxsz);
3610
3611     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3612         do_dup(MO_8, dofs, oprsz, maxsz,
3613                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3614         return;
3615     }
3616
3617     /*
3618      * Implement inline with a vector type, if possible.
3619      * Prefer integer when 64-bit host and 64-bit comparison.
3620      */
3621     hold_list = tcg_swap_vecop_list(cmp_list);
3622     type = choose_vector_type(cmp_list, vece, oprsz,
3623                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3624     switch (type) {
3625     case TCG_TYPE_V256:
3626         /* Recall that ARM SVE allows vector sizes that are not a
3627          * power of 2, but always a multiple of 16.  The intent is
3628          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3629          */
3630         some = QEMU_ALIGN_DOWN(oprsz, 32);
3631         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3632         if (some == oprsz) {
3633             break;
3634         }
3635         dofs += some;
3636         aofs += some;
3637         bofs += some;
3638         oprsz -= some;
3639         maxsz -= some;
3640         /* fallthru */
3641     case TCG_TYPE_V128:
3642         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3643         break;
3644     case TCG_TYPE_V64:
3645         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3646         break;
3647
3648     case 0:
3649         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3650             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3651         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3652             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3653         } else {
3654             gen_helper_gvec_3 * const *fn = fns[cond];
3655
3656             if (fn == NULL) {
3657                 uint32_t tmp;
3658                 tmp = aofs, aofs = bofs, bofs = tmp;
3659                 cond = tcg_swap_cond(cond);
3660                 fn = fns[cond];
3661                 assert(fn != NULL);
3662             }
3663             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3664             oprsz = maxsz;
3665         }
3666         break;
3667
3668     default:
3669         g_assert_not_reached();
3670     }
3671     tcg_swap_vecop_list(hold_list);
3672
3673     if (oprsz < maxsz) {
3674         expand_clr(dofs + oprsz, maxsz - oprsz);
3675     }
3676 }
3677
3678 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3679 {
3680     TCGv_i64 t = tcg_temp_new_i64();
3681
3682     tcg_gen_and_i64(t, b, a);
3683     tcg_gen_andc_i64(d, c, a);
3684     tcg_gen_or_i64(d, d, t);
3685     tcg_temp_free_i64(t);
3686 }
3687
3688 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3689                          uint32_t bofs, uint32_t cofs,
3690                          uint32_t oprsz, uint32_t maxsz)
3691 {
3692     static const GVecGen4 g = {
3693         .fni8 = tcg_gen_bitsel_i64,
3694         .fniv = tcg_gen_bitsel_vec,
3695         .fno = gen_helper_gvec_bitsel,
3696     };
3697
3698     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3699 }