tcg/tcg-op-gvec.c

   1 /*
   2  * Generic vector operation expansion
   3  *
   4  * Copyright (c) 2018 Linaro
   5  *
   6  * This library is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * This library is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  18  */
  19
  20 #include "qemu/osdep.h"
  21 #include "tcg/tcg.h"
  22 #include "tcg/tcg-op.h"
  23 #include "tcg/tcg-op-gvec.h"
  24 #include "qemu/main-loop.h"
  25 #include "tcg/tcg-gvec-desc.h"
  26
  27 #define MAX_UNROLL  4
  28
  29 #ifdef CONFIG_DEBUG_TCG
  30 static const TCGOpcode vecop_list_empty[1] = { 0 };
  31 #else
  32 #define vecop_list_empty NULL
  33 #endif
  34
  35
  36 /* Verify vector size and alignment rules.  OFS should be the OR of all
  37    of the operand offsets so that we can check them all at once.  */
  38 static void check_size_align(uint32_t oprsz, uint32_t maxsz, uint32_t ofs)
  39 {
  40     uint32_t max_align;
  41
  42     switch (oprsz) {
  43     case 8:
  44     case 16:
  45     case 32:
  46         tcg_debug_assert(oprsz <= maxsz);
  47         break;
  48     default:
  49         tcg_debug_assert(oprsz == maxsz);
  50         break;
  51     }
  52     tcg_debug_assert(maxsz <= (8 << SIMD_MAXSZ_BITS));
  53
  54     max_align = maxsz >= 16 ? 15 : 7;
  55     tcg_debug_assert((maxsz & max_align) == 0);
  56     tcg_debug_assert((ofs & max_align) == 0);
  57 }
  58
  59 /* Verify vector overlap rules for two operands.  */
  60 static void check_overlap_2(uint32_t d, uint32_t a, uint32_t s)
  61 {
  62     tcg_debug_assert(d == a || d + s <= a || a + s <= d);
  63 }
  64
  65 /* Verify vector overlap rules for three operands.  */
  66 static void check_overlap_3(uint32_t d, uint32_t a, uint32_t b, uint32_t s)
  67 {
  68     check_overlap_2(d, a, s);
  69     check_overlap_2(d, b, s);
  70     check_overlap_2(a, b, s);
  71 }
  72
  73 /* Verify vector overlap rules for four operands.  */
  74 static void check_overlap_4(uint32_t d, uint32_t a, uint32_t b,
  75                             uint32_t c, uint32_t s)
  76 {
  77     check_overlap_2(d, a, s);
  78     check_overlap_2(d, b, s);
  79     check_overlap_2(d, c, s);
  80     check_overlap_2(a, b, s);
  81     check_overlap_2(a, c, s);
  82     check_overlap_2(b, c, s);
  83 }
  84
  85 /* Create a descriptor from components.  */
  86 uint32_t simd_desc(uint32_t oprsz, uint32_t maxsz, int32_t data)
  87 {
  88     uint32_t desc = 0;
  89
  90     check_size_align(oprsz, maxsz, 0);
  91     tcg_debug_assert(data == sextract32(data, 0, SIMD_DATA_BITS));
  92
  93     oprsz = (oprsz / 8) - 1;
  94     maxsz = (maxsz / 8) - 1;
  95
  96     /*
  97      * We have just asserted in check_size_align that either
  98      * oprsz is {8,16,32} or matches maxsz.  Encode the final
  99      * case with '2', as that would otherwise map to 24.
 100      */
 101     if (oprsz == maxsz) {
 102         oprsz = 2;
 103     }
 104
 105     desc = deposit32(desc, SIMD_OPRSZ_SHIFT, SIMD_OPRSZ_BITS, oprsz);
 106     desc = deposit32(desc, SIMD_MAXSZ_SHIFT, SIMD_MAXSZ_BITS, maxsz);
 107     desc = deposit32(desc, SIMD_DATA_SHIFT, SIMD_DATA_BITS, data);
 108
 109     return desc;
 110 }
 111
 112 /* Generate a call to a gvec-style helper with two vector operands.  */
 113 void tcg_gen_gvec_2_ool(uint32_t dofs, uint32_t aofs,
 114                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 115                         gen_helper_gvec_2 *fn)
 116 {
 117     TCGv_ptr a0, a1;
 118     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 119
 120     a0 = tcg_temp_new_ptr();
 121     a1 = tcg_temp_new_ptr();
 122
 123     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 124     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 125
 126     fn(a0, a1, desc);
 127
 128     tcg_temp_free_ptr(a0);
 129     tcg_temp_free_ptr(a1);
 130 }
 131
 132 /* Generate a call to a gvec-style helper with two vector operands
 133    and one scalar operand.  */
 134 void tcg_gen_gvec_2i_ool(uint32_t dofs, uint32_t aofs, TCGv_i64 c,
 135                          uint32_t oprsz, uint32_t maxsz, int32_t data,
 136                          gen_helper_gvec_2i *fn)
 137 {
 138     TCGv_ptr a0, a1;
 139     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 140
 141     a0 = tcg_temp_new_ptr();
 142     a1 = tcg_temp_new_ptr();
 143
 144     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 145     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 146
 147     fn(a0, a1, c, desc);
 148
 149     tcg_temp_free_ptr(a0);
 150     tcg_temp_free_ptr(a1);
 151 }
 152
 153 /* Generate a call to a gvec-style helper with three vector operands.  */
 154 void tcg_gen_gvec_3_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 155                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 156                         gen_helper_gvec_3 *fn)
 157 {
 158     TCGv_ptr a0, a1, a2;
 159     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 160
 161     a0 = tcg_temp_new_ptr();
 162     a1 = tcg_temp_new_ptr();
 163     a2 = tcg_temp_new_ptr();
 164
 165     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 166     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 167     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 168
 169     fn(a0, a1, a2, desc);
 170
 171     tcg_temp_free_ptr(a0);
 172     tcg_temp_free_ptr(a1);
 173     tcg_temp_free_ptr(a2);
 174 }
 175
 176 /* Generate a call to a gvec-style helper with four vector operands.  */
 177 void tcg_gen_gvec_4_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 178                         uint32_t cofs, uint32_t oprsz, uint32_t maxsz,
 179                         int32_t data, gen_helper_gvec_4 *fn)
 180 {
 181     TCGv_ptr a0, a1, a2, a3;
 182     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 183
 184     a0 = tcg_temp_new_ptr();
 185     a1 = tcg_temp_new_ptr();
 186     a2 = tcg_temp_new_ptr();
 187     a3 = tcg_temp_new_ptr();
 188
 189     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 190     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 191     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 192     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 193
 194     fn(a0, a1, a2, a3, desc);
 195
 196     tcg_temp_free_ptr(a0);
 197     tcg_temp_free_ptr(a1);
 198     tcg_temp_free_ptr(a2);
 199     tcg_temp_free_ptr(a3);
 200 }
 201
 202 /* Generate a call to a gvec-style helper with five vector operands.  */
 203 void tcg_gen_gvec_5_ool(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 204                         uint32_t cofs, uint32_t xofs, uint32_t oprsz,
 205                         uint32_t maxsz, int32_t data, gen_helper_gvec_5 *fn)
 206 {
 207     TCGv_ptr a0, a1, a2, a3, a4;
 208     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 209
 210     a0 = tcg_temp_new_ptr();
 211     a1 = tcg_temp_new_ptr();
 212     a2 = tcg_temp_new_ptr();
 213     a3 = tcg_temp_new_ptr();
 214     a4 = tcg_temp_new_ptr();
 215
 216     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 217     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 218     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 219     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 220     tcg_gen_addi_ptr(a4, cpu_env, xofs);
 221
 222     fn(a0, a1, a2, a3, a4, desc);
 223
 224     tcg_temp_free_ptr(a0);
 225     tcg_temp_free_ptr(a1);
 226     tcg_temp_free_ptr(a2);
 227     tcg_temp_free_ptr(a3);
 228     tcg_temp_free_ptr(a4);
 229 }
 230
 231 /* Generate a call to a gvec-style helper with three vector operands
 232    and an extra pointer operand.  */
 233 void tcg_gen_gvec_2_ptr(uint32_t dofs, uint32_t aofs,
 234                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 235                         int32_t data, gen_helper_gvec_2_ptr *fn)
 236 {
 237     TCGv_ptr a0, a1;
 238     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 239
 240     a0 = tcg_temp_new_ptr();
 241     a1 = tcg_temp_new_ptr();
 242
 243     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 244     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 245
 246     fn(a0, a1, ptr, desc);
 247
 248     tcg_temp_free_ptr(a0);
 249     tcg_temp_free_ptr(a1);
 250 }
 251
 252 /* Generate a call to a gvec-style helper with three vector operands
 253    and an extra pointer operand.  */
 254 void tcg_gen_gvec_3_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 255                         TCGv_ptr ptr, uint32_t oprsz, uint32_t maxsz,
 256                         int32_t data, gen_helper_gvec_3_ptr *fn)
 257 {
 258     TCGv_ptr a0, a1, a2;
 259     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 260
 261     a0 = tcg_temp_new_ptr();
 262     a1 = tcg_temp_new_ptr();
 263     a2 = tcg_temp_new_ptr();
 264
 265     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 266     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 267     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 268
 269     fn(a0, a1, a2, ptr, desc);
 270
 271     tcg_temp_free_ptr(a0);
 272     tcg_temp_free_ptr(a1);
 273     tcg_temp_free_ptr(a2);
 274 }
 275
 276 /* Generate a call to a gvec-style helper with four vector operands
 277    and an extra pointer operand.  */
 278 void tcg_gen_gvec_4_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 279                         uint32_t cofs, TCGv_ptr ptr, uint32_t oprsz,
 280                         uint32_t maxsz, int32_t data,
 281                         gen_helper_gvec_4_ptr *fn)
 282 {
 283     TCGv_ptr a0, a1, a2, a3;
 284     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 285
 286     a0 = tcg_temp_new_ptr();
 287     a1 = tcg_temp_new_ptr();
 288     a2 = tcg_temp_new_ptr();
 289     a3 = tcg_temp_new_ptr();
 290
 291     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 292     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 293     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 294     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 295
 296     fn(a0, a1, a2, a3, ptr, desc);
 297
 298     tcg_temp_free_ptr(a0);
 299     tcg_temp_free_ptr(a1);
 300     tcg_temp_free_ptr(a2);
 301     tcg_temp_free_ptr(a3);
 302 }
 303
 304 /* Generate a call to a gvec-style helper with five vector operands
 305    and an extra pointer operand.  */
 306 void tcg_gen_gvec_5_ptr(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 307                         uint32_t cofs, uint32_t eofs, TCGv_ptr ptr,
 308                         uint32_t oprsz, uint32_t maxsz, int32_t data,
 309                         gen_helper_gvec_5_ptr *fn)
 310 {
 311     TCGv_ptr a0, a1, a2, a3, a4;
 312     TCGv_i32 desc = tcg_constant_i32(simd_desc(oprsz, maxsz, data));
 313
 314     a0 = tcg_temp_new_ptr();
 315     a1 = tcg_temp_new_ptr();
 316     a2 = tcg_temp_new_ptr();
 317     a3 = tcg_temp_new_ptr();
 318     a4 = tcg_temp_new_ptr();
 319
 320     tcg_gen_addi_ptr(a0, cpu_env, dofs);
 321     tcg_gen_addi_ptr(a1, cpu_env, aofs);
 322     tcg_gen_addi_ptr(a2, cpu_env, bofs);
 323     tcg_gen_addi_ptr(a3, cpu_env, cofs);
 324     tcg_gen_addi_ptr(a4, cpu_env, eofs);
 325
 326     fn(a0, a1, a2, a3, a4, ptr, desc);
 327
 328     tcg_temp_free_ptr(a0);
 329     tcg_temp_free_ptr(a1);
 330     tcg_temp_free_ptr(a2);
 331     tcg_temp_free_ptr(a3);
 332     tcg_temp_free_ptr(a4);
 333 }
 334
 335 /* Return true if we want to implement something of OPRSZ bytes
 336    in units of LNSZ.  This limits the expansion of inline code.  */
 337 static inline bool check_size_impl(uint32_t oprsz, uint32_t lnsz)
 338 {
 339     uint32_t q, r;
 340
 341     if (oprsz < lnsz) {
 342         return false;
 343     }
 344
 345     q = oprsz / lnsz;
 346     r = oprsz % lnsz;
 347     tcg_debug_assert((r & 7) == 0);
 348
 349     if (lnsz < 16) {
 350         /* For sizes below 16, accept no remainder. */
 351         if (r != 0) {
 352             return false;
 353         }
 354     } else {
 355         /*
 356          * Recall that ARM SVE allows vector sizes that are not a
 357          * power of 2, but always a multiple of 16.  The intent is
 358          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 359          * In addition, expand_clr needs to handle a multiple of 8.
 360          * Thus we can handle the tail with one more operation per
 361          * diminishing power of 2.
 362          */
 363         q += ctpop32(r);
 364     }
 365
 366     return q <= MAX_UNROLL;
 367 }
 368
 369 static void expand_clr(uint32_t dofs, uint32_t maxsz);
 370
 371 /* Duplicate C as per VECE.  */
 372 uint64_t (dup_const)(unsigned vece, uint64_t c)
 373 {
 374     switch (vece) {
 375     case MO_8:
 376         return 0x0101010101010101ull * (uint8_t)c;
 377     case MO_16:
 378         return 0x0001000100010001ull * (uint16_t)c;
 379     case MO_32:
 380         return 0x0000000100000001ull * (uint32_t)c;
 381     case MO_64:
 382         return c;
 383     default:
 384         g_assert_not_reached();
 385     }
 386 }
 387
 388 /* Duplicate IN into OUT as per VECE.  */
 389 static void gen_dup_i32(unsigned vece, TCGv_i32 out, TCGv_i32 in)
 390 {
 391     switch (vece) {
 392     case MO_8:
 393         tcg_gen_ext8u_i32(out, in);
 394         tcg_gen_muli_i32(out, out, 0x01010101);
 395         break;
 396     case MO_16:
 397         tcg_gen_deposit_i32(out, in, in, 16, 16);
 398         break;
 399     case MO_32:
 400         tcg_gen_mov_i32(out, in);
 401         break;
 402     default:
 403         g_assert_not_reached();
 404     }
 405 }
 406
 407 static void gen_dup_i64(unsigned vece, TCGv_i64 out, TCGv_i64 in)
 408 {
 409     switch (vece) {
 410     case MO_8:
 411         tcg_gen_ext8u_i64(out, in);
 412         tcg_gen_muli_i64(out, out, 0x0101010101010101ull);
 413         break;
 414     case MO_16:
 415         tcg_gen_ext16u_i64(out, in);
 416         tcg_gen_muli_i64(out, out, 0x0001000100010001ull);
 417         break;
 418     case MO_32:
 419         tcg_gen_deposit_i64(out, in, in, 32, 32);
 420         break;
 421     case MO_64:
 422         tcg_gen_mov_i64(out, in);
 423         break;
 424     default:
 425         g_assert_not_reached();
 426     }
 427 }
 428
 429 /* Select a supported vector type for implementing an operation on SIZE
 430  * bytes.  If OP is 0, assume that the real operation to be performed is
 431  * required by all backends.  Otherwise, make sure than OP can be performed
 432  * on elements of size VECE in the selected type.  Do not select V64 if
 433  * PREFER_I64 is true.  Return 0 if no vector type is selected.
 434  */
 435 static TCGType choose_vector_type(const TCGOpcode *list, unsigned vece,
 436                                   uint32_t size, bool prefer_i64)
 437 {
 438     /*
 439      * Recall that ARM SVE allows vector sizes that are not a
 440      * power of 2, but always a multiple of 16.  The intent is
 441      * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 442      * It is hard to imagine a case in which v256 is supported
 443      * but v128 is not, but check anyway.
 444      * In addition, expand_clr needs to handle a multiple of 8.
 445      */
 446     if (TCG_TARGET_HAS_v256 &&
 447         check_size_impl(size, 32) &&
 448         tcg_can_emit_vecop_list(list, TCG_TYPE_V256, vece) &&
 449         (!(size & 16) ||
 450          (TCG_TARGET_HAS_v128 &&
 451           tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece))) &&
 452         (!(size & 8) ||
 453          (TCG_TARGET_HAS_v64 &&
 454           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
 455         return TCG_TYPE_V256;
 456     }
 457     if (TCG_TARGET_HAS_v128 &&
 458         check_size_impl(size, 16) &&
 459         tcg_can_emit_vecop_list(list, TCG_TYPE_V128, vece) &&
 460         (!(size & 8) ||
 461          (TCG_TARGET_HAS_v64 &&
 462           tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)))) {
 463         return TCG_TYPE_V128;
 464     }
 465     if (TCG_TARGET_HAS_v64 && !prefer_i64 && check_size_impl(size, 8)
 466         && tcg_can_emit_vecop_list(list, TCG_TYPE_V64, vece)) {
 467         return TCG_TYPE_V64;
 468     }
 469     return 0;
 470 }
 471
 472 static void do_dup_store(TCGType type, uint32_t dofs, uint32_t oprsz,
 473                          uint32_t maxsz, TCGv_vec t_vec)
 474 {
 475     uint32_t i = 0;
 476
 477     tcg_debug_assert(oprsz >= 8);
 478
 479     /*
 480      * This may be expand_clr for the tail of an operation, e.g.
 481      * oprsz == 8 && maxsz == 64.  The first 8 bytes of this store
 482      * are misaligned wrt the maximum vector size, so do that first.
 483      */
 484     if (dofs & 8) {
 485         tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 486         i += 8;
 487     }
 488
 489     switch (type) {
 490     case TCG_TYPE_V256:
 491         /*
 492          * Recall that ARM SVE allows vector sizes that are not a
 493          * power of 2, but always a multiple of 16.  The intent is
 494          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
 495          */
 496         for (; i + 32 <= oprsz; i += 32) {
 497             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V256);
 498         }
 499         /* fallthru */
 500     case TCG_TYPE_V128:
 501         for (; i + 16 <= oprsz; i += 16) {
 502             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V128);
 503         }
 504         break;
 505     case TCG_TYPE_V64:
 506         for (; i < oprsz; i += 8) {
 507             tcg_gen_stl_vec(t_vec, cpu_env, dofs + i, TCG_TYPE_V64);
 508         }
 509         break;
 510     default:
 511         g_assert_not_reached();
 512     }
 513
 514     if (oprsz < maxsz) {
 515         expand_clr(dofs + oprsz, maxsz - oprsz);
 516     }
 517 }
 518
 519 /* Set OPRSZ bytes at DOFS to replications of IN_32, IN_64 or IN_C.
 520  * Only one of IN_32 or IN_64 may be set;
 521  * IN_C is used if IN_32 and IN_64 are unset.
 522  */
 523 static void do_dup(unsigned vece, uint32_t dofs, uint32_t oprsz,
 524                    uint32_t maxsz, TCGv_i32 in_32, TCGv_i64 in_64,
 525                    uint64_t in_c)
 526 {
 527     TCGType type;
 528     TCGv_i64 t_64;
 529     TCGv_i32 t_32, t_desc;
 530     TCGv_ptr t_ptr;
 531     uint32_t i;
 532
 533     assert(vece <= (in_32 ? MO_32 : MO_64));
 534     assert(in_32 == NULL || in_64 == NULL);
 535
 536     /* If we're storing 0, expand oprsz to maxsz.  */
 537     if (in_32 == NULL && in_64 == NULL) {
 538         in_c = dup_const(vece, in_c);
 539         if (in_c == 0) {
 540             oprsz = maxsz;
 541             vece = MO_8;
 542         } else if (in_c == dup_const(MO_8, in_c)) {
 543             vece = MO_8;
 544         }
 545     }
 546
 547     /* Implement inline with a vector type, if possible.
 548      * Prefer integer when 64-bit host and no variable dup.
 549      */
 550     type = choose_vector_type(NULL, vece, oprsz,
 551                               (TCG_TARGET_REG_BITS == 64 && in_32 == NULL
 552                                && (in_64 == NULL || vece == MO_64)));
 553     if (type != 0) {
 554         TCGv_vec t_vec = tcg_temp_new_vec(type);
 555
 556         if (in_32) {
 557             tcg_gen_dup_i32_vec(vece, t_vec, in_32);
 558         } else if (in_64) {
 559             tcg_gen_dup_i64_vec(vece, t_vec, in_64);
 560         } else {
 561             tcg_gen_dupi_vec(vece, t_vec, in_c);
 562         }
 563         do_dup_store(type, dofs, oprsz, maxsz, t_vec);
 564         tcg_temp_free_vec(t_vec);
 565         return;
 566     }
 567
 568     /* Otherwise, inline with an integer type, unless "large".  */
 569     if (check_size_impl(oprsz, TCG_TARGET_REG_BITS / 8)) {
 570         t_64 = NULL;
 571         t_32 = NULL;
 572
 573         if (in_32) {
 574             /* We are given a 32-bit variable input.  For a 64-bit host,
 575                use a 64-bit operation unless the 32-bit operation would
 576                be simple enough.  */
 577             if (TCG_TARGET_REG_BITS == 64
 578                 && (vece != MO_32 || !check_size_impl(oprsz, 4))) {
 579                 t_64 = tcg_temp_new_i64();
 580                 tcg_gen_extu_i32_i64(t_64, in_32);
 581                 gen_dup_i64(vece, t_64, t_64);
 582             } else {
 583                 t_32 = tcg_temp_new_i32();
 584                 gen_dup_i32(vece, t_32, in_32);
 585             }
 586         } else if (in_64) {
 587             /* We are given a 64-bit variable input.  */
 588             t_64 = tcg_temp_new_i64();
 589             gen_dup_i64(vece, t_64, in_64);
 590         } else {
 591             /* We are given a constant input.  */
 592             /* For 64-bit hosts, use 64-bit constants for "simple" constants
 593                or when we'd need too many 32-bit stores, or when a 64-bit
 594                constant is really required.  */
 595             if (vece == MO_64
 596                 || (TCG_TARGET_REG_BITS == 64
 597                     && (in_c == 0 || in_c == -1
 598                         || !check_size_impl(oprsz, 4)))) {
 599                 t_64 = tcg_constant_i64(in_c);
 600             } else {
 601                 t_32 = tcg_constant_i32(in_c);
 602             }
 603         }
 604
 605         /* Implement inline if we picked an implementation size above.  */
 606         if (t_32) {
 607             for (i = 0; i < oprsz; i += 4) {
 608                 tcg_gen_st_i32(t_32, cpu_env, dofs + i);
 609             }
 610             tcg_temp_free_i32(t_32);
 611             goto done;
 612         }
 613         if (t_64) {
 614             for (i = 0; i < oprsz; i += 8) {
 615                 tcg_gen_st_i64(t_64, cpu_env, dofs + i);
 616             }
 617             tcg_temp_free_i64(t_64);
 618             goto done;
 619         }
 620     }
 621
 622     /* Otherwise implement out of line.  */
 623     t_ptr = tcg_temp_new_ptr();
 624     tcg_gen_addi_ptr(t_ptr, cpu_env, dofs);
 625
 626     /*
 627      * This may be expand_clr for the tail of an operation, e.g.
 628      * oprsz == 8 && maxsz == 64.  The size of the clear is misaligned
 629      * wrt simd_desc and will assert.  Simply pass all replicated byte
 630      * stores through to memset.
 631      */
 632     if (oprsz == maxsz && vece == MO_8) {
 633         TCGv_ptr t_size = tcg_const_ptr(oprsz);
 634         TCGv_i32 t_val;
 635
 636         if (in_32) {
 637             t_val = in_32;
 638         } else if (in_64) {
 639             t_val = tcg_temp_new_i32();
 640             tcg_gen_extrl_i64_i32(t_val, in_64);
 641         } else {
 642             t_val = tcg_constant_i32(in_c);
 643         }
 644         gen_helper_memset(t_ptr, t_ptr, t_val, t_size);
 645
 646         if (in_64) {
 647             tcg_temp_free_i32(t_val);
 648         }
 649         tcg_temp_free_ptr(t_size);
 650         tcg_temp_free_ptr(t_ptr);
 651         return;
 652     }
 653
 654     t_desc = tcg_constant_i32(simd_desc(oprsz, maxsz, 0));
 655
 656     if (vece == MO_64) {
 657         if (in_64) {
 658             gen_helper_gvec_dup64(t_ptr, t_desc, in_64);
 659         } else {
 660             t_64 = tcg_constant_i64(in_c);
 661             gen_helper_gvec_dup64(t_ptr, t_desc, t_64);
 662         }
 663     } else {
 664         typedef void dup_fn(TCGv_ptr, TCGv_i32, TCGv_i32);
 665         static dup_fn * const fns[3] = {
 666             gen_helper_gvec_dup8,
 667             gen_helper_gvec_dup16,
 668             gen_helper_gvec_dup32
 669         };
 670
 671         if (in_32) {
 672             fns[vece](t_ptr, t_desc, in_32);
 673         } else if (in_64) {
 674             t_32 = tcg_temp_new_i32();
 675             tcg_gen_extrl_i64_i32(t_32, in_64);
 676             fns[vece](t_ptr, t_desc, t_32);
 677             tcg_temp_free_i32(t_32);
 678         } else {
 679             if (vece == MO_8) {
 680                 in_c &= 0xff;
 681             } else if (vece == MO_16) {
 682                 in_c &= 0xffff;
 683             }
 684             t_32 = tcg_constant_i32(in_c);
 685             fns[vece](t_ptr, t_desc, t_32);
 686         }
 687     }
 688
 689     tcg_temp_free_ptr(t_ptr);
 690     return;
 691
 692  done:
 693     if (oprsz < maxsz) {
 694         expand_clr(dofs + oprsz, maxsz - oprsz);
 695     }
 696 }
 697
 698 /* Likewise, but with zero.  */
 699 static void expand_clr(uint32_t dofs, uint32_t maxsz)
 700 {
 701     do_dup(MO_8, dofs, maxsz, maxsz, NULL, NULL, 0);
 702 }
 703
 704 /* Expand OPSZ bytes worth of two-operand operations using i32 elements.  */
 705 static void expand_2_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 706                          bool load_dest, void (*fni)(TCGv_i32, TCGv_i32))
 707 {
 708     TCGv_i32 t0 = tcg_temp_new_i32();
 709     TCGv_i32 t1 = tcg_temp_new_i32();
 710     uint32_t i;
 711
 712     for (i = 0; i < oprsz; i += 4) {
 713         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 714         if (load_dest) {
 715             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 716         }
 717         fni(t1, t0);
 718         tcg_gen_st_i32(t1, cpu_env, dofs + i);
 719     }
 720     tcg_temp_free_i32(t0);
 721     tcg_temp_free_i32(t1);
 722 }
 723
 724 static void expand_2i_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 725                           int32_t c, bool load_dest,
 726                           void (*fni)(TCGv_i32, TCGv_i32, int32_t))
 727 {
 728     TCGv_i32 t0 = tcg_temp_new_i32();
 729     TCGv_i32 t1 = tcg_temp_new_i32();
 730     uint32_t i;
 731
 732     for (i = 0; i < oprsz; i += 4) {
 733         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 734         if (load_dest) {
 735             tcg_gen_ld_i32(t1, cpu_env, dofs + i);
 736         }
 737         fni(t1, t0, c);
 738         tcg_gen_st_i32(t1, cpu_env, dofs + i);
 739     }
 740     tcg_temp_free_i32(t0);
 741     tcg_temp_free_i32(t1);
 742 }
 743
 744 static void expand_2s_i32(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 745                           TCGv_i32 c, bool scalar_first,
 746                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 747 {
 748     TCGv_i32 t0 = tcg_temp_new_i32();
 749     TCGv_i32 t1 = tcg_temp_new_i32();
 750     uint32_t i;
 751
 752     for (i = 0; i < oprsz; i += 4) {
 753         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 754         if (scalar_first) {
 755             fni(t1, c, t0);
 756         } else {
 757             fni(t1, t0, c);
 758         }
 759         tcg_gen_st_i32(t1, cpu_env, dofs + i);
 760     }
 761     tcg_temp_free_i32(t0);
 762     tcg_temp_free_i32(t1);
 763 }
 764
 765 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 766 static void expand_3_i32(uint32_t dofs, uint32_t aofs,
 767                          uint32_t bofs, uint32_t oprsz, bool load_dest,
 768                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32))
 769 {
 770     TCGv_i32 t0 = tcg_temp_new_i32();
 771     TCGv_i32 t1 = tcg_temp_new_i32();
 772     TCGv_i32 t2 = tcg_temp_new_i32();
 773     uint32_t i;
 774
 775     for (i = 0; i < oprsz; i += 4) {
 776         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 777         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 778         if (load_dest) {
 779             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 780         }
 781         fni(t2, t0, t1);
 782         tcg_gen_st_i32(t2, cpu_env, dofs + i);
 783     }
 784     tcg_temp_free_i32(t2);
 785     tcg_temp_free_i32(t1);
 786     tcg_temp_free_i32(t0);
 787 }
 788
 789 static void expand_3i_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 790                           uint32_t oprsz, int32_t c, bool load_dest,
 791                           void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, int32_t))
 792 {
 793     TCGv_i32 t0 = tcg_temp_new_i32();
 794     TCGv_i32 t1 = tcg_temp_new_i32();
 795     TCGv_i32 t2 = tcg_temp_new_i32();
 796     uint32_t i;
 797
 798     for (i = 0; i < oprsz; i += 4) {
 799         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
 800         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
 801         if (load_dest) {
 802             tcg_gen_ld_i32(t2, cpu_env, dofs + i);
 803         }
 804         fni(t2, t0, t1, c);
 805         tcg_gen_st_i32(t2, cpu_env, dofs + i);
 806     }
 807     tcg_temp_free_i32(t0);
 808     tcg_temp_free_i32(t1);
 809     tcg_temp_free_i32(t2);
 810 }
 811
 812 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
 813 static void expand_4_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 814                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
 815                          void (*fni)(TCGv_i32, TCGv_i32, TCGv_i32, TCGv_i32))
 816 {
 817     TCGv_i32 t0 = tcg_temp_new_i32();
 818     TCGv_i32 t1 = tcg_temp_new_i32();
 819     TCGv_i32 t2 = tcg_temp_new_i32();
 820     TCGv_i32 t3 = tcg_temp_new_i32();
 821     uint32_t i;
 822
 823     for (i = 0; i < oprsz; i += 4) {
 824         tcg_gen_ld_i32(t1, cpu_env, aofs + i);
 825         tcg_gen_ld_i32(t2, cpu_env, bofs + i);
 826         tcg_gen_ld_i32(t3, cpu_env, cofs + i);
 827         fni(t0, t1, t2, t3);
 828         tcg_gen_st_i32(t0, cpu_env, dofs + i);
 829         if (write_aofs) {
 830             tcg_gen_st_i32(t1, cpu_env, aofs + i);
 831         }
 832     }
 833     tcg_temp_free_i32(t3);
 834     tcg_temp_free_i32(t2);
 835     tcg_temp_free_i32(t1);
 836     tcg_temp_free_i32(t0);
 837 }
 838
 839 /* Expand OPSZ bytes worth of two-operand operations using i64 elements.  */
 840 static void expand_2_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 841                          bool load_dest, void (*fni)(TCGv_i64, TCGv_i64))
 842 {
 843     TCGv_i64 t0 = tcg_temp_new_i64();
 844     TCGv_i64 t1 = tcg_temp_new_i64();
 845     uint32_t i;
 846
 847     for (i = 0; i < oprsz; i += 8) {
 848         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 849         if (load_dest) {
 850             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 851         }
 852         fni(t1, t0);
 853         tcg_gen_st_i64(t1, cpu_env, dofs + i);
 854     }
 855     tcg_temp_free_i64(t0);
 856     tcg_temp_free_i64(t1);
 857 }
 858
 859 static void expand_2i_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 860                           int64_t c, bool load_dest,
 861                           void (*fni)(TCGv_i64, TCGv_i64, int64_t))
 862 {
 863     TCGv_i64 t0 = tcg_temp_new_i64();
 864     TCGv_i64 t1 = tcg_temp_new_i64();
 865     uint32_t i;
 866
 867     for (i = 0; i < oprsz; i += 8) {
 868         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 869         if (load_dest) {
 870             tcg_gen_ld_i64(t1, cpu_env, dofs + i);
 871         }
 872         fni(t1, t0, c);
 873         tcg_gen_st_i64(t1, cpu_env, dofs + i);
 874     }
 875     tcg_temp_free_i64(t0);
 876     tcg_temp_free_i64(t1);
 877 }
 878
 879 static void expand_2s_i64(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
 880                           TCGv_i64 c, bool scalar_first,
 881                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 882 {
 883     TCGv_i64 t0 = tcg_temp_new_i64();
 884     TCGv_i64 t1 = tcg_temp_new_i64();
 885     uint32_t i;
 886
 887     for (i = 0; i < oprsz; i += 8) {
 888         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 889         if (scalar_first) {
 890             fni(t1, c, t0);
 891         } else {
 892             fni(t1, t0, c);
 893         }
 894         tcg_gen_st_i64(t1, cpu_env, dofs + i);
 895     }
 896     tcg_temp_free_i64(t0);
 897     tcg_temp_free_i64(t1);
 898 }
 899
 900 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 901 static void expand_3_i64(uint32_t dofs, uint32_t aofs,
 902                          uint32_t bofs, uint32_t oprsz, bool load_dest,
 903                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64))
 904 {
 905     TCGv_i64 t0 = tcg_temp_new_i64();
 906     TCGv_i64 t1 = tcg_temp_new_i64();
 907     TCGv_i64 t2 = tcg_temp_new_i64();
 908     uint32_t i;
 909
 910     for (i = 0; i < oprsz; i += 8) {
 911         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 912         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 913         if (load_dest) {
 914             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 915         }
 916         fni(t2, t0, t1);
 917         tcg_gen_st_i64(t2, cpu_env, dofs + i);
 918     }
 919     tcg_temp_free_i64(t2);
 920     tcg_temp_free_i64(t1);
 921     tcg_temp_free_i64(t0);
 922 }
 923
 924 static void expand_3i_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 925                           uint32_t oprsz, int64_t c, bool load_dest,
 926                           void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, int64_t))
 927 {
 928     TCGv_i64 t0 = tcg_temp_new_i64();
 929     TCGv_i64 t1 = tcg_temp_new_i64();
 930     TCGv_i64 t2 = tcg_temp_new_i64();
 931     uint32_t i;
 932
 933     for (i = 0; i < oprsz; i += 8) {
 934         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
 935         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
 936         if (load_dest) {
 937             tcg_gen_ld_i64(t2, cpu_env, dofs + i);
 938         }
 939         fni(t2, t0, t1, c);
 940         tcg_gen_st_i64(t2, cpu_env, dofs + i);
 941     }
 942     tcg_temp_free_i64(t0);
 943     tcg_temp_free_i64(t1);
 944     tcg_temp_free_i64(t2);
 945 }
 946
 947 /* Expand OPSZ bytes worth of three-operand operations using i64 elements.  */
 948 static void expand_4_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
 949                          uint32_t cofs, uint32_t oprsz, bool write_aofs,
 950                          void (*fni)(TCGv_i64, TCGv_i64, TCGv_i64, TCGv_i64))
 951 {
 952     TCGv_i64 t0 = tcg_temp_new_i64();
 953     TCGv_i64 t1 = tcg_temp_new_i64();
 954     TCGv_i64 t2 = tcg_temp_new_i64();
 955     TCGv_i64 t3 = tcg_temp_new_i64();
 956     uint32_t i;
 957
 958     for (i = 0; i < oprsz; i += 8) {
 959         tcg_gen_ld_i64(t1, cpu_env, aofs + i);
 960         tcg_gen_ld_i64(t2, cpu_env, bofs + i);
 961         tcg_gen_ld_i64(t3, cpu_env, cofs + i);
 962         fni(t0, t1, t2, t3);
 963         tcg_gen_st_i64(t0, cpu_env, dofs + i);
 964         if (write_aofs) {
 965             tcg_gen_st_i64(t1, cpu_env, aofs + i);
 966         }
 967     }
 968     tcg_temp_free_i64(t3);
 969     tcg_temp_free_i64(t2);
 970     tcg_temp_free_i64(t1);
 971     tcg_temp_free_i64(t0);
 972 }
 973
 974 /* Expand OPSZ bytes worth of two-operand operations using host vectors.  */
 975 static void expand_2_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 976                          uint32_t oprsz, uint32_t tysz, TCGType type,
 977                          bool load_dest,
 978                          void (*fni)(unsigned, TCGv_vec, TCGv_vec))
 979 {
 980     TCGv_vec t0 = tcg_temp_new_vec(type);
 981     TCGv_vec t1 = tcg_temp_new_vec(type);
 982     uint32_t i;
 983
 984     for (i = 0; i < oprsz; i += tysz) {
 985         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
 986         if (load_dest) {
 987             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
 988         }
 989         fni(vece, t1, t0);
 990         tcg_gen_st_vec(t1, cpu_env, dofs + i);
 991     }
 992     tcg_temp_free_vec(t0);
 993     tcg_temp_free_vec(t1);
 994 }
 995
 996 /* Expand OPSZ bytes worth of two-vector operands and an immediate operand
 997    using host vectors.  */
 998 static void expand_2i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
 999                           uint32_t oprsz, uint32_t tysz, TCGType type,
1000                           int64_t c, bool load_dest,
1001                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, int64_t))
1002 {
1003     TCGv_vec t0 = tcg_temp_new_vec(type);
1004     TCGv_vec t1 = tcg_temp_new_vec(type);
1005     uint32_t i;
1006
1007     for (i = 0; i < oprsz; i += tysz) {
1008         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1009         if (load_dest) {
1010             tcg_gen_ld_vec(t1, cpu_env, dofs + i);
1011         }
1012         fni(vece, t1, t0, c);
1013         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1014     }
1015     tcg_temp_free_vec(t0);
1016     tcg_temp_free_vec(t1);
1017 }
1018
1019 static void expand_2s_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1020                           uint32_t oprsz, uint32_t tysz, TCGType type,
1021                           TCGv_vec c, bool scalar_first,
1022                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1023 {
1024     TCGv_vec t0 = tcg_temp_new_vec(type);
1025     TCGv_vec t1 = tcg_temp_new_vec(type);
1026     uint32_t i;
1027
1028     for (i = 0; i < oprsz; i += tysz) {
1029         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1030         if (scalar_first) {
1031             fni(vece, t1, c, t0);
1032         } else {
1033             fni(vece, t1, t0, c);
1034         }
1035         tcg_gen_st_vec(t1, cpu_env, dofs + i);
1036     }
1037     tcg_temp_free_vec(t0);
1038     tcg_temp_free_vec(t1);
1039 }
1040
1041 /* Expand OPSZ bytes worth of three-operand operations using host vectors.  */
1042 static void expand_3_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1043                          uint32_t bofs, uint32_t oprsz,
1044                          uint32_t tysz, TCGType type, bool load_dest,
1045                          void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec))
1046 {
1047     TCGv_vec t0 = tcg_temp_new_vec(type);
1048     TCGv_vec t1 = tcg_temp_new_vec(type);
1049     TCGv_vec t2 = tcg_temp_new_vec(type);
1050     uint32_t i;
1051
1052     for (i = 0; i < oprsz; i += tysz) {
1053         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1054         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1055         if (load_dest) {
1056             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1057         }
1058         fni(vece, t2, t0, t1);
1059         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1060     }
1061     tcg_temp_free_vec(t2);
1062     tcg_temp_free_vec(t1);
1063     tcg_temp_free_vec(t0);
1064 }
1065
1066 /*
1067  * Expand OPSZ bytes worth of three-vector operands and an immediate operand
1068  * using host vectors.
1069  */
1070 static void expand_3i_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1071                           uint32_t bofs, uint32_t oprsz, uint32_t tysz,
1072                           TCGType type, int64_t c, bool load_dest,
1073                           void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec,
1074                                       int64_t))
1075 {
1076     TCGv_vec t0 = tcg_temp_new_vec(type);
1077     TCGv_vec t1 = tcg_temp_new_vec(type);
1078     TCGv_vec t2 = tcg_temp_new_vec(type);
1079     uint32_t i;
1080
1081     for (i = 0; i < oprsz; i += tysz) {
1082         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
1083         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
1084         if (load_dest) {
1085             tcg_gen_ld_vec(t2, cpu_env, dofs + i);
1086         }
1087         fni(vece, t2, t0, t1, c);
1088         tcg_gen_st_vec(t2, cpu_env, dofs + i);
1089     }
1090     tcg_temp_free_vec(t0);
1091     tcg_temp_free_vec(t1);
1092     tcg_temp_free_vec(t2);
1093 }
1094
1095 /* Expand OPSZ bytes worth of four-operand operations using host vectors.  */
1096 static void expand_4_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
1097                          uint32_t bofs, uint32_t cofs, uint32_t oprsz,
1098                          uint32_t tysz, TCGType type, bool write_aofs,
1099                          void (*fni)(unsigned, TCGv_vec, TCGv_vec,
1100                                      TCGv_vec, TCGv_vec))
1101 {
1102     TCGv_vec t0 = tcg_temp_new_vec(type);
1103     TCGv_vec t1 = tcg_temp_new_vec(type);
1104     TCGv_vec t2 = tcg_temp_new_vec(type);
1105     TCGv_vec t3 = tcg_temp_new_vec(type);
1106     uint32_t i;
1107
1108     for (i = 0; i < oprsz; i += tysz) {
1109         tcg_gen_ld_vec(t1, cpu_env, aofs + i);
1110         tcg_gen_ld_vec(t2, cpu_env, bofs + i);
1111         tcg_gen_ld_vec(t3, cpu_env, cofs + i);
1112         fni(vece, t0, t1, t2, t3);
1113         tcg_gen_st_vec(t0, cpu_env, dofs + i);
1114         if (write_aofs) {
1115             tcg_gen_st_vec(t1, cpu_env, aofs + i);
1116         }
1117     }
1118     tcg_temp_free_vec(t3);
1119     tcg_temp_free_vec(t2);
1120     tcg_temp_free_vec(t1);
1121     tcg_temp_free_vec(t0);
1122 }
1123
1124 /* Expand a vector two-operand operation.  */
1125 void tcg_gen_gvec_2(uint32_t dofs, uint32_t aofs,
1126                     uint32_t oprsz, uint32_t maxsz, const GVecGen2 *g)
1127 {
1128     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1129     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1130     TCGType type;
1131     uint32_t some;
1132
1133     check_size_align(oprsz, maxsz, dofs | aofs);
1134     check_overlap_2(dofs, aofs, maxsz);
1135
1136     type = 0;
1137     if (g->fniv) {
1138         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1139     }
1140     switch (type) {
1141     case TCG_TYPE_V256:
1142         /* Recall that ARM SVE allows vector sizes that are not a
1143          * power of 2, but always a multiple of 16.  The intent is
1144          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1145          */
1146         some = QEMU_ALIGN_DOWN(oprsz, 32);
1147         expand_2_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1148                      g->load_dest, g->fniv);
1149         if (some == oprsz) {
1150             break;
1151         }
1152         dofs += some;
1153         aofs += some;
1154         oprsz -= some;
1155         maxsz -= some;
1156         /* fallthru */
1157     case TCG_TYPE_V128:
1158         expand_2_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1159                      g->load_dest, g->fniv);
1160         break;
1161     case TCG_TYPE_V64:
1162         expand_2_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1163                      g->load_dest, g->fniv);
1164         break;
1165
1166     case 0:
1167         if (g->fni8 && check_size_impl(oprsz, 8)) {
1168             expand_2_i64(dofs, aofs, oprsz, g->load_dest, g->fni8);
1169         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1170             expand_2_i32(dofs, aofs, oprsz, g->load_dest, g->fni4);
1171         } else {
1172             assert(g->fno != NULL);
1173             tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, g->data, g->fno);
1174             oprsz = maxsz;
1175         }
1176         break;
1177
1178     default:
1179         g_assert_not_reached();
1180     }
1181     tcg_swap_vecop_list(hold_list);
1182
1183     if (oprsz < maxsz) {
1184         expand_clr(dofs + oprsz, maxsz - oprsz);
1185     }
1186 }
1187
1188 /* Expand a vector operation with two vectors and an immediate.  */
1189 void tcg_gen_gvec_2i(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1190                      uint32_t maxsz, int64_t c, const GVecGen2i *g)
1191 {
1192     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1193     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1194     TCGType type;
1195     uint32_t some;
1196
1197     check_size_align(oprsz, maxsz, dofs | aofs);
1198     check_overlap_2(dofs, aofs, maxsz);
1199
1200     type = 0;
1201     if (g->fniv) {
1202         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1203     }
1204     switch (type) {
1205     case TCG_TYPE_V256:
1206         /* Recall that ARM SVE allows vector sizes that are not a
1207          * power of 2, but always a multiple of 16.  The intent is
1208          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1209          */
1210         some = QEMU_ALIGN_DOWN(oprsz, 32);
1211         expand_2i_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1212                       c, g->load_dest, g->fniv);
1213         if (some == oprsz) {
1214             break;
1215         }
1216         dofs += some;
1217         aofs += some;
1218         oprsz -= some;
1219         maxsz -= some;
1220         /* fallthru */
1221     case TCG_TYPE_V128:
1222         expand_2i_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1223                       c, g->load_dest, g->fniv);
1224         break;
1225     case TCG_TYPE_V64:
1226         expand_2i_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1227                       c, g->load_dest, g->fniv);
1228         break;
1229
1230     case 0:
1231         if (g->fni8 && check_size_impl(oprsz, 8)) {
1232             expand_2i_i64(dofs, aofs, oprsz, c, g->load_dest, g->fni8);
1233         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1234             expand_2i_i32(dofs, aofs, oprsz, c, g->load_dest, g->fni4);
1235         } else {
1236             if (g->fno) {
1237                 tcg_gen_gvec_2_ool(dofs, aofs, oprsz, maxsz, c, g->fno);
1238             } else {
1239                 TCGv_i64 tcg_c = tcg_constant_i64(c);
1240                 tcg_gen_gvec_2i_ool(dofs, aofs, tcg_c, oprsz,
1241                                     maxsz, c, g->fnoi);
1242             }
1243             oprsz = maxsz;
1244         }
1245         break;
1246
1247     default:
1248         g_assert_not_reached();
1249     }
1250     tcg_swap_vecop_list(hold_list);
1251
1252     if (oprsz < maxsz) {
1253         expand_clr(dofs + oprsz, maxsz - oprsz);
1254     }
1255 }
1256
1257 /* Expand a vector operation with two vectors and a scalar.  */
1258 void tcg_gen_gvec_2s(uint32_t dofs, uint32_t aofs, uint32_t oprsz,
1259                      uint32_t maxsz, TCGv_i64 c, const GVecGen2s *g)
1260 {
1261     TCGType type;
1262
1263     check_size_align(oprsz, maxsz, dofs | aofs);
1264     check_overlap_2(dofs, aofs, maxsz);
1265
1266     type = 0;
1267     if (g->fniv) {
1268         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1269     }
1270     if (type != 0) {
1271         const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1272         const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1273         TCGv_vec t_vec = tcg_temp_new_vec(type);
1274         uint32_t some;
1275
1276         tcg_gen_dup_i64_vec(g->vece, t_vec, c);
1277
1278         switch (type) {
1279         case TCG_TYPE_V256:
1280             /* Recall that ARM SVE allows vector sizes that are not a
1281              * power of 2, but always a multiple of 16.  The intent is
1282              * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1283              */
1284             some = QEMU_ALIGN_DOWN(oprsz, 32);
1285             expand_2s_vec(g->vece, dofs, aofs, some, 32, TCG_TYPE_V256,
1286                           t_vec, g->scalar_first, g->fniv);
1287             if (some == oprsz) {
1288                 break;
1289             }
1290             dofs += some;
1291             aofs += some;
1292             oprsz -= some;
1293             maxsz -= some;
1294             /* fallthru */
1295
1296         case TCG_TYPE_V128:
1297             expand_2s_vec(g->vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
1298                           t_vec, g->scalar_first, g->fniv);
1299             break;
1300
1301         case TCG_TYPE_V64:
1302             expand_2s_vec(g->vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
1303                           t_vec, g->scalar_first, g->fniv);
1304             break;
1305
1306         default:
1307             g_assert_not_reached();
1308         }
1309         tcg_temp_free_vec(t_vec);
1310         tcg_swap_vecop_list(hold_list);
1311     } else if (g->fni8 && check_size_impl(oprsz, 8)) {
1312         TCGv_i64 t64 = tcg_temp_new_i64();
1313
1314         gen_dup_i64(g->vece, t64, c);
1315         expand_2s_i64(dofs, aofs, oprsz, t64, g->scalar_first, g->fni8);
1316         tcg_temp_free_i64(t64);
1317     } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1318         TCGv_i32 t32 = tcg_temp_new_i32();
1319
1320         tcg_gen_extrl_i64_i32(t32, c);
1321         gen_dup_i32(g->vece, t32, t32);
1322         expand_2s_i32(dofs, aofs, oprsz, t32, g->scalar_first, g->fni4);
1323         tcg_temp_free_i32(t32);
1324     } else {
1325         tcg_gen_gvec_2i_ool(dofs, aofs, c, oprsz, maxsz, 0, g->fno);
1326         return;
1327     }
1328
1329     if (oprsz < maxsz) {
1330         expand_clr(dofs + oprsz, maxsz - oprsz);
1331     }
1332 }
1333
1334 /* Expand a vector three-operand operation.  */
1335 void tcg_gen_gvec_3(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1336                     uint32_t oprsz, uint32_t maxsz, const GVecGen3 *g)
1337 {
1338     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1339     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1340     TCGType type;
1341     uint32_t some;
1342
1343     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1344     check_overlap_3(dofs, aofs, bofs, maxsz);
1345
1346     type = 0;
1347     if (g->fniv) {
1348         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1349     }
1350     switch (type) {
1351     case TCG_TYPE_V256:
1352         /* Recall that ARM SVE allows vector sizes that are not a
1353          * power of 2, but always a multiple of 16.  The intent is
1354          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1355          */
1356         some = QEMU_ALIGN_DOWN(oprsz, 32);
1357         expand_3_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1358                      g->load_dest, g->fniv);
1359         if (some == oprsz) {
1360             break;
1361         }
1362         dofs += some;
1363         aofs += some;
1364         bofs += some;
1365         oprsz -= some;
1366         maxsz -= some;
1367         /* fallthru */
1368     case TCG_TYPE_V128:
1369         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1370                      g->load_dest, g->fniv);
1371         break;
1372     case TCG_TYPE_V64:
1373         expand_3_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1374                      g->load_dest, g->fniv);
1375         break;
1376
1377     case 0:
1378         if (g->fni8 && check_size_impl(oprsz, 8)) {
1379             expand_3_i64(dofs, aofs, bofs, oprsz, g->load_dest, g->fni8);
1380         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1381             expand_3_i32(dofs, aofs, bofs, oprsz, g->load_dest, g->fni4);
1382         } else {
1383             assert(g->fno != NULL);
1384             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz,
1385                                maxsz, g->data, g->fno);
1386             oprsz = maxsz;
1387         }
1388         break;
1389
1390     default:
1391         g_assert_not_reached();
1392     }
1393     tcg_swap_vecop_list(hold_list);
1394
1395     if (oprsz < maxsz) {
1396         expand_clr(dofs + oprsz, maxsz - oprsz);
1397     }
1398 }
1399
1400 /* Expand a vector operation with three vectors and an immediate.  */
1401 void tcg_gen_gvec_3i(uint32_t dofs, uint32_t aofs, uint32_t bofs,
1402                      uint32_t oprsz, uint32_t maxsz, int64_t c,
1403                      const GVecGen3i *g)
1404 {
1405     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1406     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1407     TCGType type;
1408     uint32_t some;
1409
1410     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
1411     check_overlap_3(dofs, aofs, bofs, maxsz);
1412
1413     type = 0;
1414     if (g->fniv) {
1415         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1416     }
1417     switch (type) {
1418     case TCG_TYPE_V256:
1419         /*
1420          * Recall that ARM SVE allows vector sizes that are not a
1421          * power of 2, but always a multiple of 16.  The intent is
1422          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1423          */
1424         some = QEMU_ALIGN_DOWN(oprsz, 32);
1425         expand_3i_vec(g->vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256,
1426                       c, g->load_dest, g->fniv);
1427         if (some == oprsz) {
1428             break;
1429         }
1430         dofs += some;
1431         aofs += some;
1432         bofs += some;
1433         oprsz -= some;
1434         maxsz -= some;
1435         /* fallthru */
1436     case TCG_TYPE_V128:
1437         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128,
1438                       c, g->load_dest, g->fniv);
1439         break;
1440     case TCG_TYPE_V64:
1441         expand_3i_vec(g->vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64,
1442                       c, g->load_dest, g->fniv);
1443         break;
1444
1445     case 0:
1446         if (g->fni8 && check_size_impl(oprsz, 8)) {
1447             expand_3i_i64(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni8);
1448         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1449             expand_3i_i32(dofs, aofs, bofs, oprsz, c, g->load_dest, g->fni4);
1450         } else {
1451             assert(g->fno != NULL);
1452             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, c, g->fno);
1453             oprsz = maxsz;
1454         }
1455         break;
1456
1457     default:
1458         g_assert_not_reached();
1459     }
1460     tcg_swap_vecop_list(hold_list);
1461
1462     if (oprsz < maxsz) {
1463         expand_clr(dofs + oprsz, maxsz - oprsz);
1464     }
1465 }
1466
1467 /* Expand a vector four-operand operation.  */
1468 void tcg_gen_gvec_4(uint32_t dofs, uint32_t aofs, uint32_t bofs, uint32_t cofs,
1469                     uint32_t oprsz, uint32_t maxsz, const GVecGen4 *g)
1470 {
1471     const TCGOpcode *this_list = g->opt_opc ? : vecop_list_empty;
1472     const TCGOpcode *hold_list = tcg_swap_vecop_list(this_list);
1473     TCGType type;
1474     uint32_t some;
1475
1476     check_size_align(oprsz, maxsz, dofs | aofs | bofs | cofs);
1477     check_overlap_4(dofs, aofs, bofs, cofs, maxsz);
1478
1479     type = 0;
1480     if (g->fniv) {
1481         type = choose_vector_type(g->opt_opc, g->vece, oprsz, g->prefer_i64);
1482     }
1483     switch (type) {
1484     case TCG_TYPE_V256:
1485         /* Recall that ARM SVE allows vector sizes that are not a
1486          * power of 2, but always a multiple of 16.  The intent is
1487          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
1488          */
1489         some = QEMU_ALIGN_DOWN(oprsz, 32);
1490         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, some,
1491                      32, TCG_TYPE_V256, g->write_aofs, g->fniv);
1492         if (some == oprsz) {
1493             break;
1494         }
1495         dofs += some;
1496         aofs += some;
1497         bofs += some;
1498         cofs += some;
1499         oprsz -= some;
1500         maxsz -= some;
1501         /* fallthru */
1502     case TCG_TYPE_V128:
1503         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1504                      16, TCG_TYPE_V128, g->write_aofs, g->fniv);
1505         break;
1506     case TCG_TYPE_V64:
1507         expand_4_vec(g->vece, dofs, aofs, bofs, cofs, oprsz,
1508                      8, TCG_TYPE_V64, g->write_aofs, g->fniv);
1509         break;
1510
1511     case 0:
1512         if (g->fni8 && check_size_impl(oprsz, 8)) {
1513             expand_4_i64(dofs, aofs, bofs, cofs, oprsz,
1514                          g->write_aofs, g->fni8);
1515         } else if (g->fni4 && check_size_impl(oprsz, 4)) {
1516             expand_4_i32(dofs, aofs, bofs, cofs, oprsz,
1517                          g->write_aofs, g->fni4);
1518         } else {
1519             assert(g->fno != NULL);
1520             tcg_gen_gvec_4_ool(dofs, aofs, bofs, cofs,
1521                                oprsz, maxsz, g->data, g->fno);
1522             oprsz = maxsz;
1523         }
1524         break;
1525
1526     default:
1527         g_assert_not_reached();
1528     }
1529     tcg_swap_vecop_list(hold_list);
1530
1531     if (oprsz < maxsz) {
1532         expand_clr(dofs + oprsz, maxsz - oprsz);
1533     }
1534 }
1535
1536 /*
1537  * Expand specific vector operations.
1538  */
1539
1540 static void vec_mov2(unsigned vece, TCGv_vec a, TCGv_vec b)
1541 {
1542     tcg_gen_mov_vec(a, b);
1543 }
1544
1545 void tcg_gen_gvec_mov(unsigned vece, uint32_t dofs, uint32_t aofs,
1546                       uint32_t oprsz, uint32_t maxsz)
1547 {
1548     static const GVecGen2 g = {
1549         .fni8 = tcg_gen_mov_i64,
1550         .fniv = vec_mov2,
1551         .fno = gen_helper_gvec_mov,
1552         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1553     };
1554     if (dofs != aofs) {
1555         tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1556     } else {
1557         check_size_align(oprsz, maxsz, dofs);
1558         if (oprsz < maxsz) {
1559             expand_clr(dofs + oprsz, maxsz - oprsz);
1560         }
1561     }
1562 }
1563
1564 void tcg_gen_gvec_dup_i32(unsigned vece, uint32_t dofs, uint32_t oprsz,
1565                           uint32_t maxsz, TCGv_i32 in)
1566 {
1567     check_size_align(oprsz, maxsz, dofs);
1568     tcg_debug_assert(vece <= MO_32);
1569     do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1570 }
1571
1572 void tcg_gen_gvec_dup_i64(unsigned vece, uint32_t dofs, uint32_t oprsz,
1573                           uint32_t maxsz, TCGv_i64 in)
1574 {
1575     check_size_align(oprsz, maxsz, dofs);
1576     tcg_debug_assert(vece <= MO_64);
1577     do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1578 }
1579
1580 void tcg_gen_gvec_dup_mem(unsigned vece, uint32_t dofs, uint32_t aofs,
1581                           uint32_t oprsz, uint32_t maxsz)
1582 {
1583     check_size_align(oprsz, maxsz, dofs);
1584     if (vece <= MO_64) {
1585         TCGType type = choose_vector_type(NULL, vece, oprsz, 0);
1586         if (type != 0) {
1587             TCGv_vec t_vec = tcg_temp_new_vec(type);
1588             tcg_gen_dup_mem_vec(vece, t_vec, cpu_env, aofs);
1589             do_dup_store(type, dofs, oprsz, maxsz, t_vec);
1590             tcg_temp_free_vec(t_vec);
1591         } else if (vece <= MO_32) {
1592             TCGv_i32 in = tcg_temp_new_i32();
1593             switch (vece) {
1594             case MO_8:
1595                 tcg_gen_ld8u_i32(in, cpu_env, aofs);
1596                 break;
1597             case MO_16:
1598                 tcg_gen_ld16u_i32(in, cpu_env, aofs);
1599                 break;
1600             default:
1601                 tcg_gen_ld_i32(in, cpu_env, aofs);
1602                 break;
1603             }
1604             do_dup(vece, dofs, oprsz, maxsz, in, NULL, 0);
1605             tcg_temp_free_i32(in);
1606         } else {
1607             TCGv_i64 in = tcg_temp_new_i64();
1608             tcg_gen_ld_i64(in, cpu_env, aofs);
1609             do_dup(vece, dofs, oprsz, maxsz, NULL, in, 0);
1610             tcg_temp_free_i64(in);
1611         }
1612     } else if (vece == 4) {
1613         /* 128-bit duplicate.  */
1614         int i;
1615
1616         tcg_debug_assert(oprsz >= 16);
1617         if (TCG_TARGET_HAS_v128) {
1618             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V128);
1619
1620             tcg_gen_ld_vec(in, cpu_env, aofs);
1621             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1622                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1623             }
1624             tcg_temp_free_vec(in);
1625         } else {
1626             TCGv_i64 in0 = tcg_temp_new_i64();
1627             TCGv_i64 in1 = tcg_temp_new_i64();
1628
1629             tcg_gen_ld_i64(in0, cpu_env, aofs);
1630             tcg_gen_ld_i64(in1, cpu_env, aofs + 8);
1631             for (i = (aofs == dofs) * 16; i < oprsz; i += 16) {
1632                 tcg_gen_st_i64(in0, cpu_env, dofs + i);
1633                 tcg_gen_st_i64(in1, cpu_env, dofs + i + 8);
1634             }
1635             tcg_temp_free_i64(in0);
1636             tcg_temp_free_i64(in1);
1637         }
1638         if (oprsz < maxsz) {
1639             expand_clr(dofs + oprsz, maxsz - oprsz);
1640         }
1641     } else if (vece == 5) {
1642         /* 256-bit duplicate.  */
1643         int i;
1644
1645         tcg_debug_assert(oprsz >= 32);
1646         tcg_debug_assert(oprsz % 32 == 0);
1647         if (TCG_TARGET_HAS_v256) {
1648             TCGv_vec in = tcg_temp_new_vec(TCG_TYPE_V256);
1649
1650             tcg_gen_ld_vec(in, cpu_env, aofs);
1651             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1652                 tcg_gen_st_vec(in, cpu_env, dofs + i);
1653             }
1654             tcg_temp_free_vec(in);
1655         } else if (TCG_TARGET_HAS_v128) {
1656             TCGv_vec in0 = tcg_temp_new_vec(TCG_TYPE_V128);
1657             TCGv_vec in1 = tcg_temp_new_vec(TCG_TYPE_V128);
1658
1659             tcg_gen_ld_vec(in0, cpu_env, aofs);
1660             tcg_gen_ld_vec(in1, cpu_env, aofs + 16);
1661             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1662                 tcg_gen_st_vec(in0, cpu_env, dofs + i);
1663                 tcg_gen_st_vec(in1, cpu_env, dofs + i + 16);
1664             }
1665             tcg_temp_free_vec(in0);
1666             tcg_temp_free_vec(in1);
1667         } else {
1668             TCGv_i64 in[4];
1669             int j;
1670
1671             for (j = 0; j < 4; ++j) {
1672                 in[j] = tcg_temp_new_i64();
1673                 tcg_gen_ld_i64(in[j], cpu_env, aofs + j * 8);
1674             }
1675             for (i = (aofs == dofs) * 32; i < oprsz; i += 32) {
1676                 for (j = 0; j < 4; ++j) {
1677                     tcg_gen_st_i64(in[j], cpu_env, dofs + i + j * 8);
1678                 }
1679             }
1680             for (j = 0; j < 4; ++j) {
1681                 tcg_temp_free_i64(in[j]);
1682             }
1683         }
1684         if (oprsz < maxsz) {
1685             expand_clr(dofs + oprsz, maxsz - oprsz);
1686         }
1687     } else {
1688         g_assert_not_reached();
1689     }
1690 }
1691
1692 void tcg_gen_gvec_dup_imm(unsigned vece, uint32_t dofs, uint32_t oprsz,
1693                           uint32_t maxsz, uint64_t x)
1694 {
1695     check_size_align(oprsz, maxsz, dofs);
1696     do_dup(vece, dofs, oprsz, maxsz, NULL, NULL, x);
1697 }
1698
1699 void tcg_gen_gvec_not(unsigned vece, uint32_t dofs, uint32_t aofs,
1700                       uint32_t oprsz, uint32_t maxsz)
1701 {
1702     static const GVecGen2 g = {
1703         .fni8 = tcg_gen_not_i64,
1704         .fniv = tcg_gen_not_vec,
1705         .fno = gen_helper_gvec_not,
1706         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1707     };
1708     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g);
1709 }
1710
1711 /* Perform a vector addition using normal addition and a mask.  The mask
1712    should be the sign bit of each lane.  This 6-operation form is more
1713    efficient than separate additions when there are 4 or more lanes in
1714    the 64-bit operation.  */
1715 static void gen_addv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1716 {
1717     TCGv_i64 t1 = tcg_temp_new_i64();
1718     TCGv_i64 t2 = tcg_temp_new_i64();
1719     TCGv_i64 t3 = tcg_temp_new_i64();
1720
1721     tcg_gen_andc_i64(t1, a, m);
1722     tcg_gen_andc_i64(t2, b, m);
1723     tcg_gen_xor_i64(t3, a, b);
1724     tcg_gen_add_i64(d, t1, t2);
1725     tcg_gen_and_i64(t3, t3, m);
1726     tcg_gen_xor_i64(d, d, t3);
1727
1728     tcg_temp_free_i64(t1);
1729     tcg_temp_free_i64(t2);
1730     tcg_temp_free_i64(t3);
1731 }
1732
1733 void tcg_gen_vec_add8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1734 {
1735     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1736     gen_addv_mask(d, a, b, m);
1737 }
1738
1739 void tcg_gen_vec_add16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1740 {
1741     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1742     gen_addv_mask(d, a, b, m);
1743 }
1744
1745 void tcg_gen_vec_add32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1746 {
1747     TCGv_i64 t1 = tcg_temp_new_i64();
1748     TCGv_i64 t2 = tcg_temp_new_i64();
1749
1750     tcg_gen_andi_i64(t1, a, ~0xffffffffull);
1751     tcg_gen_add_i64(t2, a, b);
1752     tcg_gen_add_i64(t1, t1, b);
1753     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1754
1755     tcg_temp_free_i64(t1);
1756     tcg_temp_free_i64(t2);
1757 }
1758
1759 static const TCGOpcode vecop_list_add[] = { INDEX_op_add_vec, 0 };
1760
1761 void tcg_gen_gvec_add(unsigned vece, uint32_t dofs, uint32_t aofs,
1762                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1763 {
1764     static const GVecGen3 g[4] = {
1765         { .fni8 = tcg_gen_vec_add8_i64,
1766           .fniv = tcg_gen_add_vec,
1767           .fno = gen_helper_gvec_add8,
1768           .opt_opc = vecop_list_add,
1769           .vece = MO_8 },
1770         { .fni8 = tcg_gen_vec_add16_i64,
1771           .fniv = tcg_gen_add_vec,
1772           .fno = gen_helper_gvec_add16,
1773           .opt_opc = vecop_list_add,
1774           .vece = MO_16 },
1775         { .fni4 = tcg_gen_add_i32,
1776           .fniv = tcg_gen_add_vec,
1777           .fno = gen_helper_gvec_add32,
1778           .opt_opc = vecop_list_add,
1779           .vece = MO_32 },
1780         { .fni8 = tcg_gen_add_i64,
1781           .fniv = tcg_gen_add_vec,
1782           .fno = gen_helper_gvec_add64,
1783           .opt_opc = vecop_list_add,
1784           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1785           .vece = MO_64 },
1786     };
1787
1788     tcg_debug_assert(vece <= MO_64);
1789     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1790 }
1791
1792 void tcg_gen_gvec_adds(unsigned vece, uint32_t dofs, uint32_t aofs,
1793                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1794 {
1795     static const GVecGen2s g[4] = {
1796         { .fni8 = tcg_gen_vec_add8_i64,
1797           .fniv = tcg_gen_add_vec,
1798           .fno = gen_helper_gvec_adds8,
1799           .opt_opc = vecop_list_add,
1800           .vece = MO_8 },
1801         { .fni8 = tcg_gen_vec_add16_i64,
1802           .fniv = tcg_gen_add_vec,
1803           .fno = gen_helper_gvec_adds16,
1804           .opt_opc = vecop_list_add,
1805           .vece = MO_16 },
1806         { .fni4 = tcg_gen_add_i32,
1807           .fniv = tcg_gen_add_vec,
1808           .fno = gen_helper_gvec_adds32,
1809           .opt_opc = vecop_list_add,
1810           .vece = MO_32 },
1811         { .fni8 = tcg_gen_add_i64,
1812           .fniv = tcg_gen_add_vec,
1813           .fno = gen_helper_gvec_adds64,
1814           .opt_opc = vecop_list_add,
1815           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1816           .vece = MO_64 },
1817     };
1818
1819     tcg_debug_assert(vece <= MO_64);
1820     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1821 }
1822
1823 void tcg_gen_gvec_addi(unsigned vece, uint32_t dofs, uint32_t aofs,
1824                        int64_t c, uint32_t oprsz, uint32_t maxsz)
1825 {
1826     TCGv_i64 tmp = tcg_constant_i64(c);
1827     tcg_gen_gvec_adds(vece, dofs, aofs, tmp, oprsz, maxsz);
1828 }
1829
1830 static const TCGOpcode vecop_list_sub[] = { INDEX_op_sub_vec, 0 };
1831
1832 void tcg_gen_gvec_subs(unsigned vece, uint32_t dofs, uint32_t aofs,
1833                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1834 {
1835     static const GVecGen2s g[4] = {
1836         { .fni8 = tcg_gen_vec_sub8_i64,
1837           .fniv = tcg_gen_sub_vec,
1838           .fno = gen_helper_gvec_subs8,
1839           .opt_opc = vecop_list_sub,
1840           .vece = MO_8 },
1841         { .fni8 = tcg_gen_vec_sub16_i64,
1842           .fniv = tcg_gen_sub_vec,
1843           .fno = gen_helper_gvec_subs16,
1844           .opt_opc = vecop_list_sub,
1845           .vece = MO_16 },
1846         { .fni4 = tcg_gen_sub_i32,
1847           .fniv = tcg_gen_sub_vec,
1848           .fno = gen_helper_gvec_subs32,
1849           .opt_opc = vecop_list_sub,
1850           .vece = MO_32 },
1851         { .fni8 = tcg_gen_sub_i64,
1852           .fniv = tcg_gen_sub_vec,
1853           .fno = gen_helper_gvec_subs64,
1854           .opt_opc = vecop_list_sub,
1855           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1856           .vece = MO_64 },
1857     };
1858
1859     tcg_debug_assert(vece <= MO_64);
1860     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1861 }
1862
1863 /* Perform a vector subtraction using normal subtraction and a mask.
1864    Compare gen_addv_mask above.  */
1865 static void gen_subv_mask(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 m)
1866 {
1867     TCGv_i64 t1 = tcg_temp_new_i64();
1868     TCGv_i64 t2 = tcg_temp_new_i64();
1869     TCGv_i64 t3 = tcg_temp_new_i64();
1870
1871     tcg_gen_or_i64(t1, a, m);
1872     tcg_gen_andc_i64(t2, b, m);
1873     tcg_gen_eqv_i64(t3, a, b);
1874     tcg_gen_sub_i64(d, t1, t2);
1875     tcg_gen_and_i64(t3, t3, m);
1876     tcg_gen_xor_i64(d, d, t3);
1877
1878     tcg_temp_free_i64(t1);
1879     tcg_temp_free_i64(t2);
1880     tcg_temp_free_i64(t3);
1881 }
1882
1883 void tcg_gen_vec_sub8_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1884 {
1885     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
1886     gen_subv_mask(d, a, b, m);
1887 }
1888
1889 void tcg_gen_vec_sub16_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1890 {
1891     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
1892     gen_subv_mask(d, a, b, m);
1893 }
1894
1895 void tcg_gen_vec_sub32_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
1896 {
1897     TCGv_i64 t1 = tcg_temp_new_i64();
1898     TCGv_i64 t2 = tcg_temp_new_i64();
1899
1900     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
1901     tcg_gen_sub_i64(t2, a, b);
1902     tcg_gen_sub_i64(t1, a, t1);
1903     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
1904
1905     tcg_temp_free_i64(t1);
1906     tcg_temp_free_i64(t2);
1907 }
1908
1909 void tcg_gen_gvec_sub(unsigned vece, uint32_t dofs, uint32_t aofs,
1910                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1911 {
1912     static const GVecGen3 g[4] = {
1913         { .fni8 = tcg_gen_vec_sub8_i64,
1914           .fniv = tcg_gen_sub_vec,
1915           .fno = gen_helper_gvec_sub8,
1916           .opt_opc = vecop_list_sub,
1917           .vece = MO_8 },
1918         { .fni8 = tcg_gen_vec_sub16_i64,
1919           .fniv = tcg_gen_sub_vec,
1920           .fno = gen_helper_gvec_sub16,
1921           .opt_opc = vecop_list_sub,
1922           .vece = MO_16 },
1923         { .fni4 = tcg_gen_sub_i32,
1924           .fniv = tcg_gen_sub_vec,
1925           .fno = gen_helper_gvec_sub32,
1926           .opt_opc = vecop_list_sub,
1927           .vece = MO_32 },
1928         { .fni8 = tcg_gen_sub_i64,
1929           .fniv = tcg_gen_sub_vec,
1930           .fno = gen_helper_gvec_sub64,
1931           .opt_opc = vecop_list_sub,
1932           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1933           .vece = MO_64 },
1934     };
1935
1936     tcg_debug_assert(vece <= MO_64);
1937     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1938 }
1939
1940 static const TCGOpcode vecop_list_mul[] = { INDEX_op_mul_vec, 0 };
1941
1942 void tcg_gen_gvec_mul(unsigned vece, uint32_t dofs, uint32_t aofs,
1943                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
1944 {
1945     static const GVecGen3 g[4] = {
1946         { .fniv = tcg_gen_mul_vec,
1947           .fno = gen_helper_gvec_mul8,
1948           .opt_opc = vecop_list_mul,
1949           .vece = MO_8 },
1950         { .fniv = tcg_gen_mul_vec,
1951           .fno = gen_helper_gvec_mul16,
1952           .opt_opc = vecop_list_mul,
1953           .vece = MO_16 },
1954         { .fni4 = tcg_gen_mul_i32,
1955           .fniv = tcg_gen_mul_vec,
1956           .fno = gen_helper_gvec_mul32,
1957           .opt_opc = vecop_list_mul,
1958           .vece = MO_32 },
1959         { .fni8 = tcg_gen_mul_i64,
1960           .fniv = tcg_gen_mul_vec,
1961           .fno = gen_helper_gvec_mul64,
1962           .opt_opc = vecop_list_mul,
1963           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1964           .vece = MO_64 },
1965     };
1966
1967     tcg_debug_assert(vece <= MO_64);
1968     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
1969 }
1970
1971 void tcg_gen_gvec_muls(unsigned vece, uint32_t dofs, uint32_t aofs,
1972                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
1973 {
1974     static const GVecGen2s g[4] = {
1975         { .fniv = tcg_gen_mul_vec,
1976           .fno = gen_helper_gvec_muls8,
1977           .opt_opc = vecop_list_mul,
1978           .vece = MO_8 },
1979         { .fniv = tcg_gen_mul_vec,
1980           .fno = gen_helper_gvec_muls16,
1981           .opt_opc = vecop_list_mul,
1982           .vece = MO_16 },
1983         { .fni4 = tcg_gen_mul_i32,
1984           .fniv = tcg_gen_mul_vec,
1985           .fno = gen_helper_gvec_muls32,
1986           .opt_opc = vecop_list_mul,
1987           .vece = MO_32 },
1988         { .fni8 = tcg_gen_mul_i64,
1989           .fniv = tcg_gen_mul_vec,
1990           .fno = gen_helper_gvec_muls64,
1991           .opt_opc = vecop_list_mul,
1992           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
1993           .vece = MO_64 },
1994     };
1995
1996     tcg_debug_assert(vece <= MO_64);
1997     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, c, &g[vece]);
1998 }
1999
2000 void tcg_gen_gvec_muli(unsigned vece, uint32_t dofs, uint32_t aofs,
2001                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2002 {
2003     TCGv_i64 tmp = tcg_constant_i64(c);
2004     tcg_gen_gvec_muls(vece, dofs, aofs, tmp, oprsz, maxsz);
2005 }
2006
2007 void tcg_gen_gvec_ssadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2008                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2009 {
2010     static const TCGOpcode vecop_list[] = { INDEX_op_ssadd_vec, 0 };
2011     static const GVecGen3 g[4] = {
2012         { .fniv = tcg_gen_ssadd_vec,
2013           .fno = gen_helper_gvec_ssadd8,
2014           .opt_opc = vecop_list,
2015           .vece = MO_8 },
2016         { .fniv = tcg_gen_ssadd_vec,
2017           .fno = gen_helper_gvec_ssadd16,
2018           .opt_opc = vecop_list,
2019           .vece = MO_16 },
2020         { .fniv = tcg_gen_ssadd_vec,
2021           .fno = gen_helper_gvec_ssadd32,
2022           .opt_opc = vecop_list,
2023           .vece = MO_32 },
2024         { .fniv = tcg_gen_ssadd_vec,
2025           .fno = gen_helper_gvec_ssadd64,
2026           .opt_opc = vecop_list,
2027           .vece = MO_64 },
2028     };
2029     tcg_debug_assert(vece <= MO_64);
2030     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2031 }
2032
2033 void tcg_gen_gvec_sssub(unsigned vece, uint32_t dofs, uint32_t aofs,
2034                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2035 {
2036     static const TCGOpcode vecop_list[] = { INDEX_op_sssub_vec, 0 };
2037     static const GVecGen3 g[4] = {
2038         { .fniv = tcg_gen_sssub_vec,
2039           .fno = gen_helper_gvec_sssub8,
2040           .opt_opc = vecop_list,
2041           .vece = MO_8 },
2042         { .fniv = tcg_gen_sssub_vec,
2043           .fno = gen_helper_gvec_sssub16,
2044           .opt_opc = vecop_list,
2045           .vece = MO_16 },
2046         { .fniv = tcg_gen_sssub_vec,
2047           .fno = gen_helper_gvec_sssub32,
2048           .opt_opc = vecop_list,
2049           .vece = MO_32 },
2050         { .fniv = tcg_gen_sssub_vec,
2051           .fno = gen_helper_gvec_sssub64,
2052           .opt_opc = vecop_list,
2053           .vece = MO_64 },
2054     };
2055     tcg_debug_assert(vece <= MO_64);
2056     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2057 }
2058
2059 static void tcg_gen_usadd_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2060 {
2061     TCGv_i32 max = tcg_constant_i32(-1);
2062     tcg_gen_add_i32(d, a, b);
2063     tcg_gen_movcond_i32(TCG_COND_LTU, d, d, a, max, d);
2064 }
2065
2066 static void tcg_gen_usadd_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2067 {
2068     TCGv_i64 max = tcg_constant_i64(-1);
2069     tcg_gen_add_i64(d, a, b);
2070     tcg_gen_movcond_i64(TCG_COND_LTU, d, d, a, max, d);
2071 }
2072
2073 void tcg_gen_gvec_usadd(unsigned vece, uint32_t dofs, uint32_t aofs,
2074                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2075 {
2076     static const TCGOpcode vecop_list[] = { INDEX_op_usadd_vec, 0 };
2077     static const GVecGen3 g[4] = {
2078         { .fniv = tcg_gen_usadd_vec,
2079           .fno = gen_helper_gvec_usadd8,
2080           .opt_opc = vecop_list,
2081           .vece = MO_8 },
2082         { .fniv = tcg_gen_usadd_vec,
2083           .fno = gen_helper_gvec_usadd16,
2084           .opt_opc = vecop_list,
2085           .vece = MO_16 },
2086         { .fni4 = tcg_gen_usadd_i32,
2087           .fniv = tcg_gen_usadd_vec,
2088           .fno = gen_helper_gvec_usadd32,
2089           .opt_opc = vecop_list,
2090           .vece = MO_32 },
2091         { .fni8 = tcg_gen_usadd_i64,
2092           .fniv = tcg_gen_usadd_vec,
2093           .fno = gen_helper_gvec_usadd64,
2094           .opt_opc = vecop_list,
2095           .vece = MO_64 }
2096     };
2097     tcg_debug_assert(vece <= MO_64);
2098     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2099 }
2100
2101 static void tcg_gen_ussub_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
2102 {
2103     TCGv_i32 min = tcg_constant_i32(0);
2104     tcg_gen_sub_i32(d, a, b);
2105     tcg_gen_movcond_i32(TCG_COND_LTU, d, a, b, min, d);
2106 }
2107
2108 static void tcg_gen_ussub_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
2109 {
2110     TCGv_i64 min = tcg_constant_i64(0);
2111     tcg_gen_sub_i64(d, a, b);
2112     tcg_gen_movcond_i64(TCG_COND_LTU, d, a, b, min, d);
2113 }
2114
2115 void tcg_gen_gvec_ussub(unsigned vece, uint32_t dofs, uint32_t aofs,
2116                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2117 {
2118     static const TCGOpcode vecop_list[] = { INDEX_op_ussub_vec, 0 };
2119     static const GVecGen3 g[4] = {
2120         { .fniv = tcg_gen_ussub_vec,
2121           .fno = gen_helper_gvec_ussub8,
2122           .opt_opc = vecop_list,
2123           .vece = MO_8 },
2124         { .fniv = tcg_gen_ussub_vec,
2125           .fno = gen_helper_gvec_ussub16,
2126           .opt_opc = vecop_list,
2127           .vece = MO_16 },
2128         { .fni4 = tcg_gen_ussub_i32,
2129           .fniv = tcg_gen_ussub_vec,
2130           .fno = gen_helper_gvec_ussub32,
2131           .opt_opc = vecop_list,
2132           .vece = MO_32 },
2133         { .fni8 = tcg_gen_ussub_i64,
2134           .fniv = tcg_gen_ussub_vec,
2135           .fno = gen_helper_gvec_ussub64,
2136           .opt_opc = vecop_list,
2137           .vece = MO_64 }
2138     };
2139     tcg_debug_assert(vece <= MO_64);
2140     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2141 }
2142
2143 void tcg_gen_gvec_smin(unsigned vece, uint32_t dofs, uint32_t aofs,
2144                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2145 {
2146     static const TCGOpcode vecop_list[] = { INDEX_op_smin_vec, 0 };
2147     static const GVecGen3 g[4] = {
2148         { .fniv = tcg_gen_smin_vec,
2149           .fno = gen_helper_gvec_smin8,
2150           .opt_opc = vecop_list,
2151           .vece = MO_8 },
2152         { .fniv = tcg_gen_smin_vec,
2153           .fno = gen_helper_gvec_smin16,
2154           .opt_opc = vecop_list,
2155           .vece = MO_16 },
2156         { .fni4 = tcg_gen_smin_i32,
2157           .fniv = tcg_gen_smin_vec,
2158           .fno = gen_helper_gvec_smin32,
2159           .opt_opc = vecop_list,
2160           .vece = MO_32 },
2161         { .fni8 = tcg_gen_smin_i64,
2162           .fniv = tcg_gen_smin_vec,
2163           .fno = gen_helper_gvec_smin64,
2164           .opt_opc = vecop_list,
2165           .vece = MO_64 }
2166     };
2167     tcg_debug_assert(vece <= MO_64);
2168     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2169 }
2170
2171 void tcg_gen_gvec_umin(unsigned vece, uint32_t dofs, uint32_t aofs,
2172                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2173 {
2174     static const TCGOpcode vecop_list[] = { INDEX_op_umin_vec, 0 };
2175     static const GVecGen3 g[4] = {
2176         { .fniv = tcg_gen_umin_vec,
2177           .fno = gen_helper_gvec_umin8,
2178           .opt_opc = vecop_list,
2179           .vece = MO_8 },
2180         { .fniv = tcg_gen_umin_vec,
2181           .fno = gen_helper_gvec_umin16,
2182           .opt_opc = vecop_list,
2183           .vece = MO_16 },
2184         { .fni4 = tcg_gen_umin_i32,
2185           .fniv = tcg_gen_umin_vec,
2186           .fno = gen_helper_gvec_umin32,
2187           .opt_opc = vecop_list,
2188           .vece = MO_32 },
2189         { .fni8 = tcg_gen_umin_i64,
2190           .fniv = tcg_gen_umin_vec,
2191           .fno = gen_helper_gvec_umin64,
2192           .opt_opc = vecop_list,
2193           .vece = MO_64 }
2194     };
2195     tcg_debug_assert(vece <= MO_64);
2196     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2197 }
2198
2199 void tcg_gen_gvec_smax(unsigned vece, uint32_t dofs, uint32_t aofs,
2200                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2201 {
2202     static const TCGOpcode vecop_list[] = { INDEX_op_smax_vec, 0 };
2203     static const GVecGen3 g[4] = {
2204         { .fniv = tcg_gen_smax_vec,
2205           .fno = gen_helper_gvec_smax8,
2206           .opt_opc = vecop_list,
2207           .vece = MO_8 },
2208         { .fniv = tcg_gen_smax_vec,
2209           .fno = gen_helper_gvec_smax16,
2210           .opt_opc = vecop_list,
2211           .vece = MO_16 },
2212         { .fni4 = tcg_gen_smax_i32,
2213           .fniv = tcg_gen_smax_vec,
2214           .fno = gen_helper_gvec_smax32,
2215           .opt_opc = vecop_list,
2216           .vece = MO_32 },
2217         { .fni8 = tcg_gen_smax_i64,
2218           .fniv = tcg_gen_smax_vec,
2219           .fno = gen_helper_gvec_smax64,
2220           .opt_opc = vecop_list,
2221           .vece = MO_64 }
2222     };
2223     tcg_debug_assert(vece <= MO_64);
2224     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2225 }
2226
2227 void tcg_gen_gvec_umax(unsigned vece, uint32_t dofs, uint32_t aofs,
2228                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2229 {
2230     static const TCGOpcode vecop_list[] = { INDEX_op_umax_vec, 0 };
2231     static const GVecGen3 g[4] = {
2232         { .fniv = tcg_gen_umax_vec,
2233           .fno = gen_helper_gvec_umax8,
2234           .opt_opc = vecop_list,
2235           .vece = MO_8 },
2236         { .fniv = tcg_gen_umax_vec,
2237           .fno = gen_helper_gvec_umax16,
2238           .opt_opc = vecop_list,
2239           .vece = MO_16 },
2240         { .fni4 = tcg_gen_umax_i32,
2241           .fniv = tcg_gen_umax_vec,
2242           .fno = gen_helper_gvec_umax32,
2243           .opt_opc = vecop_list,
2244           .vece = MO_32 },
2245         { .fni8 = tcg_gen_umax_i64,
2246           .fniv = tcg_gen_umax_vec,
2247           .fno = gen_helper_gvec_umax64,
2248           .opt_opc = vecop_list,
2249           .vece = MO_64 }
2250     };
2251     tcg_debug_assert(vece <= MO_64);
2252     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
2253 }
2254
2255 /* Perform a vector negation using normal negation and a mask.
2256    Compare gen_subv_mask above.  */
2257 static void gen_negv_mask(TCGv_i64 d, TCGv_i64 b, TCGv_i64 m)
2258 {
2259     TCGv_i64 t2 = tcg_temp_new_i64();
2260     TCGv_i64 t3 = tcg_temp_new_i64();
2261
2262     tcg_gen_andc_i64(t3, m, b);
2263     tcg_gen_andc_i64(t2, b, m);
2264     tcg_gen_sub_i64(d, m, t2);
2265     tcg_gen_xor_i64(d, d, t3);
2266
2267     tcg_temp_free_i64(t2);
2268     tcg_temp_free_i64(t3);
2269 }
2270
2271 void tcg_gen_vec_neg8_i64(TCGv_i64 d, TCGv_i64 b)
2272 {
2273     TCGv_i64 m = tcg_constant_i64(dup_const(MO_8, 0x80));
2274     gen_negv_mask(d, b, m);
2275 }
2276
2277 void tcg_gen_vec_neg16_i64(TCGv_i64 d, TCGv_i64 b)
2278 {
2279     TCGv_i64 m = tcg_constant_i64(dup_const(MO_16, 0x8000));
2280     gen_negv_mask(d, b, m);
2281 }
2282
2283 void tcg_gen_vec_neg32_i64(TCGv_i64 d, TCGv_i64 b)
2284 {
2285     TCGv_i64 t1 = tcg_temp_new_i64();
2286     TCGv_i64 t2 = tcg_temp_new_i64();
2287
2288     tcg_gen_andi_i64(t1, b, ~0xffffffffull);
2289     tcg_gen_neg_i64(t2, b);
2290     tcg_gen_neg_i64(t1, t1);
2291     tcg_gen_deposit_i64(d, t1, t2, 0, 32);
2292
2293     tcg_temp_free_i64(t1);
2294     tcg_temp_free_i64(t2);
2295 }
2296
2297 void tcg_gen_gvec_neg(unsigned vece, uint32_t dofs, uint32_t aofs,
2298                       uint32_t oprsz, uint32_t maxsz)
2299 {
2300     static const TCGOpcode vecop_list[] = { INDEX_op_neg_vec, 0 };
2301     static const GVecGen2 g[4] = {
2302         { .fni8 = tcg_gen_vec_neg8_i64,
2303           .fniv = tcg_gen_neg_vec,
2304           .fno = gen_helper_gvec_neg8,
2305           .opt_opc = vecop_list,
2306           .vece = MO_8 },
2307         { .fni8 = tcg_gen_vec_neg16_i64,
2308           .fniv = tcg_gen_neg_vec,
2309           .fno = gen_helper_gvec_neg16,
2310           .opt_opc = vecop_list,
2311           .vece = MO_16 },
2312         { .fni4 = tcg_gen_neg_i32,
2313           .fniv = tcg_gen_neg_vec,
2314           .fno = gen_helper_gvec_neg32,
2315           .opt_opc = vecop_list,
2316           .vece = MO_32 },
2317         { .fni8 = tcg_gen_neg_i64,
2318           .fniv = tcg_gen_neg_vec,
2319           .fno = gen_helper_gvec_neg64,
2320           .opt_opc = vecop_list,
2321           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2322           .vece = MO_64 },
2323     };
2324
2325     tcg_debug_assert(vece <= MO_64);
2326     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2327 }
2328
2329 static void gen_absv_mask(TCGv_i64 d, TCGv_i64 b, unsigned vece)
2330 {
2331     TCGv_i64 t = tcg_temp_new_i64();
2332     int nbit = 8 << vece;
2333
2334     /* Create -1 for each negative element.  */
2335     tcg_gen_shri_i64(t, b, nbit - 1);
2336     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2337     tcg_gen_muli_i64(t, t, (1 << nbit) - 1);
2338
2339     /*
2340      * Invert (via xor -1) and add one.
2341      * Because of the ordering the msb is cleared,
2342      * so we never have carry into the next element.
2343      */
2344     tcg_gen_xor_i64(d, b, t);
2345     tcg_gen_andi_i64(t, t, dup_const(vece, 1));
2346     tcg_gen_add_i64(d, d, t);
2347
2348     tcg_temp_free_i64(t);
2349 }
2350
2351 static void tcg_gen_vec_abs8_i64(TCGv_i64 d, TCGv_i64 b)
2352 {
2353     gen_absv_mask(d, b, MO_8);
2354 }
2355
2356 static void tcg_gen_vec_abs16_i64(TCGv_i64 d, TCGv_i64 b)
2357 {
2358     gen_absv_mask(d, b, MO_16);
2359 }
2360
2361 void tcg_gen_gvec_abs(unsigned vece, uint32_t dofs, uint32_t aofs,
2362                       uint32_t oprsz, uint32_t maxsz)
2363 {
2364     static const TCGOpcode vecop_list[] = { INDEX_op_abs_vec, 0 };
2365     static const GVecGen2 g[4] = {
2366         { .fni8 = tcg_gen_vec_abs8_i64,
2367           .fniv = tcg_gen_abs_vec,
2368           .fno = gen_helper_gvec_abs8,
2369           .opt_opc = vecop_list,
2370           .vece = MO_8 },
2371         { .fni8 = tcg_gen_vec_abs16_i64,
2372           .fniv = tcg_gen_abs_vec,
2373           .fno = gen_helper_gvec_abs16,
2374           .opt_opc = vecop_list,
2375           .vece = MO_16 },
2376         { .fni4 = tcg_gen_abs_i32,
2377           .fniv = tcg_gen_abs_vec,
2378           .fno = gen_helper_gvec_abs32,
2379           .opt_opc = vecop_list,
2380           .vece = MO_32 },
2381         { .fni8 = tcg_gen_abs_i64,
2382           .fniv = tcg_gen_abs_vec,
2383           .fno = gen_helper_gvec_abs64,
2384           .opt_opc = vecop_list,
2385           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2386           .vece = MO_64 },
2387     };
2388
2389     tcg_debug_assert(vece <= MO_64);
2390     tcg_gen_gvec_2(dofs, aofs, oprsz, maxsz, &g[vece]);
2391 }
2392
2393 void tcg_gen_gvec_and(unsigned vece, uint32_t dofs, uint32_t aofs,
2394                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2395 {
2396     static const GVecGen3 g = {
2397         .fni8 = tcg_gen_and_i64,
2398         .fniv = tcg_gen_and_vec,
2399         .fno = gen_helper_gvec_and,
2400         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2401     };
2402
2403     if (aofs == bofs) {
2404         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2405     } else {
2406         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2407     }
2408 }
2409
2410 void tcg_gen_gvec_or(unsigned vece, uint32_t dofs, uint32_t aofs,
2411                      uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2412 {
2413     static const GVecGen3 g = {
2414         .fni8 = tcg_gen_or_i64,
2415         .fniv = tcg_gen_or_vec,
2416         .fno = gen_helper_gvec_or,
2417         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2418     };
2419
2420     if (aofs == bofs) {
2421         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2422     } else {
2423         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2424     }
2425 }
2426
2427 void tcg_gen_gvec_xor(unsigned vece, uint32_t dofs, uint32_t aofs,
2428                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2429 {
2430     static const GVecGen3 g = {
2431         .fni8 = tcg_gen_xor_i64,
2432         .fniv = tcg_gen_xor_vec,
2433         .fno = gen_helper_gvec_xor,
2434         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2435     };
2436
2437     if (aofs == bofs) {
2438         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2439     } else {
2440         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2441     }
2442 }
2443
2444 void tcg_gen_gvec_andc(unsigned vece, uint32_t dofs, uint32_t aofs,
2445                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2446 {
2447     static const GVecGen3 g = {
2448         .fni8 = tcg_gen_andc_i64,
2449         .fniv = tcg_gen_andc_vec,
2450         .fno = gen_helper_gvec_andc,
2451         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2452     };
2453
2454     if (aofs == bofs) {
2455         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, 0);
2456     } else {
2457         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2458     }
2459 }
2460
2461 void tcg_gen_gvec_orc(unsigned vece, uint32_t dofs, uint32_t aofs,
2462                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2463 {
2464     static const GVecGen3 g = {
2465         .fni8 = tcg_gen_orc_i64,
2466         .fniv = tcg_gen_orc_vec,
2467         .fno = gen_helper_gvec_orc,
2468         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2469     };
2470
2471     if (aofs == bofs) {
2472         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2473     } else {
2474         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2475     }
2476 }
2477
2478 void tcg_gen_gvec_nand(unsigned vece, uint32_t dofs, uint32_t aofs,
2479                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2480 {
2481     static const GVecGen3 g = {
2482         .fni8 = tcg_gen_nand_i64,
2483         .fniv = tcg_gen_nand_vec,
2484         .fno = gen_helper_gvec_nand,
2485         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2486     };
2487
2488     if (aofs == bofs) {
2489         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2490     } else {
2491         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2492     }
2493 }
2494
2495 void tcg_gen_gvec_nor(unsigned vece, uint32_t dofs, uint32_t aofs,
2496                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2497 {
2498     static const GVecGen3 g = {
2499         .fni8 = tcg_gen_nor_i64,
2500         .fniv = tcg_gen_nor_vec,
2501         .fno = gen_helper_gvec_nor,
2502         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2503     };
2504
2505     if (aofs == bofs) {
2506         tcg_gen_gvec_not(vece, dofs, aofs, oprsz, maxsz);
2507     } else {
2508         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2509     }
2510 }
2511
2512 void tcg_gen_gvec_eqv(unsigned vece, uint32_t dofs, uint32_t aofs,
2513                       uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
2514 {
2515     static const GVecGen3 g = {
2516         .fni8 = tcg_gen_eqv_i64,
2517         .fniv = tcg_gen_eqv_vec,
2518         .fno = gen_helper_gvec_eqv,
2519         .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2520     };
2521
2522     if (aofs == bofs) {
2523         tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, -1);
2524     } else {
2525         tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g);
2526     }
2527 }
2528
2529 static const GVecGen2s gop_ands = {
2530     .fni8 = tcg_gen_and_i64,
2531     .fniv = tcg_gen_and_vec,
2532     .fno = gen_helper_gvec_ands,
2533     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2534     .vece = MO_64
2535 };
2536
2537 void tcg_gen_gvec_ands(unsigned vece, uint32_t dofs, uint32_t aofs,
2538                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2539 {
2540     TCGv_i64 tmp = tcg_temp_new_i64();
2541     gen_dup_i64(vece, tmp, c);
2542     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2543     tcg_temp_free_i64(tmp);
2544 }
2545
2546 void tcg_gen_gvec_andi(unsigned vece, uint32_t dofs, uint32_t aofs,
2547                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2548 {
2549     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2550     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ands);
2551 }
2552
2553 static const GVecGen2s gop_xors = {
2554     .fni8 = tcg_gen_xor_i64,
2555     .fniv = tcg_gen_xor_vec,
2556     .fno = gen_helper_gvec_xors,
2557     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2558     .vece = MO_64
2559 };
2560
2561 void tcg_gen_gvec_xors(unsigned vece, uint32_t dofs, uint32_t aofs,
2562                        TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2563 {
2564     TCGv_i64 tmp = tcg_temp_new_i64();
2565     gen_dup_i64(vece, tmp, c);
2566     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2567     tcg_temp_free_i64(tmp);
2568 }
2569
2570 void tcg_gen_gvec_xori(unsigned vece, uint32_t dofs, uint32_t aofs,
2571                        int64_t c, uint32_t oprsz, uint32_t maxsz)
2572 {
2573     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2574     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_xors);
2575 }
2576
2577 static const GVecGen2s gop_ors = {
2578     .fni8 = tcg_gen_or_i64,
2579     .fniv = tcg_gen_or_vec,
2580     .fno = gen_helper_gvec_ors,
2581     .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2582     .vece = MO_64
2583 };
2584
2585 void tcg_gen_gvec_ors(unsigned vece, uint32_t dofs, uint32_t aofs,
2586                       TCGv_i64 c, uint32_t oprsz, uint32_t maxsz)
2587 {
2588     TCGv_i64 tmp = tcg_temp_new_i64();
2589     gen_dup_i64(vece, tmp, c);
2590     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2591     tcg_temp_free_i64(tmp);
2592 }
2593
2594 void tcg_gen_gvec_ori(unsigned vece, uint32_t dofs, uint32_t aofs,
2595                       int64_t c, uint32_t oprsz, uint32_t maxsz)
2596 {
2597     TCGv_i64 tmp = tcg_constant_i64(dup_const(vece, c));
2598     tcg_gen_gvec_2s(dofs, aofs, oprsz, maxsz, tmp, &gop_ors);
2599 }
2600
2601 void tcg_gen_vec_shl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2602 {
2603     uint64_t mask = dup_const(MO_8, 0xff << c);
2604     tcg_gen_shli_i64(d, a, c);
2605     tcg_gen_andi_i64(d, d, mask);
2606 }
2607
2608 void tcg_gen_vec_shl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2609 {
2610     uint64_t mask = dup_const(MO_16, 0xffff << c);
2611     tcg_gen_shli_i64(d, a, c);
2612     tcg_gen_andi_i64(d, d, mask);
2613 }
2614
2615 void tcg_gen_gvec_shli(unsigned vece, uint32_t dofs, uint32_t aofs,
2616                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2617 {
2618     static const TCGOpcode vecop_list[] = { INDEX_op_shli_vec, 0 };
2619     static const GVecGen2i g[4] = {
2620         { .fni8 = tcg_gen_vec_shl8i_i64,
2621           .fniv = tcg_gen_shli_vec,
2622           .fno = gen_helper_gvec_shl8i,
2623           .opt_opc = vecop_list,
2624           .vece = MO_8 },
2625         { .fni8 = tcg_gen_vec_shl16i_i64,
2626           .fniv = tcg_gen_shli_vec,
2627           .fno = gen_helper_gvec_shl16i,
2628           .opt_opc = vecop_list,
2629           .vece = MO_16 },
2630         { .fni4 = tcg_gen_shli_i32,
2631           .fniv = tcg_gen_shli_vec,
2632           .fno = gen_helper_gvec_shl32i,
2633           .opt_opc = vecop_list,
2634           .vece = MO_32 },
2635         { .fni8 = tcg_gen_shli_i64,
2636           .fniv = tcg_gen_shli_vec,
2637           .fno = gen_helper_gvec_shl64i,
2638           .opt_opc = vecop_list,
2639           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2640           .vece = MO_64 },
2641     };
2642
2643     tcg_debug_assert(vece <= MO_64);
2644     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2645     if (shift == 0) {
2646         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2647     } else {
2648         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2649     }
2650 }
2651
2652 void tcg_gen_vec_shr8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2653 {
2654     uint64_t mask = dup_const(MO_8, 0xff >> c);
2655     tcg_gen_shri_i64(d, a, c);
2656     tcg_gen_andi_i64(d, d, mask);
2657 }
2658
2659 void tcg_gen_vec_shr16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2660 {
2661     uint64_t mask = dup_const(MO_16, 0xffff >> c);
2662     tcg_gen_shri_i64(d, a, c);
2663     tcg_gen_andi_i64(d, d, mask);
2664 }
2665
2666 void tcg_gen_gvec_shri(unsigned vece, uint32_t dofs, uint32_t aofs,
2667                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2668 {
2669     static const TCGOpcode vecop_list[] = { INDEX_op_shri_vec, 0 };
2670     static const GVecGen2i g[4] = {
2671         { .fni8 = tcg_gen_vec_shr8i_i64,
2672           .fniv = tcg_gen_shri_vec,
2673           .fno = gen_helper_gvec_shr8i,
2674           .opt_opc = vecop_list,
2675           .vece = MO_8 },
2676         { .fni8 = tcg_gen_vec_shr16i_i64,
2677           .fniv = tcg_gen_shri_vec,
2678           .fno = gen_helper_gvec_shr16i,
2679           .opt_opc = vecop_list,
2680           .vece = MO_16 },
2681         { .fni4 = tcg_gen_shri_i32,
2682           .fniv = tcg_gen_shri_vec,
2683           .fno = gen_helper_gvec_shr32i,
2684           .opt_opc = vecop_list,
2685           .vece = MO_32 },
2686         { .fni8 = tcg_gen_shri_i64,
2687           .fniv = tcg_gen_shri_vec,
2688           .fno = gen_helper_gvec_shr64i,
2689           .opt_opc = vecop_list,
2690           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2691           .vece = MO_64 },
2692     };
2693
2694     tcg_debug_assert(vece <= MO_64);
2695     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2696     if (shift == 0) {
2697         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2698     } else {
2699         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2700     }
2701 }
2702
2703 void tcg_gen_vec_sar8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2704 {
2705     uint64_t s_mask = dup_const(MO_8, 0x80 >> c);
2706     uint64_t c_mask = dup_const(MO_8, 0xff >> c);
2707     TCGv_i64 s = tcg_temp_new_i64();
2708
2709     tcg_gen_shri_i64(d, a, c);
2710     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2711     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2712     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2713     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2714     tcg_temp_free_i64(s);
2715 }
2716
2717 void tcg_gen_vec_sar16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2718 {
2719     uint64_t s_mask = dup_const(MO_16, 0x8000 >> c);
2720     uint64_t c_mask = dup_const(MO_16, 0xffff >> c);
2721     TCGv_i64 s = tcg_temp_new_i64();
2722
2723     tcg_gen_shri_i64(d, a, c);
2724     tcg_gen_andi_i64(s, d, s_mask);  /* isolate (shifted) sign bit */
2725     tcg_gen_andi_i64(d, d, c_mask);  /* clear out bits above sign  */
2726     tcg_gen_muli_i64(s, s, (2 << c) - 2); /* replicate isolated signs */
2727     tcg_gen_or_i64(d, d, s);         /* include sign extension */
2728     tcg_temp_free_i64(s);
2729 }
2730
2731 void tcg_gen_gvec_sari(unsigned vece, uint32_t dofs, uint32_t aofs,
2732                        int64_t shift, uint32_t oprsz, uint32_t maxsz)
2733 {
2734     static const TCGOpcode vecop_list[] = { INDEX_op_sari_vec, 0 };
2735     static const GVecGen2i g[4] = {
2736         { .fni8 = tcg_gen_vec_sar8i_i64,
2737           .fniv = tcg_gen_sari_vec,
2738           .fno = gen_helper_gvec_sar8i,
2739           .opt_opc = vecop_list,
2740           .vece = MO_8 },
2741         { .fni8 = tcg_gen_vec_sar16i_i64,
2742           .fniv = tcg_gen_sari_vec,
2743           .fno = gen_helper_gvec_sar16i,
2744           .opt_opc = vecop_list,
2745           .vece = MO_16 },
2746         { .fni4 = tcg_gen_sari_i32,
2747           .fniv = tcg_gen_sari_vec,
2748           .fno = gen_helper_gvec_sar32i,
2749           .opt_opc = vecop_list,
2750           .vece = MO_32 },
2751         { .fni8 = tcg_gen_sari_i64,
2752           .fniv = tcg_gen_sari_vec,
2753           .fno = gen_helper_gvec_sar64i,
2754           .opt_opc = vecop_list,
2755           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2756           .vece = MO_64 },
2757     };
2758
2759     tcg_debug_assert(vece <= MO_64);
2760     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2761     if (shift == 0) {
2762         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2763     } else {
2764         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2765     }
2766 }
2767
2768 void tcg_gen_vec_rotl8i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2769 {
2770     uint64_t mask = dup_const(MO_8, 0xff << c);
2771
2772     tcg_gen_shli_i64(d, a, c);
2773     tcg_gen_shri_i64(a, a, 8 - c);
2774     tcg_gen_andi_i64(d, d, mask);
2775     tcg_gen_andi_i64(a, a, ~mask);
2776     tcg_gen_or_i64(d, d, a);
2777 }
2778
2779 void tcg_gen_vec_rotl16i_i64(TCGv_i64 d, TCGv_i64 a, int64_t c)
2780 {
2781     uint64_t mask = dup_const(MO_16, 0xffff << c);
2782
2783     tcg_gen_shli_i64(d, a, c);
2784     tcg_gen_shri_i64(a, a, 16 - c);
2785     tcg_gen_andi_i64(d, d, mask);
2786     tcg_gen_andi_i64(a, a, ~mask);
2787     tcg_gen_or_i64(d, d, a);
2788 }
2789
2790 void tcg_gen_gvec_rotli(unsigned vece, uint32_t dofs, uint32_t aofs,
2791                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
2792 {
2793     static const TCGOpcode vecop_list[] = { INDEX_op_rotli_vec, 0 };
2794     static const GVecGen2i g[4] = {
2795         { .fni8 = tcg_gen_vec_rotl8i_i64,
2796           .fniv = tcg_gen_rotli_vec,
2797           .fno = gen_helper_gvec_rotl8i,
2798           .opt_opc = vecop_list,
2799           .vece = MO_8 },
2800         { .fni8 = tcg_gen_vec_rotl16i_i64,
2801           .fniv = tcg_gen_rotli_vec,
2802           .fno = gen_helper_gvec_rotl16i,
2803           .opt_opc = vecop_list,
2804           .vece = MO_16 },
2805         { .fni4 = tcg_gen_rotli_i32,
2806           .fniv = tcg_gen_rotli_vec,
2807           .fno = gen_helper_gvec_rotl32i,
2808           .opt_opc = vecop_list,
2809           .vece = MO_32 },
2810         { .fni8 = tcg_gen_rotli_i64,
2811           .fniv = tcg_gen_rotli_vec,
2812           .fno = gen_helper_gvec_rotl64i,
2813           .opt_opc = vecop_list,
2814           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
2815           .vece = MO_64 },
2816     };
2817
2818     tcg_debug_assert(vece <= MO_64);
2819     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2820     if (shift == 0) {
2821         tcg_gen_gvec_mov(vece, dofs, aofs, oprsz, maxsz);
2822     } else {
2823         tcg_gen_gvec_2i(dofs, aofs, oprsz, maxsz, shift, &g[vece]);
2824     }
2825 }
2826
2827 void tcg_gen_gvec_rotri(unsigned vece, uint32_t dofs, uint32_t aofs,
2828                         int64_t shift, uint32_t oprsz, uint32_t maxsz)
2829 {
2830     tcg_debug_assert(vece <= MO_64);
2831     tcg_debug_assert(shift >= 0 && shift < (8 << vece));
2832     tcg_gen_gvec_rotli(vece, dofs, aofs, -shift & ((8 << vece) - 1),
2833                        oprsz, maxsz);
2834 }
2835
2836 /*
2837  * Specialized generation vector shifts by a non-constant scalar.
2838  */
2839
2840 typedef struct {
2841     void (*fni4)(TCGv_i32, TCGv_i32, TCGv_i32);
2842     void (*fni8)(TCGv_i64, TCGv_i64, TCGv_i64);
2843     void (*fniv_s)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32);
2844     void (*fniv_v)(unsigned, TCGv_vec, TCGv_vec, TCGv_vec);
2845     gen_helper_gvec_2 *fno[4];
2846     TCGOpcode s_list[2];
2847     TCGOpcode v_list[2];
2848 } GVecGen2sh;
2849
2850 static void expand_2sh_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
2851                            uint32_t oprsz, uint32_t tysz, TCGType type,
2852                            TCGv_i32 shift,
2853                            void (*fni)(unsigned, TCGv_vec, TCGv_vec, TCGv_i32))
2854 {
2855     TCGv_vec t0 = tcg_temp_new_vec(type);
2856     uint32_t i;
2857
2858     for (i = 0; i < oprsz; i += tysz) {
2859         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
2860         fni(vece, t0, t0, shift);
2861         tcg_gen_st_vec(t0, cpu_env, dofs + i);
2862     }
2863     tcg_temp_free_vec(t0);
2864 }
2865
2866 static void
2867 do_gvec_shifts(unsigned vece, uint32_t dofs, uint32_t aofs, TCGv_i32 shift,
2868                uint32_t oprsz, uint32_t maxsz, const GVecGen2sh *g)
2869 {
2870     TCGType type;
2871     uint32_t some;
2872
2873     check_size_align(oprsz, maxsz, dofs | aofs);
2874     check_overlap_2(dofs, aofs, maxsz);
2875
2876     /* If the backend has a scalar expansion, great.  */
2877     type = choose_vector_type(g->s_list, vece, oprsz, vece == MO_64);
2878     if (type) {
2879         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2880         switch (type) {
2881         case TCG_TYPE_V256:
2882             some = QEMU_ALIGN_DOWN(oprsz, 32);
2883             expand_2sh_vec(vece, dofs, aofs, some, 32,
2884                            TCG_TYPE_V256, shift, g->fniv_s);
2885             if (some == oprsz) {
2886                 break;
2887             }
2888             dofs += some;
2889             aofs += some;
2890             oprsz -= some;
2891             maxsz -= some;
2892             /* fallthru */
2893         case TCG_TYPE_V128:
2894             expand_2sh_vec(vece, dofs, aofs, oprsz, 16,
2895                            TCG_TYPE_V128, shift, g->fniv_s);
2896             break;
2897         case TCG_TYPE_V64:
2898             expand_2sh_vec(vece, dofs, aofs, oprsz, 8,
2899                            TCG_TYPE_V64, shift, g->fniv_s);
2900             break;
2901         default:
2902             g_assert_not_reached();
2903         }
2904         tcg_swap_vecop_list(hold_list);
2905         goto clear_tail;
2906     }
2907
2908     /* If the backend supports variable vector shifts, also cool.  */
2909     type = choose_vector_type(g->v_list, vece, oprsz, vece == MO_64);
2910     if (type) {
2911         const TCGOpcode *hold_list = tcg_swap_vecop_list(NULL);
2912         TCGv_vec v_shift = tcg_temp_new_vec(type);
2913
2914         if (vece == MO_64) {
2915             TCGv_i64 sh64 = tcg_temp_new_i64();
2916             tcg_gen_extu_i32_i64(sh64, shift);
2917             tcg_gen_dup_i64_vec(MO_64, v_shift, sh64);
2918             tcg_temp_free_i64(sh64);
2919         } else {
2920             tcg_gen_dup_i32_vec(vece, v_shift, shift);
2921         }
2922
2923         switch (type) {
2924         case TCG_TYPE_V256:
2925             some = QEMU_ALIGN_DOWN(oprsz, 32);
2926             expand_2s_vec(vece, dofs, aofs, some, 32, TCG_TYPE_V256,
2927                           v_shift, false, g->fniv_v);
2928             if (some == oprsz) {
2929                 break;
2930             }
2931             dofs += some;
2932             aofs += some;
2933             oprsz -= some;
2934             maxsz -= some;
2935             /* fallthru */
2936         case TCG_TYPE_V128:
2937             expand_2s_vec(vece, dofs, aofs, oprsz, 16, TCG_TYPE_V128,
2938                           v_shift, false, g->fniv_v);
2939             break;
2940         case TCG_TYPE_V64:
2941             expand_2s_vec(vece, dofs, aofs, oprsz, 8, TCG_TYPE_V64,
2942                           v_shift, false, g->fniv_v);
2943             break;
2944         default:
2945             g_assert_not_reached();
2946         }
2947         tcg_temp_free_vec(v_shift);
2948         tcg_swap_vecop_list(hold_list);
2949         goto clear_tail;
2950     }
2951
2952     /* Otherwise fall back to integral... */
2953     if (vece == MO_32 && check_size_impl(oprsz, 4)) {
2954         expand_2s_i32(dofs, aofs, oprsz, shift, false, g->fni4);
2955     } else if (vece == MO_64 && check_size_impl(oprsz, 8)) {
2956         TCGv_i64 sh64 = tcg_temp_new_i64();
2957         tcg_gen_extu_i32_i64(sh64, shift);
2958         expand_2s_i64(dofs, aofs, oprsz, sh64, false, g->fni8);
2959         tcg_temp_free_i64(sh64);
2960     } else {
2961         TCGv_ptr a0 = tcg_temp_new_ptr();
2962         TCGv_ptr a1 = tcg_temp_new_ptr();
2963         TCGv_i32 desc = tcg_temp_new_i32();
2964
2965         tcg_gen_shli_i32(desc, shift, SIMD_DATA_SHIFT);
2966         tcg_gen_ori_i32(desc, desc, simd_desc(oprsz, maxsz, 0));
2967         tcg_gen_addi_ptr(a0, cpu_env, dofs);
2968         tcg_gen_addi_ptr(a1, cpu_env, aofs);
2969
2970         g->fno[vece](a0, a1, desc);
2971
2972         tcg_temp_free_ptr(a0);
2973         tcg_temp_free_ptr(a1);
2974         tcg_temp_free_i32(desc);
2975         return;
2976     }
2977
2978  clear_tail:
2979     if (oprsz < maxsz) {
2980         expand_clr(dofs + oprsz, maxsz - oprsz);
2981     }
2982 }
2983
2984 void tcg_gen_gvec_shls(unsigned vece, uint32_t dofs, uint32_t aofs,
2985                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
2986 {
2987     static const GVecGen2sh g = {
2988         .fni4 = tcg_gen_shl_i32,
2989         .fni8 = tcg_gen_shl_i64,
2990         .fniv_s = tcg_gen_shls_vec,
2991         .fniv_v = tcg_gen_shlv_vec,
2992         .fno = {
2993             gen_helper_gvec_shl8i,
2994             gen_helper_gvec_shl16i,
2995             gen_helper_gvec_shl32i,
2996             gen_helper_gvec_shl64i,
2997         },
2998         .s_list = { INDEX_op_shls_vec, 0 },
2999         .v_list = { INDEX_op_shlv_vec, 0 },
3000     };
3001
3002     tcg_debug_assert(vece <= MO_64);
3003     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3004 }
3005
3006 void tcg_gen_gvec_shrs(unsigned vece, uint32_t dofs, uint32_t aofs,
3007                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3008 {
3009     static const GVecGen2sh g = {
3010         .fni4 = tcg_gen_shr_i32,
3011         .fni8 = tcg_gen_shr_i64,
3012         .fniv_s = tcg_gen_shrs_vec,
3013         .fniv_v = tcg_gen_shrv_vec,
3014         .fno = {
3015             gen_helper_gvec_shr8i,
3016             gen_helper_gvec_shr16i,
3017             gen_helper_gvec_shr32i,
3018             gen_helper_gvec_shr64i,
3019         },
3020         .s_list = { INDEX_op_shrs_vec, 0 },
3021         .v_list = { INDEX_op_shrv_vec, 0 },
3022     };
3023
3024     tcg_debug_assert(vece <= MO_64);
3025     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3026 }
3027
3028 void tcg_gen_gvec_sars(unsigned vece, uint32_t dofs, uint32_t aofs,
3029                        TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3030 {
3031     static const GVecGen2sh g = {
3032         .fni4 = tcg_gen_sar_i32,
3033         .fni8 = tcg_gen_sar_i64,
3034         .fniv_s = tcg_gen_sars_vec,
3035         .fniv_v = tcg_gen_sarv_vec,
3036         .fno = {
3037             gen_helper_gvec_sar8i,
3038             gen_helper_gvec_sar16i,
3039             gen_helper_gvec_sar32i,
3040             gen_helper_gvec_sar64i,
3041         },
3042         .s_list = { INDEX_op_sars_vec, 0 },
3043         .v_list = { INDEX_op_sarv_vec, 0 },
3044     };
3045
3046     tcg_debug_assert(vece <= MO_64);
3047     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3048 }
3049
3050 void tcg_gen_gvec_rotls(unsigned vece, uint32_t dofs, uint32_t aofs,
3051                         TCGv_i32 shift, uint32_t oprsz, uint32_t maxsz)
3052 {
3053     static const GVecGen2sh g = {
3054         .fni4 = tcg_gen_rotl_i32,
3055         .fni8 = tcg_gen_rotl_i64,
3056         .fniv_s = tcg_gen_rotls_vec,
3057         .fniv_v = tcg_gen_rotlv_vec,
3058         .fno = {
3059             gen_helper_gvec_rotl8i,
3060             gen_helper_gvec_rotl16i,
3061             gen_helper_gvec_rotl32i,
3062             gen_helper_gvec_rotl64i,
3063         },
3064         .s_list = { INDEX_op_rotls_vec, 0 },
3065         .v_list = { INDEX_op_rotlv_vec, 0 },
3066     };
3067
3068     tcg_debug_assert(vece <= MO_64);
3069     do_gvec_shifts(vece, dofs, aofs, shift, oprsz, maxsz, &g);
3070 }
3071
3072 /*
3073  * Expand D = A << (B % element bits)
3074  *
3075  * Unlike scalar shifts, where it is easy for the target front end
3076  * to include the modulo as part of the expansion.  If the target
3077  * naturally includes the modulo as part of the operation, great!
3078  * If the target has some other behaviour from out-of-range shifts,
3079  * then it could not use this function anyway, and would need to
3080  * do it's own expansion with custom functions.
3081  */
3082 static void tcg_gen_shlv_mod_vec(unsigned vece, TCGv_vec d,
3083                                  TCGv_vec a, TCGv_vec b)
3084 {
3085     TCGv_vec t = tcg_temp_new_vec_matching(d);
3086     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3087
3088     tcg_gen_and_vec(vece, t, b, m);
3089     tcg_gen_shlv_vec(vece, d, a, t);
3090     tcg_temp_free_vec(t);
3091 }
3092
3093 static void tcg_gen_shl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3094 {
3095     TCGv_i32 t = tcg_temp_new_i32();
3096
3097     tcg_gen_andi_i32(t, b, 31);
3098     tcg_gen_shl_i32(d, a, t);
3099     tcg_temp_free_i32(t);
3100 }
3101
3102 static void tcg_gen_shl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3103 {
3104     TCGv_i64 t = tcg_temp_new_i64();
3105
3106     tcg_gen_andi_i64(t, b, 63);
3107     tcg_gen_shl_i64(d, a, t);
3108     tcg_temp_free_i64(t);
3109 }
3110
3111 void tcg_gen_gvec_shlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3112                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3113 {
3114     static const TCGOpcode vecop_list[] = { INDEX_op_shlv_vec, 0 };
3115     static const GVecGen3 g[4] = {
3116         { .fniv = tcg_gen_shlv_mod_vec,
3117           .fno = gen_helper_gvec_shl8v,
3118           .opt_opc = vecop_list,
3119           .vece = MO_8 },
3120         { .fniv = tcg_gen_shlv_mod_vec,
3121           .fno = gen_helper_gvec_shl16v,
3122           .opt_opc = vecop_list,
3123           .vece = MO_16 },
3124         { .fni4 = tcg_gen_shl_mod_i32,
3125           .fniv = tcg_gen_shlv_mod_vec,
3126           .fno = gen_helper_gvec_shl32v,
3127           .opt_opc = vecop_list,
3128           .vece = MO_32 },
3129         { .fni8 = tcg_gen_shl_mod_i64,
3130           .fniv = tcg_gen_shlv_mod_vec,
3131           .fno = gen_helper_gvec_shl64v,
3132           .opt_opc = vecop_list,
3133           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3134           .vece = MO_64 },
3135     };
3136
3137     tcg_debug_assert(vece <= MO_64);
3138     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3139 }
3140
3141 /*
3142  * Similarly for logical right shifts.
3143  */
3144
3145 static void tcg_gen_shrv_mod_vec(unsigned vece, TCGv_vec d,
3146                                  TCGv_vec a, TCGv_vec b)
3147 {
3148     TCGv_vec t = tcg_temp_new_vec_matching(d);
3149     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3150
3151     tcg_gen_and_vec(vece, t, b, m);
3152     tcg_gen_shrv_vec(vece, d, a, t);
3153     tcg_temp_free_vec(t);
3154 }
3155
3156 static void tcg_gen_shr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3157 {
3158     TCGv_i32 t = tcg_temp_new_i32();
3159
3160     tcg_gen_andi_i32(t, b, 31);
3161     tcg_gen_shr_i32(d, a, t);
3162     tcg_temp_free_i32(t);
3163 }
3164
3165 static void tcg_gen_shr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3166 {
3167     TCGv_i64 t = tcg_temp_new_i64();
3168
3169     tcg_gen_andi_i64(t, b, 63);
3170     tcg_gen_shr_i64(d, a, t);
3171     tcg_temp_free_i64(t);
3172 }
3173
3174 void tcg_gen_gvec_shrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3175                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3176 {
3177     static const TCGOpcode vecop_list[] = { INDEX_op_shrv_vec, 0 };
3178     static const GVecGen3 g[4] = {
3179         { .fniv = tcg_gen_shrv_mod_vec,
3180           .fno = gen_helper_gvec_shr8v,
3181           .opt_opc = vecop_list,
3182           .vece = MO_8 },
3183         { .fniv = tcg_gen_shrv_mod_vec,
3184           .fno = gen_helper_gvec_shr16v,
3185           .opt_opc = vecop_list,
3186           .vece = MO_16 },
3187         { .fni4 = tcg_gen_shr_mod_i32,
3188           .fniv = tcg_gen_shrv_mod_vec,
3189           .fno = gen_helper_gvec_shr32v,
3190           .opt_opc = vecop_list,
3191           .vece = MO_32 },
3192         { .fni8 = tcg_gen_shr_mod_i64,
3193           .fniv = tcg_gen_shrv_mod_vec,
3194           .fno = gen_helper_gvec_shr64v,
3195           .opt_opc = vecop_list,
3196           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3197           .vece = MO_64 },
3198     };
3199
3200     tcg_debug_assert(vece <= MO_64);
3201     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3202 }
3203
3204 /*
3205  * Similarly for arithmetic right shifts.
3206  */
3207
3208 static void tcg_gen_sarv_mod_vec(unsigned vece, TCGv_vec d,
3209                                  TCGv_vec a, TCGv_vec b)
3210 {
3211     TCGv_vec t = tcg_temp_new_vec_matching(d);
3212     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3213
3214     tcg_gen_and_vec(vece, t, b, m);
3215     tcg_gen_sarv_vec(vece, d, a, t);
3216     tcg_temp_free_vec(t);
3217 }
3218
3219 static void tcg_gen_sar_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3220 {
3221     TCGv_i32 t = tcg_temp_new_i32();
3222
3223     tcg_gen_andi_i32(t, b, 31);
3224     tcg_gen_sar_i32(d, a, t);
3225     tcg_temp_free_i32(t);
3226 }
3227
3228 static void tcg_gen_sar_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3229 {
3230     TCGv_i64 t = tcg_temp_new_i64();
3231
3232     tcg_gen_andi_i64(t, b, 63);
3233     tcg_gen_sar_i64(d, a, t);
3234     tcg_temp_free_i64(t);
3235 }
3236
3237 void tcg_gen_gvec_sarv(unsigned vece, uint32_t dofs, uint32_t aofs,
3238                        uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3239 {
3240     static const TCGOpcode vecop_list[] = { INDEX_op_sarv_vec, 0 };
3241     static const GVecGen3 g[4] = {
3242         { .fniv = tcg_gen_sarv_mod_vec,
3243           .fno = gen_helper_gvec_sar8v,
3244           .opt_opc = vecop_list,
3245           .vece = MO_8 },
3246         { .fniv = tcg_gen_sarv_mod_vec,
3247           .fno = gen_helper_gvec_sar16v,
3248           .opt_opc = vecop_list,
3249           .vece = MO_16 },
3250         { .fni4 = tcg_gen_sar_mod_i32,
3251           .fniv = tcg_gen_sarv_mod_vec,
3252           .fno = gen_helper_gvec_sar32v,
3253           .opt_opc = vecop_list,
3254           .vece = MO_32 },
3255         { .fni8 = tcg_gen_sar_mod_i64,
3256           .fniv = tcg_gen_sarv_mod_vec,
3257           .fno = gen_helper_gvec_sar64v,
3258           .opt_opc = vecop_list,
3259           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3260           .vece = MO_64 },
3261     };
3262
3263     tcg_debug_assert(vece <= MO_64);
3264     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3265 }
3266
3267 /*
3268  * Similarly for rotates.
3269  */
3270
3271 static void tcg_gen_rotlv_mod_vec(unsigned vece, TCGv_vec d,
3272                                   TCGv_vec a, TCGv_vec b)
3273 {
3274     TCGv_vec t = tcg_temp_new_vec_matching(d);
3275     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3276
3277     tcg_gen_and_vec(vece, t, b, m);
3278     tcg_gen_rotlv_vec(vece, d, a, t);
3279     tcg_temp_free_vec(t);
3280 }
3281
3282 static void tcg_gen_rotl_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3283 {
3284     TCGv_i32 t = tcg_temp_new_i32();
3285
3286     tcg_gen_andi_i32(t, b, 31);
3287     tcg_gen_rotl_i32(d, a, t);
3288     tcg_temp_free_i32(t);
3289 }
3290
3291 static void tcg_gen_rotl_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3292 {
3293     TCGv_i64 t = tcg_temp_new_i64();
3294
3295     tcg_gen_andi_i64(t, b, 63);
3296     tcg_gen_rotl_i64(d, a, t);
3297     tcg_temp_free_i64(t);
3298 }
3299
3300 void tcg_gen_gvec_rotlv(unsigned vece, uint32_t dofs, uint32_t aofs,
3301                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3302 {
3303     static const TCGOpcode vecop_list[] = { INDEX_op_rotlv_vec, 0 };
3304     static const GVecGen3 g[4] = {
3305         { .fniv = tcg_gen_rotlv_mod_vec,
3306           .fno = gen_helper_gvec_rotl8v,
3307           .opt_opc = vecop_list,
3308           .vece = MO_8 },
3309         { .fniv = tcg_gen_rotlv_mod_vec,
3310           .fno = gen_helper_gvec_rotl16v,
3311           .opt_opc = vecop_list,
3312           .vece = MO_16 },
3313         { .fni4 = tcg_gen_rotl_mod_i32,
3314           .fniv = tcg_gen_rotlv_mod_vec,
3315           .fno = gen_helper_gvec_rotl32v,
3316           .opt_opc = vecop_list,
3317           .vece = MO_32 },
3318         { .fni8 = tcg_gen_rotl_mod_i64,
3319           .fniv = tcg_gen_rotlv_mod_vec,
3320           .fno = gen_helper_gvec_rotl64v,
3321           .opt_opc = vecop_list,
3322           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3323           .vece = MO_64 },
3324     };
3325
3326     tcg_debug_assert(vece <= MO_64);
3327     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3328 }
3329
3330 static void tcg_gen_rotrv_mod_vec(unsigned vece, TCGv_vec d,
3331                                   TCGv_vec a, TCGv_vec b)
3332 {
3333     TCGv_vec t = tcg_temp_new_vec_matching(d);
3334     TCGv_vec m = tcg_constant_vec_matching(d, vece, (8 << vece) - 1);
3335
3336     tcg_gen_and_vec(vece, t, b, m);
3337     tcg_gen_rotrv_vec(vece, d, a, t);
3338     tcg_temp_free_vec(t);
3339 }
3340
3341 static void tcg_gen_rotr_mod_i32(TCGv_i32 d, TCGv_i32 a, TCGv_i32 b)
3342 {
3343     TCGv_i32 t = tcg_temp_new_i32();
3344
3345     tcg_gen_andi_i32(t, b, 31);
3346     tcg_gen_rotr_i32(d, a, t);
3347     tcg_temp_free_i32(t);
3348 }
3349
3350 static void tcg_gen_rotr_mod_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b)
3351 {
3352     TCGv_i64 t = tcg_temp_new_i64();
3353
3354     tcg_gen_andi_i64(t, b, 63);
3355     tcg_gen_rotr_i64(d, a, t);
3356     tcg_temp_free_i64(t);
3357 }
3358
3359 void tcg_gen_gvec_rotrv(unsigned vece, uint32_t dofs, uint32_t aofs,
3360                         uint32_t bofs, uint32_t oprsz, uint32_t maxsz)
3361 {
3362     static const TCGOpcode vecop_list[] = { INDEX_op_rotrv_vec, 0 };
3363     static const GVecGen3 g[4] = {
3364         { .fniv = tcg_gen_rotrv_mod_vec,
3365           .fno = gen_helper_gvec_rotr8v,
3366           .opt_opc = vecop_list,
3367           .vece = MO_8 },
3368         { .fniv = tcg_gen_rotrv_mod_vec,
3369           .fno = gen_helper_gvec_rotr16v,
3370           .opt_opc = vecop_list,
3371           .vece = MO_16 },
3372         { .fni4 = tcg_gen_rotr_mod_i32,
3373           .fniv = tcg_gen_rotrv_mod_vec,
3374           .fno = gen_helper_gvec_rotr32v,
3375           .opt_opc = vecop_list,
3376           .vece = MO_32 },
3377         { .fni8 = tcg_gen_rotr_mod_i64,
3378           .fniv = tcg_gen_rotrv_mod_vec,
3379           .fno = gen_helper_gvec_rotr64v,
3380           .opt_opc = vecop_list,
3381           .prefer_i64 = TCG_TARGET_REG_BITS == 64,
3382           .vece = MO_64 },
3383     };
3384
3385     tcg_debug_assert(vece <= MO_64);
3386     tcg_gen_gvec_3(dofs, aofs, bofs, oprsz, maxsz, &g[vece]);
3387 }
3388
3389 /* Expand OPSZ bytes worth of three-operand operations using i32 elements.  */
3390 static void expand_cmp_i32(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3391                            uint32_t oprsz, TCGCond cond)
3392 {
3393     TCGv_i32 t0 = tcg_temp_new_i32();
3394     TCGv_i32 t1 = tcg_temp_new_i32();
3395     uint32_t i;
3396
3397     for (i = 0; i < oprsz; i += 4) {
3398         tcg_gen_ld_i32(t0, cpu_env, aofs + i);
3399         tcg_gen_ld_i32(t1, cpu_env, bofs + i);
3400         tcg_gen_setcond_i32(cond, t0, t0, t1);
3401         tcg_gen_neg_i32(t0, t0);
3402         tcg_gen_st_i32(t0, cpu_env, dofs + i);
3403     }
3404     tcg_temp_free_i32(t1);
3405     tcg_temp_free_i32(t0);
3406 }
3407
3408 static void expand_cmp_i64(uint32_t dofs, uint32_t aofs, uint32_t bofs,
3409                            uint32_t oprsz, TCGCond cond)
3410 {
3411     TCGv_i64 t0 = tcg_temp_new_i64();
3412     TCGv_i64 t1 = tcg_temp_new_i64();
3413     uint32_t i;
3414
3415     for (i = 0; i < oprsz; i += 8) {
3416         tcg_gen_ld_i64(t0, cpu_env, aofs + i);
3417         tcg_gen_ld_i64(t1, cpu_env, bofs + i);
3418         tcg_gen_setcond_i64(cond, t0, t0, t1);
3419         tcg_gen_neg_i64(t0, t0);
3420         tcg_gen_st_i64(t0, cpu_env, dofs + i);
3421     }
3422     tcg_temp_free_i64(t1);
3423     tcg_temp_free_i64(t0);
3424 }
3425
3426 static void expand_cmp_vec(unsigned vece, uint32_t dofs, uint32_t aofs,
3427                            uint32_t bofs, uint32_t oprsz, uint32_t tysz,
3428                            TCGType type, TCGCond cond)
3429 {
3430     TCGv_vec t0 = tcg_temp_new_vec(type);
3431     TCGv_vec t1 = tcg_temp_new_vec(type);
3432     uint32_t i;
3433
3434     for (i = 0; i < oprsz; i += tysz) {
3435         tcg_gen_ld_vec(t0, cpu_env, aofs + i);
3436         tcg_gen_ld_vec(t1, cpu_env, bofs + i);
3437         tcg_gen_cmp_vec(cond, vece, t0, t0, t1);
3438         tcg_gen_st_vec(t0, cpu_env, dofs + i);
3439     }
3440     tcg_temp_free_vec(t1);
3441     tcg_temp_free_vec(t0);
3442 }
3443
3444 void tcg_gen_gvec_cmp(TCGCond cond, unsigned vece, uint32_t dofs,
3445                       uint32_t aofs, uint32_t bofs,
3446                       uint32_t oprsz, uint32_t maxsz)
3447 {
3448     static const TCGOpcode cmp_list[] = { INDEX_op_cmp_vec, 0 };
3449     static gen_helper_gvec_3 * const eq_fn[4] = {
3450         gen_helper_gvec_eq8, gen_helper_gvec_eq16,
3451         gen_helper_gvec_eq32, gen_helper_gvec_eq64
3452     };
3453     static gen_helper_gvec_3 * const ne_fn[4] = {
3454         gen_helper_gvec_ne8, gen_helper_gvec_ne16,
3455         gen_helper_gvec_ne32, gen_helper_gvec_ne64
3456     };
3457     static gen_helper_gvec_3 * const lt_fn[4] = {
3458         gen_helper_gvec_lt8, gen_helper_gvec_lt16,
3459         gen_helper_gvec_lt32, gen_helper_gvec_lt64
3460     };
3461     static gen_helper_gvec_3 * const le_fn[4] = {
3462         gen_helper_gvec_le8, gen_helper_gvec_le16,
3463         gen_helper_gvec_le32, gen_helper_gvec_le64
3464     };
3465     static gen_helper_gvec_3 * const ltu_fn[4] = {
3466         gen_helper_gvec_ltu8, gen_helper_gvec_ltu16,
3467         gen_helper_gvec_ltu32, gen_helper_gvec_ltu64
3468     };
3469     static gen_helper_gvec_3 * const leu_fn[4] = {
3470         gen_helper_gvec_leu8, gen_helper_gvec_leu16,
3471         gen_helper_gvec_leu32, gen_helper_gvec_leu64
3472     };
3473     static gen_helper_gvec_3 * const * const fns[16] = {
3474         [TCG_COND_EQ] = eq_fn,
3475         [TCG_COND_NE] = ne_fn,
3476         [TCG_COND_LT] = lt_fn,
3477         [TCG_COND_LE] = le_fn,
3478         [TCG_COND_LTU] = ltu_fn,
3479         [TCG_COND_LEU] = leu_fn,
3480     };
3481
3482     const TCGOpcode *hold_list;
3483     TCGType type;
3484     uint32_t some;
3485
3486     check_size_align(oprsz, maxsz, dofs | aofs | bofs);
3487     check_overlap_3(dofs, aofs, bofs, maxsz);
3488
3489     if (cond == TCG_COND_NEVER || cond == TCG_COND_ALWAYS) {
3490         do_dup(MO_8, dofs, oprsz, maxsz,
3491                NULL, NULL, -(cond == TCG_COND_ALWAYS));
3492         return;
3493     }
3494
3495     /*
3496      * Implement inline with a vector type, if possible.
3497      * Prefer integer when 64-bit host and 64-bit comparison.
3498      */
3499     hold_list = tcg_swap_vecop_list(cmp_list);
3500     type = choose_vector_type(cmp_list, vece, oprsz,
3501                               TCG_TARGET_REG_BITS == 64 && vece == MO_64);
3502     switch (type) {
3503     case TCG_TYPE_V256:
3504         /* Recall that ARM SVE allows vector sizes that are not a
3505          * power of 2, but always a multiple of 16.  The intent is
3506          * that e.g. size == 80 would be expanded with 2x32 + 1x16.
3507          */
3508         some = QEMU_ALIGN_DOWN(oprsz, 32);
3509         expand_cmp_vec(vece, dofs, aofs, bofs, some, 32, TCG_TYPE_V256, cond);
3510         if (some == oprsz) {
3511             break;
3512         }
3513         dofs += some;
3514         aofs += some;
3515         bofs += some;
3516         oprsz -= some;
3517         maxsz -= some;
3518         /* fallthru */
3519     case TCG_TYPE_V128:
3520         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 16, TCG_TYPE_V128, cond);
3521         break;
3522     case TCG_TYPE_V64:
3523         expand_cmp_vec(vece, dofs, aofs, bofs, oprsz, 8, TCG_TYPE_V64, cond);
3524         break;
3525
3526     case 0:
3527         if (vece == MO_64 && check_size_impl(oprsz, 8)) {
3528             expand_cmp_i64(dofs, aofs, bofs, oprsz, cond);
3529         } else if (vece == MO_32 && check_size_impl(oprsz, 4)) {
3530             expand_cmp_i32(dofs, aofs, bofs, oprsz, cond);
3531         } else {
3532             gen_helper_gvec_3 * const *fn = fns[cond];
3533
3534             if (fn == NULL) {
3535                 uint32_t tmp;
3536                 tmp = aofs, aofs = bofs, bofs = tmp;
3537                 cond = tcg_swap_cond(cond);
3538                 fn = fns[cond];
3539                 assert(fn != NULL);
3540             }
3541             tcg_gen_gvec_3_ool(dofs, aofs, bofs, oprsz, maxsz, 0, fn[vece]);
3542             oprsz = maxsz;
3543         }
3544         break;
3545
3546     default:
3547         g_assert_not_reached();
3548     }
3549     tcg_swap_vecop_list(hold_list);
3550
3551     if (oprsz < maxsz) {
3552         expand_clr(dofs + oprsz, maxsz - oprsz);
3553     }
3554 }
3555
3556 static void tcg_gen_bitsel_i64(TCGv_i64 d, TCGv_i64 a, TCGv_i64 b, TCGv_i64 c)
3557 {
3558     TCGv_i64 t = tcg_temp_new_i64();
3559
3560     tcg_gen_and_i64(t, b, a);
3561     tcg_gen_andc_i64(d, c, a);
3562     tcg_gen_or_i64(d, d, t);
3563     tcg_temp_free_i64(t);
3564 }
3565
3566 void tcg_gen_gvec_bitsel(unsigned vece, uint32_t dofs, uint32_t aofs,
3567                          uint32_t bofs, uint32_t cofs,
3568                          uint32_t oprsz, uint32_t maxsz)
3569 {
3570     static const GVecGen4 g = {
3571         .fni8 = tcg_gen_bitsel_i64,
3572         .fniv = tcg_gen_bitsel_vec,
3573         .fno = gen_helper_gvec_bitsel,
3574     };
3575
3576     tcg_gen_gvec_4(dofs, aofs, bofs, cofs, oprsz, maxsz, &g);
3577 }