target/arm/tcg/translate-neon.c

   1 /*
   2  *  ARM translation: AArch32 Neon instructions
   3  *
   4  *  Copyright (c) 2003 Fabrice Bellard
   5  *  Copyright (c) 2005-2007 CodeSourcery
   6  *  Copyright (c) 2007 OpenedHand, Ltd.
   7  *  Copyright (c) 2020 Linaro, Ltd.
   8  *
   9  * This library is free software; you can redistribute it and/or
  10  * modify it under the terms of the GNU Lesser General Public
  11  * License as published by the Free Software Foundation; either
  12  * version 2.1 of the License, or (at your option) any later version.
  13  *
  14  * This library is distributed in the hope that it will be useful,
  15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  17  * Lesser General Public License for more details.
  18  *
  19  * You should have received a copy of the GNU Lesser General Public
  20  * License along with this library; if not, see <http://www.gnu.org/licenses/>.
  21  */
  22
  23 #include "qemu/osdep.h"
  24 #include "tcg/tcg-op.h"
  25 #include "tcg/tcg-op-gvec.h"
  26 #include "exec/exec-all.h"
  27 #include "translate.h"
  28 #include "translate-a32.h"
  29
  30 /* Include the generated Neon decoder */
  31 #include "decode-neon-dp.c.inc"
  32 #include "decode-neon-ls.c.inc"
  33 #include "decode-neon-shared.c.inc"
  34
  35 static TCGv_ptr vfp_reg_ptr(bool dp, int reg)
  36 {
  37     TCGv_ptr ret = tcg_temp_new_ptr();
  38     tcg_gen_addi_ptr(ret, cpu_env, vfp_reg_offset(dp, reg));
  39     return ret;
  40 }
  41
  42 static void neon_load_element(TCGv_i32 var, int reg, int ele, MemOp mop)
  43 {
  44     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  45
  46     switch (mop) {
  47     case MO_UB:
  48         tcg_gen_ld8u_i32(var, cpu_env, offset);
  49         break;
  50     case MO_UW:
  51         tcg_gen_ld16u_i32(var, cpu_env, offset);
  52         break;
  53     case MO_UL:
  54         tcg_gen_ld_i32(var, cpu_env, offset);
  55         break;
  56     default:
  57         g_assert_not_reached();
  58     }
  59 }
  60
  61 static void neon_load_element64(TCGv_i64 var, int reg, int ele, MemOp mop)
  62 {
  63     long offset = neon_element_offset(reg, ele, mop & MO_SIZE);
  64
  65     switch (mop) {
  66     case MO_UB:
  67         tcg_gen_ld8u_i64(var, cpu_env, offset);
  68         break;
  69     case MO_UW:
  70         tcg_gen_ld16u_i64(var, cpu_env, offset);
  71         break;
  72     case MO_UL:
  73         tcg_gen_ld32u_i64(var, cpu_env, offset);
  74         break;
  75     case MO_UQ:
  76         tcg_gen_ld_i64(var, cpu_env, offset);
  77         break;
  78     default:
  79         g_assert_not_reached();
  80     }
  81 }
  82
  83 static void neon_store_element(int reg, int ele, MemOp size, TCGv_i32 var)
  84 {
  85     long offset = neon_element_offset(reg, ele, size);
  86
  87     switch (size) {
  88     case MO_8:
  89         tcg_gen_st8_i32(var, cpu_env, offset);
  90         break;
  91     case MO_16:
  92         tcg_gen_st16_i32(var, cpu_env, offset);
  93         break;
  94     case MO_32:
  95         tcg_gen_st_i32(var, cpu_env, offset);
  96         break;
  97     default:
  98         g_assert_not_reached();
  99     }
 100 }
 101
 102 static void neon_store_element64(int reg, int ele, MemOp size, TCGv_i64 var)
 103 {
 104     long offset = neon_element_offset(reg, ele, size);
 105
 106     switch (size) {
 107     case MO_8:
 108         tcg_gen_st8_i64(var, cpu_env, offset);
 109         break;
 110     case MO_16:
 111         tcg_gen_st16_i64(var, cpu_env, offset);
 112         break;
 113     case MO_32:
 114         tcg_gen_st32_i64(var, cpu_env, offset);
 115         break;
 116     case MO_64:
 117         tcg_gen_st_i64(var, cpu_env, offset);
 118         break;
 119     default:
 120         g_assert_not_reached();
 121     }
 122 }
 123
 124 static bool do_neon_ddda(DisasContext *s, int q, int vd, int vn, int vm,
 125                          int data, gen_helper_gvec_4 *fn_gvec)
 126 {
 127     /* UNDEF accesses to D16-D31 if they don't exist. */
 128     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 129         return false;
 130     }
 131
 132     /*
 133      * UNDEF accesses to odd registers for each bit of Q.
 134      * Q will be 0b111 for all Q-reg instructions, otherwise
 135      * when we have mixed Q- and D-reg inputs.
 136      */
 137     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 138         return false;
 139     }
 140
 141     if (!vfp_access_check(s)) {
 142         return true;
 143     }
 144
 145     int opr_sz = q ? 16 : 8;
 146     tcg_gen_gvec_4_ool(vfp_reg_offset(1, vd),
 147                        vfp_reg_offset(1, vn),
 148                        vfp_reg_offset(1, vm),
 149                        vfp_reg_offset(1, vd),
 150                        opr_sz, opr_sz, data, fn_gvec);
 151     return true;
 152 }
 153
 154 static bool do_neon_ddda_fpst(DisasContext *s, int q, int vd, int vn, int vm,
 155                               int data, ARMFPStatusFlavour fp_flavour,
 156                               gen_helper_gvec_4_ptr *fn_gvec_ptr)
 157 {
 158     /* UNDEF accesses to D16-D31 if they don't exist. */
 159     if (((vd | vn | vm) & 0x10) && !dc_isar_feature(aa32_simd_r32, s)) {
 160         return false;
 161     }
 162
 163     /*
 164      * UNDEF accesses to odd registers for each bit of Q.
 165      * Q will be 0b111 for all Q-reg instructions, otherwise
 166      * when we have mixed Q- and D-reg inputs.
 167      */
 168     if (((vd & 1) * 4 | (vn & 1) * 2 | (vm & 1)) & q) {
 169         return false;
 170     }
 171
 172     if (!vfp_access_check(s)) {
 173         return true;
 174     }
 175
 176     int opr_sz = q ? 16 : 8;
 177     TCGv_ptr fpst = fpstatus_ptr(fp_flavour);
 178
 179     tcg_gen_gvec_4_ptr(vfp_reg_offset(1, vd),
 180                        vfp_reg_offset(1, vn),
 181                        vfp_reg_offset(1, vm),
 182                        vfp_reg_offset(1, vd),
 183                        fpst, opr_sz, opr_sz, data, fn_gvec_ptr);
 184     return true;
 185 }
 186
 187 static bool trans_VCMLA(DisasContext *s, arg_VCMLA *a)
 188 {
 189     if (!dc_isar_feature(aa32_vcma, s)) {
 190         return false;
 191     }
 192     if (a->size == MO_16) {
 193         if (!dc_isar_feature(aa32_fp16_arith, s)) {
 194             return false;
 195         }
 196         return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 197                                  FPST_STD_F16, gen_helper_gvec_fcmlah);
 198     }
 199     return do_neon_ddda_fpst(s, a->q * 7, a->vd, a->vn, a->vm, a->rot,
 200                              FPST_STD, gen_helper_gvec_fcmlas);
 201 }
 202
 203 static bool trans_VCADD(DisasContext *s, arg_VCADD *a)
 204 {
 205     int opr_sz;
 206     TCGv_ptr fpst;
 207     gen_helper_gvec_3_ptr *fn_gvec_ptr;
 208
 209     if (!dc_isar_feature(aa32_vcma, s)
 210         || (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s))) {
 211         return false;
 212     }
 213
 214     /* UNDEF accesses to D16-D31 if they don't exist. */
 215     if (!dc_isar_feature(aa32_simd_r32, s) &&
 216         ((a->vd | a->vn | a->vm) & 0x10)) {
 217         return false;
 218     }
 219
 220     if ((a->vn | a->vm | a->vd) & a->q) {
 221         return false;
 222     }
 223
 224     if (!vfp_access_check(s)) {
 225         return true;
 226     }
 227
 228     opr_sz = (1 + a->q) * 8;
 229     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
 230     fn_gvec_ptr = (a->size == MO_16) ?
 231         gen_helper_gvec_fcaddh : gen_helper_gvec_fcadds;
 232     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 233                        vfp_reg_offset(1, a->vn),
 234                        vfp_reg_offset(1, a->vm),
 235                        fpst, opr_sz, opr_sz, a->rot,
 236                        fn_gvec_ptr);
 237     return true;
 238 }
 239
 240 static bool trans_VSDOT(DisasContext *s, arg_VSDOT *a)
 241 {
 242     if (!dc_isar_feature(aa32_dp, s)) {
 243         return false;
 244     }
 245     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 246                         gen_helper_gvec_sdot_b);
 247 }
 248
 249 static bool trans_VUDOT(DisasContext *s, arg_VUDOT *a)
 250 {
 251     if (!dc_isar_feature(aa32_dp, s)) {
 252         return false;
 253     }
 254     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 255                         gen_helper_gvec_udot_b);
 256 }
 257
 258 static bool trans_VUSDOT(DisasContext *s, arg_VUSDOT *a)
 259 {
 260     if (!dc_isar_feature(aa32_i8mm, s)) {
 261         return false;
 262     }
 263     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 264                         gen_helper_gvec_usdot_b);
 265 }
 266
 267 static bool trans_VDOT_b16(DisasContext *s, arg_VDOT_b16 *a)
 268 {
 269     if (!dc_isar_feature(aa32_bf16, s)) {
 270         return false;
 271     }
 272     return do_neon_ddda(s, a->q * 7, a->vd, a->vn, a->vm, 0,
 273                         gen_helper_gvec_bfdot);
 274 }
 275
 276 static bool trans_VFML(DisasContext *s, arg_VFML *a)
 277 {
 278     int opr_sz;
 279
 280     if (!dc_isar_feature(aa32_fhm, s)) {
 281         return false;
 282     }
 283
 284     /* UNDEF accesses to D16-D31 if they don't exist. */
 285     if (!dc_isar_feature(aa32_simd_r32, s) &&
 286         (a->vd & 0x10)) {
 287         return false;
 288     }
 289
 290     if (a->vd & a->q) {
 291         return false;
 292     }
 293
 294     if (!vfp_access_check(s)) {
 295         return true;
 296     }
 297
 298     opr_sz = (1 + a->q) * 8;
 299     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 300                        vfp_reg_offset(a->q, a->vn),
 301                        vfp_reg_offset(a->q, a->vm),
 302                        cpu_env, opr_sz, opr_sz, a->s, /* is_2 == 0 */
 303                        gen_helper_gvec_fmlal_a32);
 304     return true;
 305 }
 306
 307 static bool trans_VCMLA_scalar(DisasContext *s, arg_VCMLA_scalar *a)
 308 {
 309     int data = (a->index << 2) | a->rot;
 310
 311     if (!dc_isar_feature(aa32_vcma, s)) {
 312         return false;
 313     }
 314     if (a->size == MO_16) {
 315         if (!dc_isar_feature(aa32_fp16_arith, s)) {
 316             return false;
 317         }
 318         return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 319                                  FPST_STD_F16, gen_helper_gvec_fcmlah_idx);
 320     }
 321     return do_neon_ddda_fpst(s, a->q * 6, a->vd, a->vn, a->vm, data,
 322                              FPST_STD, gen_helper_gvec_fcmlas_idx);
 323 }
 324
 325 static bool trans_VSDOT_scalar(DisasContext *s, arg_VSDOT_scalar *a)
 326 {
 327     if (!dc_isar_feature(aa32_dp, s)) {
 328         return false;
 329     }
 330     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 331                         gen_helper_gvec_sdot_idx_b);
 332 }
 333
 334 static bool trans_VUDOT_scalar(DisasContext *s, arg_VUDOT_scalar *a)
 335 {
 336     if (!dc_isar_feature(aa32_dp, s)) {
 337         return false;
 338     }
 339     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 340                         gen_helper_gvec_udot_idx_b);
 341 }
 342
 343 static bool trans_VUSDOT_scalar(DisasContext *s, arg_VUSDOT_scalar *a)
 344 {
 345     if (!dc_isar_feature(aa32_i8mm, s)) {
 346         return false;
 347     }
 348     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 349                         gen_helper_gvec_usdot_idx_b);
 350 }
 351
 352 static bool trans_VSUDOT_scalar(DisasContext *s, arg_VSUDOT_scalar *a)
 353 {
 354     if (!dc_isar_feature(aa32_i8mm, s)) {
 355         return false;
 356     }
 357     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 358                         gen_helper_gvec_sudot_idx_b);
 359 }
 360
 361 static bool trans_VDOT_b16_scal(DisasContext *s, arg_VDOT_b16_scal *a)
 362 {
 363     if (!dc_isar_feature(aa32_bf16, s)) {
 364         return false;
 365     }
 366     return do_neon_ddda(s, a->q * 6, a->vd, a->vn, a->vm, a->index,
 367                         gen_helper_gvec_bfdot_idx);
 368 }
 369
 370 static bool trans_VFML_scalar(DisasContext *s, arg_VFML_scalar *a)
 371 {
 372     int opr_sz;
 373
 374     if (!dc_isar_feature(aa32_fhm, s)) {
 375         return false;
 376     }
 377
 378     /* UNDEF accesses to D16-D31 if they don't exist. */
 379     if (!dc_isar_feature(aa32_simd_r32, s) &&
 380         ((a->vd & 0x10) || (a->q && (a->vn & 0x10)))) {
 381         return false;
 382     }
 383
 384     if (a->vd & a->q) {
 385         return false;
 386     }
 387
 388     if (!vfp_access_check(s)) {
 389         return true;
 390     }
 391
 392     opr_sz = (1 + a->q) * 8;
 393     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
 394                        vfp_reg_offset(a->q, a->vn),
 395                        vfp_reg_offset(a->q, a->rm),
 396                        cpu_env, opr_sz, opr_sz,
 397                        (a->index << 2) | a->s, /* is_2 == 0 */
 398                        gen_helper_gvec_fmlal_idx_a32);
 399     return true;
 400 }
 401
 402 static struct {
 403     int nregs;
 404     int interleave;
 405     int spacing;
 406 } const neon_ls_element_type[11] = {
 407     {1, 4, 1},
 408     {1, 4, 2},
 409     {4, 1, 1},
 410     {2, 2, 2},
 411     {1, 3, 1},
 412     {1, 3, 2},
 413     {3, 1, 1},
 414     {1, 1, 1},
 415     {1, 2, 1},
 416     {1, 2, 2},
 417     {2, 1, 1}
 418 };
 419
 420 static void gen_neon_ldst_base_update(DisasContext *s, int rm, int rn,
 421                                       int stride)
 422 {
 423     if (rm != 15) {
 424         TCGv_i32 base;
 425
 426         base = load_reg(s, rn);
 427         if (rm == 13) {
 428             tcg_gen_addi_i32(base, base, stride);
 429         } else {
 430             TCGv_i32 index;
 431             index = load_reg(s, rm);
 432             tcg_gen_add_i32(base, base, index);
 433         }
 434         store_reg(s, rn, base);
 435     }
 436 }
 437
 438 static bool trans_VLDST_multiple(DisasContext *s, arg_VLDST_multiple *a)
 439 {
 440     /* Neon load/store multiple structures */
 441     int nregs, interleave, spacing, reg, n;
 442     MemOp mop, align, endian;
 443     int mmu_idx = get_mem_index(s);
 444     int size = a->size;
 445     TCGv_i64 tmp64;
 446     TCGv_i32 addr;
 447
 448     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 449         return false;
 450     }
 451
 452     /* UNDEF accesses to D16-D31 if they don't exist */
 453     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 454         return false;
 455     }
 456     if (a->itype > 10) {
 457         return false;
 458     }
 459     /* Catch UNDEF cases for bad values of align field */
 460     switch (a->itype & 0xc) {
 461     case 4:
 462         if (a->align >= 2) {
 463             return false;
 464         }
 465         break;
 466     case 8:
 467         if (a->align == 3) {
 468             return false;
 469         }
 470         break;
 471     default:
 472         break;
 473     }
 474     nregs = neon_ls_element_type[a->itype].nregs;
 475     interleave = neon_ls_element_type[a->itype].interleave;
 476     spacing = neon_ls_element_type[a->itype].spacing;
 477     if (size == 3 && (interleave | spacing) != 1) {
 478         return false;
 479     }
 480
 481     if (!vfp_access_check(s)) {
 482         return true;
 483     }
 484
 485     /* For our purposes, bytes are always little-endian.  */
 486     endian = s->be_data;
 487     if (size == 0) {
 488         endian = MO_LE;
 489     }
 490
 491     /* Enforce alignment requested by the instruction */
 492     if (a->align) {
 493         align = pow2_align(a->align + 2); /* 4 ** a->align */
 494     } else {
 495         align = s->align_mem ? MO_ALIGN : 0;
 496     }
 497
 498     /*
 499      * Consecutive little-endian elements from a single register
 500      * can be promoted to a larger little-endian operation.
 501      */
 502     if (interleave == 1 && endian == MO_LE) {
 503         /* Retain any natural alignment. */
 504         if (align == MO_ALIGN) {
 505             align = pow2_align(size);
 506         }
 507         size = 3;
 508     }
 509
 510     tmp64 = tcg_temp_new_i64();
 511     addr = tcg_temp_new_i32();
 512     load_reg_var(s, addr, a->rn);
 513
 514     mop = endian | size | align;
 515     for (reg = 0; reg < nregs; reg++) {
 516         for (n = 0; n < 8 >> size; n++) {
 517             int xs;
 518             for (xs = 0; xs < interleave; xs++) {
 519                 int tt = a->vd + reg + spacing * xs;
 520
 521                 if (a->l) {
 522                     gen_aa32_ld_internal_i64(s, tmp64, addr, mmu_idx, mop);
 523                     neon_store_element64(tt, n, size, tmp64);
 524                 } else {
 525                     neon_load_element64(tmp64, tt, n, size);
 526                     gen_aa32_st_internal_i64(s, tmp64, addr, mmu_idx, mop);
 527                 }
 528                 tcg_gen_addi_i32(addr, addr, 1 << size);
 529
 530                 /* Subsequent memory operations inherit alignment */
 531                 mop &= ~MO_AMASK;
 532             }
 533         }
 534     }
 535
 536     gen_neon_ldst_base_update(s, a->rm, a->rn, nregs * interleave * 8);
 537     return true;
 538 }
 539
 540 static bool trans_VLD_all_lanes(DisasContext *s, arg_VLD_all_lanes *a)
 541 {
 542     /* Neon load single structure to all lanes */
 543     int reg, stride, vec_size;
 544     int vd = a->vd;
 545     int size = a->size;
 546     int nregs = a->n + 1;
 547     TCGv_i32 addr, tmp;
 548     MemOp mop, align;
 549
 550     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 551         return false;
 552     }
 553
 554     /* UNDEF accesses to D16-D31 if they don't exist */
 555     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 556         return false;
 557     }
 558
 559     align = 0;
 560     if (size == 3) {
 561         if (nregs != 4 || a->a == 0) {
 562             return false;
 563         }
 564         /* For VLD4 size == 3 a == 1 means 32 bits at 16 byte alignment */
 565         size = MO_32;
 566         align = MO_ALIGN_16;
 567     } else if (a->a) {
 568         switch (nregs) {
 569         case 1:
 570             if (size == 0) {
 571                 return false;
 572             }
 573             align = MO_ALIGN;
 574             break;
 575         case 2:
 576             align = pow2_align(size + 1);
 577             break;
 578         case 3:
 579             return false;
 580         case 4:
 581             if (size == 2) {
 582                 align = pow2_align(3);
 583             } else {
 584                 align = pow2_align(size + 2);
 585             }
 586             break;
 587         default:
 588             g_assert_not_reached();
 589         }
 590     }
 591
 592     if (!vfp_access_check(s)) {
 593         return true;
 594     }
 595
 596     /*
 597      * VLD1 to all lanes: T bit indicates how many Dregs to write.
 598      * VLD2/3/4 to all lanes: T bit indicates register stride.
 599      */
 600     stride = a->t ? 2 : 1;
 601     vec_size = nregs == 1 ? stride * 8 : 8;
 602     mop = size | align;
 603     tmp = tcg_temp_new_i32();
 604     addr = tcg_temp_new_i32();
 605     load_reg_var(s, addr, a->rn);
 606     for (reg = 0; reg < nregs; reg++) {
 607         gen_aa32_ld_i32(s, tmp, addr, get_mem_index(s), mop);
 608         if ((vd & 1) && vec_size == 16) {
 609             /*
 610              * We cannot write 16 bytes at once because the
 611              * destination is unaligned.
 612              */
 613             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 614                                  8, 8, tmp);
 615             tcg_gen_gvec_mov(0, neon_full_reg_offset(vd + 1),
 616                              neon_full_reg_offset(vd), 8, 8);
 617         } else {
 618             tcg_gen_gvec_dup_i32(size, neon_full_reg_offset(vd),
 619                                  vec_size, vec_size, tmp);
 620         }
 621         tcg_gen_addi_i32(addr, addr, 1 << size);
 622         vd += stride;
 623
 624         /* Subsequent memory operations inherit alignment */
 625         mop &= ~MO_AMASK;
 626     }
 627
 628     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << size) * nregs);
 629
 630     return true;
 631 }
 632
 633 static bool trans_VLDST_single(DisasContext *s, arg_VLDST_single *a)
 634 {
 635     /* Neon load/store single structure to one lane */
 636     int reg;
 637     int nregs = a->n + 1;
 638     int vd = a->vd;
 639     TCGv_i32 addr, tmp;
 640     MemOp mop;
 641
 642     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 643         return false;
 644     }
 645
 646     /* UNDEF accesses to D16-D31 if they don't exist */
 647     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
 648         return false;
 649     }
 650
 651     /* Catch the UNDEF cases. This is unavoidably a bit messy. */
 652     switch (nregs) {
 653     case 1:
 654         if (a->stride != 1) {
 655             return false;
 656         }
 657         if (((a->align & (1 << a->size)) != 0) ||
 658             (a->size == 2 && (a->align == 1 || a->align == 2))) {
 659             return false;
 660         }
 661         break;
 662     case 2:
 663         if (a->size == 2 && (a->align & 2) != 0) {
 664             return false;
 665         }
 666         break;
 667     case 3:
 668         if (a->align != 0) {
 669             return false;
 670         }
 671         break;
 672     case 4:
 673         if (a->size == 2 && a->align == 3) {
 674             return false;
 675         }
 676         break;
 677     default:
 678         g_assert_not_reached();
 679     }
 680     if ((vd + a->stride * (nregs - 1)) > 31) {
 681         /*
 682          * Attempts to write off the end of the register file are
 683          * UNPREDICTABLE; we choose to UNDEF because otherwise we would
 684          * access off the end of the array that holds the register data.
 685          */
 686         return false;
 687     }
 688
 689     if (!vfp_access_check(s)) {
 690         return true;
 691     }
 692
 693     /* Pick up SCTLR settings */
 694     mop = finalize_memop(s, a->size);
 695
 696     if (a->align) {
 697         MemOp align_op;
 698
 699         switch (nregs) {
 700         case 1:
 701             /* For VLD1, use natural alignment. */
 702             align_op = MO_ALIGN;
 703             break;
 704         case 2:
 705             /* For VLD2, use double alignment. */
 706             align_op = pow2_align(a->size + 1);
 707             break;
 708         case 4:
 709             if (a->size == MO_32) {
 710                 /*
 711                  * For VLD4.32, align = 1 is double alignment, align = 2 is
 712                  * quad alignment; align = 3 is rejected above.
 713                  */
 714                 align_op = pow2_align(a->size + a->align);
 715             } else {
 716                 /* For VLD4.8 and VLD.16, we want quad alignment. */
 717                 align_op = pow2_align(a->size + 2);
 718             }
 719             break;
 720         default:
 721             /* For VLD3, the alignment field is zero and rejected above. */
 722             g_assert_not_reached();
 723         }
 724
 725         mop = (mop & ~MO_AMASK) | align_op;
 726     }
 727
 728     tmp = tcg_temp_new_i32();
 729     addr = tcg_temp_new_i32();
 730     load_reg_var(s, addr, a->rn);
 731
 732     for (reg = 0; reg < nregs; reg++) {
 733         if (a->l) {
 734             gen_aa32_ld_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 735             neon_store_element(vd, a->reg_idx, a->size, tmp);
 736         } else { /* Store */
 737             neon_load_element(tmp, vd, a->reg_idx, a->size);
 738             gen_aa32_st_internal_i32(s, tmp, addr, get_mem_index(s), mop);
 739         }
 740         vd += a->stride;
 741         tcg_gen_addi_i32(addr, addr, 1 << a->size);
 742
 743         /* Subsequent memory operations inherit alignment */
 744         mop &= ~MO_AMASK;
 745     }
 746
 747     gen_neon_ldst_base_update(s, a->rm, a->rn, (1 << a->size) * nregs);
 748
 749     return true;
 750 }
 751
 752 static bool do_3same(DisasContext *s, arg_3same *a, GVecGen3Fn fn)
 753 {
 754     int vec_size = a->q ? 16 : 8;
 755     int rd_ofs = neon_full_reg_offset(a->vd);
 756     int rn_ofs = neon_full_reg_offset(a->vn);
 757     int rm_ofs = neon_full_reg_offset(a->vm);
 758
 759     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
 760         return false;
 761     }
 762
 763     /* UNDEF accesses to D16-D31 if they don't exist. */
 764     if (!dc_isar_feature(aa32_simd_r32, s) &&
 765         ((a->vd | a->vn | a->vm) & 0x10)) {
 766         return false;
 767     }
 768
 769     if ((a->vn | a->vm | a->vd) & a->q) {
 770         return false;
 771     }
 772
 773     if (!vfp_access_check(s)) {
 774         return true;
 775     }
 776
 777     fn(a->size, rd_ofs, rn_ofs, rm_ofs, vec_size, vec_size);
 778     return true;
 779 }
 780
 781 #define DO_3SAME(INSN, FUNC)                                            \
 782     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 783     {                                                                   \
 784         return do_3same(s, a, FUNC);                                    \
 785     }
 786
 787 DO_3SAME(VADD, tcg_gen_gvec_add)
 788 DO_3SAME(VSUB, tcg_gen_gvec_sub)
 789 DO_3SAME(VAND, tcg_gen_gvec_and)
 790 DO_3SAME(VBIC, tcg_gen_gvec_andc)
 791 DO_3SAME(VORR, tcg_gen_gvec_or)
 792 DO_3SAME(VORN, tcg_gen_gvec_orc)
 793 DO_3SAME(VEOR, tcg_gen_gvec_xor)
 794 DO_3SAME(VSHL_S, gen_gvec_sshl)
 795 DO_3SAME(VSHL_U, gen_gvec_ushl)
 796 DO_3SAME(VQADD_S, gen_gvec_sqadd_qc)
 797 DO_3SAME(VQADD_U, gen_gvec_uqadd_qc)
 798 DO_3SAME(VQSUB_S, gen_gvec_sqsub_qc)
 799 DO_3SAME(VQSUB_U, gen_gvec_uqsub_qc)
 800
 801 /* These insns are all gvec_bitsel but with the inputs in various orders. */
 802 #define DO_3SAME_BITSEL(INSN, O1, O2, O3)                               \
 803     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 804                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 805                                 uint32_t oprsz, uint32_t maxsz)         \
 806     {                                                                   \
 807         tcg_gen_gvec_bitsel(vece, rd_ofs, O1, O2, O3, oprsz, maxsz);    \
 808     }                                                                   \
 809     DO_3SAME(INSN, gen_##INSN##_3s)
 810
 811 DO_3SAME_BITSEL(VBSL, rd_ofs, rn_ofs, rm_ofs)
 812 DO_3SAME_BITSEL(VBIT, rm_ofs, rn_ofs, rd_ofs)
 813 DO_3SAME_BITSEL(VBIF, rm_ofs, rd_ofs, rn_ofs)
 814
 815 #define DO_3SAME_NO_SZ_3(INSN, FUNC)                                    \
 816     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 817     {                                                                   \
 818         if (a->size == 3) {                                             \
 819             return false;                                               \
 820         }                                                               \
 821         return do_3same(s, a, FUNC);                                    \
 822     }
 823
 824 DO_3SAME_NO_SZ_3(VMAX_S, tcg_gen_gvec_smax)
 825 DO_3SAME_NO_SZ_3(VMAX_U, tcg_gen_gvec_umax)
 826 DO_3SAME_NO_SZ_3(VMIN_S, tcg_gen_gvec_smin)
 827 DO_3SAME_NO_SZ_3(VMIN_U, tcg_gen_gvec_umin)
 828 DO_3SAME_NO_SZ_3(VMUL, tcg_gen_gvec_mul)
 829 DO_3SAME_NO_SZ_3(VMLA, gen_gvec_mla)
 830 DO_3SAME_NO_SZ_3(VMLS, gen_gvec_mls)
 831 DO_3SAME_NO_SZ_3(VTST, gen_gvec_cmtst)
 832 DO_3SAME_NO_SZ_3(VABD_S, gen_gvec_sabd)
 833 DO_3SAME_NO_SZ_3(VABA_S, gen_gvec_saba)
 834 DO_3SAME_NO_SZ_3(VABD_U, gen_gvec_uabd)
 835 DO_3SAME_NO_SZ_3(VABA_U, gen_gvec_uaba)
 836
 837 #define DO_3SAME_CMP(INSN, COND)                                        \
 838     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 839                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 840                                 uint32_t oprsz, uint32_t maxsz)         \
 841     {                                                                   \
 842         tcg_gen_gvec_cmp(COND, vece, rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz); \
 843     }                                                                   \
 844     DO_3SAME_NO_SZ_3(INSN, gen_##INSN##_3s)
 845
 846 DO_3SAME_CMP(VCGT_S, TCG_COND_GT)
 847 DO_3SAME_CMP(VCGT_U, TCG_COND_GTU)
 848 DO_3SAME_CMP(VCGE_S, TCG_COND_GE)
 849 DO_3SAME_CMP(VCGE_U, TCG_COND_GEU)
 850 DO_3SAME_CMP(VCEQ, TCG_COND_EQ)
 851
 852 #define WRAP_OOL_FN(WRAPNAME, FUNC)                                        \
 853     static void WRAPNAME(unsigned vece, uint32_t rd_ofs, uint32_t rn_ofs,  \
 854                          uint32_t rm_ofs, uint32_t oprsz, uint32_t maxsz)  \
 855     {                                                                      \
 856         tcg_gen_gvec_3_ool(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, 0, FUNC); \
 857     }
 858
 859 WRAP_OOL_FN(gen_VMUL_p_3s, gen_helper_gvec_pmul_b)
 860
 861 static bool trans_VMUL_p_3s(DisasContext *s, arg_3same *a)
 862 {
 863     if (a->size != 0) {
 864         return false;
 865     }
 866     return do_3same(s, a, gen_VMUL_p_3s);
 867 }
 868
 869 #define DO_VQRDMLAH(INSN, FUNC)                                         \
 870     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 871     {                                                                   \
 872         if (!dc_isar_feature(aa32_rdm, s)) {                            \
 873             return false;                                               \
 874         }                                                               \
 875         if (a->size != 1 && a->size != 2) {                             \
 876             return false;                                               \
 877         }                                                               \
 878         return do_3same(s, a, FUNC);                                    \
 879     }
 880
 881 DO_VQRDMLAH(VQRDMLAH, gen_gvec_sqrdmlah_qc)
 882 DO_VQRDMLAH(VQRDMLSH, gen_gvec_sqrdmlsh_qc)
 883
 884 #define DO_SHA1(NAME, FUNC)                                             \
 885     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 886     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 887     {                                                                   \
 888         if (!dc_isar_feature(aa32_sha1, s)) {                           \
 889             return false;                                               \
 890         }                                                               \
 891         return do_3same(s, a, gen_##NAME##_3s);                         \
 892     }
 893
 894 DO_SHA1(SHA1C, gen_helper_crypto_sha1c)
 895 DO_SHA1(SHA1P, gen_helper_crypto_sha1p)
 896 DO_SHA1(SHA1M, gen_helper_crypto_sha1m)
 897 DO_SHA1(SHA1SU0, gen_helper_crypto_sha1su0)
 898
 899 #define DO_SHA2(NAME, FUNC)                                             \
 900     WRAP_OOL_FN(gen_##NAME##_3s, FUNC)                                  \
 901     static bool trans_##NAME##_3s(DisasContext *s, arg_3same *a)        \
 902     {                                                                   \
 903         if (!dc_isar_feature(aa32_sha2, s)) {                           \
 904             return false;                                               \
 905         }                                                               \
 906         return do_3same(s, a, gen_##NAME##_3s);                         \
 907     }
 908
 909 DO_SHA2(SHA256H, gen_helper_crypto_sha256h)
 910 DO_SHA2(SHA256H2, gen_helper_crypto_sha256h2)
 911 DO_SHA2(SHA256SU1, gen_helper_crypto_sha256su1)
 912
 913 #define DO_3SAME_64(INSN, FUNC)                                         \
 914     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 915                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 916                                 uint32_t oprsz, uint32_t maxsz)         \
 917     {                                                                   \
 918         static const GVecGen3 op = { .fni8 = FUNC };                    \
 919         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &op);      \
 920     }                                                                   \
 921     DO_3SAME(INSN, gen_##INSN##_3s)
 922
 923 #define DO_3SAME_64_ENV(INSN, FUNC)                                     \
 924     static void gen_##INSN##_elt(TCGv_i64 d, TCGv_i64 n, TCGv_i64 m)    \
 925     {                                                                   \
 926         FUNC(d, cpu_env, n, m);                                         \
 927     }                                                                   \
 928     DO_3SAME_64(INSN, gen_##INSN##_elt)
 929
 930 DO_3SAME_64(VRSHL_S64, gen_helper_neon_rshl_s64)
 931 DO_3SAME_64(VRSHL_U64, gen_helper_neon_rshl_u64)
 932 DO_3SAME_64_ENV(VQSHL_S64, gen_helper_neon_qshl_s64)
 933 DO_3SAME_64_ENV(VQSHL_U64, gen_helper_neon_qshl_u64)
 934 DO_3SAME_64_ENV(VQRSHL_S64, gen_helper_neon_qrshl_s64)
 935 DO_3SAME_64_ENV(VQRSHL_U64, gen_helper_neon_qrshl_u64)
 936
 937 #define DO_3SAME_32(INSN, FUNC)                                         \
 938     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 939                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 940                                 uint32_t oprsz, uint32_t maxsz)         \
 941     {                                                                   \
 942         static const GVecGen3 ops[4] = {                                \
 943             { .fni4 = gen_helper_neon_##FUNC##8 },                      \
 944             { .fni4 = gen_helper_neon_##FUNC##16 },                     \
 945             { .fni4 = gen_helper_neon_##FUNC##32 },                     \
 946             { 0 },                                                      \
 947         };                                                              \
 948         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 949     }                                                                   \
 950     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 951     {                                                                   \
 952         if (a->size > 2) {                                              \
 953             return false;                                               \
 954         }                                                               \
 955         return do_3same(s, a, gen_##INSN##_3s);                         \
 956     }
 957
 958 /*
 959  * Some helper functions need to be passed the cpu_env. In order
 960  * to use those with the gvec APIs like tcg_gen_gvec_3() we need
 961  * to create wrapper functions whose prototype is a NeonGenTwoOpFn()
 962  * and which call a NeonGenTwoOpEnvFn().
 963  */
 964 #define WRAP_ENV_FN(WRAPNAME, FUNC)                                     \
 965     static void WRAPNAME(TCGv_i32 d, TCGv_i32 n, TCGv_i32 m)            \
 966     {                                                                   \
 967         FUNC(d, cpu_env, n, m);                                         \
 968     }
 969
 970 #define DO_3SAME_32_ENV(INSN, FUNC)                                     \
 971     WRAP_ENV_FN(gen_##INSN##_tramp8, gen_helper_neon_##FUNC##8);        \
 972     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##16);      \
 973     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##32);      \
 974     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
 975                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
 976                                 uint32_t oprsz, uint32_t maxsz)         \
 977     {                                                                   \
 978         static const GVecGen3 ops[4] = {                                \
 979             { .fni4 = gen_##INSN##_tramp8 },                            \
 980             { .fni4 = gen_##INSN##_tramp16 },                           \
 981             { .fni4 = gen_##INSN##_tramp32 },                           \
 982             { 0 },                                                      \
 983         };                                                              \
 984         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece]); \
 985     }                                                                   \
 986     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
 987     {                                                                   \
 988         if (a->size > 2) {                                              \
 989             return false;                                               \
 990         }                                                               \
 991         return do_3same(s, a, gen_##INSN##_3s);                         \
 992     }
 993
 994 DO_3SAME_32(VHADD_S, hadd_s)
 995 DO_3SAME_32(VHADD_U, hadd_u)
 996 DO_3SAME_32(VHSUB_S, hsub_s)
 997 DO_3SAME_32(VHSUB_U, hsub_u)
 998 DO_3SAME_32(VRHADD_S, rhadd_s)
 999 DO_3SAME_32(VRHADD_U, rhadd_u)
1000 DO_3SAME_32(VRSHL_S, rshl_s)
1001 DO_3SAME_32(VRSHL_U, rshl_u)
1002
1003 DO_3SAME_32_ENV(VQSHL_S, qshl_s)
1004 DO_3SAME_32_ENV(VQSHL_U, qshl_u)
1005 DO_3SAME_32_ENV(VQRSHL_S, qrshl_s)
1006 DO_3SAME_32_ENV(VQRSHL_U, qrshl_u)
1007
1008 static bool do_3same_pair(DisasContext *s, arg_3same *a, NeonGenTwoOpFn *fn)
1009 {
1010     /* Operations handled pairwise 32 bits at a time */
1011     TCGv_i32 tmp, tmp2, tmp3;
1012
1013     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1014         return false;
1015     }
1016
1017     /* UNDEF accesses to D16-D31 if they don't exist. */
1018     if (!dc_isar_feature(aa32_simd_r32, s) &&
1019         ((a->vd | a->vn | a->vm) & 0x10)) {
1020         return false;
1021     }
1022
1023     if (a->size == 3) {
1024         return false;
1025     }
1026
1027     if (!vfp_access_check(s)) {
1028         return true;
1029     }
1030
1031     assert(a->q == 0); /* enforced by decode patterns */
1032
1033     /*
1034      * Note that we have to be careful not to clobber the source operands
1035      * in the "vm == vd" case by storing the result of the first pass too
1036      * early. Since Q is 0 there are always just two passes, so instead
1037      * of a complicated loop over each pass we just unroll.
1038      */
1039     tmp = tcg_temp_new_i32();
1040     tmp2 = tcg_temp_new_i32();
1041     tmp3 = tcg_temp_new_i32();
1042
1043     read_neon_element32(tmp, a->vn, 0, MO_32);
1044     read_neon_element32(tmp2, a->vn, 1, MO_32);
1045     fn(tmp, tmp, tmp2);
1046
1047     read_neon_element32(tmp3, a->vm, 0, MO_32);
1048     read_neon_element32(tmp2, a->vm, 1, MO_32);
1049     fn(tmp3, tmp3, tmp2);
1050
1051     write_neon_element32(tmp, a->vd, 0, MO_32);
1052     write_neon_element32(tmp3, a->vd, 1, MO_32);
1053
1054     return true;
1055 }
1056
1057 #define DO_3SAME_PAIR(INSN, func)                                       \
1058     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1059     {                                                                   \
1060         static NeonGenTwoOpFn * const fns[] = {                         \
1061             gen_helper_neon_##func##8,                                  \
1062             gen_helper_neon_##func##16,                                 \
1063             gen_helper_neon_##func##32,                                 \
1064         };                                                              \
1065         if (a->size > 2) {                                              \
1066             return false;                                               \
1067         }                                                               \
1068         return do_3same_pair(s, a, fns[a->size]);                       \
1069     }
1070
1071 /* 32-bit pairwise ops end up the same as the elementwise versions.  */
1072 #define gen_helper_neon_pmax_s32  tcg_gen_smax_i32
1073 #define gen_helper_neon_pmax_u32  tcg_gen_umax_i32
1074 #define gen_helper_neon_pmin_s32  tcg_gen_smin_i32
1075 #define gen_helper_neon_pmin_u32  tcg_gen_umin_i32
1076 #define gen_helper_neon_padd_u32  tcg_gen_add_i32
1077
1078 DO_3SAME_PAIR(VPMAX_S, pmax_s)
1079 DO_3SAME_PAIR(VPMIN_S, pmin_s)
1080 DO_3SAME_PAIR(VPMAX_U, pmax_u)
1081 DO_3SAME_PAIR(VPMIN_U, pmin_u)
1082 DO_3SAME_PAIR(VPADD, padd_u)
1083
1084 #define DO_3SAME_VQDMULH(INSN, FUNC)                                    \
1085     WRAP_ENV_FN(gen_##INSN##_tramp16, gen_helper_neon_##FUNC##_s16);    \
1086     WRAP_ENV_FN(gen_##INSN##_tramp32, gen_helper_neon_##FUNC##_s32);    \
1087     static void gen_##INSN##_3s(unsigned vece, uint32_t rd_ofs,         \
1088                                 uint32_t rn_ofs, uint32_t rm_ofs,       \
1089                                 uint32_t oprsz, uint32_t maxsz)         \
1090     {                                                                   \
1091         static const GVecGen3 ops[2] = {                                \
1092             { .fni4 = gen_##INSN##_tramp16 },                           \
1093             { .fni4 = gen_##INSN##_tramp32 },                           \
1094         };                                                              \
1095         tcg_gen_gvec_3(rd_ofs, rn_ofs, rm_ofs, oprsz, maxsz, &ops[vece - 1]); \
1096     }                                                                   \
1097     static bool trans_##INSN##_3s(DisasContext *s, arg_3same *a)        \
1098     {                                                                   \
1099         if (a->size != 1 && a->size != 2) {                             \
1100             return false;                                               \
1101         }                                                               \
1102         return do_3same(s, a, gen_##INSN##_3s);                         \
1103     }
1104
1105 DO_3SAME_VQDMULH(VQDMULH, qdmulh)
1106 DO_3SAME_VQDMULH(VQRDMULH, qrdmulh)
1107
1108 #define WRAP_FP_GVEC(WRAPNAME, FPST, FUNC)                              \
1109     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
1110                          uint32_t rn_ofs, uint32_t rm_ofs,              \
1111                          uint32_t oprsz, uint32_t maxsz)                \
1112     {                                                                   \
1113         TCGv_ptr fpst = fpstatus_ptr(FPST);                             \
1114         tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpst,                \
1115                            oprsz, maxsz, 0, FUNC);                      \
1116     }
1117
1118 #define DO_3S_FP_GVEC(INSN,SFUNC,HFUNC)                                 \
1119     WRAP_FP_GVEC(gen_##INSN##_fp32_3s, FPST_STD, SFUNC)                 \
1120     WRAP_FP_GVEC(gen_##INSN##_fp16_3s, FPST_STD_F16, HFUNC)             \
1121     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a)     \
1122     {                                                                   \
1123         if (a->size == MO_16) {                                         \
1124             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
1125                 return false;                                           \
1126             }                                                           \
1127             return do_3same(s, a, gen_##INSN##_fp16_3s);                \
1128         }                                                               \
1129         return do_3same(s, a, gen_##INSN##_fp32_3s);                    \
1130     }
1131
1132
1133 DO_3S_FP_GVEC(VADD, gen_helper_gvec_fadd_s, gen_helper_gvec_fadd_h)
1134 DO_3S_FP_GVEC(VSUB, gen_helper_gvec_fsub_s, gen_helper_gvec_fsub_h)
1135 DO_3S_FP_GVEC(VABD, gen_helper_gvec_fabd_s, gen_helper_gvec_fabd_h)
1136 DO_3S_FP_GVEC(VMUL, gen_helper_gvec_fmul_s, gen_helper_gvec_fmul_h)
1137 DO_3S_FP_GVEC(VCEQ, gen_helper_gvec_fceq_s, gen_helper_gvec_fceq_h)
1138 DO_3S_FP_GVEC(VCGE, gen_helper_gvec_fcge_s, gen_helper_gvec_fcge_h)
1139 DO_3S_FP_GVEC(VCGT, gen_helper_gvec_fcgt_s, gen_helper_gvec_fcgt_h)
1140 DO_3S_FP_GVEC(VACGE, gen_helper_gvec_facge_s, gen_helper_gvec_facge_h)
1141 DO_3S_FP_GVEC(VACGT, gen_helper_gvec_facgt_s, gen_helper_gvec_facgt_h)
1142 DO_3S_FP_GVEC(VMAX, gen_helper_gvec_fmax_s, gen_helper_gvec_fmax_h)
1143 DO_3S_FP_GVEC(VMIN, gen_helper_gvec_fmin_s, gen_helper_gvec_fmin_h)
1144 DO_3S_FP_GVEC(VMLA, gen_helper_gvec_fmla_s, gen_helper_gvec_fmla_h)
1145 DO_3S_FP_GVEC(VMLS, gen_helper_gvec_fmls_s, gen_helper_gvec_fmls_h)
1146 DO_3S_FP_GVEC(VFMA, gen_helper_gvec_vfma_s, gen_helper_gvec_vfma_h)
1147 DO_3S_FP_GVEC(VFMS, gen_helper_gvec_vfms_s, gen_helper_gvec_vfms_h)
1148 DO_3S_FP_GVEC(VRECPS, gen_helper_gvec_recps_nf_s, gen_helper_gvec_recps_nf_h)
1149 DO_3S_FP_GVEC(VRSQRTS, gen_helper_gvec_rsqrts_nf_s, gen_helper_gvec_rsqrts_nf_h)
1150
1151 WRAP_FP_GVEC(gen_VMAXNM_fp32_3s, FPST_STD, gen_helper_gvec_fmaxnum_s)
1152 WRAP_FP_GVEC(gen_VMAXNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fmaxnum_h)
1153 WRAP_FP_GVEC(gen_VMINNM_fp32_3s, FPST_STD, gen_helper_gvec_fminnum_s)
1154 WRAP_FP_GVEC(gen_VMINNM_fp16_3s, FPST_STD_F16, gen_helper_gvec_fminnum_h)
1155
1156 static bool trans_VMAXNM_fp_3s(DisasContext *s, arg_3same *a)
1157 {
1158     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1159         return false;
1160     }
1161
1162     if (a->size == MO_16) {
1163         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1164             return false;
1165         }
1166         return do_3same(s, a, gen_VMAXNM_fp16_3s);
1167     }
1168     return do_3same(s, a, gen_VMAXNM_fp32_3s);
1169 }
1170
1171 static bool trans_VMINNM_fp_3s(DisasContext *s, arg_3same *a)
1172 {
1173     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
1174         return false;
1175     }
1176
1177     if (a->size == MO_16) {
1178         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1179             return false;
1180         }
1181         return do_3same(s, a, gen_VMINNM_fp16_3s);
1182     }
1183     return do_3same(s, a, gen_VMINNM_fp32_3s);
1184 }
1185
1186 static bool do_3same_fp_pair(DisasContext *s, arg_3same *a,
1187                              gen_helper_gvec_3_ptr *fn)
1188 {
1189     /* FP pairwise operations */
1190     TCGv_ptr fpstatus;
1191
1192     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1193         return false;
1194     }
1195
1196     /* UNDEF accesses to D16-D31 if they don't exist. */
1197     if (!dc_isar_feature(aa32_simd_r32, s) &&
1198         ((a->vd | a->vn | a->vm) & 0x10)) {
1199         return false;
1200     }
1201
1202     if (!vfp_access_check(s)) {
1203         return true;
1204     }
1205
1206     assert(a->q == 0); /* enforced by decode patterns */
1207
1208
1209     fpstatus = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1210     tcg_gen_gvec_3_ptr(vfp_reg_offset(1, a->vd),
1211                        vfp_reg_offset(1, a->vn),
1212                        vfp_reg_offset(1, a->vm),
1213                        fpstatus, 8, 8, 0, fn);
1214
1215     return true;
1216 }
1217
1218 /*
1219  * For all the functions using this macro, size == 1 means fp16,
1220  * which is an architecture extension we don't implement yet.
1221  */
1222 #define DO_3S_FP_PAIR(INSN,FUNC)                                    \
1223     static bool trans_##INSN##_fp_3s(DisasContext *s, arg_3same *a) \
1224     {                                                               \
1225         if (a->size == MO_16) {                                     \
1226             if (!dc_isar_feature(aa32_fp16_arith, s)) {             \
1227                 return false;                                       \
1228             }                                                       \
1229             return do_3same_fp_pair(s, a, FUNC##h);                 \
1230         }                                                           \
1231         return do_3same_fp_pair(s, a, FUNC##s);                     \
1232     }
1233
1234 DO_3S_FP_PAIR(VPADD, gen_helper_neon_padd)
1235 DO_3S_FP_PAIR(VPMAX, gen_helper_neon_pmax)
1236 DO_3S_FP_PAIR(VPMIN, gen_helper_neon_pmin)
1237
1238 static bool do_vector_2sh(DisasContext *s, arg_2reg_shift *a, GVecGen2iFn *fn)
1239 {
1240     /* Handle a 2-reg-shift insn which can be vectorized. */
1241     int vec_size = a->q ? 16 : 8;
1242     int rd_ofs = neon_full_reg_offset(a->vd);
1243     int rm_ofs = neon_full_reg_offset(a->vm);
1244
1245     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1246         return false;
1247     }
1248
1249     /* UNDEF accesses to D16-D31 if they don't exist. */
1250     if (!dc_isar_feature(aa32_simd_r32, s) &&
1251         ((a->vd | a->vm) & 0x10)) {
1252         return false;
1253     }
1254
1255     if ((a->vm | a->vd) & a->q) {
1256         return false;
1257     }
1258
1259     if (!vfp_access_check(s)) {
1260         return true;
1261     }
1262
1263     fn(a->size, rd_ofs, rm_ofs, a->shift, vec_size, vec_size);
1264     return true;
1265 }
1266
1267 #define DO_2SH(INSN, FUNC)                                              \
1268     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1269     {                                                                   \
1270         return do_vector_2sh(s, a, FUNC);                               \
1271     }                                                                   \
1272
1273 DO_2SH(VSHL, tcg_gen_gvec_shli)
1274 DO_2SH(VSLI, gen_gvec_sli)
1275 DO_2SH(VSRI, gen_gvec_sri)
1276 DO_2SH(VSRA_S, gen_gvec_ssra)
1277 DO_2SH(VSRA_U, gen_gvec_usra)
1278 DO_2SH(VRSHR_S, gen_gvec_srshr)
1279 DO_2SH(VRSHR_U, gen_gvec_urshr)
1280 DO_2SH(VRSRA_S, gen_gvec_srsra)
1281 DO_2SH(VRSRA_U, gen_gvec_ursra)
1282
1283 static bool trans_VSHR_S_2sh(DisasContext *s, arg_2reg_shift *a)
1284 {
1285     /* Signed shift out of range results in all-sign-bits */
1286     a->shift = MIN(a->shift, (8 << a->size) - 1);
1287     return do_vector_2sh(s, a, tcg_gen_gvec_sari);
1288 }
1289
1290 static void gen_zero_rd_2sh(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
1291                             int64_t shift, uint32_t oprsz, uint32_t maxsz)
1292 {
1293     tcg_gen_gvec_dup_imm(vece, rd_ofs, oprsz, maxsz, 0);
1294 }
1295
1296 static bool trans_VSHR_U_2sh(DisasContext *s, arg_2reg_shift *a)
1297 {
1298     /* Shift out of range is architecturally valid and results in zero. */
1299     if (a->shift >= (8 << a->size)) {
1300         return do_vector_2sh(s, a, gen_zero_rd_2sh);
1301     } else {
1302         return do_vector_2sh(s, a, tcg_gen_gvec_shri);
1303     }
1304 }
1305
1306 static bool do_2shift_env_64(DisasContext *s, arg_2reg_shift *a,
1307                              NeonGenTwo64OpEnvFn *fn)
1308 {
1309     /*
1310      * 2-reg-and-shift operations, size == 3 case, where the
1311      * function needs to be passed cpu_env.
1312      */
1313     TCGv_i64 constimm;
1314     int pass;
1315
1316     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1317         return false;
1318     }
1319
1320     /* UNDEF accesses to D16-D31 if they don't exist. */
1321     if (!dc_isar_feature(aa32_simd_r32, s) &&
1322         ((a->vd | a->vm) & 0x10)) {
1323         return false;
1324     }
1325
1326     if ((a->vm | a->vd) & a->q) {
1327         return false;
1328     }
1329
1330     if (!vfp_access_check(s)) {
1331         return true;
1332     }
1333
1334     /*
1335      * To avoid excessive duplication of ops we implement shift
1336      * by immediate using the variable shift operations.
1337      */
1338     constimm = tcg_constant_i64(dup_const(a->size, a->shift));
1339
1340     for (pass = 0; pass < a->q + 1; pass++) {
1341         TCGv_i64 tmp = tcg_temp_new_i64();
1342
1343         read_neon_element64(tmp, a->vm, pass, MO_64);
1344         fn(tmp, cpu_env, tmp, constimm);
1345         write_neon_element64(tmp, a->vd, pass, MO_64);
1346     }
1347     return true;
1348 }
1349
1350 static bool do_2shift_env_32(DisasContext *s, arg_2reg_shift *a,
1351                              NeonGenTwoOpEnvFn *fn)
1352 {
1353     /*
1354      * 2-reg-and-shift operations, size < 3 case, where the
1355      * helper needs to be passed cpu_env.
1356      */
1357     TCGv_i32 constimm, tmp;
1358     int pass;
1359
1360     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1361         return false;
1362     }
1363
1364     /* UNDEF accesses to D16-D31 if they don't exist. */
1365     if (!dc_isar_feature(aa32_simd_r32, s) &&
1366         ((a->vd | a->vm) & 0x10)) {
1367         return false;
1368     }
1369
1370     if ((a->vm | a->vd) & a->q) {
1371         return false;
1372     }
1373
1374     if (!vfp_access_check(s)) {
1375         return true;
1376     }
1377
1378     /*
1379      * To avoid excessive duplication of ops we implement shift
1380      * by immediate using the variable shift operations.
1381      */
1382     constimm = tcg_constant_i32(dup_const(a->size, a->shift));
1383     tmp = tcg_temp_new_i32();
1384
1385     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
1386         read_neon_element32(tmp, a->vm, pass, MO_32);
1387         fn(tmp, cpu_env, tmp, constimm);
1388         write_neon_element32(tmp, a->vd, pass, MO_32);
1389     }
1390     return true;
1391 }
1392
1393 #define DO_2SHIFT_ENV(INSN, FUNC)                                       \
1394     static bool trans_##INSN##_64_2sh(DisasContext *s, arg_2reg_shift *a) \
1395     {                                                                   \
1396         return do_2shift_env_64(s, a, gen_helper_neon_##FUNC##64);      \
1397     }                                                                   \
1398     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1399     {                                                                   \
1400         static NeonGenTwoOpEnvFn * const fns[] = {                      \
1401             gen_helper_neon_##FUNC##8,                                  \
1402             gen_helper_neon_##FUNC##16,                                 \
1403             gen_helper_neon_##FUNC##32,                                 \
1404         };                                                              \
1405         assert(a->size < ARRAY_SIZE(fns));                              \
1406         return do_2shift_env_32(s, a, fns[a->size]);                    \
1407     }
1408
1409 DO_2SHIFT_ENV(VQSHLU, qshlu_s)
1410 DO_2SHIFT_ENV(VQSHL_U, qshl_u)
1411 DO_2SHIFT_ENV(VQSHL_S, qshl_s)
1412
1413 static bool do_2shift_narrow_64(DisasContext *s, arg_2reg_shift *a,
1414                                 NeonGenTwo64OpFn *shiftfn,
1415                                 NeonGenNarrowEnvFn *narrowfn)
1416 {
1417     /* 2-reg-and-shift narrowing-shift operations, size == 3 case */
1418     TCGv_i64 constimm, rm1, rm2;
1419     TCGv_i32 rd;
1420
1421     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1422         return false;
1423     }
1424
1425     /* UNDEF accesses to D16-D31 if they don't exist. */
1426     if (!dc_isar_feature(aa32_simd_r32, s) &&
1427         ((a->vd | a->vm) & 0x10)) {
1428         return false;
1429     }
1430
1431     if (a->vm & 1) {
1432         return false;
1433     }
1434
1435     if (!vfp_access_check(s)) {
1436         return true;
1437     }
1438
1439     /*
1440      * This is always a right shift, and the shiftfn is always a
1441      * left-shift helper, which thus needs the negated shift count.
1442      */
1443     constimm = tcg_constant_i64(-a->shift);
1444     rm1 = tcg_temp_new_i64();
1445     rm2 = tcg_temp_new_i64();
1446     rd = tcg_temp_new_i32();
1447
1448     /* Load both inputs first to avoid potential overwrite if rm == rd */
1449     read_neon_element64(rm1, a->vm, 0, MO_64);
1450     read_neon_element64(rm2, a->vm, 1, MO_64);
1451
1452     shiftfn(rm1, rm1, constimm);
1453     narrowfn(rd, cpu_env, rm1);
1454     write_neon_element32(rd, a->vd, 0, MO_32);
1455
1456     shiftfn(rm2, rm2, constimm);
1457     narrowfn(rd, cpu_env, rm2);
1458     write_neon_element32(rd, a->vd, 1, MO_32);
1459
1460     return true;
1461 }
1462
1463 static bool do_2shift_narrow_32(DisasContext *s, arg_2reg_shift *a,
1464                                 NeonGenTwoOpFn *shiftfn,
1465                                 NeonGenNarrowEnvFn *narrowfn)
1466 {
1467     /* 2-reg-and-shift narrowing-shift operations, size < 3 case */
1468     TCGv_i32 constimm, rm1, rm2, rm3, rm4;
1469     TCGv_i64 rtmp;
1470     uint32_t imm;
1471
1472     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1473         return false;
1474     }
1475
1476     /* UNDEF accesses to D16-D31 if they don't exist. */
1477     if (!dc_isar_feature(aa32_simd_r32, s) &&
1478         ((a->vd | a->vm) & 0x10)) {
1479         return false;
1480     }
1481
1482     if (a->vm & 1) {
1483         return false;
1484     }
1485
1486     if (!vfp_access_check(s)) {
1487         return true;
1488     }
1489
1490     /*
1491      * This is always a right shift, and the shiftfn is always a
1492      * left-shift helper, which thus needs the negated shift count
1493      * duplicated into each lane of the immediate value.
1494      */
1495     if (a->size == 1) {
1496         imm = (uint16_t)(-a->shift);
1497         imm |= imm << 16;
1498     } else {
1499         /* size == 2 */
1500         imm = -a->shift;
1501     }
1502     constimm = tcg_constant_i32(imm);
1503
1504     /* Load all inputs first to avoid potential overwrite */
1505     rm1 = tcg_temp_new_i32();
1506     rm2 = tcg_temp_new_i32();
1507     rm3 = tcg_temp_new_i32();
1508     rm4 = tcg_temp_new_i32();
1509     read_neon_element32(rm1, a->vm, 0, MO_32);
1510     read_neon_element32(rm2, a->vm, 1, MO_32);
1511     read_neon_element32(rm3, a->vm, 2, MO_32);
1512     read_neon_element32(rm4, a->vm, 3, MO_32);
1513     rtmp = tcg_temp_new_i64();
1514
1515     shiftfn(rm1, rm1, constimm);
1516     shiftfn(rm2, rm2, constimm);
1517
1518     tcg_gen_concat_i32_i64(rtmp, rm1, rm2);
1519
1520     narrowfn(rm1, cpu_env, rtmp);
1521     write_neon_element32(rm1, a->vd, 0, MO_32);
1522
1523     shiftfn(rm3, rm3, constimm);
1524     shiftfn(rm4, rm4, constimm);
1525
1526     tcg_gen_concat_i32_i64(rtmp, rm3, rm4);
1527
1528     narrowfn(rm3, cpu_env, rtmp);
1529     write_neon_element32(rm3, a->vd, 1, MO_32);
1530     return true;
1531 }
1532
1533 #define DO_2SN_64(INSN, FUNC, NARROWFUNC)                               \
1534     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1535     {                                                                   \
1536         return do_2shift_narrow_64(s, a, FUNC, NARROWFUNC);             \
1537     }
1538 #define DO_2SN_32(INSN, FUNC, NARROWFUNC)                               \
1539     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1540     {                                                                   \
1541         return do_2shift_narrow_32(s, a, FUNC, NARROWFUNC);             \
1542     }
1543
1544 static void gen_neon_narrow_u32(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1545 {
1546     tcg_gen_extrl_i64_i32(dest, src);
1547 }
1548
1549 static void gen_neon_narrow_u16(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1550 {
1551     gen_helper_neon_narrow_u16(dest, src);
1552 }
1553
1554 static void gen_neon_narrow_u8(TCGv_i32 dest, TCGv_ptr env, TCGv_i64 src)
1555 {
1556     gen_helper_neon_narrow_u8(dest, src);
1557 }
1558
1559 DO_2SN_64(VSHRN_64, gen_ushl_i64, gen_neon_narrow_u32)
1560 DO_2SN_32(VSHRN_32, gen_ushl_i32, gen_neon_narrow_u16)
1561 DO_2SN_32(VSHRN_16, gen_helper_neon_shl_u16, gen_neon_narrow_u8)
1562
1563 DO_2SN_64(VRSHRN_64, gen_helper_neon_rshl_u64, gen_neon_narrow_u32)
1564 DO_2SN_32(VRSHRN_32, gen_helper_neon_rshl_u32, gen_neon_narrow_u16)
1565 DO_2SN_32(VRSHRN_16, gen_helper_neon_rshl_u16, gen_neon_narrow_u8)
1566
1567 DO_2SN_64(VQSHRUN_64, gen_sshl_i64, gen_helper_neon_unarrow_sat32)
1568 DO_2SN_32(VQSHRUN_32, gen_sshl_i32, gen_helper_neon_unarrow_sat16)
1569 DO_2SN_32(VQSHRUN_16, gen_helper_neon_shl_s16, gen_helper_neon_unarrow_sat8)
1570
1571 DO_2SN_64(VQRSHRUN_64, gen_helper_neon_rshl_s64, gen_helper_neon_unarrow_sat32)
1572 DO_2SN_32(VQRSHRUN_32, gen_helper_neon_rshl_s32, gen_helper_neon_unarrow_sat16)
1573 DO_2SN_32(VQRSHRUN_16, gen_helper_neon_rshl_s16, gen_helper_neon_unarrow_sat8)
1574 DO_2SN_64(VQSHRN_S64, gen_sshl_i64, gen_helper_neon_narrow_sat_s32)
1575 DO_2SN_32(VQSHRN_S32, gen_sshl_i32, gen_helper_neon_narrow_sat_s16)
1576 DO_2SN_32(VQSHRN_S16, gen_helper_neon_shl_s16, gen_helper_neon_narrow_sat_s8)
1577
1578 DO_2SN_64(VQRSHRN_S64, gen_helper_neon_rshl_s64, gen_helper_neon_narrow_sat_s32)
1579 DO_2SN_32(VQRSHRN_S32, gen_helper_neon_rshl_s32, gen_helper_neon_narrow_sat_s16)
1580 DO_2SN_32(VQRSHRN_S16, gen_helper_neon_rshl_s16, gen_helper_neon_narrow_sat_s8)
1581
1582 DO_2SN_64(VQSHRN_U64, gen_ushl_i64, gen_helper_neon_narrow_sat_u32)
1583 DO_2SN_32(VQSHRN_U32, gen_ushl_i32, gen_helper_neon_narrow_sat_u16)
1584 DO_2SN_32(VQSHRN_U16, gen_helper_neon_shl_u16, gen_helper_neon_narrow_sat_u8)
1585
1586 DO_2SN_64(VQRSHRN_U64, gen_helper_neon_rshl_u64, gen_helper_neon_narrow_sat_u32)
1587 DO_2SN_32(VQRSHRN_U32, gen_helper_neon_rshl_u32, gen_helper_neon_narrow_sat_u16)
1588 DO_2SN_32(VQRSHRN_U16, gen_helper_neon_rshl_u16, gen_helper_neon_narrow_sat_u8)
1589
1590 static bool do_vshll_2sh(DisasContext *s, arg_2reg_shift *a,
1591                          NeonGenWidenFn *widenfn, bool u)
1592 {
1593     TCGv_i64 tmp;
1594     TCGv_i32 rm0, rm1;
1595     uint64_t widen_mask = 0;
1596
1597     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1598         return false;
1599     }
1600
1601     /* UNDEF accesses to D16-D31 if they don't exist. */
1602     if (!dc_isar_feature(aa32_simd_r32, s) &&
1603         ((a->vd | a->vm) & 0x10)) {
1604         return false;
1605     }
1606
1607     if (a->vd & 1) {
1608         return false;
1609     }
1610
1611     if (!vfp_access_check(s)) {
1612         return true;
1613     }
1614
1615     /*
1616      * This is a widen-and-shift operation. The shift is always less
1617      * than the width of the source type, so after widening the input
1618      * vector we can simply shift the whole 64-bit widened register,
1619      * and then clear the potential overflow bits resulting from left
1620      * bits of the narrow input appearing as right bits of the left
1621      * neighbour narrow input. Calculate a mask of bits to clear.
1622      */
1623     if ((a->shift != 0) && (a->size < 2 || u)) {
1624         int esize = 8 << a->size;
1625         widen_mask = MAKE_64BIT_MASK(0, esize);
1626         widen_mask >>= esize - a->shift;
1627         widen_mask = dup_const(a->size + 1, widen_mask);
1628     }
1629
1630     rm0 = tcg_temp_new_i32();
1631     rm1 = tcg_temp_new_i32();
1632     read_neon_element32(rm0, a->vm, 0, MO_32);
1633     read_neon_element32(rm1, a->vm, 1, MO_32);
1634     tmp = tcg_temp_new_i64();
1635
1636     widenfn(tmp, rm0);
1637     if (a->shift != 0) {
1638         tcg_gen_shli_i64(tmp, tmp, a->shift);
1639         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1640     }
1641     write_neon_element64(tmp, a->vd, 0, MO_64);
1642
1643     widenfn(tmp, rm1);
1644     if (a->shift != 0) {
1645         tcg_gen_shli_i64(tmp, tmp, a->shift);
1646         tcg_gen_andi_i64(tmp, tmp, ~widen_mask);
1647     }
1648     write_neon_element64(tmp, a->vd, 1, MO_64);
1649     return true;
1650 }
1651
1652 static bool trans_VSHLL_S_2sh(DisasContext *s, arg_2reg_shift *a)
1653 {
1654     static NeonGenWidenFn * const widenfn[] = {
1655         gen_helper_neon_widen_s8,
1656         gen_helper_neon_widen_s16,
1657         tcg_gen_ext_i32_i64,
1658     };
1659     return do_vshll_2sh(s, a, widenfn[a->size], false);
1660 }
1661
1662 static bool trans_VSHLL_U_2sh(DisasContext *s, arg_2reg_shift *a)
1663 {
1664     static NeonGenWidenFn * const widenfn[] = {
1665         gen_helper_neon_widen_u8,
1666         gen_helper_neon_widen_u16,
1667         tcg_gen_extu_i32_i64,
1668     };
1669     return do_vshll_2sh(s, a, widenfn[a->size], true);
1670 }
1671
1672 static bool do_fp_2sh(DisasContext *s, arg_2reg_shift *a,
1673                       gen_helper_gvec_2_ptr *fn)
1674 {
1675     /* FP operations in 2-reg-and-shift group */
1676     int vec_size = a->q ? 16 : 8;
1677     int rd_ofs = neon_full_reg_offset(a->vd);
1678     int rm_ofs = neon_full_reg_offset(a->vm);
1679     TCGv_ptr fpst;
1680
1681     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1682         return false;
1683     }
1684
1685     if (a->size == MO_16) {
1686         if (!dc_isar_feature(aa32_fp16_arith, s)) {
1687             return false;
1688         }
1689     }
1690
1691     /* UNDEF accesses to D16-D31 if they don't exist. */
1692     if (!dc_isar_feature(aa32_simd_r32, s) &&
1693         ((a->vd | a->vm) & 0x10)) {
1694         return false;
1695     }
1696
1697     if ((a->vm | a->vd) & a->q) {
1698         return false;
1699     }
1700
1701     if (!vfp_access_check(s)) {
1702         return true;
1703     }
1704
1705     fpst = fpstatus_ptr(a->size == MO_16 ? FPST_STD_F16 : FPST_STD);
1706     tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, vec_size, vec_size, a->shift, fn);
1707     return true;
1708 }
1709
1710 #define DO_FP_2SH(INSN, FUNC)                                           \
1711     static bool trans_##INSN##_2sh(DisasContext *s, arg_2reg_shift *a)  \
1712     {                                                                   \
1713         return do_fp_2sh(s, a, FUNC);                                   \
1714     }
1715
1716 DO_FP_2SH(VCVT_SF, gen_helper_gvec_vcvt_sf)
1717 DO_FP_2SH(VCVT_UF, gen_helper_gvec_vcvt_uf)
1718 DO_FP_2SH(VCVT_FS, gen_helper_gvec_vcvt_fs)
1719 DO_FP_2SH(VCVT_FU, gen_helper_gvec_vcvt_fu)
1720
1721 DO_FP_2SH(VCVT_SH, gen_helper_gvec_vcvt_sh)
1722 DO_FP_2SH(VCVT_UH, gen_helper_gvec_vcvt_uh)
1723 DO_FP_2SH(VCVT_HS, gen_helper_gvec_vcvt_hs)
1724 DO_FP_2SH(VCVT_HU, gen_helper_gvec_vcvt_hu)
1725
1726 static bool do_1reg_imm(DisasContext *s, arg_1reg_imm *a,
1727                         GVecGen2iFn *fn)
1728 {
1729     uint64_t imm;
1730     int reg_ofs, vec_size;
1731
1732     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1733         return false;
1734     }
1735
1736     /* UNDEF accesses to D16-D31 if they don't exist. */
1737     if (!dc_isar_feature(aa32_simd_r32, s) && (a->vd & 0x10)) {
1738         return false;
1739     }
1740
1741     if (a->vd & a->q) {
1742         return false;
1743     }
1744
1745     if (!vfp_access_check(s)) {
1746         return true;
1747     }
1748
1749     reg_ofs = neon_full_reg_offset(a->vd);
1750     vec_size = a->q ? 16 : 8;
1751     imm = asimd_imm_const(a->imm, a->cmode, a->op);
1752
1753     fn(MO_64, reg_ofs, reg_ofs, imm, vec_size, vec_size);
1754     return true;
1755 }
1756
1757 static void gen_VMOV_1r(unsigned vece, uint32_t dofs, uint32_t aofs,
1758                         int64_t c, uint32_t oprsz, uint32_t maxsz)
1759 {
1760     tcg_gen_gvec_dup_imm(MO_64, dofs, oprsz, maxsz, c);
1761 }
1762
1763 static bool trans_Vimm_1r(DisasContext *s, arg_1reg_imm *a)
1764 {
1765     /* Handle decode of cmode/op here between VORR/VBIC/VMOV */
1766     GVecGen2iFn *fn;
1767
1768     if ((a->cmode & 1) && a->cmode < 12) {
1769         /* for op=1, the imm will be inverted, so BIC becomes AND. */
1770         fn = a->op ? tcg_gen_gvec_andi : tcg_gen_gvec_ori;
1771     } else {
1772         /* There is one unallocated cmode/op combination in this space */
1773         if (a->cmode == 15 && a->op == 1) {
1774             return false;
1775         }
1776         fn = gen_VMOV_1r;
1777     }
1778     return do_1reg_imm(s, a, fn);
1779 }
1780
1781 static bool do_prewiden_3d(DisasContext *s, arg_3diff *a,
1782                            NeonGenWidenFn *widenfn,
1783                            NeonGenTwo64OpFn *opfn,
1784                            int src1_mop, int src2_mop)
1785 {
1786     /* 3-regs different lengths, prewidening case (VADDL/VSUBL/VAADW/VSUBW) */
1787     TCGv_i64 rn0_64, rn1_64, rm_64;
1788
1789     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1790         return false;
1791     }
1792
1793     /* UNDEF accesses to D16-D31 if they don't exist. */
1794     if (!dc_isar_feature(aa32_simd_r32, s) &&
1795         ((a->vd | a->vn | a->vm) & 0x10)) {
1796         return false;
1797     }
1798
1799     if (!opfn) {
1800         /* size == 3 case, which is an entirely different insn group */
1801         return false;
1802     }
1803
1804     if ((a->vd & 1) || (src1_mop == MO_UQ && (a->vn & 1))) {
1805         return false;
1806     }
1807
1808     if (!vfp_access_check(s)) {
1809         return true;
1810     }
1811
1812     rn0_64 = tcg_temp_new_i64();
1813     rn1_64 = tcg_temp_new_i64();
1814     rm_64 = tcg_temp_new_i64();
1815
1816     if (src1_mop >= 0) {
1817         read_neon_element64(rn0_64, a->vn, 0, src1_mop);
1818     } else {
1819         TCGv_i32 tmp = tcg_temp_new_i32();
1820         read_neon_element32(tmp, a->vn, 0, MO_32);
1821         widenfn(rn0_64, tmp);
1822     }
1823     if (src2_mop >= 0) {
1824         read_neon_element64(rm_64, a->vm, 0, src2_mop);
1825     } else {
1826         TCGv_i32 tmp = tcg_temp_new_i32();
1827         read_neon_element32(tmp, a->vm, 0, MO_32);
1828         widenfn(rm_64, tmp);
1829     }
1830
1831     opfn(rn0_64, rn0_64, rm_64);
1832
1833     /*
1834      * Load second pass inputs before storing the first pass result, to
1835      * avoid incorrect results if a narrow input overlaps with the result.
1836      */
1837     if (src1_mop >= 0) {
1838         read_neon_element64(rn1_64, a->vn, 1, src1_mop);
1839     } else {
1840         TCGv_i32 tmp = tcg_temp_new_i32();
1841         read_neon_element32(tmp, a->vn, 1, MO_32);
1842         widenfn(rn1_64, tmp);
1843     }
1844     if (src2_mop >= 0) {
1845         read_neon_element64(rm_64, a->vm, 1, src2_mop);
1846     } else {
1847         TCGv_i32 tmp = tcg_temp_new_i32();
1848         read_neon_element32(tmp, a->vm, 1, MO_32);
1849         widenfn(rm_64, tmp);
1850     }
1851
1852     write_neon_element64(rn0_64, a->vd, 0, MO_64);
1853
1854     opfn(rn1_64, rn1_64, rm_64);
1855     write_neon_element64(rn1_64, a->vd, 1, MO_64);
1856
1857     return true;
1858 }
1859
1860 #define DO_PREWIDEN(INSN, S, OP, SRC1WIDE, SIGN)                        \
1861     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1862     {                                                                   \
1863         static NeonGenWidenFn * const widenfn[] = {                     \
1864             gen_helper_neon_widen_##S##8,                               \
1865             gen_helper_neon_widen_##S##16,                              \
1866             NULL, NULL,                                                 \
1867         };                                                              \
1868         static NeonGenTwo64OpFn * const addfn[] = {                     \
1869             gen_helper_neon_##OP##l_u16,                                \
1870             gen_helper_neon_##OP##l_u32,                                \
1871             tcg_gen_##OP##_i64,                                         \
1872             NULL,                                                       \
1873         };                                                              \
1874         int narrow_mop = a->size == MO_32 ? MO_32 | SIGN : -1;          \
1875         return do_prewiden_3d(s, a, widenfn[a->size], addfn[a->size],   \
1876                               SRC1WIDE ? MO_UQ : narrow_mop,             \
1877                               narrow_mop);                              \
1878     }
1879
1880 DO_PREWIDEN(VADDL_S, s, add, false, MO_SIGN)
1881 DO_PREWIDEN(VADDL_U, u, add, false, 0)
1882 DO_PREWIDEN(VSUBL_S, s, sub, false, MO_SIGN)
1883 DO_PREWIDEN(VSUBL_U, u, sub, false, 0)
1884 DO_PREWIDEN(VADDW_S, s, add, true, MO_SIGN)
1885 DO_PREWIDEN(VADDW_U, u, add, true, 0)
1886 DO_PREWIDEN(VSUBW_S, s, sub, true, MO_SIGN)
1887 DO_PREWIDEN(VSUBW_U, u, sub, true, 0)
1888
1889 static bool do_narrow_3d(DisasContext *s, arg_3diff *a,
1890                          NeonGenTwo64OpFn *opfn, NeonGenNarrowFn *narrowfn)
1891 {
1892     /* 3-regs different lengths, narrowing (VADDHN/VSUBHN/VRADDHN/VRSUBHN) */
1893     TCGv_i64 rn_64, rm_64;
1894     TCGv_i32 rd0, rd1;
1895
1896     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1897         return false;
1898     }
1899
1900     /* UNDEF accesses to D16-D31 if they don't exist. */
1901     if (!dc_isar_feature(aa32_simd_r32, s) &&
1902         ((a->vd | a->vn | a->vm) & 0x10)) {
1903         return false;
1904     }
1905
1906     if (!opfn || !narrowfn) {
1907         /* size == 3 case, which is an entirely different insn group */
1908         return false;
1909     }
1910
1911     if ((a->vn | a->vm) & 1) {
1912         return false;
1913     }
1914
1915     if (!vfp_access_check(s)) {
1916         return true;
1917     }
1918
1919     rn_64 = tcg_temp_new_i64();
1920     rm_64 = tcg_temp_new_i64();
1921     rd0 = tcg_temp_new_i32();
1922     rd1 = tcg_temp_new_i32();
1923
1924     read_neon_element64(rn_64, a->vn, 0, MO_64);
1925     read_neon_element64(rm_64, a->vm, 0, MO_64);
1926
1927     opfn(rn_64, rn_64, rm_64);
1928
1929     narrowfn(rd0, rn_64);
1930
1931     read_neon_element64(rn_64, a->vn, 1, MO_64);
1932     read_neon_element64(rm_64, a->vm, 1, MO_64);
1933
1934     opfn(rn_64, rn_64, rm_64);
1935
1936     narrowfn(rd1, rn_64);
1937
1938     write_neon_element32(rd0, a->vd, 0, MO_32);
1939     write_neon_element32(rd1, a->vd, 1, MO_32);
1940
1941     return true;
1942 }
1943
1944 #define DO_NARROW_3D(INSN, OP, NARROWTYPE, EXTOP)                       \
1945     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
1946     {                                                                   \
1947         static NeonGenTwo64OpFn * const addfn[] = {                     \
1948             gen_helper_neon_##OP##l_u16,                                \
1949             gen_helper_neon_##OP##l_u32,                                \
1950             tcg_gen_##OP##_i64,                                         \
1951             NULL,                                                       \
1952         };                                                              \
1953         static NeonGenNarrowFn * const narrowfn[] = {                   \
1954             gen_helper_neon_##NARROWTYPE##_high_u8,                     \
1955             gen_helper_neon_##NARROWTYPE##_high_u16,                    \
1956             EXTOP,                                                      \
1957             NULL,                                                       \
1958         };                                                              \
1959         return do_narrow_3d(s, a, addfn[a->size], narrowfn[a->size]);   \
1960     }
1961
1962 static void gen_narrow_round_high_u32(TCGv_i32 rd, TCGv_i64 rn)
1963 {
1964     tcg_gen_addi_i64(rn, rn, 1u << 31);
1965     tcg_gen_extrh_i64_i32(rd, rn);
1966 }
1967
1968 DO_NARROW_3D(VADDHN, add, narrow, tcg_gen_extrh_i64_i32)
1969 DO_NARROW_3D(VSUBHN, sub, narrow, tcg_gen_extrh_i64_i32)
1970 DO_NARROW_3D(VRADDHN, add, narrow_round, gen_narrow_round_high_u32)
1971 DO_NARROW_3D(VRSUBHN, sub, narrow_round, gen_narrow_round_high_u32)
1972
1973 static bool do_long_3d(DisasContext *s, arg_3diff *a,
1974                        NeonGenTwoOpWidenFn *opfn,
1975                        NeonGenTwo64OpFn *accfn)
1976 {
1977     /*
1978      * 3-regs different lengths, long operations.
1979      * These perform an operation on two inputs that returns a double-width
1980      * result, and then possibly perform an accumulation operation of
1981      * that result into the double-width destination.
1982      */
1983     TCGv_i64 rd0, rd1, tmp;
1984     TCGv_i32 rn, rm;
1985
1986     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
1987         return false;
1988     }
1989
1990     /* UNDEF accesses to D16-D31 if they don't exist. */
1991     if (!dc_isar_feature(aa32_simd_r32, s) &&
1992         ((a->vd | a->vn | a->vm) & 0x10)) {
1993         return false;
1994     }
1995
1996     if (!opfn) {
1997         /* size == 3 case, which is an entirely different insn group */
1998         return false;
1999     }
2000
2001     if (a->vd & 1) {
2002         return false;
2003     }
2004
2005     if (!vfp_access_check(s)) {
2006         return true;
2007     }
2008
2009     rd0 = tcg_temp_new_i64();
2010     rd1 = tcg_temp_new_i64();
2011
2012     rn = tcg_temp_new_i32();
2013     rm = tcg_temp_new_i32();
2014     read_neon_element32(rn, a->vn, 0, MO_32);
2015     read_neon_element32(rm, a->vm, 0, MO_32);
2016     opfn(rd0, rn, rm);
2017
2018     read_neon_element32(rn, a->vn, 1, MO_32);
2019     read_neon_element32(rm, a->vm, 1, MO_32);
2020     opfn(rd1, rn, rm);
2021
2022     /* Don't store results until after all loads: they might overlap */
2023     if (accfn) {
2024         tmp = tcg_temp_new_i64();
2025         read_neon_element64(tmp, a->vd, 0, MO_64);
2026         accfn(rd0, tmp, rd0);
2027         read_neon_element64(tmp, a->vd, 1, MO_64);
2028         accfn(rd1, tmp, rd1);
2029     }
2030
2031     write_neon_element64(rd0, a->vd, 0, MO_64);
2032     write_neon_element64(rd1, a->vd, 1, MO_64);
2033
2034     return true;
2035 }
2036
2037 static bool trans_VABDL_S_3d(DisasContext *s, arg_3diff *a)
2038 {
2039     static NeonGenTwoOpWidenFn * const opfn[] = {
2040         gen_helper_neon_abdl_s16,
2041         gen_helper_neon_abdl_s32,
2042         gen_helper_neon_abdl_s64,
2043         NULL,
2044     };
2045
2046     return do_long_3d(s, a, opfn[a->size], NULL);
2047 }
2048
2049 static bool trans_VABDL_U_3d(DisasContext *s, arg_3diff *a)
2050 {
2051     static NeonGenTwoOpWidenFn * const opfn[] = {
2052         gen_helper_neon_abdl_u16,
2053         gen_helper_neon_abdl_u32,
2054         gen_helper_neon_abdl_u64,
2055         NULL,
2056     };
2057
2058     return do_long_3d(s, a, opfn[a->size], NULL);
2059 }
2060
2061 static bool trans_VABAL_S_3d(DisasContext *s, arg_3diff *a)
2062 {
2063     static NeonGenTwoOpWidenFn * const opfn[] = {
2064         gen_helper_neon_abdl_s16,
2065         gen_helper_neon_abdl_s32,
2066         gen_helper_neon_abdl_s64,
2067         NULL,
2068     };
2069     static NeonGenTwo64OpFn * const addfn[] = {
2070         gen_helper_neon_addl_u16,
2071         gen_helper_neon_addl_u32,
2072         tcg_gen_add_i64,
2073         NULL,
2074     };
2075
2076     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2077 }
2078
2079 static bool trans_VABAL_U_3d(DisasContext *s, arg_3diff *a)
2080 {
2081     static NeonGenTwoOpWidenFn * const opfn[] = {
2082         gen_helper_neon_abdl_u16,
2083         gen_helper_neon_abdl_u32,
2084         gen_helper_neon_abdl_u64,
2085         NULL,
2086     };
2087     static NeonGenTwo64OpFn * const addfn[] = {
2088         gen_helper_neon_addl_u16,
2089         gen_helper_neon_addl_u32,
2090         tcg_gen_add_i64,
2091         NULL,
2092     };
2093
2094     return do_long_3d(s, a, opfn[a->size], addfn[a->size]);
2095 }
2096
2097 static void gen_mull_s32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2098 {
2099     TCGv_i32 lo = tcg_temp_new_i32();
2100     TCGv_i32 hi = tcg_temp_new_i32();
2101
2102     tcg_gen_muls2_i32(lo, hi, rn, rm);
2103     tcg_gen_concat_i32_i64(rd, lo, hi);
2104 }
2105
2106 static void gen_mull_u32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2107 {
2108     TCGv_i32 lo = tcg_temp_new_i32();
2109     TCGv_i32 hi = tcg_temp_new_i32();
2110
2111     tcg_gen_mulu2_i32(lo, hi, rn, rm);
2112     tcg_gen_concat_i32_i64(rd, lo, hi);
2113 }
2114
2115 static bool trans_VMULL_S_3d(DisasContext *s, arg_3diff *a)
2116 {
2117     static NeonGenTwoOpWidenFn * const opfn[] = {
2118         gen_helper_neon_mull_s8,
2119         gen_helper_neon_mull_s16,
2120         gen_mull_s32,
2121         NULL,
2122     };
2123
2124     return do_long_3d(s, a, opfn[a->size], NULL);
2125 }
2126
2127 static bool trans_VMULL_U_3d(DisasContext *s, arg_3diff *a)
2128 {
2129     static NeonGenTwoOpWidenFn * const opfn[] = {
2130         gen_helper_neon_mull_u8,
2131         gen_helper_neon_mull_u16,
2132         gen_mull_u32,
2133         NULL,
2134     };
2135
2136     return do_long_3d(s, a, opfn[a->size], NULL);
2137 }
2138
2139 #define DO_VMLAL(INSN,MULL,ACC)                                         \
2140     static bool trans_##INSN##_3d(DisasContext *s, arg_3diff *a)        \
2141     {                                                                   \
2142         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2143             gen_helper_neon_##MULL##8,                                  \
2144             gen_helper_neon_##MULL##16,                                 \
2145             gen_##MULL##32,                                             \
2146             NULL,                                                       \
2147         };                                                              \
2148         static NeonGenTwo64OpFn * const accfn[] = {                     \
2149             gen_helper_neon_##ACC##l_u16,                               \
2150             gen_helper_neon_##ACC##l_u32,                               \
2151             tcg_gen_##ACC##_i64,                                        \
2152             NULL,                                                       \
2153         };                                                              \
2154         return do_long_3d(s, a, opfn[a->size], accfn[a->size]);         \
2155     }
2156
2157 DO_VMLAL(VMLAL_S,mull_s,add)
2158 DO_VMLAL(VMLAL_U,mull_u,add)
2159 DO_VMLAL(VMLSL_S,mull_s,sub)
2160 DO_VMLAL(VMLSL_U,mull_u,sub)
2161
2162 static void gen_VQDMULL_16(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2163 {
2164     gen_helper_neon_mull_s16(rd, rn, rm);
2165     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rd, rd);
2166 }
2167
2168 static void gen_VQDMULL_32(TCGv_i64 rd, TCGv_i32 rn, TCGv_i32 rm)
2169 {
2170     gen_mull_s32(rd, rn, rm);
2171     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rd, rd);
2172 }
2173
2174 static bool trans_VQDMULL_3d(DisasContext *s, arg_3diff *a)
2175 {
2176     static NeonGenTwoOpWidenFn * const opfn[] = {
2177         NULL,
2178         gen_VQDMULL_16,
2179         gen_VQDMULL_32,
2180         NULL,
2181     };
2182
2183     return do_long_3d(s, a, opfn[a->size], NULL);
2184 }
2185
2186 static void gen_VQDMLAL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2187 {
2188     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2189 }
2190
2191 static void gen_VQDMLAL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2192 {
2193     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2194 }
2195
2196 static bool trans_VQDMLAL_3d(DisasContext *s, arg_3diff *a)
2197 {
2198     static NeonGenTwoOpWidenFn * const opfn[] = {
2199         NULL,
2200         gen_VQDMULL_16,
2201         gen_VQDMULL_32,
2202         NULL,
2203     };
2204     static NeonGenTwo64OpFn * const accfn[] = {
2205         NULL,
2206         gen_VQDMLAL_acc_16,
2207         gen_VQDMLAL_acc_32,
2208         NULL,
2209     };
2210
2211     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2212 }
2213
2214 static void gen_VQDMLSL_acc_16(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2215 {
2216     gen_helper_neon_negl_u32(rm, rm);
2217     gen_helper_neon_addl_saturate_s32(rd, cpu_env, rn, rm);
2218 }
2219
2220 static void gen_VQDMLSL_acc_32(TCGv_i64 rd, TCGv_i64 rn, TCGv_i64 rm)
2221 {
2222     tcg_gen_neg_i64(rm, rm);
2223     gen_helper_neon_addl_saturate_s64(rd, cpu_env, rn, rm);
2224 }
2225
2226 static bool trans_VQDMLSL_3d(DisasContext *s, arg_3diff *a)
2227 {
2228     static NeonGenTwoOpWidenFn * const opfn[] = {
2229         NULL,
2230         gen_VQDMULL_16,
2231         gen_VQDMULL_32,
2232         NULL,
2233     };
2234     static NeonGenTwo64OpFn * const accfn[] = {
2235         NULL,
2236         gen_VQDMLSL_acc_16,
2237         gen_VQDMLSL_acc_32,
2238         NULL,
2239     };
2240
2241     return do_long_3d(s, a, opfn[a->size], accfn[a->size]);
2242 }
2243
2244 static bool trans_VMULL_P_3d(DisasContext *s, arg_3diff *a)
2245 {
2246     gen_helper_gvec_3 *fn_gvec;
2247
2248     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2249         return false;
2250     }
2251
2252     /* UNDEF accesses to D16-D31 if they don't exist. */
2253     if (!dc_isar_feature(aa32_simd_r32, s) &&
2254         ((a->vd | a->vn | a->vm) & 0x10)) {
2255         return false;
2256     }
2257
2258     if (a->vd & 1) {
2259         return false;
2260     }
2261
2262     switch (a->size) {
2263     case 0:
2264         fn_gvec = gen_helper_neon_pmull_h;
2265         break;
2266     case 2:
2267         if (!dc_isar_feature(aa32_pmull, s)) {
2268             return false;
2269         }
2270         fn_gvec = gen_helper_gvec_pmull_q;
2271         break;
2272     default:
2273         return false;
2274     }
2275
2276     if (!vfp_access_check(s)) {
2277         return true;
2278     }
2279
2280     tcg_gen_gvec_3_ool(neon_full_reg_offset(a->vd),
2281                        neon_full_reg_offset(a->vn),
2282                        neon_full_reg_offset(a->vm),
2283                        16, 16, 0, fn_gvec);
2284     return true;
2285 }
2286
2287 static void gen_neon_dup_low16(TCGv_i32 var)
2288 {
2289     TCGv_i32 tmp = tcg_temp_new_i32();
2290     tcg_gen_ext16u_i32(var, var);
2291     tcg_gen_shli_i32(tmp, var, 16);
2292     tcg_gen_or_i32(var, var, tmp);
2293 }
2294
2295 static void gen_neon_dup_high16(TCGv_i32 var)
2296 {
2297     TCGv_i32 tmp = tcg_temp_new_i32();
2298     tcg_gen_andi_i32(var, var, 0xffff0000);
2299     tcg_gen_shri_i32(tmp, var, 16);
2300     tcg_gen_or_i32(var, var, tmp);
2301 }
2302
2303 static inline TCGv_i32 neon_get_scalar(int size, int reg)
2304 {
2305     TCGv_i32 tmp = tcg_temp_new_i32();
2306     if (size == MO_16) {
2307         read_neon_element32(tmp, reg & 7, reg >> 4, MO_32);
2308         if (reg & 8) {
2309             gen_neon_dup_high16(tmp);
2310         } else {
2311             gen_neon_dup_low16(tmp);
2312         }
2313     } else {
2314         read_neon_element32(tmp, reg & 15, reg >> 4, MO_32);
2315     }
2316     return tmp;
2317 }
2318
2319 static bool do_2scalar(DisasContext *s, arg_2scalar *a,
2320                        NeonGenTwoOpFn *opfn, NeonGenTwoOpFn *accfn)
2321 {
2322     /*
2323      * Two registers and a scalar: perform an operation between
2324      * the input elements and the scalar, and then possibly
2325      * perform an accumulation operation of that result into the
2326      * destination.
2327      */
2328     TCGv_i32 scalar, tmp;
2329     int pass;
2330
2331     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2332         return false;
2333     }
2334
2335     /* UNDEF accesses to D16-D31 if they don't exist. */
2336     if (!dc_isar_feature(aa32_simd_r32, s) &&
2337         ((a->vd | a->vn | a->vm) & 0x10)) {
2338         return false;
2339     }
2340
2341     if (!opfn) {
2342         /* Bad size (including size == 3, which is a different insn group) */
2343         return false;
2344     }
2345
2346     if (a->q && ((a->vd | a->vn) & 1)) {
2347         return false;
2348     }
2349
2350     if (!vfp_access_check(s)) {
2351         return true;
2352     }
2353
2354     scalar = neon_get_scalar(a->size, a->vm);
2355     tmp = tcg_temp_new_i32();
2356
2357     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2358         read_neon_element32(tmp, a->vn, pass, MO_32);
2359         opfn(tmp, tmp, scalar);
2360         if (accfn) {
2361             TCGv_i32 rd = tcg_temp_new_i32();
2362             read_neon_element32(rd, a->vd, pass, MO_32);
2363             accfn(tmp, rd, tmp);
2364         }
2365         write_neon_element32(tmp, a->vd, pass, MO_32);
2366     }
2367     return true;
2368 }
2369
2370 static bool trans_VMUL_2sc(DisasContext *s, arg_2scalar *a)
2371 {
2372     static NeonGenTwoOpFn * const opfn[] = {
2373         NULL,
2374         gen_helper_neon_mul_u16,
2375         tcg_gen_mul_i32,
2376         NULL,
2377     };
2378
2379     return do_2scalar(s, a, opfn[a->size], NULL);
2380 }
2381
2382 static bool trans_VMLA_2sc(DisasContext *s, arg_2scalar *a)
2383 {
2384     static NeonGenTwoOpFn * const opfn[] = {
2385         NULL,
2386         gen_helper_neon_mul_u16,
2387         tcg_gen_mul_i32,
2388         NULL,
2389     };
2390     static NeonGenTwoOpFn * const accfn[] = {
2391         NULL,
2392         gen_helper_neon_add_u16,
2393         tcg_gen_add_i32,
2394         NULL,
2395     };
2396
2397     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2398 }
2399
2400 static bool trans_VMLS_2sc(DisasContext *s, arg_2scalar *a)
2401 {
2402     static NeonGenTwoOpFn * const opfn[] = {
2403         NULL,
2404         gen_helper_neon_mul_u16,
2405         tcg_gen_mul_i32,
2406         NULL,
2407     };
2408     static NeonGenTwoOpFn * const accfn[] = {
2409         NULL,
2410         gen_helper_neon_sub_u16,
2411         tcg_gen_sub_i32,
2412         NULL,
2413     };
2414
2415     return do_2scalar(s, a, opfn[a->size], accfn[a->size]);
2416 }
2417
2418 static bool do_2scalar_fp_vec(DisasContext *s, arg_2scalar *a,
2419                               gen_helper_gvec_3_ptr *fn)
2420 {
2421     /* Two registers and a scalar, using gvec */
2422     int vec_size = a->q ? 16 : 8;
2423     int rd_ofs = neon_full_reg_offset(a->vd);
2424     int rn_ofs = neon_full_reg_offset(a->vn);
2425     int rm_ofs;
2426     int idx;
2427     TCGv_ptr fpstatus;
2428
2429     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2430         return false;
2431     }
2432
2433     /* UNDEF accesses to D16-D31 if they don't exist. */
2434     if (!dc_isar_feature(aa32_simd_r32, s) &&
2435         ((a->vd | a->vn | a->vm) & 0x10)) {
2436         return false;
2437     }
2438
2439     if (!fn) {
2440         /* Bad size (including size == 3, which is a different insn group) */
2441         return false;
2442     }
2443
2444     if (a->q && ((a->vd | a->vn) & 1)) {
2445         return false;
2446     }
2447
2448     if (!vfp_access_check(s)) {
2449         return true;
2450     }
2451
2452     /* a->vm is M:Vm, which encodes both register and index */
2453     idx = extract32(a->vm, a->size + 2, 2);
2454     a->vm = extract32(a->vm, 0, a->size + 2);
2455     rm_ofs = neon_full_reg_offset(a->vm);
2456
2457     fpstatus = fpstatus_ptr(a->size == 1 ? FPST_STD_F16 : FPST_STD);
2458     tcg_gen_gvec_3_ptr(rd_ofs, rn_ofs, rm_ofs, fpstatus,
2459                        vec_size, vec_size, idx, fn);
2460     return true;
2461 }
2462
2463 #define DO_VMUL_F_2sc(NAME, FUNC)                                       \
2464     static bool trans_##NAME##_F_2sc(DisasContext *s, arg_2scalar *a)   \
2465     {                                                                   \
2466         static gen_helper_gvec_3_ptr * const opfn[] = {                 \
2467             NULL,                                                       \
2468             gen_helper_##FUNC##_h,                                      \
2469             gen_helper_##FUNC##_s,                                      \
2470             NULL,                                                       \
2471         };                                                              \
2472         if (a->size == MO_16 && !dc_isar_feature(aa32_fp16_arith, s)) { \
2473             return false;                                               \
2474         }                                                               \
2475         return do_2scalar_fp_vec(s, a, opfn[a->size]);                  \
2476     }
2477
2478 DO_VMUL_F_2sc(VMUL, gvec_fmul_idx)
2479 DO_VMUL_F_2sc(VMLA, gvec_fmla_nf_idx)
2480 DO_VMUL_F_2sc(VMLS, gvec_fmls_nf_idx)
2481
2482 WRAP_ENV_FN(gen_VQDMULH_16, gen_helper_neon_qdmulh_s16)
2483 WRAP_ENV_FN(gen_VQDMULH_32, gen_helper_neon_qdmulh_s32)
2484 WRAP_ENV_FN(gen_VQRDMULH_16, gen_helper_neon_qrdmulh_s16)
2485 WRAP_ENV_FN(gen_VQRDMULH_32, gen_helper_neon_qrdmulh_s32)
2486
2487 static bool trans_VQDMULH_2sc(DisasContext *s, arg_2scalar *a)
2488 {
2489     static NeonGenTwoOpFn * const opfn[] = {
2490         NULL,
2491         gen_VQDMULH_16,
2492         gen_VQDMULH_32,
2493         NULL,
2494     };
2495
2496     return do_2scalar(s, a, opfn[a->size], NULL);
2497 }
2498
2499 static bool trans_VQRDMULH_2sc(DisasContext *s, arg_2scalar *a)
2500 {
2501     static NeonGenTwoOpFn * const opfn[] = {
2502         NULL,
2503         gen_VQRDMULH_16,
2504         gen_VQRDMULH_32,
2505         NULL,
2506     };
2507
2508     return do_2scalar(s, a, opfn[a->size], NULL);
2509 }
2510
2511 static bool do_vqrdmlah_2sc(DisasContext *s, arg_2scalar *a,
2512                             NeonGenThreeOpEnvFn *opfn)
2513 {
2514     /*
2515      * VQRDMLAH/VQRDMLSH: this is like do_2scalar, but the opfn
2516      * performs a kind of fused op-then-accumulate using a helper
2517      * function that takes all of rd, rn and the scalar at once.
2518      */
2519     TCGv_i32 scalar, rn, rd;
2520     int pass;
2521
2522     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2523         return false;
2524     }
2525
2526     if (!dc_isar_feature(aa32_rdm, s)) {
2527         return false;
2528     }
2529
2530     /* UNDEF accesses to D16-D31 if they don't exist. */
2531     if (!dc_isar_feature(aa32_simd_r32, s) &&
2532         ((a->vd | a->vn | a->vm) & 0x10)) {
2533         return false;
2534     }
2535
2536     if (!opfn) {
2537         /* Bad size (including size == 3, which is a different insn group) */
2538         return false;
2539     }
2540
2541     if (a->q && ((a->vd | a->vn) & 1)) {
2542         return false;
2543     }
2544
2545     if (!vfp_access_check(s)) {
2546         return true;
2547     }
2548
2549     scalar = neon_get_scalar(a->size, a->vm);
2550     rn = tcg_temp_new_i32();
2551     rd = tcg_temp_new_i32();
2552
2553     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
2554         read_neon_element32(rn, a->vn, pass, MO_32);
2555         read_neon_element32(rd, a->vd, pass, MO_32);
2556         opfn(rd, cpu_env, rn, scalar, rd);
2557         write_neon_element32(rd, a->vd, pass, MO_32);
2558     }
2559     return true;
2560 }
2561
2562 static bool trans_VQRDMLAH_2sc(DisasContext *s, arg_2scalar *a)
2563 {
2564     static NeonGenThreeOpEnvFn *opfn[] = {
2565         NULL,
2566         gen_helper_neon_qrdmlah_s16,
2567         gen_helper_neon_qrdmlah_s32,
2568         NULL,
2569     };
2570     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2571 }
2572
2573 static bool trans_VQRDMLSH_2sc(DisasContext *s, arg_2scalar *a)
2574 {
2575     static NeonGenThreeOpEnvFn *opfn[] = {
2576         NULL,
2577         gen_helper_neon_qrdmlsh_s16,
2578         gen_helper_neon_qrdmlsh_s32,
2579         NULL,
2580     };
2581     return do_vqrdmlah_2sc(s, a, opfn[a->size]);
2582 }
2583
2584 static bool do_2scalar_long(DisasContext *s, arg_2scalar *a,
2585                             NeonGenTwoOpWidenFn *opfn,
2586                             NeonGenTwo64OpFn *accfn)
2587 {
2588     /*
2589      * Two registers and a scalar, long operations: perform an
2590      * operation on the input elements and the scalar which produces
2591      * a double-width result, and then possibly perform an accumulation
2592      * operation of that result into the destination.
2593      */
2594     TCGv_i32 scalar, rn;
2595     TCGv_i64 rn0_64, rn1_64;
2596
2597     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2598         return false;
2599     }
2600
2601     /* UNDEF accesses to D16-D31 if they don't exist. */
2602     if (!dc_isar_feature(aa32_simd_r32, s) &&
2603         ((a->vd | a->vn | a->vm) & 0x10)) {
2604         return false;
2605     }
2606
2607     if (!opfn) {
2608         /* Bad size (including size == 3, which is a different insn group) */
2609         return false;
2610     }
2611
2612     if (a->vd & 1) {
2613         return false;
2614     }
2615
2616     if (!vfp_access_check(s)) {
2617         return true;
2618     }
2619
2620     scalar = neon_get_scalar(a->size, a->vm);
2621
2622     /* Load all inputs before writing any outputs, in case of overlap */
2623     rn = tcg_temp_new_i32();
2624     read_neon_element32(rn, a->vn, 0, MO_32);
2625     rn0_64 = tcg_temp_new_i64();
2626     opfn(rn0_64, rn, scalar);
2627
2628     read_neon_element32(rn, a->vn, 1, MO_32);
2629     rn1_64 = tcg_temp_new_i64();
2630     opfn(rn1_64, rn, scalar);
2631
2632     if (accfn) {
2633         TCGv_i64 t64 = tcg_temp_new_i64();
2634         read_neon_element64(t64, a->vd, 0, MO_64);
2635         accfn(rn0_64, t64, rn0_64);
2636         read_neon_element64(t64, a->vd, 1, MO_64);
2637         accfn(rn1_64, t64, rn1_64);
2638     }
2639
2640     write_neon_element64(rn0_64, a->vd, 0, MO_64);
2641     write_neon_element64(rn1_64, a->vd, 1, MO_64);
2642     return true;
2643 }
2644
2645 static bool trans_VMULL_S_2sc(DisasContext *s, arg_2scalar *a)
2646 {
2647     static NeonGenTwoOpWidenFn * const opfn[] = {
2648         NULL,
2649         gen_helper_neon_mull_s16,
2650         gen_mull_s32,
2651         NULL,
2652     };
2653
2654     return do_2scalar_long(s, a, opfn[a->size], NULL);
2655 }
2656
2657 static bool trans_VMULL_U_2sc(DisasContext *s, arg_2scalar *a)
2658 {
2659     static NeonGenTwoOpWidenFn * const opfn[] = {
2660         NULL,
2661         gen_helper_neon_mull_u16,
2662         gen_mull_u32,
2663         NULL,
2664     };
2665
2666     return do_2scalar_long(s, a, opfn[a->size], NULL);
2667 }
2668
2669 #define DO_VMLAL_2SC(INSN, MULL, ACC)                                   \
2670     static bool trans_##INSN##_2sc(DisasContext *s, arg_2scalar *a)     \
2671     {                                                                   \
2672         static NeonGenTwoOpWidenFn * const opfn[] = {                   \
2673             NULL,                                                       \
2674             gen_helper_neon_##MULL##16,                                 \
2675             gen_##MULL##32,                                             \
2676             NULL,                                                       \
2677         };                                                              \
2678         static NeonGenTwo64OpFn * const accfn[] = {                     \
2679             NULL,                                                       \
2680             gen_helper_neon_##ACC##l_u32,                               \
2681             tcg_gen_##ACC##_i64,                                        \
2682             NULL,                                                       \
2683         };                                                              \
2684         return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);    \
2685     }
2686
2687 DO_VMLAL_2SC(VMLAL_S, mull_s, add)
2688 DO_VMLAL_2SC(VMLAL_U, mull_u, add)
2689 DO_VMLAL_2SC(VMLSL_S, mull_s, sub)
2690 DO_VMLAL_2SC(VMLSL_U, mull_u, sub)
2691
2692 static bool trans_VQDMULL_2sc(DisasContext *s, arg_2scalar *a)
2693 {
2694     static NeonGenTwoOpWidenFn * const opfn[] = {
2695         NULL,
2696         gen_VQDMULL_16,
2697         gen_VQDMULL_32,
2698         NULL,
2699     };
2700
2701     return do_2scalar_long(s, a, opfn[a->size], NULL);
2702 }
2703
2704 static bool trans_VQDMLAL_2sc(DisasContext *s, arg_2scalar *a)
2705 {
2706     static NeonGenTwoOpWidenFn * const opfn[] = {
2707         NULL,
2708         gen_VQDMULL_16,
2709         gen_VQDMULL_32,
2710         NULL,
2711     };
2712     static NeonGenTwo64OpFn * const accfn[] = {
2713         NULL,
2714         gen_VQDMLAL_acc_16,
2715         gen_VQDMLAL_acc_32,
2716         NULL,
2717     };
2718
2719     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2720 }
2721
2722 static bool trans_VQDMLSL_2sc(DisasContext *s, arg_2scalar *a)
2723 {
2724     static NeonGenTwoOpWidenFn * const opfn[] = {
2725         NULL,
2726         gen_VQDMULL_16,
2727         gen_VQDMULL_32,
2728         NULL,
2729     };
2730     static NeonGenTwo64OpFn * const accfn[] = {
2731         NULL,
2732         gen_VQDMLSL_acc_16,
2733         gen_VQDMLSL_acc_32,
2734         NULL,
2735     };
2736
2737     return do_2scalar_long(s, a, opfn[a->size], accfn[a->size]);
2738 }
2739
2740 static bool trans_VEXT(DisasContext *s, arg_VEXT *a)
2741 {
2742     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2743         return false;
2744     }
2745
2746     /* UNDEF accesses to D16-D31 if they don't exist. */
2747     if (!dc_isar_feature(aa32_simd_r32, s) &&
2748         ((a->vd | a->vn | a->vm) & 0x10)) {
2749         return false;
2750     }
2751
2752     if ((a->vn | a->vm | a->vd) & a->q) {
2753         return false;
2754     }
2755
2756     if (a->imm > 7 && !a->q) {
2757         return false;
2758     }
2759
2760     if (!vfp_access_check(s)) {
2761         return true;
2762     }
2763
2764     if (!a->q) {
2765         /* Extract 64 bits from <Vm:Vn> */
2766         TCGv_i64 left, right, dest;
2767
2768         left = tcg_temp_new_i64();
2769         right = tcg_temp_new_i64();
2770         dest = tcg_temp_new_i64();
2771
2772         read_neon_element64(right, a->vn, 0, MO_64);
2773         read_neon_element64(left, a->vm, 0, MO_64);
2774         tcg_gen_extract2_i64(dest, right, left, a->imm * 8);
2775         write_neon_element64(dest, a->vd, 0, MO_64);
2776     } else {
2777         /* Extract 128 bits from <Vm+1:Vm:Vn+1:Vn> */
2778         TCGv_i64 left, middle, right, destleft, destright;
2779
2780         left = tcg_temp_new_i64();
2781         middle = tcg_temp_new_i64();
2782         right = tcg_temp_new_i64();
2783         destleft = tcg_temp_new_i64();
2784         destright = tcg_temp_new_i64();
2785
2786         if (a->imm < 8) {
2787             read_neon_element64(right, a->vn, 0, MO_64);
2788             read_neon_element64(middle, a->vn, 1, MO_64);
2789             tcg_gen_extract2_i64(destright, right, middle, a->imm * 8);
2790             read_neon_element64(left, a->vm, 0, MO_64);
2791             tcg_gen_extract2_i64(destleft, middle, left, a->imm * 8);
2792         } else {
2793             read_neon_element64(right, a->vn, 1, MO_64);
2794             read_neon_element64(middle, a->vm, 0, MO_64);
2795             tcg_gen_extract2_i64(destright, right, middle, (a->imm - 8) * 8);
2796             read_neon_element64(left, a->vm, 1, MO_64);
2797             tcg_gen_extract2_i64(destleft, middle, left, (a->imm - 8) * 8);
2798         }
2799
2800         write_neon_element64(destright, a->vd, 0, MO_64);
2801         write_neon_element64(destleft, a->vd, 1, MO_64);
2802     }
2803     return true;
2804 }
2805
2806 static bool trans_VTBL(DisasContext *s, arg_VTBL *a)
2807 {
2808     TCGv_i64 val, def;
2809     TCGv_i32 desc;
2810
2811     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2812         return false;
2813     }
2814
2815     /* UNDEF accesses to D16-D31 if they don't exist. */
2816     if (!dc_isar_feature(aa32_simd_r32, s) &&
2817         ((a->vd | a->vn | a->vm) & 0x10)) {
2818         return false;
2819     }
2820
2821     if ((a->vn + a->len + 1) > 32) {
2822         /*
2823          * This is UNPREDICTABLE; we choose to UNDEF to avoid the
2824          * helper function running off the end of the register file.
2825          */
2826         return false;
2827     }
2828
2829     if (!vfp_access_check(s)) {
2830         return true;
2831     }
2832
2833     desc = tcg_constant_i32((a->vn << 2) | a->len);
2834     def = tcg_temp_new_i64();
2835     if (a->op) {
2836         read_neon_element64(def, a->vd, 0, MO_64);
2837     } else {
2838         tcg_gen_movi_i64(def, 0);
2839     }
2840     val = tcg_temp_new_i64();
2841     read_neon_element64(val, a->vm, 0, MO_64);
2842
2843     gen_helper_neon_tbl(val, cpu_env, desc, val, def);
2844     write_neon_element64(val, a->vd, 0, MO_64);
2845     return true;
2846 }
2847
2848 static bool trans_VDUP_scalar(DisasContext *s, arg_VDUP_scalar *a)
2849 {
2850     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2851         return false;
2852     }
2853
2854     /* UNDEF accesses to D16-D31 if they don't exist. */
2855     if (!dc_isar_feature(aa32_simd_r32, s) &&
2856         ((a->vd | a->vm) & 0x10)) {
2857         return false;
2858     }
2859
2860     if (a->vd & a->q) {
2861         return false;
2862     }
2863
2864     if (!vfp_access_check(s)) {
2865         return true;
2866     }
2867
2868     tcg_gen_gvec_dup_mem(a->size, neon_full_reg_offset(a->vd),
2869                          neon_element_offset(a->vm, a->index, a->size),
2870                          a->q ? 16 : 8, a->q ? 16 : 8);
2871     return true;
2872 }
2873
2874 static bool trans_VREV64(DisasContext *s, arg_VREV64 *a)
2875 {
2876     int pass, half;
2877     TCGv_i32 tmp[2];
2878
2879     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2880         return false;
2881     }
2882
2883     /* UNDEF accesses to D16-D31 if they don't exist. */
2884     if (!dc_isar_feature(aa32_simd_r32, s) &&
2885         ((a->vd | a->vm) & 0x10)) {
2886         return false;
2887     }
2888
2889     if ((a->vd | a->vm) & a->q) {
2890         return false;
2891     }
2892
2893     if (a->size == 3) {
2894         return false;
2895     }
2896
2897     if (!vfp_access_check(s)) {
2898         return true;
2899     }
2900
2901     tmp[0] = tcg_temp_new_i32();
2902     tmp[1] = tcg_temp_new_i32();
2903
2904     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
2905         for (half = 0; half < 2; half++) {
2906             read_neon_element32(tmp[half], a->vm, pass * 2 + half, MO_32);
2907             switch (a->size) {
2908             case 0:
2909                 tcg_gen_bswap32_i32(tmp[half], tmp[half]);
2910                 break;
2911             case 1:
2912                 gen_swap_half(tmp[half], tmp[half]);
2913                 break;
2914             case 2:
2915                 break;
2916             default:
2917                 g_assert_not_reached();
2918             }
2919         }
2920         write_neon_element32(tmp[1], a->vd, pass * 2, MO_32);
2921         write_neon_element32(tmp[0], a->vd, pass * 2 + 1, MO_32);
2922     }
2923     return true;
2924 }
2925
2926 static bool do_2misc_pairwise(DisasContext *s, arg_2misc *a,
2927                               NeonGenWidenFn *widenfn,
2928                               NeonGenTwo64OpFn *opfn,
2929                               NeonGenTwo64OpFn *accfn)
2930 {
2931     /*
2932      * Pairwise long operations: widen both halves of the pair,
2933      * combine the pairs with the opfn, and then possibly accumulate
2934      * into the destination with the accfn.
2935      */
2936     int pass;
2937
2938     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
2939         return false;
2940     }
2941
2942     /* UNDEF accesses to D16-D31 if they don't exist. */
2943     if (!dc_isar_feature(aa32_simd_r32, s) &&
2944         ((a->vd | a->vm) & 0x10)) {
2945         return false;
2946     }
2947
2948     if ((a->vd | a->vm) & a->q) {
2949         return false;
2950     }
2951
2952     if (!widenfn) {
2953         return false;
2954     }
2955
2956     if (!vfp_access_check(s)) {
2957         return true;
2958     }
2959
2960     for (pass = 0; pass < a->q + 1; pass++) {
2961         TCGv_i32 tmp;
2962         TCGv_i64 rm0_64, rm1_64, rd_64;
2963
2964         rm0_64 = tcg_temp_new_i64();
2965         rm1_64 = tcg_temp_new_i64();
2966         rd_64 = tcg_temp_new_i64();
2967
2968         tmp = tcg_temp_new_i32();
2969         read_neon_element32(tmp, a->vm, pass * 2, MO_32);
2970         widenfn(rm0_64, tmp);
2971         read_neon_element32(tmp, a->vm, pass * 2 + 1, MO_32);
2972         widenfn(rm1_64, tmp);
2973
2974         opfn(rd_64, rm0_64, rm1_64);
2975
2976         if (accfn) {
2977             TCGv_i64 tmp64 = tcg_temp_new_i64();
2978             read_neon_element64(tmp64, a->vd, pass, MO_64);
2979             accfn(rd_64, tmp64, rd_64);
2980         }
2981         write_neon_element64(rd_64, a->vd, pass, MO_64);
2982     }
2983     return true;
2984 }
2985
2986 static bool trans_VPADDL_S(DisasContext *s, arg_2misc *a)
2987 {
2988     static NeonGenWidenFn * const widenfn[] = {
2989         gen_helper_neon_widen_s8,
2990         gen_helper_neon_widen_s16,
2991         tcg_gen_ext_i32_i64,
2992         NULL,
2993     };
2994     static NeonGenTwo64OpFn * const opfn[] = {
2995         gen_helper_neon_paddl_u16,
2996         gen_helper_neon_paddl_u32,
2997         tcg_gen_add_i64,
2998         NULL,
2999     };
3000
3001     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3002 }
3003
3004 static bool trans_VPADDL_U(DisasContext *s, arg_2misc *a)
3005 {
3006     static NeonGenWidenFn * const widenfn[] = {
3007         gen_helper_neon_widen_u8,
3008         gen_helper_neon_widen_u16,
3009         tcg_gen_extu_i32_i64,
3010         NULL,
3011     };
3012     static NeonGenTwo64OpFn * const opfn[] = {
3013         gen_helper_neon_paddl_u16,
3014         gen_helper_neon_paddl_u32,
3015         tcg_gen_add_i64,
3016         NULL,
3017     };
3018
3019     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size], NULL);
3020 }
3021
3022 static bool trans_VPADAL_S(DisasContext *s, arg_2misc *a)
3023 {
3024     static NeonGenWidenFn * const widenfn[] = {
3025         gen_helper_neon_widen_s8,
3026         gen_helper_neon_widen_s16,
3027         tcg_gen_ext_i32_i64,
3028         NULL,
3029     };
3030     static NeonGenTwo64OpFn * const opfn[] = {
3031         gen_helper_neon_paddl_u16,
3032         gen_helper_neon_paddl_u32,
3033         tcg_gen_add_i64,
3034         NULL,
3035     };
3036     static NeonGenTwo64OpFn * const accfn[] = {
3037         gen_helper_neon_addl_u16,
3038         gen_helper_neon_addl_u32,
3039         tcg_gen_add_i64,
3040         NULL,
3041     };
3042
3043     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3044                              accfn[a->size]);
3045 }
3046
3047 static bool trans_VPADAL_U(DisasContext *s, arg_2misc *a)
3048 {
3049     static NeonGenWidenFn * const widenfn[] = {
3050         gen_helper_neon_widen_u8,
3051         gen_helper_neon_widen_u16,
3052         tcg_gen_extu_i32_i64,
3053         NULL,
3054     };
3055     static NeonGenTwo64OpFn * const opfn[] = {
3056         gen_helper_neon_paddl_u16,
3057         gen_helper_neon_paddl_u32,
3058         tcg_gen_add_i64,
3059         NULL,
3060     };
3061     static NeonGenTwo64OpFn * const accfn[] = {
3062         gen_helper_neon_addl_u16,
3063         gen_helper_neon_addl_u32,
3064         tcg_gen_add_i64,
3065         NULL,
3066     };
3067
3068     return do_2misc_pairwise(s, a, widenfn[a->size], opfn[a->size],
3069                              accfn[a->size]);
3070 }
3071
3072 typedef void ZipFn(TCGv_ptr, TCGv_ptr);
3073
3074 static bool do_zip_uzp(DisasContext *s, arg_2misc *a,
3075                        ZipFn *fn)
3076 {
3077     TCGv_ptr pd, pm;
3078
3079     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3080         return false;
3081     }
3082
3083     /* UNDEF accesses to D16-D31 if they don't exist. */
3084     if (!dc_isar_feature(aa32_simd_r32, s) &&
3085         ((a->vd | a->vm) & 0x10)) {
3086         return false;
3087     }
3088
3089     if ((a->vd | a->vm) & a->q) {
3090         return false;
3091     }
3092
3093     if (!fn) {
3094         /* Bad size or size/q combination */
3095         return false;
3096     }
3097
3098     if (!vfp_access_check(s)) {
3099         return true;
3100     }
3101
3102     pd = vfp_reg_ptr(true, a->vd);
3103     pm = vfp_reg_ptr(true, a->vm);
3104     fn(pd, pm);
3105     return true;
3106 }
3107
3108 static bool trans_VUZP(DisasContext *s, arg_2misc *a)
3109 {
3110     static ZipFn * const fn[2][4] = {
3111         {
3112             gen_helper_neon_unzip8,
3113             gen_helper_neon_unzip16,
3114             NULL,
3115             NULL,
3116         }, {
3117             gen_helper_neon_qunzip8,
3118             gen_helper_neon_qunzip16,
3119             gen_helper_neon_qunzip32,
3120             NULL,
3121         }
3122     };
3123     return do_zip_uzp(s, a, fn[a->q][a->size]);
3124 }
3125
3126 static bool trans_VZIP(DisasContext *s, arg_2misc *a)
3127 {
3128     static ZipFn * const fn[2][4] = {
3129         {
3130             gen_helper_neon_zip8,
3131             gen_helper_neon_zip16,
3132             NULL,
3133             NULL,
3134         }, {
3135             gen_helper_neon_qzip8,
3136             gen_helper_neon_qzip16,
3137             gen_helper_neon_qzip32,
3138             NULL,
3139         }
3140     };
3141     return do_zip_uzp(s, a, fn[a->q][a->size]);
3142 }
3143
3144 static bool do_vmovn(DisasContext *s, arg_2misc *a,
3145                      NeonGenNarrowEnvFn *narrowfn)
3146 {
3147     TCGv_i64 rm;
3148     TCGv_i32 rd0, rd1;
3149
3150     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3151         return false;
3152     }
3153
3154     /* UNDEF accesses to D16-D31 if they don't exist. */
3155     if (!dc_isar_feature(aa32_simd_r32, s) &&
3156         ((a->vd | a->vm) & 0x10)) {
3157         return false;
3158     }
3159
3160     if (a->vm & 1) {
3161         return false;
3162     }
3163
3164     if (!narrowfn) {
3165         return false;
3166     }
3167
3168     if (!vfp_access_check(s)) {
3169         return true;
3170     }
3171
3172     rm = tcg_temp_new_i64();
3173     rd0 = tcg_temp_new_i32();
3174     rd1 = tcg_temp_new_i32();
3175
3176     read_neon_element64(rm, a->vm, 0, MO_64);
3177     narrowfn(rd0, cpu_env, rm);
3178     read_neon_element64(rm, a->vm, 1, MO_64);
3179     narrowfn(rd1, cpu_env, rm);
3180     write_neon_element32(rd0, a->vd, 0, MO_32);
3181     write_neon_element32(rd1, a->vd, 1, MO_32);
3182     return true;
3183 }
3184
3185 #define DO_VMOVN(INSN, FUNC)                                    \
3186     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3187     {                                                           \
3188         static NeonGenNarrowEnvFn * const narrowfn[] = {        \
3189             FUNC##8,                                            \
3190             FUNC##16,                                           \
3191             FUNC##32,                                           \
3192             NULL,                                               \
3193         };                                                      \
3194         return do_vmovn(s, a, narrowfn[a->size]);               \
3195     }
3196
3197 DO_VMOVN(VMOVN, gen_neon_narrow_u)
3198 DO_VMOVN(VQMOVUN, gen_helper_neon_unarrow_sat)
3199 DO_VMOVN(VQMOVN_S, gen_helper_neon_narrow_sat_s)
3200 DO_VMOVN(VQMOVN_U, gen_helper_neon_narrow_sat_u)
3201
3202 static bool trans_VSHLL(DisasContext *s, arg_2misc *a)
3203 {
3204     TCGv_i32 rm0, rm1;
3205     TCGv_i64 rd;
3206     static NeonGenWidenFn * const widenfns[] = {
3207         gen_helper_neon_widen_u8,
3208         gen_helper_neon_widen_u16,
3209         tcg_gen_extu_i32_i64,
3210         NULL,
3211     };
3212     NeonGenWidenFn *widenfn = widenfns[a->size];
3213
3214     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3215         return false;
3216     }
3217
3218     /* UNDEF accesses to D16-D31 if they don't exist. */
3219     if (!dc_isar_feature(aa32_simd_r32, s) &&
3220         ((a->vd | a->vm) & 0x10)) {
3221         return false;
3222     }
3223
3224     if (a->vd & 1) {
3225         return false;
3226     }
3227
3228     if (!widenfn) {
3229         return false;
3230     }
3231
3232     if (!vfp_access_check(s)) {
3233         return true;
3234     }
3235
3236     rd = tcg_temp_new_i64();
3237     rm0 = tcg_temp_new_i32();
3238     rm1 = tcg_temp_new_i32();
3239
3240     read_neon_element32(rm0, a->vm, 0, MO_32);
3241     read_neon_element32(rm1, a->vm, 1, MO_32);
3242
3243     widenfn(rd, rm0);
3244     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3245     write_neon_element64(rd, a->vd, 0, MO_64);
3246     widenfn(rd, rm1);
3247     tcg_gen_shli_i64(rd, rd, 8 << a->size);
3248     write_neon_element64(rd, a->vd, 1, MO_64);
3249     return true;
3250 }
3251
3252 static bool trans_VCVT_B16_F32(DisasContext *s, arg_2misc *a)
3253 {
3254     TCGv_ptr fpst;
3255     TCGv_i64 tmp;
3256     TCGv_i32 dst0, dst1;
3257
3258     if (!dc_isar_feature(aa32_bf16, s)) {
3259         return false;
3260     }
3261
3262     /* UNDEF accesses to D16-D31 if they don't exist. */
3263     if (!dc_isar_feature(aa32_simd_r32, s) &&
3264         ((a->vd | a->vm) & 0x10)) {
3265         return false;
3266     }
3267
3268     if ((a->vm & 1) || (a->size != 1)) {
3269         return false;
3270     }
3271
3272     if (!vfp_access_check(s)) {
3273         return true;
3274     }
3275
3276     fpst = fpstatus_ptr(FPST_STD);
3277     tmp = tcg_temp_new_i64();
3278     dst0 = tcg_temp_new_i32();
3279     dst1 = tcg_temp_new_i32();
3280
3281     read_neon_element64(tmp, a->vm, 0, MO_64);
3282     gen_helper_bfcvt_pair(dst0, tmp, fpst);
3283
3284     read_neon_element64(tmp, a->vm, 1, MO_64);
3285     gen_helper_bfcvt_pair(dst1, tmp, fpst);
3286
3287     write_neon_element32(dst0, a->vd, 0, MO_32);
3288     write_neon_element32(dst1, a->vd, 1, MO_32);
3289     return true;
3290 }
3291
3292 static bool trans_VCVT_F16_F32(DisasContext *s, arg_2misc *a)
3293 {
3294     TCGv_ptr fpst;
3295     TCGv_i32 ahp, tmp, tmp2, tmp3;
3296
3297     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3298         !dc_isar_feature(aa32_fp16_spconv, s)) {
3299         return false;
3300     }
3301
3302     /* UNDEF accesses to D16-D31 if they don't exist. */
3303     if (!dc_isar_feature(aa32_simd_r32, s) &&
3304         ((a->vd | a->vm) & 0x10)) {
3305         return false;
3306     }
3307
3308     if ((a->vm & 1) || (a->size != 1)) {
3309         return false;
3310     }
3311
3312     if (!vfp_access_check(s)) {
3313         return true;
3314     }
3315
3316     fpst = fpstatus_ptr(FPST_STD);
3317     ahp = get_ahp_flag();
3318     tmp = tcg_temp_new_i32();
3319     read_neon_element32(tmp, a->vm, 0, MO_32);
3320     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3321     tmp2 = tcg_temp_new_i32();
3322     read_neon_element32(tmp2, a->vm, 1, MO_32);
3323     gen_helper_vfp_fcvt_f32_to_f16(tmp2, tmp2, fpst, ahp);
3324     tcg_gen_shli_i32(tmp2, tmp2, 16);
3325     tcg_gen_or_i32(tmp2, tmp2, tmp);
3326     read_neon_element32(tmp, a->vm, 2, MO_32);
3327     gen_helper_vfp_fcvt_f32_to_f16(tmp, tmp, fpst, ahp);
3328     tmp3 = tcg_temp_new_i32();
3329     read_neon_element32(tmp3, a->vm, 3, MO_32);
3330     write_neon_element32(tmp2, a->vd, 0, MO_32);
3331     gen_helper_vfp_fcvt_f32_to_f16(tmp3, tmp3, fpst, ahp);
3332     tcg_gen_shli_i32(tmp3, tmp3, 16);
3333     tcg_gen_or_i32(tmp3, tmp3, tmp);
3334     write_neon_element32(tmp3, a->vd, 1, MO_32);
3335     return true;
3336 }
3337
3338 static bool trans_VCVT_F32_F16(DisasContext *s, arg_2misc *a)
3339 {
3340     TCGv_ptr fpst;
3341     TCGv_i32 ahp, tmp, tmp2, tmp3;
3342
3343     if (!arm_dc_feature(s, ARM_FEATURE_NEON) ||
3344         !dc_isar_feature(aa32_fp16_spconv, s)) {
3345         return false;
3346     }
3347
3348     /* UNDEF accesses to D16-D31 if they don't exist. */
3349     if (!dc_isar_feature(aa32_simd_r32, s) &&
3350         ((a->vd | a->vm) & 0x10)) {
3351         return false;
3352     }
3353
3354     if ((a->vd & 1) || (a->size != 1)) {
3355         return false;
3356     }
3357
3358     if (!vfp_access_check(s)) {
3359         return true;
3360     }
3361
3362     fpst = fpstatus_ptr(FPST_STD);
3363     ahp = get_ahp_flag();
3364     tmp3 = tcg_temp_new_i32();
3365     tmp2 = tcg_temp_new_i32();
3366     tmp = tcg_temp_new_i32();
3367     read_neon_element32(tmp, a->vm, 0, MO_32);
3368     read_neon_element32(tmp2, a->vm, 1, MO_32);
3369     tcg_gen_ext16u_i32(tmp3, tmp);
3370     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3371     write_neon_element32(tmp3, a->vd, 0, MO_32);
3372     tcg_gen_shri_i32(tmp, tmp, 16);
3373     gen_helper_vfp_fcvt_f16_to_f32(tmp, tmp, fpst, ahp);
3374     write_neon_element32(tmp, a->vd, 1, MO_32);
3375     tcg_gen_ext16u_i32(tmp3, tmp2);
3376     gen_helper_vfp_fcvt_f16_to_f32(tmp3, tmp3, fpst, ahp);
3377     write_neon_element32(tmp3, a->vd, 2, MO_32);
3378     tcg_gen_shri_i32(tmp2, tmp2, 16);
3379     gen_helper_vfp_fcvt_f16_to_f32(tmp2, tmp2, fpst, ahp);
3380     write_neon_element32(tmp2, a->vd, 3, MO_32);
3381     return true;
3382 }
3383
3384 static bool do_2misc_vec(DisasContext *s, arg_2misc *a, GVecGen2Fn *fn)
3385 {
3386     int vec_size = a->q ? 16 : 8;
3387     int rd_ofs = neon_full_reg_offset(a->vd);
3388     int rm_ofs = neon_full_reg_offset(a->vm);
3389
3390     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3391         return false;
3392     }
3393
3394     /* UNDEF accesses to D16-D31 if they don't exist. */
3395     if (!dc_isar_feature(aa32_simd_r32, s) &&
3396         ((a->vd | a->vm) & 0x10)) {
3397         return false;
3398     }
3399
3400     if (a->size == 3) {
3401         return false;
3402     }
3403
3404     if ((a->vd | a->vm) & a->q) {
3405         return false;
3406     }
3407
3408     if (!vfp_access_check(s)) {
3409         return true;
3410     }
3411
3412     fn(a->size, rd_ofs, rm_ofs, vec_size, vec_size);
3413
3414     return true;
3415 }
3416
3417 #define DO_2MISC_VEC(INSN, FN)                                  \
3418     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3419     {                                                           \
3420         return do_2misc_vec(s, a, FN);                          \
3421     }
3422
3423 DO_2MISC_VEC(VNEG, tcg_gen_gvec_neg)
3424 DO_2MISC_VEC(VABS, tcg_gen_gvec_abs)
3425 DO_2MISC_VEC(VCEQ0, gen_gvec_ceq0)
3426 DO_2MISC_VEC(VCGT0, gen_gvec_cgt0)
3427 DO_2MISC_VEC(VCLE0, gen_gvec_cle0)
3428 DO_2MISC_VEC(VCGE0, gen_gvec_cge0)
3429 DO_2MISC_VEC(VCLT0, gen_gvec_clt0)
3430
3431 static bool trans_VMVN(DisasContext *s, arg_2misc *a)
3432 {
3433     if (a->size != 0) {
3434         return false;
3435     }
3436     return do_2misc_vec(s, a, tcg_gen_gvec_not);
3437 }
3438
3439 #define WRAP_2M_3_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3440     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3441                          uint32_t rm_ofs, uint32_t oprsz,               \
3442                          uint32_t maxsz)                                \
3443     {                                                                   \
3444         tcg_gen_gvec_3_ool(rd_ofs, rd_ofs, rm_ofs, oprsz, maxsz,        \
3445                            DATA, FUNC);                                 \
3446     }
3447
3448 #define WRAP_2M_2_OOL_FN(WRAPNAME, FUNC, DATA)                          \
3449     static void WRAPNAME(unsigned vece, uint32_t rd_ofs,                \
3450                          uint32_t rm_ofs, uint32_t oprsz,               \
3451                          uint32_t maxsz)                                \
3452     {                                                                   \
3453         tcg_gen_gvec_2_ool(rd_ofs, rm_ofs, oprsz, maxsz, DATA, FUNC);   \
3454     }
3455
3456 WRAP_2M_3_OOL_FN(gen_AESE, gen_helper_crypto_aese, 0)
3457 WRAP_2M_3_OOL_FN(gen_AESD, gen_helper_crypto_aese, 1)
3458 WRAP_2M_2_OOL_FN(gen_AESMC, gen_helper_crypto_aesmc, 0)
3459 WRAP_2M_2_OOL_FN(gen_AESIMC, gen_helper_crypto_aesmc, 1)
3460 WRAP_2M_2_OOL_FN(gen_SHA1H, gen_helper_crypto_sha1h, 0)
3461 WRAP_2M_2_OOL_FN(gen_SHA1SU1, gen_helper_crypto_sha1su1, 0)
3462 WRAP_2M_2_OOL_FN(gen_SHA256SU0, gen_helper_crypto_sha256su0, 0)
3463
3464 #define DO_2M_CRYPTO(INSN, FEATURE, SIZE)                       \
3465     static bool trans_##INSN(DisasContext *s, arg_2misc *a)     \
3466     {                                                           \
3467         if (!dc_isar_feature(FEATURE, s) || a->size != SIZE) {  \
3468             return false;                                       \
3469         }                                                       \
3470         return do_2misc_vec(s, a, gen_##INSN);                  \
3471     }
3472
3473 DO_2M_CRYPTO(AESE, aa32_aes, 0)
3474 DO_2M_CRYPTO(AESD, aa32_aes, 0)
3475 DO_2M_CRYPTO(AESMC, aa32_aes, 0)
3476 DO_2M_CRYPTO(AESIMC, aa32_aes, 0)
3477 DO_2M_CRYPTO(SHA1H, aa32_sha1, 2)
3478 DO_2M_CRYPTO(SHA1SU1, aa32_sha1, 2)
3479 DO_2M_CRYPTO(SHA256SU0, aa32_sha2, 2)
3480
3481 static bool do_2misc(DisasContext *s, arg_2misc *a, NeonGenOneOpFn *fn)
3482 {
3483     TCGv_i32 tmp;
3484     int pass;
3485
3486     /* Handle a 2-reg-misc operation by iterating 32 bits at a time */
3487     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3488         return false;
3489     }
3490
3491     /* UNDEF accesses to D16-D31 if they don't exist. */
3492     if (!dc_isar_feature(aa32_simd_r32, s) &&
3493         ((a->vd | a->vm) & 0x10)) {
3494         return false;
3495     }
3496
3497     if (!fn) {
3498         return false;
3499     }
3500
3501     if ((a->vd | a->vm) & a->q) {
3502         return false;
3503     }
3504
3505     if (!vfp_access_check(s)) {
3506         return true;
3507     }
3508
3509     tmp = tcg_temp_new_i32();
3510     for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3511         read_neon_element32(tmp, a->vm, pass, MO_32);
3512         fn(tmp, tmp);
3513         write_neon_element32(tmp, a->vd, pass, MO_32);
3514     }
3515     return true;
3516 }
3517
3518 static bool trans_VREV32(DisasContext *s, arg_2misc *a)
3519 {
3520     static NeonGenOneOpFn * const fn[] = {
3521         tcg_gen_bswap32_i32,
3522         gen_swap_half,
3523         NULL,
3524         NULL,
3525     };
3526     return do_2misc(s, a, fn[a->size]);
3527 }
3528
3529 static bool trans_VREV16(DisasContext *s, arg_2misc *a)
3530 {
3531     if (a->size != 0) {
3532         return false;
3533     }
3534     return do_2misc(s, a, gen_rev16);
3535 }
3536
3537 static bool trans_VCLS(DisasContext *s, arg_2misc *a)
3538 {
3539     static NeonGenOneOpFn * const fn[] = {
3540         gen_helper_neon_cls_s8,
3541         gen_helper_neon_cls_s16,
3542         gen_helper_neon_cls_s32,
3543         NULL,
3544     };
3545     return do_2misc(s, a, fn[a->size]);
3546 }
3547
3548 static void do_VCLZ_32(TCGv_i32 rd, TCGv_i32 rm)
3549 {
3550     tcg_gen_clzi_i32(rd, rm, 32);
3551 }
3552
3553 static bool trans_VCLZ(DisasContext *s, arg_2misc *a)
3554 {
3555     static NeonGenOneOpFn * const fn[] = {
3556         gen_helper_neon_clz_u8,
3557         gen_helper_neon_clz_u16,
3558         do_VCLZ_32,
3559         NULL,
3560     };
3561     return do_2misc(s, a, fn[a->size]);
3562 }
3563
3564 static bool trans_VCNT(DisasContext *s, arg_2misc *a)
3565 {
3566     if (a->size != 0) {
3567         return false;
3568     }
3569     return do_2misc(s, a, gen_helper_neon_cnt_u8);
3570 }
3571
3572 static void gen_VABS_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3573                        uint32_t oprsz, uint32_t maxsz)
3574 {
3575     tcg_gen_gvec_andi(vece, rd_ofs, rm_ofs,
3576                       vece == MO_16 ? 0x7fff : 0x7fffffff,
3577                       oprsz, maxsz);
3578 }
3579
3580 static bool trans_VABS_F(DisasContext *s, arg_2misc *a)
3581 {
3582     if (a->size == MO_16) {
3583         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3584             return false;
3585         }
3586     } else if (a->size != MO_32) {
3587         return false;
3588     }
3589     return do_2misc_vec(s, a, gen_VABS_F);
3590 }
3591
3592 static void gen_VNEG_F(unsigned vece, uint32_t rd_ofs, uint32_t rm_ofs,
3593                        uint32_t oprsz, uint32_t maxsz)
3594 {
3595     tcg_gen_gvec_xori(vece, rd_ofs, rm_ofs,
3596                       vece == MO_16 ? 0x8000 : 0x80000000,
3597                       oprsz, maxsz);
3598 }
3599
3600 static bool trans_VNEG_F(DisasContext *s, arg_2misc *a)
3601 {
3602     if (a->size == MO_16) {
3603         if (!dc_isar_feature(aa32_fp16_arith, s)) {
3604             return false;
3605         }
3606     } else if (a->size != MO_32) {
3607         return false;
3608     }
3609     return do_2misc_vec(s, a, gen_VNEG_F);
3610 }
3611
3612 static bool trans_VRECPE(DisasContext *s, arg_2misc *a)
3613 {
3614     if (a->size != 2) {
3615         return false;
3616     }
3617     return do_2misc(s, a, gen_helper_recpe_u32);
3618 }
3619
3620 static bool trans_VRSQRTE(DisasContext *s, arg_2misc *a)
3621 {
3622     if (a->size != 2) {
3623         return false;
3624     }
3625     return do_2misc(s, a, gen_helper_rsqrte_u32);
3626 }
3627
3628 #define WRAP_1OP_ENV_FN(WRAPNAME, FUNC) \
3629     static void WRAPNAME(TCGv_i32 d, TCGv_i32 m)        \
3630     {                                                   \
3631         FUNC(d, cpu_env, m);                            \
3632     }
3633
3634 WRAP_1OP_ENV_FN(gen_VQABS_s8, gen_helper_neon_qabs_s8)
3635 WRAP_1OP_ENV_FN(gen_VQABS_s16, gen_helper_neon_qabs_s16)
3636 WRAP_1OP_ENV_FN(gen_VQABS_s32, gen_helper_neon_qabs_s32)
3637 WRAP_1OP_ENV_FN(gen_VQNEG_s8, gen_helper_neon_qneg_s8)
3638 WRAP_1OP_ENV_FN(gen_VQNEG_s16, gen_helper_neon_qneg_s16)
3639 WRAP_1OP_ENV_FN(gen_VQNEG_s32, gen_helper_neon_qneg_s32)
3640
3641 static bool trans_VQABS(DisasContext *s, arg_2misc *a)
3642 {
3643     static NeonGenOneOpFn * const fn[] = {
3644         gen_VQABS_s8,
3645         gen_VQABS_s16,
3646         gen_VQABS_s32,
3647         NULL,
3648     };
3649     return do_2misc(s, a, fn[a->size]);
3650 }
3651
3652 static bool trans_VQNEG(DisasContext *s, arg_2misc *a)
3653 {
3654     static NeonGenOneOpFn * const fn[] = {
3655         gen_VQNEG_s8,
3656         gen_VQNEG_s16,
3657         gen_VQNEG_s32,
3658         NULL,
3659     };
3660     return do_2misc(s, a, fn[a->size]);
3661 }
3662
3663 #define DO_2MISC_FP_VEC(INSN, HFUNC, SFUNC)                             \
3664     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3665                            uint32_t rm_ofs,                             \
3666                            uint32_t oprsz, uint32_t maxsz)              \
3667     {                                                                   \
3668         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3669             NULL, HFUNC, SFUNC, NULL,                                   \
3670         };                                                              \
3671         TCGv_ptr fpst;                                                  \
3672         fpst = fpstatus_ptr(vece == MO_16 ? FPST_STD_F16 : FPST_STD);   \
3673         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz, 0,       \
3674                            fns[vece]);                                  \
3675     }                                                                   \
3676     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3677     {                                                                   \
3678         if (a->size == MO_16) {                                         \
3679             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3680                 return false;                                           \
3681             }                                                           \
3682         } else if (a->size != MO_32) {                                  \
3683             return false;                                               \
3684         }                                                               \
3685         return do_2misc_vec(s, a, gen_##INSN);                          \
3686     }
3687
3688 DO_2MISC_FP_VEC(VRECPE_F, gen_helper_gvec_frecpe_h, gen_helper_gvec_frecpe_s)
3689 DO_2MISC_FP_VEC(VRSQRTE_F, gen_helper_gvec_frsqrte_h, gen_helper_gvec_frsqrte_s)
3690 DO_2MISC_FP_VEC(VCGT0_F, gen_helper_gvec_fcgt0_h, gen_helper_gvec_fcgt0_s)
3691 DO_2MISC_FP_VEC(VCGE0_F, gen_helper_gvec_fcge0_h, gen_helper_gvec_fcge0_s)
3692 DO_2MISC_FP_VEC(VCEQ0_F, gen_helper_gvec_fceq0_h, gen_helper_gvec_fceq0_s)
3693 DO_2MISC_FP_VEC(VCLT0_F, gen_helper_gvec_fclt0_h, gen_helper_gvec_fclt0_s)
3694 DO_2MISC_FP_VEC(VCLE0_F, gen_helper_gvec_fcle0_h, gen_helper_gvec_fcle0_s)
3695 DO_2MISC_FP_VEC(VCVT_FS, gen_helper_gvec_sstoh, gen_helper_gvec_sitos)
3696 DO_2MISC_FP_VEC(VCVT_FU, gen_helper_gvec_ustoh, gen_helper_gvec_uitos)
3697 DO_2MISC_FP_VEC(VCVT_SF, gen_helper_gvec_tosszh, gen_helper_gvec_tosizs)
3698 DO_2MISC_FP_VEC(VCVT_UF, gen_helper_gvec_touszh, gen_helper_gvec_touizs)
3699
3700 DO_2MISC_FP_VEC(VRINTX_impl, gen_helper_gvec_vrintx_h, gen_helper_gvec_vrintx_s)
3701
3702 static bool trans_VRINTX(DisasContext *s, arg_2misc *a)
3703 {
3704     if (!arm_dc_feature(s, ARM_FEATURE_V8)) {
3705         return false;
3706     }
3707     return trans_VRINTX_impl(s, a);
3708 }
3709
3710 #define DO_VEC_RMODE(INSN, RMODE, OP)                                   \
3711     static void gen_##INSN(unsigned vece, uint32_t rd_ofs,              \
3712                            uint32_t rm_ofs,                             \
3713                            uint32_t oprsz, uint32_t maxsz)              \
3714     {                                                                   \
3715         static gen_helper_gvec_2_ptr * const fns[4] = {                 \
3716             NULL,                                                       \
3717             gen_helper_gvec_##OP##h,                                    \
3718             gen_helper_gvec_##OP##s,                                    \
3719             NULL,                                                       \
3720         };                                                              \
3721         TCGv_ptr fpst;                                                  \
3722         fpst = fpstatus_ptr(vece == 1 ? FPST_STD_F16 : FPST_STD);       \
3723         tcg_gen_gvec_2_ptr(rd_ofs, rm_ofs, fpst, oprsz, maxsz,          \
3724                            arm_rmode_to_sf(RMODE), fns[vece]);          \
3725     }                                                                   \
3726     static bool trans_##INSN(DisasContext *s, arg_2misc *a)             \
3727     {                                                                   \
3728         if (!arm_dc_feature(s, ARM_FEATURE_V8)) {                       \
3729             return false;                                               \
3730         }                                                               \
3731         if (a->size == MO_16) {                                         \
3732             if (!dc_isar_feature(aa32_fp16_arith, s)) {                 \
3733                 return false;                                           \
3734             }                                                           \
3735         } else if (a->size != MO_32) {                                  \
3736             return false;                                               \
3737         }                                                               \
3738         return do_2misc_vec(s, a, gen_##INSN);                          \
3739     }
3740
3741 DO_VEC_RMODE(VCVTAU, FPROUNDING_TIEAWAY, vcvt_rm_u)
3742 DO_VEC_RMODE(VCVTAS, FPROUNDING_TIEAWAY, vcvt_rm_s)
3743 DO_VEC_RMODE(VCVTNU, FPROUNDING_TIEEVEN, vcvt_rm_u)
3744 DO_VEC_RMODE(VCVTNS, FPROUNDING_TIEEVEN, vcvt_rm_s)
3745 DO_VEC_RMODE(VCVTPU, FPROUNDING_POSINF, vcvt_rm_u)
3746 DO_VEC_RMODE(VCVTPS, FPROUNDING_POSINF, vcvt_rm_s)
3747 DO_VEC_RMODE(VCVTMU, FPROUNDING_NEGINF, vcvt_rm_u)
3748 DO_VEC_RMODE(VCVTMS, FPROUNDING_NEGINF, vcvt_rm_s)
3749
3750 DO_VEC_RMODE(VRINTN, FPROUNDING_TIEEVEN, vrint_rm_)
3751 DO_VEC_RMODE(VRINTA, FPROUNDING_TIEAWAY, vrint_rm_)
3752 DO_VEC_RMODE(VRINTZ, FPROUNDING_ZERO, vrint_rm_)
3753 DO_VEC_RMODE(VRINTM, FPROUNDING_NEGINF, vrint_rm_)
3754 DO_VEC_RMODE(VRINTP, FPROUNDING_POSINF, vrint_rm_)
3755
3756 static bool trans_VSWP(DisasContext *s, arg_2misc *a)
3757 {
3758     TCGv_i64 rm, rd;
3759     int pass;
3760
3761     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3762         return false;
3763     }
3764
3765     /* UNDEF accesses to D16-D31 if they don't exist. */
3766     if (!dc_isar_feature(aa32_simd_r32, s) &&
3767         ((a->vd | a->vm) & 0x10)) {
3768         return false;
3769     }
3770
3771     if (a->size != 0) {
3772         return false;
3773     }
3774
3775     if ((a->vd | a->vm) & a->q) {
3776         return false;
3777     }
3778
3779     if (!vfp_access_check(s)) {
3780         return true;
3781     }
3782
3783     rm = tcg_temp_new_i64();
3784     rd = tcg_temp_new_i64();
3785     for (pass = 0; pass < (a->q ? 2 : 1); pass++) {
3786         read_neon_element64(rm, a->vm, pass, MO_64);
3787         read_neon_element64(rd, a->vd, pass, MO_64);
3788         write_neon_element64(rm, a->vd, pass, MO_64);
3789         write_neon_element64(rd, a->vm, pass, MO_64);
3790     }
3791     return true;
3792 }
3793
3794 static void gen_neon_trn_u8(TCGv_i32 t0, TCGv_i32 t1)
3795 {
3796     TCGv_i32 rd, tmp;
3797
3798     rd = tcg_temp_new_i32();
3799     tmp = tcg_temp_new_i32();
3800
3801     tcg_gen_shli_i32(rd, t0, 8);
3802     tcg_gen_andi_i32(rd, rd, 0xff00ff00);
3803     tcg_gen_andi_i32(tmp, t1, 0x00ff00ff);
3804     tcg_gen_or_i32(rd, rd, tmp);
3805
3806     tcg_gen_shri_i32(t1, t1, 8);
3807     tcg_gen_andi_i32(t1, t1, 0x00ff00ff);
3808     tcg_gen_andi_i32(tmp, t0, 0xff00ff00);
3809     tcg_gen_or_i32(t1, t1, tmp);
3810     tcg_gen_mov_i32(t0, rd);
3811 }
3812
3813 static void gen_neon_trn_u16(TCGv_i32 t0, TCGv_i32 t1)
3814 {
3815     TCGv_i32 rd, tmp;
3816
3817     rd = tcg_temp_new_i32();
3818     tmp = tcg_temp_new_i32();
3819
3820     tcg_gen_shli_i32(rd, t0, 16);
3821     tcg_gen_andi_i32(tmp, t1, 0xffff);
3822     tcg_gen_or_i32(rd, rd, tmp);
3823     tcg_gen_shri_i32(t1, t1, 16);
3824     tcg_gen_andi_i32(tmp, t0, 0xffff0000);
3825     tcg_gen_or_i32(t1, t1, tmp);
3826     tcg_gen_mov_i32(t0, rd);
3827 }
3828
3829 static bool trans_VTRN(DisasContext *s, arg_2misc *a)
3830 {
3831     TCGv_i32 tmp, tmp2;
3832     int pass;
3833
3834     if (!arm_dc_feature(s, ARM_FEATURE_NEON)) {
3835         return false;
3836     }
3837
3838     /* UNDEF accesses to D16-D31 if they don't exist. */
3839     if (!dc_isar_feature(aa32_simd_r32, s) &&
3840         ((a->vd | a->vm) & 0x10)) {
3841         return false;
3842     }
3843
3844     if ((a->vd | a->vm) & a->q) {
3845         return false;
3846     }
3847
3848     if (a->size == 3) {
3849         return false;
3850     }
3851
3852     if (!vfp_access_check(s)) {
3853         return true;
3854     }
3855
3856     tmp = tcg_temp_new_i32();
3857     tmp2 = tcg_temp_new_i32();
3858     if (a->size == MO_32) {
3859         for (pass = 0; pass < (a->q ? 4 : 2); pass += 2) {
3860             read_neon_element32(tmp, a->vm, pass, MO_32);
3861             read_neon_element32(tmp2, a->vd, pass + 1, MO_32);
3862             write_neon_element32(tmp2, a->vm, pass, MO_32);
3863             write_neon_element32(tmp, a->vd, pass + 1, MO_32);
3864         }
3865     } else {
3866         for (pass = 0; pass < (a->q ? 4 : 2); pass++) {
3867             read_neon_element32(tmp, a->vm, pass, MO_32);
3868             read_neon_element32(tmp2, a->vd, pass, MO_32);
3869             if (a->size == MO_8) {
3870                 gen_neon_trn_u8(tmp, tmp2);
3871             } else {
3872                 gen_neon_trn_u16(tmp, tmp2);
3873             }
3874             write_neon_element32(tmp2, a->vm, pass, MO_32);
3875             write_neon_element32(tmp, a->vd, pass, MO_32);
3876         }
3877     }
3878     return true;
3879 }
3880
3881 static bool trans_VSMMLA(DisasContext *s, arg_VSMMLA *a)
3882 {
3883     if (!dc_isar_feature(aa32_i8mm, s)) {
3884         return false;
3885     }
3886     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3887                         gen_helper_gvec_smmla_b);
3888 }
3889
3890 static bool trans_VUMMLA(DisasContext *s, arg_VUMMLA *a)
3891 {
3892     if (!dc_isar_feature(aa32_i8mm, s)) {
3893         return false;
3894     }
3895     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3896                         gen_helper_gvec_ummla_b);
3897 }
3898
3899 static bool trans_VUSMMLA(DisasContext *s, arg_VUSMMLA *a)
3900 {
3901     if (!dc_isar_feature(aa32_i8mm, s)) {
3902         return false;
3903     }
3904     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3905                         gen_helper_gvec_usmmla_b);
3906 }
3907
3908 static bool trans_VMMLA_b16(DisasContext *s, arg_VMMLA_b16 *a)
3909 {
3910     if (!dc_isar_feature(aa32_bf16, s)) {
3911         return false;
3912     }
3913     return do_neon_ddda(s, 7, a->vd, a->vn, a->vm, 0,
3914                         gen_helper_gvec_bfmmla);
3915 }
3916
3917 static bool trans_VFMA_b16(DisasContext *s, arg_VFMA_b16 *a)
3918 {
3919     if (!dc_isar_feature(aa32_bf16, s)) {
3920         return false;
3921     }
3922     return do_neon_ddda_fpst(s, 7, a->vd, a->vn, a->vm, a->q, FPST_STD,
3923                              gen_helper_gvec_bfmlal);
3924 }
3925
3926 static bool trans_VFMA_b16_scal(DisasContext *s, arg_VFMA_b16_scal *a)
3927 {
3928     if (!dc_isar_feature(aa32_bf16, s)) {
3929         return false;
3930     }
3931     return do_neon_ddda_fpst(s, 6, a->vd, a->vn, a->vm,
3932                              (a->index << 1) | a->q, FPST_STD,
3933                              gen_helper_gvec_bfmlal_idx);
3934 }