tcg/aarch64/tcg-target.c.inc

   1 /*
   2  * Initial TCG Implementation for aarch64
   3  *
   4  * Copyright (c) 2013 Huawei Technologies Duesseldorf GmbH
   5  * Written by Claudio Fontana
   6  *
   7  * This work is licensed under the terms of the GNU GPL, version 2 or
   8  * (at your option) any later version.
   9  *
  10  * See the COPYING file in the top-level directory for details.
  11  */
  12
  13 #include "../tcg-ldst.c.inc"
  14 #include "../tcg-pool.c.inc"
  15 #include "qemu/bitops.h"
  16
  17 /* We're going to re-use TCGType in setting of the SF bit, which controls
  18    the size of the operation performed.  If we know the values match, it
  19    makes things much cleaner.  */
  20 QEMU_BUILD_BUG_ON(TCG_TYPE_I32 != 0 || TCG_TYPE_I64 != 1);
  21
  22 #ifdef CONFIG_DEBUG_TCG
  23 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  24     "x0", "x1", "x2", "x3", "x4", "x5", "x6", "x7",
  25     "x8", "x9", "x10", "x11", "x12", "x13", "x14", "x15",
  26     "x16", "x17", "x18", "x19", "x20", "x21", "x22", "x23",
  27     "x24", "x25", "x26", "x27", "x28", "fp", "x30", "sp",
  28
  29     "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7",
  30     "v8", "v9", "v10", "v11", "v12", "v13", "v14", "v15",
  31     "v16", "v17", "v18", "v19", "v20", "v21", "v22", "v23",
  32     "v24", "v25", "v26", "v27", "v28", "fp", "v30", "v31",
  33 };
  34 #endif /* CONFIG_DEBUG_TCG */
  35
  36 static const int tcg_target_reg_alloc_order[] = {
  37     TCG_REG_X20, TCG_REG_X21, TCG_REG_X22, TCG_REG_X23,
  38     TCG_REG_X24, TCG_REG_X25, TCG_REG_X26, TCG_REG_X27,
  39     TCG_REG_X28, /* we will reserve this for guest_base if configured */
  40
  41     TCG_REG_X8, TCG_REG_X9, TCG_REG_X10, TCG_REG_X11,
  42     TCG_REG_X12, TCG_REG_X13, TCG_REG_X14, TCG_REG_X15,
  43
  44     TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
  45     TCG_REG_X4, TCG_REG_X5, TCG_REG_X6, TCG_REG_X7,
  46
  47     /* X16 reserved as temporary */
  48     /* X17 reserved as temporary */
  49     /* X18 reserved by system */
  50     /* X19 reserved for AREG0 */
  51     /* X29 reserved as fp */
  52     /* X30 reserved as temporary */
  53
  54     TCG_REG_V0, TCG_REG_V1, TCG_REG_V2, TCG_REG_V3,
  55     TCG_REG_V4, TCG_REG_V5, TCG_REG_V6, TCG_REG_V7,
  56     /* V8 - V15 are call-saved, and skipped.  */
  57     TCG_REG_V16, TCG_REG_V17, TCG_REG_V18, TCG_REG_V19,
  58     TCG_REG_V20, TCG_REG_V21, TCG_REG_V22, TCG_REG_V23,
  59     TCG_REG_V24, TCG_REG_V25, TCG_REG_V26, TCG_REG_V27,
  60     TCG_REG_V28, TCG_REG_V29, TCG_REG_V30, TCG_REG_V31,
  61 };
  62
  63 static const int tcg_target_call_iarg_regs[8] = {
  64     TCG_REG_X0, TCG_REG_X1, TCG_REG_X2, TCG_REG_X3,
  65     TCG_REG_X4, TCG_REG_X5, TCG_REG_X6, TCG_REG_X7
  66 };
  67
  68 static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
  69 {
  70     tcg_debug_assert(kind == TCG_CALL_RET_NORMAL);
  71     tcg_debug_assert(slot >= 0 && slot <= 1);
  72     return TCG_REG_X0 + slot;
  73 }
  74
  75 #define TCG_REG_TMP0 TCG_REG_X16
  76 #define TCG_REG_TMP1 TCG_REG_X17
  77 #define TCG_REG_TMP2 TCG_REG_X30
  78 #define TCG_VEC_TMP0 TCG_REG_V31
  79
  80 #ifndef CONFIG_SOFTMMU
  81 #define TCG_REG_GUEST_BASE TCG_REG_X28
  82 #endif
  83
  84 static bool reloc_pc26(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
  85 {
  86     const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
  87     ptrdiff_t offset = target - src_rx;
  88
  89     if (offset == sextract64(offset, 0, 26)) {
  90         /* read instruction, mask away previous PC_REL26 parameter contents,
  91            set the proper offset, then write back the instruction. */
  92         *src_rw = deposit32(*src_rw, 0, 26, offset);
  93         return true;
  94     }
  95     return false;
  96 }
  97
  98 static bool reloc_pc19(tcg_insn_unit *src_rw, const tcg_insn_unit *target)
  99 {
 100     const tcg_insn_unit *src_rx = tcg_splitwx_to_rx(src_rw);
 101     ptrdiff_t offset = target - src_rx;
 102
 103     if (offset == sextract64(offset, 0, 19)) {
 104         *src_rw = deposit32(*src_rw, 5, 19, offset);
 105         return true;
 106     }
 107     return false;
 108 }
 109
 110 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 111                         intptr_t value, intptr_t addend)
 112 {
 113     tcg_debug_assert(addend == 0);
 114     switch (type) {
 115     case R_AARCH64_JUMP26:
 116     case R_AARCH64_CALL26:
 117         return reloc_pc26(code_ptr, (const tcg_insn_unit *)value);
 118     case R_AARCH64_CONDBR19:
 119         return reloc_pc19(code_ptr, (const tcg_insn_unit *)value);
 120     default:
 121         g_assert_not_reached();
 122     }
 123 }
 124
 125 #define TCG_CT_CONST_AIMM 0x100
 126 #define TCG_CT_CONST_LIMM 0x200
 127 #define TCG_CT_CONST_ZERO 0x400
 128 #define TCG_CT_CONST_MONE 0x800
 129 #define TCG_CT_CONST_ORRI 0x1000
 130 #define TCG_CT_CONST_ANDI 0x2000
 131
 132 #define ALL_GENERAL_REGS  0xffffffffu
 133 #define ALL_VECTOR_REGS   0xffffffff00000000ull
 134
 135 /* Match a constant valid for addition (12-bit, optionally shifted).  */
 136 static inline bool is_aimm(uint64_t val)
 137 {
 138     return (val & ~0xfff) == 0 || (val & ~0xfff000) == 0;
 139 }
 140
 141 /* Match a constant valid for logical operations.  */
 142 static inline bool is_limm(uint64_t val)
 143 {
 144     /* Taking a simplified view of the logical immediates for now, ignoring
 145        the replication that can happen across the field.  Match bit patterns
 146        of the forms
 147            0....01....1
 148            0..01..10..0
 149        and their inverses.  */
 150
 151     /* Make things easier below, by testing the form with msb clear. */
 152     if ((int64_t)val < 0) {
 153         val = ~val;
 154     }
 155     if (val == 0) {
 156         return false;
 157     }
 158     val += val & -val;
 159     return (val & (val - 1)) == 0;
 160 }
 161
 162 /* Return true if v16 is a valid 16-bit shifted immediate.  */
 163 static bool is_shimm16(uint16_t v16, int *cmode, int *imm8)
 164 {
 165     if (v16 == (v16 & 0xff)) {
 166         *cmode = 0x8;
 167         *imm8 = v16 & 0xff;
 168         return true;
 169     } else if (v16 == (v16 & 0xff00)) {
 170         *cmode = 0xa;
 171         *imm8 = v16 >> 8;
 172         return true;
 173     }
 174     return false;
 175 }
 176
 177 /* Return true if v32 is a valid 32-bit shifted immediate.  */
 178 static bool is_shimm32(uint32_t v32, int *cmode, int *imm8)
 179 {
 180     if (v32 == (v32 & 0xff)) {
 181         *cmode = 0x0;
 182         *imm8 = v32 & 0xff;
 183         return true;
 184     } else if (v32 == (v32 & 0xff00)) {
 185         *cmode = 0x2;
 186         *imm8 = (v32 >> 8) & 0xff;
 187         return true;
 188     } else if (v32 == (v32 & 0xff0000)) {
 189         *cmode = 0x4;
 190         *imm8 = (v32 >> 16) & 0xff;
 191         return true;
 192     } else if (v32 == (v32 & 0xff000000)) {
 193         *cmode = 0x6;
 194         *imm8 = v32 >> 24;
 195         return true;
 196     }
 197     return false;
 198 }
 199
 200 /* Return true if v32 is a valid 32-bit shifting ones immediate.  */
 201 static bool is_soimm32(uint32_t v32, int *cmode, int *imm8)
 202 {
 203     if ((v32 & 0xffff00ff) == 0xff) {
 204         *cmode = 0xc;
 205         *imm8 = (v32 >> 8) & 0xff;
 206         return true;
 207     } else if ((v32 & 0xff00ffff) == 0xffff) {
 208         *cmode = 0xd;
 209         *imm8 = (v32 >> 16) & 0xff;
 210         return true;
 211     }
 212     return false;
 213 }
 214
 215 /* Return true if v32 is a valid float32 immediate.  */
 216 static bool is_fimm32(uint32_t v32, int *cmode, int *imm8)
 217 {
 218     if (extract32(v32, 0, 19) == 0
 219         && (extract32(v32, 25, 6) == 0x20
 220             || extract32(v32, 25, 6) == 0x1f)) {
 221         *cmode = 0xf;
 222         *imm8 = (extract32(v32, 31, 1) << 7)
 223               | (extract32(v32, 25, 1) << 6)
 224               | extract32(v32, 19, 6);
 225         return true;
 226     }
 227     return false;
 228 }
 229
 230 /* Return true if v64 is a valid float64 immediate.  */
 231 static bool is_fimm64(uint64_t v64, int *cmode, int *imm8)
 232 {
 233     if (extract64(v64, 0, 48) == 0
 234         && (extract64(v64, 54, 9) == 0x100
 235             || extract64(v64, 54, 9) == 0x0ff)) {
 236         *cmode = 0xf;
 237         *imm8 = (extract64(v64, 63, 1) << 7)
 238               | (extract64(v64, 54, 1) << 6)
 239               | extract64(v64, 48, 6);
 240         return true;
 241     }
 242     return false;
 243 }
 244
 245 /*
 246  * Return non-zero if v32 can be formed by MOVI+ORR.
 247  * Place the parameters for MOVI in (cmode, imm8).
 248  * Return the cmode for ORR; the imm8 can be had via extraction from v32.
 249  */
 250 static int is_shimm32_pair(uint32_t v32, int *cmode, int *imm8)
 251 {
 252     int i;
 253
 254     for (i = 6; i > 0; i -= 2) {
 255         /* Mask out one byte we can add with ORR.  */
 256         uint32_t tmp = v32 & ~(0xffu << (i * 4));
 257         if (is_shimm32(tmp, cmode, imm8) ||
 258             is_soimm32(tmp, cmode, imm8)) {
 259             break;
 260         }
 261     }
 262     return i;
 263 }
 264
 265 /* Return true if V is a valid 16-bit or 32-bit shifted immediate.  */
 266 static bool is_shimm1632(uint32_t v32, int *cmode, int *imm8)
 267 {
 268     if (v32 == deposit32(v32, 16, 16, v32)) {
 269         return is_shimm16(v32, cmode, imm8);
 270     } else {
 271         return is_shimm32(v32, cmode, imm8);
 272     }
 273 }
 274
 275 static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
 276 {
 277     if (ct & TCG_CT_CONST) {
 278         return 1;
 279     }
 280     if (type == TCG_TYPE_I32) {
 281         val = (int32_t)val;
 282     }
 283     if ((ct & TCG_CT_CONST_AIMM) && (is_aimm(val) || is_aimm(-val))) {
 284         return 1;
 285     }
 286     if ((ct & TCG_CT_CONST_LIMM) && is_limm(val)) {
 287         return 1;
 288     }
 289     if ((ct & TCG_CT_CONST_ZERO) && val == 0) {
 290         return 1;
 291     }
 292     if ((ct & TCG_CT_CONST_MONE) && val == -1) {
 293         return 1;
 294     }
 295
 296     switch (ct & (TCG_CT_CONST_ORRI | TCG_CT_CONST_ANDI)) {
 297     case 0:
 298         break;
 299     case TCG_CT_CONST_ANDI:
 300         val = ~val;
 301         /* fallthru */
 302     case TCG_CT_CONST_ORRI:
 303         if (val == deposit64(val, 32, 32, val)) {
 304             int cmode, imm8;
 305             return is_shimm1632(val, &cmode, &imm8);
 306         }
 307         break;
 308     default:
 309         /* Both bits should not be set for the same insn.  */
 310         g_assert_not_reached();
 311     }
 312
 313     return 0;
 314 }
 315
 316 enum aarch64_cond_code {
 317     COND_EQ = 0x0,
 318     COND_NE = 0x1,
 319     COND_CS = 0x2,     /* Unsigned greater or equal */
 320     COND_HS = COND_CS, /* ALIAS greater or equal */
 321     COND_CC = 0x3,     /* Unsigned less than */
 322     COND_LO = COND_CC, /* ALIAS Lower */
 323     COND_MI = 0x4,     /* Negative */
 324     COND_PL = 0x5,     /* Zero or greater */
 325     COND_VS = 0x6,     /* Overflow */
 326     COND_VC = 0x7,     /* No overflow */
 327     COND_HI = 0x8,     /* Unsigned greater than */
 328     COND_LS = 0x9,     /* Unsigned less or equal */
 329     COND_GE = 0xa,
 330     COND_LT = 0xb,
 331     COND_GT = 0xc,
 332     COND_LE = 0xd,
 333     COND_AL = 0xe,
 334     COND_NV = 0xf, /* behaves like COND_AL here */
 335 };
 336
 337 static const enum aarch64_cond_code tcg_cond_to_aarch64[] = {
 338     [TCG_COND_EQ] = COND_EQ,
 339     [TCG_COND_NE] = COND_NE,
 340     [TCG_COND_LT] = COND_LT,
 341     [TCG_COND_GE] = COND_GE,
 342     [TCG_COND_LE] = COND_LE,
 343     [TCG_COND_GT] = COND_GT,
 344     /* unsigned */
 345     [TCG_COND_LTU] = COND_LO,
 346     [TCG_COND_GTU] = COND_HI,
 347     [TCG_COND_GEU] = COND_HS,
 348     [TCG_COND_LEU] = COND_LS,
 349 };
 350
 351 typedef enum {
 352     LDST_ST = 0,    /* store */
 353     LDST_LD = 1,    /* load */
 354     LDST_LD_S_X = 2,  /* load and sign-extend into Xt */
 355     LDST_LD_S_W = 3,  /* load and sign-extend into Wt */
 356 } AArch64LdstType;
 357
 358 /* We encode the format of the insn into the beginning of the name, so that
 359    we can have the preprocessor help "typecheck" the insn vs the output
 360    function.  Arm didn't provide us with nice names for the formats, so we
 361    use the section number of the architecture reference manual in which the
 362    instruction group is described.  */
 363 typedef enum {
 364     /* Compare and branch (immediate).  */
 365     I3201_CBZ       = 0x34000000,
 366     I3201_CBNZ      = 0x35000000,
 367
 368     /* Conditional branch (immediate).  */
 369     I3202_B_C       = 0x54000000,
 370
 371     /* Unconditional branch (immediate).  */
 372     I3206_B         = 0x14000000,
 373     I3206_BL        = 0x94000000,
 374
 375     /* Unconditional branch (register).  */
 376     I3207_BR        = 0xd61f0000,
 377     I3207_BLR       = 0xd63f0000,
 378     I3207_RET       = 0xd65f0000,
 379
 380     /* AdvSIMD load/store single structure.  */
 381     I3303_LD1R      = 0x0d40c000,
 382
 383     /* Load literal for loading the address at pc-relative offset */
 384     I3305_LDR       = 0x58000000,
 385     I3305_LDR_v64   = 0x5c000000,
 386     I3305_LDR_v128  = 0x9c000000,
 387
 388     /* Load/store exclusive. */
 389     I3306_LDXP      = 0xc8600000,
 390     I3306_STXP      = 0xc8200000,
 391
 392     /* Load/store register.  Described here as 3.3.12, but the helper
 393        that emits them can transform to 3.3.10 or 3.3.13.  */
 394     I3312_STRB      = 0x38000000 | LDST_ST << 22 | MO_8 << 30,
 395     I3312_STRH      = 0x38000000 | LDST_ST << 22 | MO_16 << 30,
 396     I3312_STRW      = 0x38000000 | LDST_ST << 22 | MO_32 << 30,
 397     I3312_STRX      = 0x38000000 | LDST_ST << 22 | MO_64 << 30,
 398
 399     I3312_LDRB      = 0x38000000 | LDST_LD << 22 | MO_8 << 30,
 400     I3312_LDRH      = 0x38000000 | LDST_LD << 22 | MO_16 << 30,
 401     I3312_LDRW      = 0x38000000 | LDST_LD << 22 | MO_32 << 30,
 402     I3312_LDRX      = 0x38000000 | LDST_LD << 22 | MO_64 << 30,
 403
 404     I3312_LDRSBW    = 0x38000000 | LDST_LD_S_W << 22 | MO_8 << 30,
 405     I3312_LDRSHW    = 0x38000000 | LDST_LD_S_W << 22 | MO_16 << 30,
 406
 407     I3312_LDRSBX    = 0x38000000 | LDST_LD_S_X << 22 | MO_8 << 30,
 408     I3312_LDRSHX    = 0x38000000 | LDST_LD_S_X << 22 | MO_16 << 30,
 409     I3312_LDRSWX    = 0x38000000 | LDST_LD_S_X << 22 | MO_32 << 30,
 410
 411     I3312_LDRVS     = 0x3c000000 | LDST_LD << 22 | MO_32 << 30,
 412     I3312_STRVS     = 0x3c000000 | LDST_ST << 22 | MO_32 << 30,
 413
 414     I3312_LDRVD     = 0x3c000000 | LDST_LD << 22 | MO_64 << 30,
 415     I3312_STRVD     = 0x3c000000 | LDST_ST << 22 | MO_64 << 30,
 416
 417     I3312_LDRVQ     = 0x3c000000 | 3 << 22 | 0 << 30,
 418     I3312_STRVQ     = 0x3c000000 | 2 << 22 | 0 << 30,
 419
 420     I3312_TO_I3310  = 0x00200800,
 421     I3312_TO_I3313  = 0x01000000,
 422
 423     /* Load/store register pair instructions.  */
 424     I3314_LDP       = 0x28400000,
 425     I3314_STP       = 0x28000000,
 426
 427     /* Add/subtract immediate instructions.  */
 428     I3401_ADDI      = 0x11000000,
 429     I3401_ADDSI     = 0x31000000,
 430     I3401_SUBI      = 0x51000000,
 431     I3401_SUBSI     = 0x71000000,
 432
 433     /* Bitfield instructions.  */
 434     I3402_BFM       = 0x33000000,
 435     I3402_SBFM      = 0x13000000,
 436     I3402_UBFM      = 0x53000000,
 437
 438     /* Extract instruction.  */
 439     I3403_EXTR      = 0x13800000,
 440
 441     /* Logical immediate instructions.  */
 442     I3404_ANDI      = 0x12000000,
 443     I3404_ORRI      = 0x32000000,
 444     I3404_EORI      = 0x52000000,
 445     I3404_ANDSI     = 0x72000000,
 446
 447     /* Move wide immediate instructions.  */
 448     I3405_MOVN      = 0x12800000,
 449     I3405_MOVZ      = 0x52800000,
 450     I3405_MOVK      = 0x72800000,
 451
 452     /* PC relative addressing instructions.  */
 453     I3406_ADR       = 0x10000000,
 454     I3406_ADRP      = 0x90000000,
 455
 456     /* Add/subtract extended register instructions. */
 457     I3501_ADD       = 0x0b200000,
 458
 459     /* Add/subtract shifted register instructions (without a shift).  */
 460     I3502_ADD       = 0x0b000000,
 461     I3502_ADDS      = 0x2b000000,
 462     I3502_SUB       = 0x4b000000,
 463     I3502_SUBS      = 0x6b000000,
 464
 465     /* Add/subtract shifted register instructions (with a shift).  */
 466     I3502S_ADD_LSL  = I3502_ADD,
 467
 468     /* Add/subtract with carry instructions.  */
 469     I3503_ADC       = 0x1a000000,
 470     I3503_SBC       = 0x5a000000,
 471
 472     /* Conditional select instructions.  */
 473     I3506_CSEL      = 0x1a800000,
 474     I3506_CSINC     = 0x1a800400,
 475     I3506_CSINV     = 0x5a800000,
 476     I3506_CSNEG     = 0x5a800400,
 477
 478     /* Data-processing (1 source) instructions.  */
 479     I3507_CLZ       = 0x5ac01000,
 480     I3507_RBIT      = 0x5ac00000,
 481     I3507_REV       = 0x5ac00000, /* + size << 10 */
 482
 483     /* Data-processing (2 source) instructions.  */
 484     I3508_LSLV      = 0x1ac02000,
 485     I3508_LSRV      = 0x1ac02400,
 486     I3508_ASRV      = 0x1ac02800,
 487     I3508_RORV      = 0x1ac02c00,
 488     I3508_SMULH     = 0x9b407c00,
 489     I3508_UMULH     = 0x9bc07c00,
 490     I3508_UDIV      = 0x1ac00800,
 491     I3508_SDIV      = 0x1ac00c00,
 492
 493     /* Data-processing (3 source) instructions.  */
 494     I3509_MADD      = 0x1b000000,
 495     I3509_MSUB      = 0x1b008000,
 496
 497     /* Logical shifted register instructions (without a shift).  */
 498     I3510_AND       = 0x0a000000,
 499     I3510_BIC       = 0x0a200000,
 500     I3510_ORR       = 0x2a000000,
 501     I3510_ORN       = 0x2a200000,
 502     I3510_EOR       = 0x4a000000,
 503     I3510_EON       = 0x4a200000,
 504     I3510_ANDS      = 0x6a000000,
 505
 506     /* Logical shifted register instructions (with a shift).  */
 507     I3502S_AND_LSR  = I3510_AND | (1 << 22),
 508
 509     /* AdvSIMD copy */
 510     I3605_DUP      = 0x0e000400,
 511     I3605_INS      = 0x4e001c00,
 512     I3605_UMOV     = 0x0e003c00,
 513
 514     /* AdvSIMD modified immediate */
 515     I3606_MOVI      = 0x0f000400,
 516     I3606_MVNI      = 0x2f000400,
 517     I3606_BIC       = 0x2f001400,
 518     I3606_ORR       = 0x0f001400,
 519
 520     /* AdvSIMD scalar shift by immediate */
 521     I3609_SSHR      = 0x5f000400,
 522     I3609_SSRA      = 0x5f001400,
 523     I3609_SHL       = 0x5f005400,
 524     I3609_USHR      = 0x7f000400,
 525     I3609_USRA      = 0x7f001400,
 526     I3609_SLI       = 0x7f005400,
 527
 528     /* AdvSIMD scalar three same */
 529     I3611_SQADD     = 0x5e200c00,
 530     I3611_SQSUB     = 0x5e202c00,
 531     I3611_CMGT      = 0x5e203400,
 532     I3611_CMGE      = 0x5e203c00,
 533     I3611_SSHL      = 0x5e204400,
 534     I3611_ADD       = 0x5e208400,
 535     I3611_CMTST     = 0x5e208c00,
 536     I3611_UQADD     = 0x7e200c00,
 537     I3611_UQSUB     = 0x7e202c00,
 538     I3611_CMHI      = 0x7e203400,
 539     I3611_CMHS      = 0x7e203c00,
 540     I3611_USHL      = 0x7e204400,
 541     I3611_SUB       = 0x7e208400,
 542     I3611_CMEQ      = 0x7e208c00,
 543
 544     /* AdvSIMD scalar two-reg misc */
 545     I3612_CMGT0     = 0x5e208800,
 546     I3612_CMEQ0     = 0x5e209800,
 547     I3612_CMLT0     = 0x5e20a800,
 548     I3612_ABS       = 0x5e20b800,
 549     I3612_CMGE0     = 0x7e208800,
 550     I3612_CMLE0     = 0x7e209800,
 551     I3612_NEG       = 0x7e20b800,
 552
 553     /* AdvSIMD shift by immediate */
 554     I3614_SSHR      = 0x0f000400,
 555     I3614_SSRA      = 0x0f001400,
 556     I3614_SHL       = 0x0f005400,
 557     I3614_SLI       = 0x2f005400,
 558     I3614_USHR      = 0x2f000400,
 559     I3614_USRA      = 0x2f001400,
 560
 561     /* AdvSIMD three same.  */
 562     I3616_ADD       = 0x0e208400,
 563     I3616_AND       = 0x0e201c00,
 564     I3616_BIC       = 0x0e601c00,
 565     I3616_BIF       = 0x2ee01c00,
 566     I3616_BIT       = 0x2ea01c00,
 567     I3616_BSL       = 0x2e601c00,
 568     I3616_EOR       = 0x2e201c00,
 569     I3616_MUL       = 0x0e209c00,
 570     I3616_ORR       = 0x0ea01c00,
 571     I3616_ORN       = 0x0ee01c00,
 572     I3616_SUB       = 0x2e208400,
 573     I3616_CMGT      = 0x0e203400,
 574     I3616_CMGE      = 0x0e203c00,
 575     I3616_CMTST     = 0x0e208c00,
 576     I3616_CMHI      = 0x2e203400,
 577     I3616_CMHS      = 0x2e203c00,
 578     I3616_CMEQ      = 0x2e208c00,
 579     I3616_SMAX      = 0x0e206400,
 580     I3616_SMIN      = 0x0e206c00,
 581     I3616_SSHL      = 0x0e204400,
 582     I3616_SQADD     = 0x0e200c00,
 583     I3616_SQSUB     = 0x0e202c00,
 584     I3616_UMAX      = 0x2e206400,
 585     I3616_UMIN      = 0x2e206c00,
 586     I3616_UQADD     = 0x2e200c00,
 587     I3616_UQSUB     = 0x2e202c00,
 588     I3616_USHL      = 0x2e204400,
 589
 590     /* AdvSIMD two-reg misc.  */
 591     I3617_CMGT0     = 0x0e208800,
 592     I3617_CMEQ0     = 0x0e209800,
 593     I3617_CMLT0     = 0x0e20a800,
 594     I3617_CMGE0     = 0x2e208800,
 595     I3617_CMLE0     = 0x2e209800,
 596     I3617_NOT       = 0x2e205800,
 597     I3617_ABS       = 0x0e20b800,
 598     I3617_NEG       = 0x2e20b800,
 599
 600     /* System instructions.  */
 601     NOP             = 0xd503201f,
 602     DMB_ISH         = 0xd50338bf,
 603     DMB_LD          = 0x00000100,
 604     DMB_ST          = 0x00000200,
 605
 606     BTI_C           = 0xd503245f,
 607     BTI_J           = 0xd503249f,
 608     BTI_JC          = 0xd50324df,
 609 } AArch64Insn;
 610
 611 static inline uint32_t tcg_in32(TCGContext *s)
 612 {
 613     uint32_t v = *(uint32_t *)s->code_ptr;
 614     return v;
 615 }
 616
 617 /* Emit an opcode with "type-checking" of the format.  */
 618 #define tcg_out_insn(S, FMT, OP, ...) \
 619     glue(tcg_out_insn_,FMT)(S, glue(glue(glue(I,FMT),_),OP), ## __VA_ARGS__)
 620
 621 static void tcg_out_insn_3303(TCGContext *s, AArch64Insn insn, bool q,
 622                               TCGReg rt, TCGReg rn, unsigned size)
 623 {
 624     tcg_out32(s, insn | (rt & 0x1f) | (rn << 5) | (size << 10) | (q << 30));
 625 }
 626
 627 static void tcg_out_insn_3305(TCGContext *s, AArch64Insn insn,
 628                               int imm19, TCGReg rt)
 629 {
 630     tcg_out32(s, insn | (imm19 & 0x7ffff) << 5 | rt);
 631 }
 632
 633 static void tcg_out_insn_3306(TCGContext *s, AArch64Insn insn, TCGReg rs,
 634                               TCGReg rt, TCGReg rt2, TCGReg rn)
 635 {
 636     tcg_out32(s, insn | rs << 16 | rt2 << 10 | rn << 5 | rt);
 637 }
 638
 639 static void tcg_out_insn_3201(TCGContext *s, AArch64Insn insn, TCGType ext,
 640                               TCGReg rt, int imm19)
 641 {
 642     tcg_out32(s, insn | ext << 31 | (imm19 & 0x7ffff) << 5 | rt);
 643 }
 644
 645 static void tcg_out_insn_3202(TCGContext *s, AArch64Insn insn,
 646                               TCGCond c, int imm19)
 647 {
 648     tcg_out32(s, insn | tcg_cond_to_aarch64[c] | (imm19 & 0x7ffff) << 5);
 649 }
 650
 651 static void tcg_out_insn_3206(TCGContext *s, AArch64Insn insn, int imm26)
 652 {
 653     tcg_out32(s, insn | (imm26 & 0x03ffffff));
 654 }
 655
 656 static void tcg_out_insn_3207(TCGContext *s, AArch64Insn insn, TCGReg rn)
 657 {
 658     tcg_out32(s, insn | rn << 5);
 659 }
 660
 661 static void tcg_out_insn_3314(TCGContext *s, AArch64Insn insn,
 662                               TCGReg r1, TCGReg r2, TCGReg rn,
 663                               tcg_target_long ofs, bool pre, bool w)
 664 {
 665     insn |= 1u << 31; /* ext */
 666     insn |= pre << 24;
 667     insn |= w << 23;
 668
 669     tcg_debug_assert(ofs >= -0x200 && ofs < 0x200 && (ofs & 7) == 0);
 670     insn |= (ofs & (0x7f << 3)) << (15 - 3);
 671
 672     tcg_out32(s, insn | r2 << 10 | rn << 5 | r1);
 673 }
 674
 675 static void tcg_out_insn_3401(TCGContext *s, AArch64Insn insn, TCGType ext,
 676                               TCGReg rd, TCGReg rn, uint64_t aimm)
 677 {
 678     if (aimm > 0xfff) {
 679         tcg_debug_assert((aimm & 0xfff) == 0);
 680         aimm >>= 12;
 681         tcg_debug_assert(aimm <= 0xfff);
 682         aimm |= 1 << 12;  /* apply LSL 12 */
 683     }
 684     tcg_out32(s, insn | ext << 31 | aimm << 10 | rn << 5 | rd);
 685 }
 686
 687 /* This function can be used for both 3.4.2 (Bitfield) and 3.4.4
 688    (Logical immediate).  Both insn groups have N, IMMR and IMMS fields
 689    that feed the DecodeBitMasks pseudo function.  */
 690 static void tcg_out_insn_3402(TCGContext *s, AArch64Insn insn, TCGType ext,
 691                               TCGReg rd, TCGReg rn, int n, int immr, int imms)
 692 {
 693     tcg_out32(s, insn | ext << 31 | n << 22 | immr << 16 | imms << 10
 694               | rn << 5 | rd);
 695 }
 696
 697 #define tcg_out_insn_3404  tcg_out_insn_3402
 698
 699 static void tcg_out_insn_3403(TCGContext *s, AArch64Insn insn, TCGType ext,
 700                               TCGReg rd, TCGReg rn, TCGReg rm, int imms)
 701 {
 702     tcg_out32(s, insn | ext << 31 | ext << 22 | rm << 16 | imms << 10
 703               | rn << 5 | rd);
 704 }
 705
 706 /* This function is used for the Move (wide immediate) instruction group.
 707    Note that SHIFT is a full shift count, not the 2 bit HW field. */
 708 static void tcg_out_insn_3405(TCGContext *s, AArch64Insn insn, TCGType ext,
 709                               TCGReg rd, uint16_t half, unsigned shift)
 710 {
 711     tcg_debug_assert((shift & ~0x30) == 0);
 712     tcg_out32(s, insn | ext << 31 | shift << (21 - 4) | half << 5 | rd);
 713 }
 714
 715 static void tcg_out_insn_3406(TCGContext *s, AArch64Insn insn,
 716                               TCGReg rd, int64_t disp)
 717 {
 718     tcg_out32(s, insn | (disp & 3) << 29 | (disp & 0x1ffffc) << (5 - 2) | rd);
 719 }
 720
 721 static inline void tcg_out_insn_3501(TCGContext *s, AArch64Insn insn,
 722                                      TCGType sf, TCGReg rd, TCGReg rn,
 723                                      TCGReg rm, int opt, int imm3)
 724 {
 725     tcg_out32(s, insn | sf << 31 | rm << 16 | opt << 13 |
 726               imm3 << 10 | rn << 5 | rd);
 727 }
 728
 729 /* This function is for both 3.5.2 (Add/Subtract shifted register), for
 730    the rare occasion when we actually want to supply a shift amount.  */
 731 static inline void tcg_out_insn_3502S(TCGContext *s, AArch64Insn insn,
 732                                       TCGType ext, TCGReg rd, TCGReg rn,
 733                                       TCGReg rm, int imm6)
 734 {
 735     tcg_out32(s, insn | ext << 31 | rm << 16 | imm6 << 10 | rn << 5 | rd);
 736 }
 737
 738 /* This function is for 3.5.2 (Add/subtract shifted register),
 739    and 3.5.10 (Logical shifted register), for the vast majorty of cases
 740    when we don't want to apply a shift.  Thus it can also be used for
 741    3.5.3 (Add/subtract with carry) and 3.5.8 (Data processing 2 source).  */
 742 static void tcg_out_insn_3502(TCGContext *s, AArch64Insn insn, TCGType ext,
 743                               TCGReg rd, TCGReg rn, TCGReg rm)
 744 {
 745     tcg_out32(s, insn | ext << 31 | rm << 16 | rn << 5 | rd);
 746 }
 747
 748 #define tcg_out_insn_3503  tcg_out_insn_3502
 749 #define tcg_out_insn_3508  tcg_out_insn_3502
 750 #define tcg_out_insn_3510  tcg_out_insn_3502
 751
 752 static void tcg_out_insn_3506(TCGContext *s, AArch64Insn insn, TCGType ext,
 753                               TCGReg rd, TCGReg rn, TCGReg rm, TCGCond c)
 754 {
 755     tcg_out32(s, insn | ext << 31 | rm << 16 | rn << 5 | rd
 756               | tcg_cond_to_aarch64[c] << 12);
 757 }
 758
 759 static void tcg_out_insn_3507(TCGContext *s, AArch64Insn insn, TCGType ext,
 760                               TCGReg rd, TCGReg rn)
 761 {
 762     tcg_out32(s, insn | ext << 31 | rn << 5 | rd);
 763 }
 764
 765 static void tcg_out_insn_3509(TCGContext *s, AArch64Insn insn, TCGType ext,
 766                               TCGReg rd, TCGReg rn, TCGReg rm, TCGReg ra)
 767 {
 768     tcg_out32(s, insn | ext << 31 | rm << 16 | ra << 10 | rn << 5 | rd);
 769 }
 770
 771 static void tcg_out_insn_3605(TCGContext *s, AArch64Insn insn, bool q,
 772                               TCGReg rd, TCGReg rn, int dst_idx, int src_idx)
 773 {
 774     /* Note that bit 11 set means general register input.  Therefore
 775        we can handle both register sets with one function.  */
 776     tcg_out32(s, insn | q << 30 | (dst_idx << 16) | (src_idx << 11)
 777               | (rd & 0x1f) | (~rn & 0x20) << 6 | (rn & 0x1f) << 5);
 778 }
 779
 780 static void tcg_out_insn_3606(TCGContext *s, AArch64Insn insn, bool q,
 781                               TCGReg rd, bool op, int cmode, uint8_t imm8)
 782 {
 783     tcg_out32(s, insn | q << 30 | op << 29 | cmode << 12 | (rd & 0x1f)
 784               | (imm8 & 0xe0) << (16 - 5) | (imm8 & 0x1f) << 5);
 785 }
 786
 787 static void tcg_out_insn_3609(TCGContext *s, AArch64Insn insn,
 788                               TCGReg rd, TCGReg rn, unsigned immhb)
 789 {
 790     tcg_out32(s, insn | immhb << 16 | (rn & 0x1f) << 5 | (rd & 0x1f));
 791 }
 792
 793 static void tcg_out_insn_3611(TCGContext *s, AArch64Insn insn,
 794                               unsigned size, TCGReg rd, TCGReg rn, TCGReg rm)
 795 {
 796     tcg_out32(s, insn | (size << 22) | (rm & 0x1f) << 16
 797               | (rn & 0x1f) << 5 | (rd & 0x1f));
 798 }
 799
 800 static void tcg_out_insn_3612(TCGContext *s, AArch64Insn insn,
 801                               unsigned size, TCGReg rd, TCGReg rn)
 802 {
 803     tcg_out32(s, insn | (size << 22) | (rn & 0x1f) << 5 | (rd & 0x1f));
 804 }
 805
 806 static void tcg_out_insn_3614(TCGContext *s, AArch64Insn insn, bool q,
 807                               TCGReg rd, TCGReg rn, unsigned immhb)
 808 {
 809     tcg_out32(s, insn | q << 30 | immhb << 16
 810               | (rn & 0x1f) << 5 | (rd & 0x1f));
 811 }
 812
 813 static void tcg_out_insn_3616(TCGContext *s, AArch64Insn insn, bool q,
 814                               unsigned size, TCGReg rd, TCGReg rn, TCGReg rm)
 815 {
 816     tcg_out32(s, insn | q << 30 | (size << 22) | (rm & 0x1f) << 16
 817               | (rn & 0x1f) << 5 | (rd & 0x1f));
 818 }
 819
 820 static void tcg_out_insn_3617(TCGContext *s, AArch64Insn insn, bool q,
 821                               unsigned size, TCGReg rd, TCGReg rn)
 822 {
 823     tcg_out32(s, insn | q << 30 | (size << 22)
 824               | (rn & 0x1f) << 5 | (rd & 0x1f));
 825 }
 826
 827 static void tcg_out_insn_3310(TCGContext *s, AArch64Insn insn,
 828                               TCGReg rd, TCGReg base, TCGType ext,
 829                               TCGReg regoff)
 830 {
 831     /* Note the AArch64Insn constants above are for C3.3.12.  Adjust.  */
 832     tcg_out32(s, insn | I3312_TO_I3310 | regoff << 16 |
 833               0x4000 | ext << 13 | base << 5 | (rd & 0x1f));
 834 }
 835
 836 static void tcg_out_insn_3312(TCGContext *s, AArch64Insn insn,
 837                               TCGReg rd, TCGReg rn, intptr_t offset)
 838 {
 839     tcg_out32(s, insn | (offset & 0x1ff) << 12 | rn << 5 | (rd & 0x1f));
 840 }
 841
 842 static void tcg_out_insn_3313(TCGContext *s, AArch64Insn insn,
 843                               TCGReg rd, TCGReg rn, uintptr_t scaled_uimm)
 844 {
 845     /* Note the AArch64Insn constants above are for C3.3.12.  Adjust.  */
 846     tcg_out32(s, insn | I3312_TO_I3313 | scaled_uimm << 10
 847               | rn << 5 | (rd & 0x1f));
 848 }
 849
 850 static void tcg_out_bti(TCGContext *s, AArch64Insn insn)
 851 {
 852     /*
 853      * While BTI insns are nops on hosts without FEAT_BTI,
 854      * there is no point in emitting them in that case either.
 855      */
 856     if (cpuinfo & CPUINFO_BTI) {
 857         tcg_out32(s, insn);
 858     }
 859 }
 860
 861 /* Register to register move using ORR (shifted register with no shift). */
 862 static void tcg_out_movr(TCGContext *s, TCGType ext, TCGReg rd, TCGReg rm)
 863 {
 864     tcg_out_insn(s, 3510, ORR, ext, rd, TCG_REG_XZR, rm);
 865 }
 866
 867 /* Register to register move using ADDI (move to/from SP).  */
 868 static void tcg_out_movr_sp(TCGContext *s, TCGType ext, TCGReg rd, TCGReg rn)
 869 {
 870     tcg_out_insn(s, 3401, ADDI, ext, rd, rn, 0);
 871 }
 872
 873 /* This function is used for the Logical (immediate) instruction group.
 874    The value of LIMM must satisfy IS_LIMM.  See the comment above about
 875    only supporting simplified logical immediates.  */
 876 static void tcg_out_logicali(TCGContext *s, AArch64Insn insn, TCGType ext,
 877                              TCGReg rd, TCGReg rn, uint64_t limm)
 878 {
 879     unsigned h, l, r, c;
 880
 881     tcg_debug_assert(is_limm(limm));
 882
 883     h = clz64(limm);
 884     l = ctz64(limm);
 885     if (l == 0) {
 886         r = 0;                  /* form 0....01....1 */
 887         c = ctz64(~limm) - 1;
 888         if (h == 0) {
 889             r = clz64(~limm);   /* form 1..10..01..1 */
 890             c += r;
 891         }
 892     } else {
 893         r = 64 - l;             /* form 1....10....0 or 0..01..10..0 */
 894         c = r - h - 1;
 895     }
 896     if (ext == TCG_TYPE_I32) {
 897         r &= 31;
 898         c &= 31;
 899     }
 900
 901     tcg_out_insn_3404(s, insn, ext, rd, rn, ext, r, c);
 902 }
 903
 904 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 905                              TCGReg rd, int64_t v64)
 906 {
 907     bool q = type == TCG_TYPE_V128;
 908     int cmode, imm8, i;
 909
 910     /* Test all bytes equal first.  */
 911     if (vece == MO_8) {
 912         imm8 = (uint8_t)v64;
 913         tcg_out_insn(s, 3606, MOVI, q, rd, 0, 0xe, imm8);
 914         return;
 915     }
 916
 917     /*
 918      * Test all bytes 0x00 or 0xff second.  This can match cases that
 919      * might otherwise take 2 or 3 insns for MO_16 or MO_32 below.
 920      */
 921     for (i = imm8 = 0; i < 8; i++) {
 922         uint8_t byte = v64 >> (i * 8);
 923         if (byte == 0xff) {
 924             imm8 |= 1 << i;
 925         } else if (byte != 0) {
 926             goto fail_bytes;
 927         }
 928     }
 929     tcg_out_insn(s, 3606, MOVI, q, rd, 1, 0xe, imm8);
 930     return;
 931  fail_bytes:
 932
 933     /*
 934      * Tests for various replications.  For each element width, if we
 935      * cannot find an expansion there's no point checking a larger
 936      * width because we already know by replication it cannot match.
 937      */
 938     if (vece == MO_16) {
 939         uint16_t v16 = v64;
 940
 941         if (is_shimm16(v16, &cmode, &imm8)) {
 942             tcg_out_insn(s, 3606, MOVI, q, rd, 0, cmode, imm8);
 943             return;
 944         }
 945         if (is_shimm16(~v16, &cmode, &imm8)) {
 946             tcg_out_insn(s, 3606, MVNI, q, rd, 0, cmode, imm8);
 947             return;
 948         }
 949
 950         /*
 951          * Otherwise, all remaining constants can be loaded in two insns:
 952          * rd = v16 & 0xff, rd |= v16 & 0xff00.
 953          */
 954         tcg_out_insn(s, 3606, MOVI, q, rd, 0, 0x8, v16 & 0xff);
 955         tcg_out_insn(s, 3606, ORR, q, rd, 0, 0xa, v16 >> 8);
 956         return;
 957     } else if (vece == MO_32) {
 958         uint32_t v32 = v64;
 959         uint32_t n32 = ~v32;
 960
 961         if (is_shimm32(v32, &cmode, &imm8) ||
 962             is_soimm32(v32, &cmode, &imm8) ||
 963             is_fimm32(v32, &cmode, &imm8)) {
 964             tcg_out_insn(s, 3606, MOVI, q, rd, 0, cmode, imm8);
 965             return;
 966         }
 967         if (is_shimm32(n32, &cmode, &imm8) ||
 968             is_soimm32(n32, &cmode, &imm8)) {
 969             tcg_out_insn(s, 3606, MVNI, q, rd, 0, cmode, imm8);
 970             return;
 971         }
 972
 973         /*
 974          * Restrict the set of constants to those we can load with
 975          * two instructions.  Others we load from the pool.
 976          */
 977         i = is_shimm32_pair(v32, &cmode, &imm8);
 978         if (i) {
 979             tcg_out_insn(s, 3606, MOVI, q, rd, 0, cmode, imm8);
 980             tcg_out_insn(s, 3606, ORR, q, rd, 0, i, extract32(v32, i * 4, 8));
 981             return;
 982         }
 983         i = is_shimm32_pair(n32, &cmode, &imm8);
 984         if (i) {
 985             tcg_out_insn(s, 3606, MVNI, q, rd, 0, cmode, imm8);
 986             tcg_out_insn(s, 3606, BIC, q, rd, 0, i, extract32(n32, i * 4, 8));
 987             return;
 988         }
 989     } else if (is_fimm64(v64, &cmode, &imm8)) {
 990         tcg_out_insn(s, 3606, MOVI, q, rd, 1, cmode, imm8);
 991         return;
 992     }
 993
 994     /*
 995      * As a last resort, load from the constant pool.  Sadly there
 996      * is no LD1R (literal), so store the full 16-byte vector.
 997      */
 998     if (type == TCG_TYPE_V128) {
 999         new_pool_l2(s, R_AARCH64_CONDBR19, s->code_ptr, 0, v64, v64);
1000         tcg_out_insn(s, 3305, LDR_v128, 0, rd);
1001     } else {
1002         new_pool_label(s, v64, R_AARCH64_CONDBR19, s->code_ptr, 0);
1003         tcg_out_insn(s, 3305, LDR_v64, 0, rd);
1004     }
1005 }
1006
1007 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
1008                             TCGReg rd, TCGReg rs)
1009 {
1010     int is_q = type - TCG_TYPE_V64;
1011     tcg_out_insn(s, 3605, DUP, is_q, rd, rs, 1 << vece, 0);
1012     return true;
1013 }
1014
1015 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
1016                              TCGReg r, TCGReg base, intptr_t offset)
1017 {
1018     TCGReg temp = TCG_REG_TMP0;
1019
1020     if (offset < -0xffffff || offset > 0xffffff) {
1021         tcg_out_movi(s, TCG_TYPE_PTR, temp, offset);
1022         tcg_out_insn(s, 3502, ADD, 1, temp, temp, base);
1023         base = temp;
1024     } else {
1025         AArch64Insn add_insn = I3401_ADDI;
1026
1027         if (offset < 0) {
1028             add_insn = I3401_SUBI;
1029             offset = -offset;
1030         }
1031         if (offset & 0xfff000) {
1032             tcg_out_insn_3401(s, add_insn, 1, temp, base, offset & 0xfff000);
1033             base = temp;
1034         }
1035         if (offset & 0xfff) {
1036             tcg_out_insn_3401(s, add_insn, 1, temp, base, offset & 0xfff);
1037             base = temp;
1038         }
1039     }
1040     tcg_out_insn(s, 3303, LD1R, type == TCG_TYPE_V128, r, base, vece);
1041     return true;
1042 }
1043
1044 static void tcg_out_movi(TCGContext *s, TCGType type, TCGReg rd,
1045                          tcg_target_long value)
1046 {
1047     tcg_target_long svalue = value;
1048     tcg_target_long ivalue = ~value;
1049     tcg_target_long t0, t1, t2;
1050     int s0, s1;
1051     AArch64Insn opc;
1052
1053     switch (type) {
1054     case TCG_TYPE_I32:
1055     case TCG_TYPE_I64:
1056         tcg_debug_assert(rd < 32);
1057         break;
1058     default:
1059         g_assert_not_reached();
1060     }
1061
1062     /* For 32-bit values, discard potential garbage in value.  For 64-bit
1063        values within [2**31, 2**32-1], we can create smaller sequences by
1064        interpreting this as a negative 32-bit number, while ensuring that
1065        the high 32 bits are cleared by setting SF=0.  */
1066     if (type == TCG_TYPE_I32 || (value & ~0xffffffffull) == 0) {
1067         svalue = (int32_t)value;
1068         value = (uint32_t)value;
1069         ivalue = (uint32_t)ivalue;
1070         type = TCG_TYPE_I32;
1071     }
1072
1073     /* Speed things up by handling the common case of small positive
1074        and negative values specially.  */
1075     if ((value & ~0xffffull) == 0) {
1076         tcg_out_insn(s, 3405, MOVZ, type, rd, value, 0);
1077         return;
1078     } else if ((ivalue & ~0xffffull) == 0) {
1079         tcg_out_insn(s, 3405, MOVN, type, rd, ivalue, 0);
1080         return;
1081     }
1082
1083     /* Check for bitfield immediates.  For the benefit of 32-bit quantities,
1084        use the sign-extended value.  That lets us match rotated values such
1085        as 0xff0000ff with the same 64-bit logic matching 0xffffffffff0000ff. */
1086     if (is_limm(svalue)) {
1087         tcg_out_logicali(s, I3404_ORRI, type, rd, TCG_REG_XZR, svalue);
1088         return;
1089     }
1090
1091     /* Look for host pointer values within 4G of the PC.  This happens
1092        often when loading pointers to QEMU's own data structures.  */
1093     if (type == TCG_TYPE_I64) {
1094         intptr_t src_rx = (intptr_t)tcg_splitwx_to_rx(s->code_ptr);
1095         tcg_target_long disp = value - src_rx;
1096         if (disp == sextract64(disp, 0, 21)) {
1097             tcg_out_insn(s, 3406, ADR, rd, disp);
1098             return;
1099         }
1100         disp = (value >> 12) - (src_rx >> 12);
1101         if (disp == sextract64(disp, 0, 21)) {
1102             tcg_out_insn(s, 3406, ADRP, rd, disp);
1103             if (value & 0xfff) {
1104                 tcg_out_insn(s, 3401, ADDI, type, rd, rd, value & 0xfff);
1105             }
1106             return;
1107         }
1108     }
1109
1110     /* Would it take fewer insns to begin with MOVN?  */
1111     if (ctpop64(value) >= 32) {
1112         t0 = ivalue;
1113         opc = I3405_MOVN;
1114     } else {
1115         t0 = value;
1116         opc = I3405_MOVZ;
1117     }
1118     s0 = ctz64(t0) & (63 & -16);
1119     t1 = t0 & ~(0xffffull << s0);
1120     s1 = ctz64(t1) & (63 & -16);
1121     t2 = t1 & ~(0xffffull << s1);
1122     if (t2 == 0) {
1123         tcg_out_insn_3405(s, opc, type, rd, t0 >> s0, s0);
1124         if (t1 != 0) {
1125             tcg_out_insn(s, 3405, MOVK, type, rd, value >> s1, s1);
1126         }
1127         return;
1128     }
1129
1130     /* For more than 2 insns, dump it into the constant pool.  */
1131     new_pool_label(s, value, R_AARCH64_CONDBR19, s->code_ptr, 0);
1132     tcg_out_insn(s, 3305, LDR, 0, rd);
1133 }
1134
1135 static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1136 {
1137     return false;
1138 }
1139
1140 static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1141                              tcg_target_long imm)
1142 {
1143     /* This function is only used for passing structs by reference. */
1144     g_assert_not_reached();
1145 }
1146
1147 /* Define something more legible for general use.  */
1148 #define tcg_out_ldst_r  tcg_out_insn_3310
1149
1150 static void tcg_out_ldst(TCGContext *s, AArch64Insn insn, TCGReg rd,
1151                          TCGReg rn, intptr_t offset, int lgsize)
1152 {
1153     /* If the offset is naturally aligned and in range, then we can
1154        use the scaled uimm12 encoding */
1155     if (offset >= 0 && !(offset & ((1 << lgsize) - 1))) {
1156         uintptr_t scaled_uimm = offset >> lgsize;
1157         if (scaled_uimm <= 0xfff) {
1158             tcg_out_insn_3313(s, insn, rd, rn, scaled_uimm);
1159             return;
1160         }
1161     }
1162
1163     /* Small signed offsets can use the unscaled encoding.  */
1164     if (offset >= -256 && offset < 256) {
1165         tcg_out_insn_3312(s, insn, rd, rn, offset);
1166         return;
1167     }
1168
1169     /* Worst-case scenario, move offset to temp register, use reg offset.  */
1170     tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, offset);
1171     tcg_out_ldst_r(s, insn, rd, rn, TCG_TYPE_I64, TCG_REG_TMP0);
1172 }
1173
1174 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
1175 {
1176     if (ret == arg) {
1177         return true;
1178     }
1179     switch (type) {
1180     case TCG_TYPE_I32:
1181     case TCG_TYPE_I64:
1182         if (ret < 32 && arg < 32) {
1183             tcg_out_movr(s, type, ret, arg);
1184             break;
1185         } else if (ret < 32) {
1186             tcg_out_insn(s, 3605, UMOV, type, ret, arg, 0, 0);
1187             break;
1188         } else if (arg < 32) {
1189             tcg_out_insn(s, 3605, INS, 0, ret, arg, 4 << type, 0);
1190             break;
1191         }
1192         /* FALLTHRU */
1193
1194     case TCG_TYPE_V64:
1195         tcg_debug_assert(ret >= 32 && arg >= 32);
1196         tcg_out_insn(s, 3616, ORR, 0, 0, ret, arg, arg);
1197         break;
1198     case TCG_TYPE_V128:
1199         tcg_debug_assert(ret >= 32 && arg >= 32);
1200         tcg_out_insn(s, 3616, ORR, 1, 0, ret, arg, arg);
1201         break;
1202
1203     default:
1204         g_assert_not_reached();
1205     }
1206     return true;
1207 }
1208
1209 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1210                        TCGReg base, intptr_t ofs)
1211 {
1212     AArch64Insn insn;
1213     int lgsz;
1214
1215     switch (type) {
1216     case TCG_TYPE_I32:
1217         insn = (ret < 32 ? I3312_LDRW : I3312_LDRVS);
1218         lgsz = 2;
1219         break;
1220     case TCG_TYPE_I64:
1221         insn = (ret < 32 ? I3312_LDRX : I3312_LDRVD);
1222         lgsz = 3;
1223         break;
1224     case TCG_TYPE_V64:
1225         insn = I3312_LDRVD;
1226         lgsz = 3;
1227         break;
1228     case TCG_TYPE_V128:
1229         insn = I3312_LDRVQ;
1230         lgsz = 4;
1231         break;
1232     default:
1233         g_assert_not_reached();
1234     }
1235     tcg_out_ldst(s, insn, ret, base, ofs, lgsz);
1236 }
1237
1238 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg src,
1239                        TCGReg base, intptr_t ofs)
1240 {
1241     AArch64Insn insn;
1242     int lgsz;
1243
1244     switch (type) {
1245     case TCG_TYPE_I32:
1246         insn = (src < 32 ? I3312_STRW : I3312_STRVS);
1247         lgsz = 2;
1248         break;
1249     case TCG_TYPE_I64:
1250         insn = (src < 32 ? I3312_STRX : I3312_STRVD);
1251         lgsz = 3;
1252         break;
1253     case TCG_TYPE_V64:
1254         insn = I3312_STRVD;
1255         lgsz = 3;
1256         break;
1257     case TCG_TYPE_V128:
1258         insn = I3312_STRVQ;
1259         lgsz = 4;
1260         break;
1261     default:
1262         g_assert_not_reached();
1263     }
1264     tcg_out_ldst(s, insn, src, base, ofs, lgsz);
1265 }
1266
1267 static inline bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1268                                TCGReg base, intptr_t ofs)
1269 {
1270     if (type <= TCG_TYPE_I64 && val == 0) {
1271         tcg_out_st(s, type, TCG_REG_XZR, base, ofs);
1272         return true;
1273     }
1274     return false;
1275 }
1276
1277 static inline void tcg_out_bfm(TCGContext *s, TCGType ext, TCGReg rd,
1278                                TCGReg rn, unsigned int a, unsigned int b)
1279 {
1280     tcg_out_insn(s, 3402, BFM, ext, rd, rn, ext, a, b);
1281 }
1282
1283 static inline void tcg_out_ubfm(TCGContext *s, TCGType ext, TCGReg rd,
1284                                 TCGReg rn, unsigned int a, unsigned int b)
1285 {
1286     tcg_out_insn(s, 3402, UBFM, ext, rd, rn, ext, a, b);
1287 }
1288
1289 static inline void tcg_out_sbfm(TCGContext *s, TCGType ext, TCGReg rd,
1290                                 TCGReg rn, unsigned int a, unsigned int b)
1291 {
1292     tcg_out_insn(s, 3402, SBFM, ext, rd, rn, ext, a, b);
1293 }
1294
1295 static inline void tcg_out_extr(TCGContext *s, TCGType ext, TCGReg rd,
1296                                 TCGReg rn, TCGReg rm, unsigned int a)
1297 {
1298     tcg_out_insn(s, 3403, EXTR, ext, rd, rn, rm, a);
1299 }
1300
1301 static inline void tcg_out_shl(TCGContext *s, TCGType ext,
1302                                TCGReg rd, TCGReg rn, unsigned int m)
1303 {
1304     int bits = ext ? 64 : 32;
1305     int max = bits - 1;
1306     tcg_out_ubfm(s, ext, rd, rn, (bits - m) & max, (max - m) & max);
1307 }
1308
1309 static inline void tcg_out_shr(TCGContext *s, TCGType ext,
1310                                TCGReg rd, TCGReg rn, unsigned int m)
1311 {
1312     int max = ext ? 63 : 31;
1313     tcg_out_ubfm(s, ext, rd, rn, m & max, max);
1314 }
1315
1316 static inline void tcg_out_sar(TCGContext *s, TCGType ext,
1317                                TCGReg rd, TCGReg rn, unsigned int m)
1318 {
1319     int max = ext ? 63 : 31;
1320     tcg_out_sbfm(s, ext, rd, rn, m & max, max);
1321 }
1322
1323 static inline void tcg_out_rotr(TCGContext *s, TCGType ext,
1324                                 TCGReg rd, TCGReg rn, unsigned int m)
1325 {
1326     int max = ext ? 63 : 31;
1327     tcg_out_extr(s, ext, rd, rn, rn, m & max);
1328 }
1329
1330 static inline void tcg_out_rotl(TCGContext *s, TCGType ext,
1331                                 TCGReg rd, TCGReg rn, unsigned int m)
1332 {
1333     int max = ext ? 63 : 31;
1334     tcg_out_extr(s, ext, rd, rn, rn, -m & max);
1335 }
1336
1337 static inline void tcg_out_dep(TCGContext *s, TCGType ext, TCGReg rd,
1338                                TCGReg rn, unsigned lsb, unsigned width)
1339 {
1340     unsigned size = ext ? 64 : 32;
1341     unsigned a = (size - lsb) & (size - 1);
1342     unsigned b = width - 1;
1343     tcg_out_bfm(s, ext, rd, rn, a, b);
1344 }
1345
1346 static void tcg_out_cmp(TCGContext *s, TCGType ext, TCGReg a,
1347                         tcg_target_long b, bool const_b)
1348 {
1349     if (const_b) {
1350         /* Using CMP or CMN aliases.  */
1351         if (b >= 0) {
1352             tcg_out_insn(s, 3401, SUBSI, ext, TCG_REG_XZR, a, b);
1353         } else {
1354             tcg_out_insn(s, 3401, ADDSI, ext, TCG_REG_XZR, a, -b);
1355         }
1356     } else {
1357         /* Using CMP alias SUBS wzr, Wn, Wm */
1358         tcg_out_insn(s, 3502, SUBS, ext, TCG_REG_XZR, a, b);
1359     }
1360 }
1361
1362 static void tcg_out_goto(TCGContext *s, const tcg_insn_unit *target)
1363 {
1364     ptrdiff_t offset = tcg_pcrel_diff(s, target) >> 2;
1365     tcg_debug_assert(offset == sextract64(offset, 0, 26));
1366     tcg_out_insn(s, 3206, B, offset);
1367 }
1368
1369 static void tcg_out_call_int(TCGContext *s, const tcg_insn_unit *target)
1370 {
1371     ptrdiff_t offset = tcg_pcrel_diff(s, target) >> 2;
1372     if (offset == sextract64(offset, 0, 26)) {
1373         tcg_out_insn(s, 3206, BL, offset);
1374     } else {
1375         tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, (intptr_t)target);
1376         tcg_out_insn(s, 3207, BLR, TCG_REG_TMP0);
1377     }
1378 }
1379
1380 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *target,
1381                          const TCGHelperInfo *info)
1382 {
1383     tcg_out_call_int(s, target);
1384 }
1385
1386 static inline void tcg_out_goto_label(TCGContext *s, TCGLabel *l)
1387 {
1388     if (!l->has_value) {
1389         tcg_out_reloc(s, s->code_ptr, R_AARCH64_JUMP26, l, 0);
1390         tcg_out_insn(s, 3206, B, 0);
1391     } else {
1392         tcg_out_goto(s, l->u.value_ptr);
1393     }
1394 }
1395
1396 static void tcg_out_brcond(TCGContext *s, TCGType ext, TCGCond c, TCGArg a,
1397                            TCGArg b, bool b_const, TCGLabel *l)
1398 {
1399     intptr_t offset;
1400     bool need_cmp;
1401
1402     if (b_const && b == 0 && (c == TCG_COND_EQ || c == TCG_COND_NE)) {
1403         need_cmp = false;
1404     } else {
1405         need_cmp = true;
1406         tcg_out_cmp(s, ext, a, b, b_const);
1407     }
1408
1409     if (!l->has_value) {
1410         tcg_out_reloc(s, s->code_ptr, R_AARCH64_CONDBR19, l, 0);
1411         offset = tcg_in32(s) >> 5;
1412     } else {
1413         offset = tcg_pcrel_diff(s, l->u.value_ptr) >> 2;
1414         tcg_debug_assert(offset == sextract64(offset, 0, 19));
1415     }
1416
1417     if (need_cmp) {
1418         tcg_out_insn(s, 3202, B_C, c, offset);
1419     } else if (c == TCG_COND_EQ) {
1420         tcg_out_insn(s, 3201, CBZ, ext, a, offset);
1421     } else {
1422         tcg_out_insn(s, 3201, CBNZ, ext, a, offset);
1423     }
1424 }
1425
1426 static inline void tcg_out_rev(TCGContext *s, int ext, MemOp s_bits,
1427                                TCGReg rd, TCGReg rn)
1428 {
1429     /* REV, REV16, REV32 */
1430     tcg_out_insn_3507(s, I3507_REV | (s_bits << 10), ext, rd, rn);
1431 }
1432
1433 static inline void tcg_out_sxt(TCGContext *s, TCGType ext, MemOp s_bits,
1434                                TCGReg rd, TCGReg rn)
1435 {
1436     /* Using ALIASes SXTB, SXTH, SXTW, of SBFM Xd, Xn, #0, #7|15|31 */
1437     int bits = (8 << s_bits) - 1;
1438     tcg_out_sbfm(s, ext, rd, rn, 0, bits);
1439 }
1440
1441 static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg rd, TCGReg rn)
1442 {
1443     tcg_out_sxt(s, type, MO_8, rd, rn);
1444 }
1445
1446 static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg rd, TCGReg rn)
1447 {
1448     tcg_out_sxt(s, type, MO_16, rd, rn);
1449 }
1450
1451 static void tcg_out_ext32s(TCGContext *s, TCGReg rd, TCGReg rn)
1452 {
1453     tcg_out_sxt(s, TCG_TYPE_I64, MO_32, rd, rn);
1454 }
1455
1456 static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg rd, TCGReg rn)
1457 {
1458     tcg_out_ext32s(s, rd, rn);
1459 }
1460
1461 static inline void tcg_out_uxt(TCGContext *s, MemOp s_bits,
1462                                TCGReg rd, TCGReg rn)
1463 {
1464     /* Using ALIASes UXTB, UXTH of UBFM Wd, Wn, #0, #7|15 */
1465     int bits = (8 << s_bits) - 1;
1466     tcg_out_ubfm(s, 0, rd, rn, 0, bits);
1467 }
1468
1469 static void tcg_out_ext8u(TCGContext *s, TCGReg rd, TCGReg rn)
1470 {
1471     tcg_out_uxt(s, MO_8, rd, rn);
1472 }
1473
1474 static void tcg_out_ext16u(TCGContext *s, TCGReg rd, TCGReg rn)
1475 {
1476     tcg_out_uxt(s, MO_16, rd, rn);
1477 }
1478
1479 static void tcg_out_ext32u(TCGContext *s, TCGReg rd, TCGReg rn)
1480 {
1481     tcg_out_movr(s, TCG_TYPE_I32, rd, rn);
1482 }
1483
1484 static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg rd, TCGReg rn)
1485 {
1486     tcg_out_ext32u(s, rd, rn);
1487 }
1488
1489 static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg rd, TCGReg rn)
1490 {
1491     tcg_out_mov(s, TCG_TYPE_I32, rd, rn);
1492 }
1493
1494 static void tcg_out_addsubi(TCGContext *s, int ext, TCGReg rd,
1495                             TCGReg rn, int64_t aimm)
1496 {
1497     if (aimm >= 0) {
1498         tcg_out_insn(s, 3401, ADDI, ext, rd, rn, aimm);
1499     } else {
1500         tcg_out_insn(s, 3401, SUBI, ext, rd, rn, -aimm);
1501     }
1502 }
1503
1504 static void tcg_out_addsub2(TCGContext *s, TCGType ext, TCGReg rl,
1505                             TCGReg rh, TCGReg al, TCGReg ah,
1506                             tcg_target_long bl, tcg_target_long bh,
1507                             bool const_bl, bool const_bh, bool sub)
1508 {
1509     TCGReg orig_rl = rl;
1510     AArch64Insn insn;
1511
1512     if (rl == ah || (!const_bh && rl == bh)) {
1513         rl = TCG_REG_TMP0;
1514     }
1515
1516     if (const_bl) {
1517         if (bl < 0) {
1518             bl = -bl;
1519             insn = sub ? I3401_ADDSI : I3401_SUBSI;
1520         } else {
1521             insn = sub ? I3401_SUBSI : I3401_ADDSI;
1522         }
1523
1524         if (unlikely(al == TCG_REG_XZR)) {
1525             /* ??? We want to allow al to be zero for the benefit of
1526                negation via subtraction.  However, that leaves open the
1527                possibility of adding 0+const in the low part, and the
1528                immediate add instructions encode XSP not XZR.  Don't try
1529                anything more elaborate here than loading another zero.  */
1530             al = TCG_REG_TMP0;
1531             tcg_out_movi(s, ext, al, 0);
1532         }
1533         tcg_out_insn_3401(s, insn, ext, rl, al, bl);
1534     } else {
1535         tcg_out_insn_3502(s, sub ? I3502_SUBS : I3502_ADDS, ext, rl, al, bl);
1536     }
1537
1538     insn = I3503_ADC;
1539     if (const_bh) {
1540         /* Note that the only two constants we support are 0 and -1, and
1541            that SBC = rn + ~rm + c, so adc -1 is sbc 0, and vice-versa.  */
1542         if ((bh != 0) ^ sub) {
1543             insn = I3503_SBC;
1544         }
1545         bh = TCG_REG_XZR;
1546     } else if (sub) {
1547         insn = I3503_SBC;
1548     }
1549     tcg_out_insn_3503(s, insn, ext, rh, ah, bh);
1550
1551     tcg_out_mov(s, ext, orig_rl, rl);
1552 }
1553
1554 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1555 {
1556     static const uint32_t sync[] = {
1557         [0 ... TCG_MO_ALL]            = DMB_ISH | DMB_LD | DMB_ST,
1558         [TCG_MO_ST_ST]                = DMB_ISH | DMB_ST,
1559         [TCG_MO_LD_LD]                = DMB_ISH | DMB_LD,
1560         [TCG_MO_LD_ST]                = DMB_ISH | DMB_LD,
1561         [TCG_MO_LD_ST | TCG_MO_LD_LD] = DMB_ISH | DMB_LD,
1562     };
1563     tcg_out32(s, sync[a0 & TCG_MO_ALL]);
1564 }
1565
1566 static void tcg_out_cltz(TCGContext *s, TCGType ext, TCGReg d,
1567                          TCGReg a0, TCGArg b, bool const_b, bool is_ctz)
1568 {
1569     TCGReg a1 = a0;
1570     if (is_ctz) {
1571         a1 = TCG_REG_TMP0;
1572         tcg_out_insn(s, 3507, RBIT, ext, a1, a0);
1573     }
1574     if (const_b && b == (ext ? 64 : 32)) {
1575         tcg_out_insn(s, 3507, CLZ, ext, d, a1);
1576     } else {
1577         AArch64Insn sel = I3506_CSEL;
1578
1579         tcg_out_cmp(s, ext, a0, 0, 1);
1580         tcg_out_insn(s, 3507, CLZ, ext, TCG_REG_TMP0, a1);
1581
1582         if (const_b) {
1583             if (b == -1) {
1584                 b = TCG_REG_XZR;
1585                 sel = I3506_CSINV;
1586             } else if (b == 0) {
1587                 b = TCG_REG_XZR;
1588             } else {
1589                 tcg_out_movi(s, ext, d, b);
1590                 b = d;
1591             }
1592         }
1593         tcg_out_insn_3506(s, sel, ext, d, TCG_REG_TMP0, b, TCG_COND_NE);
1594     }
1595 }
1596
1597 typedef struct {
1598     TCGReg base;
1599     TCGReg index;
1600     TCGType index_ext;
1601     TCGAtomAlign aa;
1602 } HostAddress;
1603
1604 bool tcg_target_has_memory_bswap(MemOp memop)
1605 {
1606     return false;
1607 }
1608
1609 static const TCGLdstHelperParam ldst_helper_param = {
1610     .ntmp = 1, .tmp = { TCG_REG_TMP0 }
1611 };
1612
1613 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
1614 {
1615     MemOp opc = get_memop(lb->oi);
1616
1617     if (!reloc_pc19(lb->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
1618         return false;
1619     }
1620
1621     tcg_out_ld_helper_args(s, lb, &ldst_helper_param);
1622     tcg_out_call_int(s, qemu_ld_helpers[opc & MO_SIZE]);
1623     tcg_out_ld_helper_ret(s, lb, false, &ldst_helper_param);
1624     tcg_out_goto(s, lb->raddr);
1625     return true;
1626 }
1627
1628 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *lb)
1629 {
1630     MemOp opc = get_memop(lb->oi);
1631
1632     if (!reloc_pc19(lb->label_ptr[0], tcg_splitwx_to_rx(s->code_ptr))) {
1633         return false;
1634     }
1635
1636     tcg_out_st_helper_args(s, lb, &ldst_helper_param);
1637     tcg_out_call_int(s, qemu_st_helpers[opc & MO_SIZE]);
1638     tcg_out_goto(s, lb->raddr);
1639     return true;
1640 }
1641
1642 /* We expect to use a 7-bit scaled negative offset from ENV.  */
1643 #define MIN_TLB_MASK_TABLE_OFS  -512
1644
1645 /*
1646  * For system-mode, perform the TLB load and compare.
1647  * For user-mode, perform any required alignment tests.
1648  * In both cases, return a TCGLabelQemuLdst structure if the slow path
1649  * is required and fill in @h with the host address for the fast path.
1650  */
1651 static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
1652                                            TCGReg addr_reg, MemOpIdx oi,
1653                                            bool is_ld)
1654 {
1655     TCGType addr_type = s->addr_type;
1656     TCGLabelQemuLdst *ldst = NULL;
1657     MemOp opc = get_memop(oi);
1658     MemOp s_bits = opc & MO_SIZE;
1659     unsigned a_mask;
1660
1661     h->aa = atom_and_align_for_opc(s, opc,
1662                                    have_lse2 ? MO_ATOM_WITHIN16
1663                                              : MO_ATOM_IFALIGN,
1664                                    s_bits == MO_128);
1665     a_mask = (1 << h->aa.align) - 1;
1666
1667 #ifdef CONFIG_SOFTMMU
1668     unsigned s_mask = (1u << s_bits) - 1;
1669     unsigned mem_index = get_mmuidx(oi);
1670     TCGReg addr_adj;
1671     TCGType mask_type;
1672     uint64_t compare_mask;
1673
1674     ldst = new_ldst_label(s);
1675     ldst->is_ld = is_ld;
1676     ldst->oi = oi;
1677     ldst->addrlo_reg = addr_reg;
1678
1679     mask_type = (s->page_bits + s->tlb_dyn_max_bits > 32
1680                  ? TCG_TYPE_I64 : TCG_TYPE_I32);
1681
1682     /* Load cpu->neg.tlb.f[mmu_idx].{mask,table} into {tmp0,tmp1}. */
1683     QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, mask) != 0);
1684     QEMU_BUILD_BUG_ON(offsetof(CPUTLBDescFast, table) != 8);
1685     tcg_out_insn(s, 3314, LDP, TCG_REG_TMP0, TCG_REG_TMP1, TCG_AREG0,
1686                  tlb_mask_table_ofs(s, mem_index), 1, 0);
1687
1688     /* Extract the TLB index from the address into X0.  */
1689     tcg_out_insn(s, 3502S, AND_LSR, mask_type == TCG_TYPE_I64,
1690                  TCG_REG_TMP0, TCG_REG_TMP0, addr_reg,
1691                  s->page_bits - CPU_TLB_ENTRY_BITS);
1692
1693     /* Add the tlb_table pointer, forming the CPUTLBEntry address in TMP1. */
1694     tcg_out_insn(s, 3502, ADD, 1, TCG_REG_TMP1, TCG_REG_TMP1, TCG_REG_TMP0);
1695
1696     /* Load the tlb comparator into TMP0, and the fast path addend into TMP1. */
1697     QEMU_BUILD_BUG_ON(HOST_BIG_ENDIAN);
1698     tcg_out_ld(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP1,
1699                is_ld ? offsetof(CPUTLBEntry, addr_read)
1700                      : offsetof(CPUTLBEntry, addr_write));
1701     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_TMP1, TCG_REG_TMP1,
1702                offsetof(CPUTLBEntry, addend));
1703
1704     /*
1705      * For aligned accesses, we check the first byte and include the alignment
1706      * bits within the address.  For unaligned access, we check that we don't
1707      * cross pages using the address of the last byte of the access.
1708      */
1709     if (a_mask >= s_mask) {
1710         addr_adj = addr_reg;
1711     } else {
1712         addr_adj = TCG_REG_TMP2;
1713         tcg_out_insn(s, 3401, ADDI, addr_type,
1714                      addr_adj, addr_reg, s_mask - a_mask);
1715     }
1716     compare_mask = (uint64_t)s->page_mask | a_mask;
1717
1718     /* Store the page mask part of the address into TMP2.  */
1719     tcg_out_logicali(s, I3404_ANDI, addr_type, TCG_REG_TMP2,
1720                      addr_adj, compare_mask);
1721
1722     /* Perform the address comparison. */
1723     tcg_out_cmp(s, addr_type, TCG_REG_TMP0, TCG_REG_TMP2, 0);
1724
1725     /* If not equal, we jump to the slow path. */
1726     ldst->label_ptr[0] = s->code_ptr;
1727     tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
1728
1729     h->base = TCG_REG_TMP1;
1730     h->index = addr_reg;
1731     h->index_ext = addr_type;
1732 #else
1733     if (a_mask) {
1734         ldst = new_ldst_label(s);
1735
1736         ldst->is_ld = is_ld;
1737         ldst->oi = oi;
1738         ldst->addrlo_reg = addr_reg;
1739
1740         /* tst addr, #mask */
1741         tcg_out_logicali(s, I3404_ANDSI, 0, TCG_REG_XZR, addr_reg, a_mask);
1742
1743         /* b.ne slow_path */
1744         ldst->label_ptr[0] = s->code_ptr;
1745         tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
1746     }
1747
1748     if (guest_base || addr_type == TCG_TYPE_I32) {
1749         h->base = TCG_REG_GUEST_BASE;
1750         h->index = addr_reg;
1751         h->index_ext = addr_type;
1752     } else {
1753         h->base = addr_reg;
1754         h->index = TCG_REG_XZR;
1755         h->index_ext = TCG_TYPE_I64;
1756     }
1757 #endif
1758
1759     return ldst;
1760 }
1761
1762 static void tcg_out_qemu_ld_direct(TCGContext *s, MemOp memop, TCGType ext,
1763                                    TCGReg data_r, HostAddress h)
1764 {
1765     switch (memop & MO_SSIZE) {
1766     case MO_UB:
1767         tcg_out_ldst_r(s, I3312_LDRB, data_r, h.base, h.index_ext, h.index);
1768         break;
1769     case MO_SB:
1770         tcg_out_ldst_r(s, ext ? I3312_LDRSBX : I3312_LDRSBW,
1771                        data_r, h.base, h.index_ext, h.index);
1772         break;
1773     case MO_UW:
1774         tcg_out_ldst_r(s, I3312_LDRH, data_r, h.base, h.index_ext, h.index);
1775         break;
1776     case MO_SW:
1777         tcg_out_ldst_r(s, (ext ? I3312_LDRSHX : I3312_LDRSHW),
1778                        data_r, h.base, h.index_ext, h.index);
1779         break;
1780     case MO_UL:
1781         tcg_out_ldst_r(s, I3312_LDRW, data_r, h.base, h.index_ext, h.index);
1782         break;
1783     case MO_SL:
1784         tcg_out_ldst_r(s, I3312_LDRSWX, data_r, h.base, h.index_ext, h.index);
1785         break;
1786     case MO_UQ:
1787         tcg_out_ldst_r(s, I3312_LDRX, data_r, h.base, h.index_ext, h.index);
1788         break;
1789     default:
1790         g_assert_not_reached();
1791     }
1792 }
1793
1794 static void tcg_out_qemu_st_direct(TCGContext *s, MemOp memop,
1795                                    TCGReg data_r, HostAddress h)
1796 {
1797     switch (memop & MO_SIZE) {
1798     case MO_8:
1799         tcg_out_ldst_r(s, I3312_STRB, data_r, h.base, h.index_ext, h.index);
1800         break;
1801     case MO_16:
1802         tcg_out_ldst_r(s, I3312_STRH, data_r, h.base, h.index_ext, h.index);
1803         break;
1804     case MO_32:
1805         tcg_out_ldst_r(s, I3312_STRW, data_r, h.base, h.index_ext, h.index);
1806         break;
1807     case MO_64:
1808         tcg_out_ldst_r(s, I3312_STRX, data_r, h.base, h.index_ext, h.index);
1809         break;
1810     default:
1811         g_assert_not_reached();
1812     }
1813 }
1814
1815 static void tcg_out_qemu_ld(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
1816                             MemOpIdx oi, TCGType data_type)
1817 {
1818     TCGLabelQemuLdst *ldst;
1819     HostAddress h;
1820
1821     ldst = prepare_host_addr(s, &h, addr_reg, oi, true);
1822     tcg_out_qemu_ld_direct(s, get_memop(oi), data_type, data_reg, h);
1823
1824     if (ldst) {
1825         ldst->type = data_type;
1826         ldst->datalo_reg = data_reg;
1827         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
1828     }
1829 }
1830
1831 static void tcg_out_qemu_st(TCGContext *s, TCGReg data_reg, TCGReg addr_reg,
1832                             MemOpIdx oi, TCGType data_type)
1833 {
1834     TCGLabelQemuLdst *ldst;
1835     HostAddress h;
1836
1837     ldst = prepare_host_addr(s, &h, addr_reg, oi, false);
1838     tcg_out_qemu_st_direct(s, get_memop(oi), data_reg, h);
1839
1840     if (ldst) {
1841         ldst->type = data_type;
1842         ldst->datalo_reg = data_reg;
1843         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
1844     }
1845 }
1846
1847 static void tcg_out_qemu_ldst_i128(TCGContext *s, TCGReg datalo, TCGReg datahi,
1848                                    TCGReg addr_reg, MemOpIdx oi, bool is_ld)
1849 {
1850     TCGLabelQemuLdst *ldst;
1851     HostAddress h;
1852     TCGReg base;
1853     bool use_pair;
1854
1855     ldst = prepare_host_addr(s, &h, addr_reg, oi, is_ld);
1856
1857     /* Compose the final address, as LDP/STP have no indexing. */
1858     if (h.index == TCG_REG_XZR) {
1859         base = h.base;
1860     } else {
1861         base = TCG_REG_TMP2;
1862         if (h.index_ext == TCG_TYPE_I32) {
1863             /* add base, base, index, uxtw */
1864             tcg_out_insn(s, 3501, ADD, TCG_TYPE_I64, base,
1865                          h.base, h.index, MO_32, 0);
1866         } else {
1867             /* add base, base, index */
1868             tcg_out_insn(s, 3502, ADD, 1, base, h.base, h.index);
1869         }
1870     }
1871
1872     use_pair = h.aa.atom < MO_128 || have_lse2;
1873
1874     if (!use_pair) {
1875         tcg_insn_unit *branch = NULL;
1876         TCGReg ll, lh, sl, sh;
1877
1878         /*
1879          * If we have already checked for 16-byte alignment, that's all
1880          * we need. Otherwise we have determined that misaligned atomicity
1881          * may be handled with two 8-byte loads.
1882          */
1883         if (h.aa.align < MO_128) {
1884             /*
1885              * TODO: align should be MO_64, so we only need test bit 3,
1886              * which means we could use TBNZ instead of ANDS+B_C.
1887              */
1888             tcg_out_logicali(s, I3404_ANDSI, 0, TCG_REG_XZR, addr_reg, 15);
1889             branch = s->code_ptr;
1890             tcg_out_insn(s, 3202, B_C, TCG_COND_NE, 0);
1891             use_pair = true;
1892         }
1893
1894         if (is_ld) {
1895             /*
1896              * 16-byte atomicity without LSE2 requires LDXP+STXP loop:
1897              *    ldxp lo, hi, [base]
1898              *    stxp t0, lo, hi, [base]
1899              *    cbnz t0, .-8
1900              * Require no overlap between data{lo,hi} and base.
1901              */
1902             if (datalo == base || datahi == base) {
1903                 tcg_out_mov(s, TCG_TYPE_REG, TCG_REG_TMP2, base);
1904                 base = TCG_REG_TMP2;
1905             }
1906             ll = sl = datalo;
1907             lh = sh = datahi;
1908         } else {
1909             /*
1910              * 16-byte atomicity without LSE2 requires LDXP+STXP loop:
1911              * 1: ldxp t0, t1, [base]
1912              *    stxp t0, lo, hi, [base]
1913              *    cbnz t0, 1b
1914              */
1915             tcg_debug_assert(base != TCG_REG_TMP0 && base != TCG_REG_TMP1);
1916             ll = TCG_REG_TMP0;
1917             lh = TCG_REG_TMP1;
1918             sl = datalo;
1919             sh = datahi;
1920         }
1921
1922         tcg_out_insn(s, 3306, LDXP, TCG_REG_XZR, ll, lh, base);
1923         tcg_out_insn(s, 3306, STXP, TCG_REG_TMP0, sl, sh, base);
1924         tcg_out_insn(s, 3201, CBNZ, 0, TCG_REG_TMP0, -2);
1925
1926         if (use_pair) {
1927             /* "b .+8", branching across the one insn of use_pair. */
1928             tcg_out_insn(s, 3206, B, 2);
1929             reloc_pc19(branch, tcg_splitwx_to_rx(s->code_ptr));
1930         }
1931     }
1932
1933     if (use_pair) {
1934         if (is_ld) {
1935             tcg_out_insn(s, 3314, LDP, datalo, datahi, base, 0, 1, 0);
1936         } else {
1937             tcg_out_insn(s, 3314, STP, datalo, datahi, base, 0, 1, 0);
1938         }
1939     }
1940
1941     if (ldst) {
1942         ldst->type = TCG_TYPE_I128;
1943         ldst->datalo_reg = datalo;
1944         ldst->datahi_reg = datahi;
1945         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
1946     }
1947 }
1948
1949 static const tcg_insn_unit *tb_ret_addr;
1950
1951 static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
1952 {
1953     const tcg_insn_unit *target;
1954     ptrdiff_t offset;
1955
1956     /* Reuse the zeroing that exists for goto_ptr.  */
1957     if (a0 == 0) {
1958         target = tcg_code_gen_epilogue;
1959     } else {
1960         tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_X0, a0);
1961         target = tb_ret_addr;
1962     }
1963
1964     offset = tcg_pcrel_diff(s, target) >> 2;
1965     if (offset == sextract64(offset, 0, 26)) {
1966         tcg_out_insn(s, 3206, B, offset);
1967     } else {
1968         /*
1969          * Only x16/x17 generate BTI type Jump (2),
1970          * other registers generate BTI type Jump|Call (3).
1971          */
1972         QEMU_BUILD_BUG_ON(TCG_REG_TMP0 != TCG_REG_X16);
1973         tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_TMP0, (intptr_t)target);
1974         tcg_out_insn(s, 3207, BR, TCG_REG_TMP0);
1975     }
1976 }
1977
1978 static void tcg_out_goto_tb(TCGContext *s, int which)
1979 {
1980     /*
1981      * Direct branch, or indirect address load, will be patched
1982      * by tb_target_set_jmp_target.  Assert indirect load offset
1983      * in range early, regardless of direct branch distance.
1984      */
1985     intptr_t i_off = tcg_pcrel_diff(s, (void *)get_jmp_target_addr(s, which));
1986     tcg_debug_assert(i_off == sextract64(i_off, 0, 21));
1987
1988     set_jmp_insn_offset(s, which);
1989     tcg_out32(s, I3206_B);
1990     tcg_out_insn(s, 3207, BR, TCG_REG_TMP0);
1991     set_jmp_reset_offset(s, which);
1992     tcg_out_bti(s, BTI_J);
1993 }
1994
1995 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
1996                               uintptr_t jmp_rx, uintptr_t jmp_rw)
1997 {
1998     uintptr_t d_addr = tb->jmp_target_addr[n];
1999     ptrdiff_t d_offset = d_addr - jmp_rx;
2000     tcg_insn_unit insn;
2001
2002     /* Either directly branch, or indirect branch load. */
2003     if (d_offset == sextract64(d_offset, 0, 28)) {
2004         insn = deposit32(I3206_B, 0, 26, d_offset >> 2);
2005     } else {
2006         uintptr_t i_addr = (uintptr_t)&tb->jmp_target_addr[n];
2007         ptrdiff_t i_offset = i_addr - jmp_rx;
2008
2009         /* Note that we asserted this in range in tcg_out_goto_tb. */
2010         insn = deposit32(I3305_LDR | TCG_REG_TMP0, 5, 19, i_offset >> 2);
2011     }
2012     qatomic_set((uint32_t *)jmp_rw, insn);
2013     flush_idcache_range(jmp_rx, jmp_rw, 4);
2014 }
2015
2016 static void tcg_out_op(TCGContext *s, TCGOpcode opc,
2017                        const TCGArg args[TCG_MAX_OP_ARGS],
2018                        const int const_args[TCG_MAX_OP_ARGS])
2019 {
2020     /* 99% of the time, we can signal the use of extension registers
2021        by looking to see if the opcode handles 64-bit data.  */
2022     TCGType ext = (tcg_op_defs[opc].flags & TCG_OPF_64BIT) != 0;
2023
2024     /* Hoist the loads of the most common arguments.  */
2025     TCGArg a0 = args[0];
2026     TCGArg a1 = args[1];
2027     TCGArg a2 = args[2];
2028     int c2 = const_args[2];
2029
2030     /* Some operands are defined with "rZ" constraint, a register or
2031        the zero register.  These need not actually test args[I] == 0.  */
2032 #define REG0(I)  (const_args[I] ? TCG_REG_XZR : (TCGReg)args[I])
2033
2034     switch (opc) {
2035     case INDEX_op_goto_ptr:
2036         tcg_out_insn(s, 3207, BR, a0);
2037         break;
2038
2039     case INDEX_op_br:
2040         tcg_out_goto_label(s, arg_label(a0));
2041         break;
2042
2043     case INDEX_op_ld8u_i32:
2044     case INDEX_op_ld8u_i64:
2045         tcg_out_ldst(s, I3312_LDRB, a0, a1, a2, 0);
2046         break;
2047     case INDEX_op_ld8s_i32:
2048         tcg_out_ldst(s, I3312_LDRSBW, a0, a1, a2, 0);
2049         break;
2050     case INDEX_op_ld8s_i64:
2051         tcg_out_ldst(s, I3312_LDRSBX, a0, a1, a2, 0);
2052         break;
2053     case INDEX_op_ld16u_i32:
2054     case INDEX_op_ld16u_i64:
2055         tcg_out_ldst(s, I3312_LDRH, a0, a1, a2, 1);
2056         break;
2057     case INDEX_op_ld16s_i32:
2058         tcg_out_ldst(s, I3312_LDRSHW, a0, a1, a2, 1);
2059         break;
2060     case INDEX_op_ld16s_i64:
2061         tcg_out_ldst(s, I3312_LDRSHX, a0, a1, a2, 1);
2062         break;
2063     case INDEX_op_ld_i32:
2064     case INDEX_op_ld32u_i64:
2065         tcg_out_ldst(s, I3312_LDRW, a0, a1, a2, 2);
2066         break;
2067     case INDEX_op_ld32s_i64:
2068         tcg_out_ldst(s, I3312_LDRSWX, a0, a1, a2, 2);
2069         break;
2070     case INDEX_op_ld_i64:
2071         tcg_out_ldst(s, I3312_LDRX, a0, a1, a2, 3);
2072         break;
2073
2074     case INDEX_op_st8_i32:
2075     case INDEX_op_st8_i64:
2076         tcg_out_ldst(s, I3312_STRB, REG0(0), a1, a2, 0);
2077         break;
2078     case INDEX_op_st16_i32:
2079     case INDEX_op_st16_i64:
2080         tcg_out_ldst(s, I3312_STRH, REG0(0), a1, a2, 1);
2081         break;
2082     case INDEX_op_st_i32:
2083     case INDEX_op_st32_i64:
2084         tcg_out_ldst(s, I3312_STRW, REG0(0), a1, a2, 2);
2085         break;
2086     case INDEX_op_st_i64:
2087         tcg_out_ldst(s, I3312_STRX, REG0(0), a1, a2, 3);
2088         break;
2089
2090     case INDEX_op_add_i32:
2091         a2 = (int32_t)a2;
2092         /* FALLTHRU */
2093     case INDEX_op_add_i64:
2094         if (c2) {
2095             tcg_out_addsubi(s, ext, a0, a1, a2);
2096         } else {
2097             tcg_out_insn(s, 3502, ADD, ext, a0, a1, a2);
2098         }
2099         break;
2100
2101     case INDEX_op_sub_i32:
2102         a2 = (int32_t)a2;
2103         /* FALLTHRU */
2104     case INDEX_op_sub_i64:
2105         if (c2) {
2106             tcg_out_addsubi(s, ext, a0, a1, -a2);
2107         } else {
2108             tcg_out_insn(s, 3502, SUB, ext, a0, a1, a2);
2109         }
2110         break;
2111
2112     case INDEX_op_neg_i64:
2113     case INDEX_op_neg_i32:
2114         tcg_out_insn(s, 3502, SUB, ext, a0, TCG_REG_XZR, a1);
2115         break;
2116
2117     case INDEX_op_and_i32:
2118         a2 = (int32_t)a2;
2119         /* FALLTHRU */
2120     case INDEX_op_and_i64:
2121         if (c2) {
2122             tcg_out_logicali(s, I3404_ANDI, ext, a0, a1, a2);
2123         } else {
2124             tcg_out_insn(s, 3510, AND, ext, a0, a1, a2);
2125         }
2126         break;
2127
2128     case INDEX_op_andc_i32:
2129         a2 = (int32_t)a2;
2130         /* FALLTHRU */
2131     case INDEX_op_andc_i64:
2132         if (c2) {
2133             tcg_out_logicali(s, I3404_ANDI, ext, a0, a1, ~a2);
2134         } else {
2135             tcg_out_insn(s, 3510, BIC, ext, a0, a1, a2);
2136         }
2137         break;
2138
2139     case INDEX_op_or_i32:
2140         a2 = (int32_t)a2;
2141         /* FALLTHRU */
2142     case INDEX_op_or_i64:
2143         if (c2) {
2144             tcg_out_logicali(s, I3404_ORRI, ext, a0, a1, a2);
2145         } else {
2146             tcg_out_insn(s, 3510, ORR, ext, a0, a1, a2);
2147         }
2148         break;
2149
2150     case INDEX_op_orc_i32:
2151         a2 = (int32_t)a2;
2152         /* FALLTHRU */
2153     case INDEX_op_orc_i64:
2154         if (c2) {
2155             tcg_out_logicali(s, I3404_ORRI, ext, a0, a1, ~a2);
2156         } else {
2157             tcg_out_insn(s, 3510, ORN, ext, a0, a1, a2);
2158         }
2159         break;
2160
2161     case INDEX_op_xor_i32:
2162         a2 = (int32_t)a2;
2163         /* FALLTHRU */
2164     case INDEX_op_xor_i64:
2165         if (c2) {
2166             tcg_out_logicali(s, I3404_EORI, ext, a0, a1, a2);
2167         } else {
2168             tcg_out_insn(s, 3510, EOR, ext, a0, a1, a2);
2169         }
2170         break;
2171
2172     case INDEX_op_eqv_i32:
2173         a2 = (int32_t)a2;
2174         /* FALLTHRU */
2175     case INDEX_op_eqv_i64:
2176         if (c2) {
2177             tcg_out_logicali(s, I3404_EORI, ext, a0, a1, ~a2);
2178         } else {
2179             tcg_out_insn(s, 3510, EON, ext, a0, a1, a2);
2180         }
2181         break;
2182
2183     case INDEX_op_not_i64:
2184     case INDEX_op_not_i32:
2185         tcg_out_insn(s, 3510, ORN, ext, a0, TCG_REG_XZR, a1);
2186         break;
2187
2188     case INDEX_op_mul_i64:
2189     case INDEX_op_mul_i32:
2190         tcg_out_insn(s, 3509, MADD, ext, a0, a1, a2, TCG_REG_XZR);
2191         break;
2192
2193     case INDEX_op_div_i64:
2194     case INDEX_op_div_i32:
2195         tcg_out_insn(s, 3508, SDIV, ext, a0, a1, a2);
2196         break;
2197     case INDEX_op_divu_i64:
2198     case INDEX_op_divu_i32:
2199         tcg_out_insn(s, 3508, UDIV, ext, a0, a1, a2);
2200         break;
2201
2202     case INDEX_op_rem_i64:
2203     case INDEX_op_rem_i32:
2204         tcg_out_insn(s, 3508, SDIV, ext, TCG_REG_TMP0, a1, a2);
2205         tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP0, a2, a1);
2206         break;
2207     case INDEX_op_remu_i64:
2208     case INDEX_op_remu_i32:
2209         tcg_out_insn(s, 3508, UDIV, ext, TCG_REG_TMP0, a1, a2);
2210         tcg_out_insn(s, 3509, MSUB, ext, a0, TCG_REG_TMP0, a2, a1);
2211         break;
2212
2213     case INDEX_op_shl_i64:
2214     case INDEX_op_shl_i32:
2215         if (c2) {
2216             tcg_out_shl(s, ext, a0, a1, a2);
2217         } else {
2218             tcg_out_insn(s, 3508, LSLV, ext, a0, a1, a2);
2219         }
2220         break;
2221
2222     case INDEX_op_shr_i64:
2223     case INDEX_op_shr_i32:
2224         if (c2) {
2225             tcg_out_shr(s, ext, a0, a1, a2);
2226         } else {
2227             tcg_out_insn(s, 3508, LSRV, ext, a0, a1, a2);
2228         }
2229         break;
2230
2231     case INDEX_op_sar_i64:
2232     case INDEX_op_sar_i32:
2233         if (c2) {
2234             tcg_out_sar(s, ext, a0, a1, a2);
2235         } else {
2236             tcg_out_insn(s, 3508, ASRV, ext, a0, a1, a2);
2237         }
2238         break;
2239
2240     case INDEX_op_rotr_i64:
2241     case INDEX_op_rotr_i32:
2242         if (c2) {
2243             tcg_out_rotr(s, ext, a0, a1, a2);
2244         } else {
2245             tcg_out_insn(s, 3508, RORV, ext, a0, a1, a2);
2246         }
2247         break;
2248
2249     case INDEX_op_rotl_i64:
2250     case INDEX_op_rotl_i32:
2251         if (c2) {
2252             tcg_out_rotl(s, ext, a0, a1, a2);
2253         } else {
2254             tcg_out_insn(s, 3502, SUB, 0, TCG_REG_TMP0, TCG_REG_XZR, a2);
2255             tcg_out_insn(s, 3508, RORV, ext, a0, a1, TCG_REG_TMP0);
2256         }
2257         break;
2258
2259     case INDEX_op_clz_i64:
2260     case INDEX_op_clz_i32:
2261         tcg_out_cltz(s, ext, a0, a1, a2, c2, false);
2262         break;
2263     case INDEX_op_ctz_i64:
2264     case INDEX_op_ctz_i32:
2265         tcg_out_cltz(s, ext, a0, a1, a2, c2, true);
2266         break;
2267
2268     case INDEX_op_brcond_i32:
2269         a1 = (int32_t)a1;
2270         /* FALLTHRU */
2271     case INDEX_op_brcond_i64:
2272         tcg_out_brcond(s, ext, a2, a0, a1, const_args[1], arg_label(args[3]));
2273         break;
2274
2275     case INDEX_op_setcond_i32:
2276         a2 = (int32_t)a2;
2277         /* FALLTHRU */
2278     case INDEX_op_setcond_i64:
2279         tcg_out_cmp(s, ext, a1, a2, c2);
2280         /* Use CSET alias of CSINC Wd, WZR, WZR, invert(cond).  */
2281         tcg_out_insn(s, 3506, CSINC, TCG_TYPE_I32, a0, TCG_REG_XZR,
2282                      TCG_REG_XZR, tcg_invert_cond(args[3]));
2283         break;
2284
2285     case INDEX_op_negsetcond_i32:
2286         a2 = (int32_t)a2;
2287         /* FALLTHRU */
2288     case INDEX_op_negsetcond_i64:
2289         tcg_out_cmp(s, ext, a1, a2, c2);
2290         /* Use CSETM alias of CSINV Wd, WZR, WZR, invert(cond).  */
2291         tcg_out_insn(s, 3506, CSINV, ext, a0, TCG_REG_XZR,
2292                      TCG_REG_XZR, tcg_invert_cond(args[3]));
2293         break;
2294
2295     case INDEX_op_movcond_i32:
2296         a2 = (int32_t)a2;
2297         /* FALLTHRU */
2298     case INDEX_op_movcond_i64:
2299         tcg_out_cmp(s, ext, a1, a2, c2);
2300         tcg_out_insn(s, 3506, CSEL, ext, a0, REG0(3), REG0(4), args[5]);
2301         break;
2302
2303     case INDEX_op_qemu_ld_a32_i32:
2304     case INDEX_op_qemu_ld_a64_i32:
2305     case INDEX_op_qemu_ld_a32_i64:
2306     case INDEX_op_qemu_ld_a64_i64:
2307         tcg_out_qemu_ld(s, a0, a1, a2, ext);
2308         break;
2309     case INDEX_op_qemu_st_a32_i32:
2310     case INDEX_op_qemu_st_a64_i32:
2311     case INDEX_op_qemu_st_a32_i64:
2312     case INDEX_op_qemu_st_a64_i64:
2313         tcg_out_qemu_st(s, REG0(0), a1, a2, ext);
2314         break;
2315     case INDEX_op_qemu_ld_a32_i128:
2316     case INDEX_op_qemu_ld_a64_i128:
2317         tcg_out_qemu_ldst_i128(s, a0, a1, a2, args[3], true);
2318         break;
2319     case INDEX_op_qemu_st_a32_i128:
2320     case INDEX_op_qemu_st_a64_i128:
2321         tcg_out_qemu_ldst_i128(s, REG0(0), REG0(1), a2, args[3], false);
2322         break;
2323
2324     case INDEX_op_bswap64_i64:
2325         tcg_out_rev(s, TCG_TYPE_I64, MO_64, a0, a1);
2326         break;
2327     case INDEX_op_bswap32_i64:
2328         tcg_out_rev(s, TCG_TYPE_I32, MO_32, a0, a1);
2329         if (a2 & TCG_BSWAP_OS) {
2330             tcg_out_ext32s(s, a0, a0);
2331         }
2332         break;
2333     case INDEX_op_bswap32_i32:
2334         tcg_out_rev(s, TCG_TYPE_I32, MO_32, a0, a1);
2335         break;
2336     case INDEX_op_bswap16_i64:
2337     case INDEX_op_bswap16_i32:
2338         tcg_out_rev(s, TCG_TYPE_I32, MO_16, a0, a1);
2339         if (a2 & TCG_BSWAP_OS) {
2340             /* Output must be sign-extended. */
2341             tcg_out_ext16s(s, ext, a0, a0);
2342         } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2343             /* Output must be zero-extended, but input isn't. */
2344             tcg_out_ext16u(s, a0, a0);
2345         }
2346         break;
2347
2348     case INDEX_op_deposit_i64:
2349     case INDEX_op_deposit_i32:
2350         tcg_out_dep(s, ext, a0, REG0(2), args[3], args[4]);
2351         break;
2352
2353     case INDEX_op_extract_i64:
2354     case INDEX_op_extract_i32:
2355         tcg_out_ubfm(s, ext, a0, a1, a2, a2 + args[3] - 1);
2356         break;
2357
2358     case INDEX_op_sextract_i64:
2359     case INDEX_op_sextract_i32:
2360         tcg_out_sbfm(s, ext, a0, a1, a2, a2 + args[3] - 1);
2361         break;
2362
2363     case INDEX_op_extract2_i64:
2364     case INDEX_op_extract2_i32:
2365         tcg_out_extr(s, ext, a0, REG0(2), REG0(1), args[3]);
2366         break;
2367
2368     case INDEX_op_add2_i32:
2369         tcg_out_addsub2(s, TCG_TYPE_I32, a0, a1, REG0(2), REG0(3),
2370                         (int32_t)args[4], args[5], const_args[4],
2371                         const_args[5], false);
2372         break;
2373     case INDEX_op_add2_i64:
2374         tcg_out_addsub2(s, TCG_TYPE_I64, a0, a1, REG0(2), REG0(3), args[4],
2375                         args[5], const_args[4], const_args[5], false);
2376         break;
2377     case INDEX_op_sub2_i32:
2378         tcg_out_addsub2(s, TCG_TYPE_I32, a0, a1, REG0(2), REG0(3),
2379                         (int32_t)args[4], args[5], const_args[4],
2380                         const_args[5], true);
2381         break;
2382     case INDEX_op_sub2_i64:
2383         tcg_out_addsub2(s, TCG_TYPE_I64, a0, a1, REG0(2), REG0(3), args[4],
2384                         args[5], const_args[4], const_args[5], true);
2385         break;
2386
2387     case INDEX_op_muluh_i64:
2388         tcg_out_insn(s, 3508, UMULH, TCG_TYPE_I64, a0, a1, a2);
2389         break;
2390     case INDEX_op_mulsh_i64:
2391         tcg_out_insn(s, 3508, SMULH, TCG_TYPE_I64, a0, a1, a2);
2392         break;
2393
2394     case INDEX_op_mb:
2395         tcg_out_mb(s, a0);
2396         break;
2397
2398     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2399     case INDEX_op_mov_i64:
2400     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2401     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
2402     case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
2403     case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
2404     case INDEX_op_ext8s_i64:
2405     case INDEX_op_ext8u_i32:
2406     case INDEX_op_ext8u_i64:
2407     case INDEX_op_ext16s_i64:
2408     case INDEX_op_ext16s_i32:
2409     case INDEX_op_ext16u_i64:
2410     case INDEX_op_ext16u_i32:
2411     case INDEX_op_ext32s_i64:
2412     case INDEX_op_ext32u_i64:
2413     case INDEX_op_ext_i32_i64:
2414     case INDEX_op_extu_i32_i64:
2415     case INDEX_op_extrl_i64_i32:
2416     default:
2417         g_assert_not_reached();
2418     }
2419
2420 #undef REG0
2421 }
2422
2423 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2424                            unsigned vecl, unsigned vece,
2425                            const TCGArg args[TCG_MAX_OP_ARGS],
2426                            const int const_args[TCG_MAX_OP_ARGS])
2427 {
2428     static const AArch64Insn cmp_vec_insn[16] = {
2429         [TCG_COND_EQ] = I3616_CMEQ,
2430         [TCG_COND_GT] = I3616_CMGT,
2431         [TCG_COND_GE] = I3616_CMGE,
2432         [TCG_COND_GTU] = I3616_CMHI,
2433         [TCG_COND_GEU] = I3616_CMHS,
2434     };
2435     static const AArch64Insn cmp_scalar_insn[16] = {
2436         [TCG_COND_EQ] = I3611_CMEQ,
2437         [TCG_COND_GT] = I3611_CMGT,
2438         [TCG_COND_GE] = I3611_CMGE,
2439         [TCG_COND_GTU] = I3611_CMHI,
2440         [TCG_COND_GEU] = I3611_CMHS,
2441     };
2442     static const AArch64Insn cmp0_vec_insn[16] = {
2443         [TCG_COND_EQ] = I3617_CMEQ0,
2444         [TCG_COND_GT] = I3617_CMGT0,
2445         [TCG_COND_GE] = I3617_CMGE0,
2446         [TCG_COND_LT] = I3617_CMLT0,
2447         [TCG_COND_LE] = I3617_CMLE0,
2448     };
2449     static const AArch64Insn cmp0_scalar_insn[16] = {
2450         [TCG_COND_EQ] = I3612_CMEQ0,
2451         [TCG_COND_GT] = I3612_CMGT0,
2452         [TCG_COND_GE] = I3612_CMGE0,
2453         [TCG_COND_LT] = I3612_CMLT0,
2454         [TCG_COND_LE] = I3612_CMLE0,
2455     };
2456
2457     TCGType type = vecl + TCG_TYPE_V64;
2458     unsigned is_q = vecl;
2459     bool is_scalar = !is_q && vece == MO_64;
2460     TCGArg a0, a1, a2, a3;
2461     int cmode, imm8;
2462
2463     a0 = args[0];
2464     a1 = args[1];
2465     a2 = args[2];
2466
2467     switch (opc) {
2468     case INDEX_op_ld_vec:
2469         tcg_out_ld(s, type, a0, a1, a2);
2470         break;
2471     case INDEX_op_st_vec:
2472         tcg_out_st(s, type, a0, a1, a2);
2473         break;
2474     case INDEX_op_dupm_vec:
2475         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2476         break;
2477     case INDEX_op_add_vec:
2478         if (is_scalar) {
2479             tcg_out_insn(s, 3611, ADD, vece, a0, a1, a2);
2480         } else {
2481             tcg_out_insn(s, 3616, ADD, is_q, vece, a0, a1, a2);
2482         }
2483         break;
2484     case INDEX_op_sub_vec:
2485         if (is_scalar) {
2486             tcg_out_insn(s, 3611, SUB, vece, a0, a1, a2);
2487         } else {
2488             tcg_out_insn(s, 3616, SUB, is_q, vece, a0, a1, a2);
2489         }
2490         break;
2491     case INDEX_op_mul_vec:
2492         tcg_out_insn(s, 3616, MUL, is_q, vece, a0, a1, a2);
2493         break;
2494     case INDEX_op_neg_vec:
2495         if (is_scalar) {
2496             tcg_out_insn(s, 3612, NEG, vece, a0, a1);
2497         } else {
2498             tcg_out_insn(s, 3617, NEG, is_q, vece, a0, a1);
2499         }
2500         break;
2501     case INDEX_op_abs_vec:
2502         if (is_scalar) {
2503             tcg_out_insn(s, 3612, ABS, vece, a0, a1);
2504         } else {
2505             tcg_out_insn(s, 3617, ABS, is_q, vece, a0, a1);
2506         }
2507         break;
2508     case INDEX_op_and_vec:
2509         if (const_args[2]) {
2510             is_shimm1632(~a2, &cmode, &imm8);
2511             if (a0 == a1) {
2512                 tcg_out_insn(s, 3606, BIC, is_q, a0, 0, cmode, imm8);
2513                 return;
2514             }
2515             tcg_out_insn(s, 3606, MVNI, is_q, a0, 0, cmode, imm8);
2516             a2 = a0;
2517         }
2518         tcg_out_insn(s, 3616, AND, is_q, 0, a0, a1, a2);
2519         break;
2520     case INDEX_op_or_vec:
2521         if (const_args[2]) {
2522             is_shimm1632(a2, &cmode, &imm8);
2523             if (a0 == a1) {
2524                 tcg_out_insn(s, 3606, ORR, is_q, a0, 0, cmode, imm8);
2525                 return;
2526             }
2527             tcg_out_insn(s, 3606, MOVI, is_q, a0, 0, cmode, imm8);
2528             a2 = a0;
2529         }
2530         tcg_out_insn(s, 3616, ORR, is_q, 0, a0, a1, a2);
2531         break;
2532     case INDEX_op_andc_vec:
2533         if (const_args[2]) {
2534             is_shimm1632(a2, &cmode, &imm8);
2535             if (a0 == a1) {
2536                 tcg_out_insn(s, 3606, BIC, is_q, a0, 0, cmode, imm8);
2537                 return;
2538             }
2539             tcg_out_insn(s, 3606, MOVI, is_q, a0, 0, cmode, imm8);
2540             a2 = a0;
2541         }
2542         tcg_out_insn(s, 3616, BIC, is_q, 0, a0, a1, a2);
2543         break;
2544     case INDEX_op_orc_vec:
2545         if (const_args[2]) {
2546             is_shimm1632(~a2, &cmode, &imm8);
2547             if (a0 == a1) {
2548                 tcg_out_insn(s, 3606, ORR, is_q, a0, 0, cmode, imm8);
2549                 return;
2550             }
2551             tcg_out_insn(s, 3606, MVNI, is_q, a0, 0, cmode, imm8);
2552             a2 = a0;
2553         }
2554         tcg_out_insn(s, 3616, ORN, is_q, 0, a0, a1, a2);
2555         break;
2556     case INDEX_op_xor_vec:
2557         tcg_out_insn(s, 3616, EOR, is_q, 0, a0, a1, a2);
2558         break;
2559     case INDEX_op_ssadd_vec:
2560         if (is_scalar) {
2561             tcg_out_insn(s, 3611, SQADD, vece, a0, a1, a2);
2562         } else {
2563             tcg_out_insn(s, 3616, SQADD, is_q, vece, a0, a1, a2);
2564         }
2565         break;
2566     case INDEX_op_sssub_vec:
2567         if (is_scalar) {
2568             tcg_out_insn(s, 3611, SQSUB, vece, a0, a1, a2);
2569         } else {
2570             tcg_out_insn(s, 3616, SQSUB, is_q, vece, a0, a1, a2);
2571         }
2572         break;
2573     case INDEX_op_usadd_vec:
2574         if (is_scalar) {
2575             tcg_out_insn(s, 3611, UQADD, vece, a0, a1, a2);
2576         } else {
2577             tcg_out_insn(s, 3616, UQADD, is_q, vece, a0, a1, a2);
2578         }
2579         break;
2580     case INDEX_op_ussub_vec:
2581         if (is_scalar) {
2582             tcg_out_insn(s, 3611, UQSUB, vece, a0, a1, a2);
2583         } else {
2584             tcg_out_insn(s, 3616, UQSUB, is_q, vece, a0, a1, a2);
2585         }
2586         break;
2587     case INDEX_op_smax_vec:
2588         tcg_out_insn(s, 3616, SMAX, is_q, vece, a0, a1, a2);
2589         break;
2590     case INDEX_op_smin_vec:
2591         tcg_out_insn(s, 3616, SMIN, is_q, vece, a0, a1, a2);
2592         break;
2593     case INDEX_op_umax_vec:
2594         tcg_out_insn(s, 3616, UMAX, is_q, vece, a0, a1, a2);
2595         break;
2596     case INDEX_op_umin_vec:
2597         tcg_out_insn(s, 3616, UMIN, is_q, vece, a0, a1, a2);
2598         break;
2599     case INDEX_op_not_vec:
2600         tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a1);
2601         break;
2602     case INDEX_op_shli_vec:
2603         if (is_scalar) {
2604             tcg_out_insn(s, 3609, SHL, a0, a1, a2 + (8 << vece));
2605         } else {
2606             tcg_out_insn(s, 3614, SHL, is_q, a0, a1, a2 + (8 << vece));
2607         }
2608         break;
2609     case INDEX_op_shri_vec:
2610         if (is_scalar) {
2611             tcg_out_insn(s, 3609, USHR, a0, a1, (16 << vece) - a2);
2612         } else {
2613             tcg_out_insn(s, 3614, USHR, is_q, a0, a1, (16 << vece) - a2);
2614         }
2615         break;
2616     case INDEX_op_sari_vec:
2617         if (is_scalar) {
2618             tcg_out_insn(s, 3609, SSHR, a0, a1, (16 << vece) - a2);
2619         } else {
2620             tcg_out_insn(s, 3614, SSHR, is_q, a0, a1, (16 << vece) - a2);
2621         }
2622         break;
2623     case INDEX_op_aa64_sli_vec:
2624         if (is_scalar) {
2625             tcg_out_insn(s, 3609, SLI, a0, a2, args[3] + (8 << vece));
2626         } else {
2627             tcg_out_insn(s, 3614, SLI, is_q, a0, a2, args[3] + (8 << vece));
2628         }
2629         break;
2630     case INDEX_op_shlv_vec:
2631         if (is_scalar) {
2632             tcg_out_insn(s, 3611, USHL, vece, a0, a1, a2);
2633         } else {
2634             tcg_out_insn(s, 3616, USHL, is_q, vece, a0, a1, a2);
2635         }
2636         break;
2637     case INDEX_op_aa64_sshl_vec:
2638         if (is_scalar) {
2639             tcg_out_insn(s, 3611, SSHL, vece, a0, a1, a2);
2640         } else {
2641             tcg_out_insn(s, 3616, SSHL, is_q, vece, a0, a1, a2);
2642         }
2643         break;
2644     case INDEX_op_cmp_vec:
2645         {
2646             TCGCond cond = args[3];
2647             AArch64Insn insn;
2648
2649             if (cond == TCG_COND_NE) {
2650                 if (const_args[2]) {
2651                     if (is_scalar) {
2652                         tcg_out_insn(s, 3611, CMTST, vece, a0, a1, a1);
2653                     } else {
2654                         tcg_out_insn(s, 3616, CMTST, is_q, vece, a0, a1, a1);
2655                     }
2656                 } else {
2657                     if (is_scalar) {
2658                         tcg_out_insn(s, 3611, CMEQ, vece, a0, a1, a2);
2659                     } else {
2660                         tcg_out_insn(s, 3616, CMEQ, is_q, vece, a0, a1, a2);
2661                     }
2662                     tcg_out_insn(s, 3617, NOT, is_q, 0, a0, a0);
2663                 }
2664             } else {
2665                 if (const_args[2]) {
2666                     if (is_scalar) {
2667                         insn = cmp0_scalar_insn[cond];
2668                         if (insn) {
2669                             tcg_out_insn_3612(s, insn, vece, a0, a1);
2670                             break;
2671                         }
2672                     } else {
2673                         insn = cmp0_vec_insn[cond];
2674                         if (insn) {
2675                             tcg_out_insn_3617(s, insn, is_q, vece, a0, a1);
2676                             break;
2677                         }
2678                     }
2679                     tcg_out_dupi_vec(s, type, MO_8, TCG_VEC_TMP0, 0);
2680                     a2 = TCG_VEC_TMP0;
2681                 }
2682                 if (is_scalar) {
2683                     insn = cmp_scalar_insn[cond];
2684                     if (insn == 0) {
2685                         TCGArg t;
2686                         t = a1, a1 = a2, a2 = t;
2687                         cond = tcg_swap_cond(cond);
2688                         insn = cmp_scalar_insn[cond];
2689                         tcg_debug_assert(insn != 0);
2690                     }
2691                     tcg_out_insn_3611(s, insn, vece, a0, a1, a2);
2692                 } else {
2693                     insn = cmp_vec_insn[cond];
2694                     if (insn == 0) {
2695                         TCGArg t;
2696                         t = a1, a1 = a2, a2 = t;
2697                         cond = tcg_swap_cond(cond);
2698                         insn = cmp_vec_insn[cond];
2699                         tcg_debug_assert(insn != 0);
2700                     }
2701                     tcg_out_insn_3616(s, insn, is_q, vece, a0, a1, a2);
2702                 }
2703             }
2704         }
2705         break;
2706
2707     case INDEX_op_bitsel_vec:
2708         a3 = args[3];
2709         if (a0 == a3) {
2710             tcg_out_insn(s, 3616, BIT, is_q, 0, a0, a2, a1);
2711         } else if (a0 == a2) {
2712             tcg_out_insn(s, 3616, BIF, is_q, 0, a0, a3, a1);
2713         } else {
2714             if (a0 != a1) {
2715                 tcg_out_mov(s, type, a0, a1);
2716             }
2717             tcg_out_insn(s, 3616, BSL, is_q, 0, a0, a2, a3);
2718         }
2719         break;
2720
2721     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
2722     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
2723     default:
2724         g_assert_not_reached();
2725     }
2726 }
2727
2728 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
2729 {
2730     switch (opc) {
2731     case INDEX_op_add_vec:
2732     case INDEX_op_sub_vec:
2733     case INDEX_op_and_vec:
2734     case INDEX_op_or_vec:
2735     case INDEX_op_xor_vec:
2736     case INDEX_op_andc_vec:
2737     case INDEX_op_orc_vec:
2738     case INDEX_op_neg_vec:
2739     case INDEX_op_abs_vec:
2740     case INDEX_op_not_vec:
2741     case INDEX_op_cmp_vec:
2742     case INDEX_op_shli_vec:
2743     case INDEX_op_shri_vec:
2744     case INDEX_op_sari_vec:
2745     case INDEX_op_ssadd_vec:
2746     case INDEX_op_sssub_vec:
2747     case INDEX_op_usadd_vec:
2748     case INDEX_op_ussub_vec:
2749     case INDEX_op_shlv_vec:
2750     case INDEX_op_bitsel_vec:
2751         return 1;
2752     case INDEX_op_rotli_vec:
2753     case INDEX_op_shrv_vec:
2754     case INDEX_op_sarv_vec:
2755     case INDEX_op_rotlv_vec:
2756     case INDEX_op_rotrv_vec:
2757         return -1;
2758     case INDEX_op_mul_vec:
2759     case INDEX_op_smax_vec:
2760     case INDEX_op_smin_vec:
2761     case INDEX_op_umax_vec:
2762     case INDEX_op_umin_vec:
2763         return vece < MO_64;
2764
2765     default:
2766         return 0;
2767     }
2768 }
2769
2770 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
2771                        TCGArg a0, ...)
2772 {
2773     va_list va;
2774     TCGv_vec v0, v1, v2, t1, t2, c1;
2775     TCGArg a2;
2776
2777     va_start(va, a0);
2778     v0 = temp_tcgv_vec(arg_temp(a0));
2779     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
2780     a2 = va_arg(va, TCGArg);
2781     va_end(va);
2782
2783     switch (opc) {
2784     case INDEX_op_rotli_vec:
2785         t1 = tcg_temp_new_vec(type);
2786         tcg_gen_shri_vec(vece, t1, v1, -a2 & ((8 << vece) - 1));
2787         vec_gen_4(INDEX_op_aa64_sli_vec, type, vece,
2788                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(v1), a2);
2789         tcg_temp_free_vec(t1);
2790         break;
2791
2792     case INDEX_op_shrv_vec:
2793     case INDEX_op_sarv_vec:
2794         /* Right shifts are negative left shifts for AArch64.  */
2795         v2 = temp_tcgv_vec(arg_temp(a2));
2796         t1 = tcg_temp_new_vec(type);
2797         tcg_gen_neg_vec(vece, t1, v2);
2798         opc = (opc == INDEX_op_shrv_vec
2799                ? INDEX_op_shlv_vec : INDEX_op_aa64_sshl_vec);
2800         vec_gen_3(opc, type, vece, tcgv_vec_arg(v0),
2801                   tcgv_vec_arg(v1), tcgv_vec_arg(t1));
2802         tcg_temp_free_vec(t1);
2803         break;
2804
2805     case INDEX_op_rotlv_vec:
2806         v2 = temp_tcgv_vec(arg_temp(a2));
2807         t1 = tcg_temp_new_vec(type);
2808         c1 = tcg_constant_vec(type, vece, 8 << vece);
2809         tcg_gen_sub_vec(vece, t1, v2, c1);
2810         /* Right shifts are negative left shifts for AArch64.  */
2811         vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
2812                   tcgv_vec_arg(v1), tcgv_vec_arg(t1));
2813         vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(v0),
2814                   tcgv_vec_arg(v1), tcgv_vec_arg(v2));
2815         tcg_gen_or_vec(vece, v0, v0, t1);
2816         tcg_temp_free_vec(t1);
2817         break;
2818
2819     case INDEX_op_rotrv_vec:
2820         v2 = temp_tcgv_vec(arg_temp(a2));
2821         t1 = tcg_temp_new_vec(type);
2822         t2 = tcg_temp_new_vec(type);
2823         c1 = tcg_constant_vec(type, vece, 8 << vece);
2824         tcg_gen_neg_vec(vece, t1, v2);
2825         tcg_gen_sub_vec(vece, t2, c1, v2);
2826         /* Right shifts are negative left shifts for AArch64.  */
2827         vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t1),
2828                   tcgv_vec_arg(v1), tcgv_vec_arg(t1));
2829         vec_gen_3(INDEX_op_shlv_vec, type, vece, tcgv_vec_arg(t2),
2830                   tcgv_vec_arg(v1), tcgv_vec_arg(t2));
2831         tcg_gen_or_vec(vece, v0, t1, t2);
2832         tcg_temp_free_vec(t1);
2833         tcg_temp_free_vec(t2);
2834         break;
2835
2836     default:
2837         g_assert_not_reached();
2838     }
2839 }
2840
2841 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
2842 {
2843     switch (op) {
2844     case INDEX_op_goto_ptr:
2845         return C_O0_I1(r);
2846
2847     case INDEX_op_ld8u_i32:
2848     case INDEX_op_ld8s_i32:
2849     case INDEX_op_ld16u_i32:
2850     case INDEX_op_ld16s_i32:
2851     case INDEX_op_ld_i32:
2852     case INDEX_op_ld8u_i64:
2853     case INDEX_op_ld8s_i64:
2854     case INDEX_op_ld16u_i64:
2855     case INDEX_op_ld16s_i64:
2856     case INDEX_op_ld32u_i64:
2857     case INDEX_op_ld32s_i64:
2858     case INDEX_op_ld_i64:
2859     case INDEX_op_neg_i32:
2860     case INDEX_op_neg_i64:
2861     case INDEX_op_not_i32:
2862     case INDEX_op_not_i64:
2863     case INDEX_op_bswap16_i32:
2864     case INDEX_op_bswap32_i32:
2865     case INDEX_op_bswap16_i64:
2866     case INDEX_op_bswap32_i64:
2867     case INDEX_op_bswap64_i64:
2868     case INDEX_op_ext8s_i32:
2869     case INDEX_op_ext16s_i32:
2870     case INDEX_op_ext8u_i32:
2871     case INDEX_op_ext16u_i32:
2872     case INDEX_op_ext8s_i64:
2873     case INDEX_op_ext16s_i64:
2874     case INDEX_op_ext32s_i64:
2875     case INDEX_op_ext8u_i64:
2876     case INDEX_op_ext16u_i64:
2877     case INDEX_op_ext32u_i64:
2878     case INDEX_op_ext_i32_i64:
2879     case INDEX_op_extu_i32_i64:
2880     case INDEX_op_extract_i32:
2881     case INDEX_op_extract_i64:
2882     case INDEX_op_sextract_i32:
2883     case INDEX_op_sextract_i64:
2884         return C_O1_I1(r, r);
2885
2886     case INDEX_op_st8_i32:
2887     case INDEX_op_st16_i32:
2888     case INDEX_op_st_i32:
2889     case INDEX_op_st8_i64:
2890     case INDEX_op_st16_i64:
2891     case INDEX_op_st32_i64:
2892     case INDEX_op_st_i64:
2893         return C_O0_I2(rZ, r);
2894
2895     case INDEX_op_add_i32:
2896     case INDEX_op_add_i64:
2897     case INDEX_op_sub_i32:
2898     case INDEX_op_sub_i64:
2899     case INDEX_op_setcond_i32:
2900     case INDEX_op_setcond_i64:
2901     case INDEX_op_negsetcond_i32:
2902     case INDEX_op_negsetcond_i64:
2903         return C_O1_I2(r, r, rA);
2904
2905     case INDEX_op_mul_i32:
2906     case INDEX_op_mul_i64:
2907     case INDEX_op_div_i32:
2908     case INDEX_op_div_i64:
2909     case INDEX_op_divu_i32:
2910     case INDEX_op_divu_i64:
2911     case INDEX_op_rem_i32:
2912     case INDEX_op_rem_i64:
2913     case INDEX_op_remu_i32:
2914     case INDEX_op_remu_i64:
2915     case INDEX_op_muluh_i64:
2916     case INDEX_op_mulsh_i64:
2917         return C_O1_I2(r, r, r);
2918
2919     case INDEX_op_and_i32:
2920     case INDEX_op_and_i64:
2921     case INDEX_op_or_i32:
2922     case INDEX_op_or_i64:
2923     case INDEX_op_xor_i32:
2924     case INDEX_op_xor_i64:
2925     case INDEX_op_andc_i32:
2926     case INDEX_op_andc_i64:
2927     case INDEX_op_orc_i32:
2928     case INDEX_op_orc_i64:
2929     case INDEX_op_eqv_i32:
2930     case INDEX_op_eqv_i64:
2931         return C_O1_I2(r, r, rL);
2932
2933     case INDEX_op_shl_i32:
2934     case INDEX_op_shr_i32:
2935     case INDEX_op_sar_i32:
2936     case INDEX_op_rotl_i32:
2937     case INDEX_op_rotr_i32:
2938     case INDEX_op_shl_i64:
2939     case INDEX_op_shr_i64:
2940     case INDEX_op_sar_i64:
2941     case INDEX_op_rotl_i64:
2942     case INDEX_op_rotr_i64:
2943         return C_O1_I2(r, r, ri);
2944
2945     case INDEX_op_clz_i32:
2946     case INDEX_op_ctz_i32:
2947     case INDEX_op_clz_i64:
2948     case INDEX_op_ctz_i64:
2949         return C_O1_I2(r, r, rAL);
2950
2951     case INDEX_op_brcond_i32:
2952     case INDEX_op_brcond_i64:
2953         return C_O0_I2(r, rA);
2954
2955     case INDEX_op_movcond_i32:
2956     case INDEX_op_movcond_i64:
2957         return C_O1_I4(r, r, rA, rZ, rZ);
2958
2959     case INDEX_op_qemu_ld_a32_i32:
2960     case INDEX_op_qemu_ld_a64_i32:
2961     case INDEX_op_qemu_ld_a32_i64:
2962     case INDEX_op_qemu_ld_a64_i64:
2963         return C_O1_I1(r, r);
2964     case INDEX_op_qemu_ld_a32_i128:
2965     case INDEX_op_qemu_ld_a64_i128:
2966         return C_O2_I1(r, r, r);
2967     case INDEX_op_qemu_st_a32_i32:
2968     case INDEX_op_qemu_st_a64_i32:
2969     case INDEX_op_qemu_st_a32_i64:
2970     case INDEX_op_qemu_st_a64_i64:
2971         return C_O0_I2(rZ, r);
2972     case INDEX_op_qemu_st_a32_i128:
2973     case INDEX_op_qemu_st_a64_i128:
2974         return C_O0_I3(rZ, rZ, r);
2975
2976     case INDEX_op_deposit_i32:
2977     case INDEX_op_deposit_i64:
2978         return C_O1_I2(r, 0, rZ);
2979
2980     case INDEX_op_extract2_i32:
2981     case INDEX_op_extract2_i64:
2982         return C_O1_I2(r, rZ, rZ);
2983
2984     case INDEX_op_add2_i32:
2985     case INDEX_op_add2_i64:
2986     case INDEX_op_sub2_i32:
2987     case INDEX_op_sub2_i64:
2988         return C_O2_I4(r, r, rZ, rZ, rA, rMZ);
2989
2990     case INDEX_op_add_vec:
2991     case INDEX_op_sub_vec:
2992     case INDEX_op_mul_vec:
2993     case INDEX_op_xor_vec:
2994     case INDEX_op_ssadd_vec:
2995     case INDEX_op_sssub_vec:
2996     case INDEX_op_usadd_vec:
2997     case INDEX_op_ussub_vec:
2998     case INDEX_op_smax_vec:
2999     case INDEX_op_smin_vec:
3000     case INDEX_op_umax_vec:
3001     case INDEX_op_umin_vec:
3002     case INDEX_op_shlv_vec:
3003     case INDEX_op_shrv_vec:
3004     case INDEX_op_sarv_vec:
3005     case INDEX_op_aa64_sshl_vec:
3006         return C_O1_I2(w, w, w);
3007     case INDEX_op_not_vec:
3008     case INDEX_op_neg_vec:
3009     case INDEX_op_abs_vec:
3010     case INDEX_op_shli_vec:
3011     case INDEX_op_shri_vec:
3012     case INDEX_op_sari_vec:
3013         return C_O1_I1(w, w);
3014     case INDEX_op_ld_vec:
3015     case INDEX_op_dupm_vec:
3016         return C_O1_I1(w, r);
3017     case INDEX_op_st_vec:
3018         return C_O0_I2(w, r);
3019     case INDEX_op_dup_vec:
3020         return C_O1_I1(w, wr);
3021     case INDEX_op_or_vec:
3022     case INDEX_op_andc_vec:
3023         return C_O1_I2(w, w, wO);
3024     case INDEX_op_and_vec:
3025     case INDEX_op_orc_vec:
3026         return C_O1_I2(w, w, wN);
3027     case INDEX_op_cmp_vec:
3028         return C_O1_I2(w, w, wZ);
3029     case INDEX_op_bitsel_vec:
3030         return C_O1_I3(w, w, w, w);
3031     case INDEX_op_aa64_sli_vec:
3032         return C_O1_I2(w, 0, w);
3033
3034     default:
3035         g_assert_not_reached();
3036     }
3037 }
3038
3039 static void tcg_target_init(TCGContext *s)
3040 {
3041     tcg_target_available_regs[TCG_TYPE_I32] = 0xffffffffu;
3042     tcg_target_available_regs[TCG_TYPE_I64] = 0xffffffffu;
3043     tcg_target_available_regs[TCG_TYPE_V64] = 0xffffffff00000000ull;
3044     tcg_target_available_regs[TCG_TYPE_V128] = 0xffffffff00000000ull;
3045
3046     tcg_target_call_clobber_regs = -1ull;
3047     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X19);
3048     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X20);
3049     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X21);
3050     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X22);
3051     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X23);
3052     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X24);
3053     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X25);
3054     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X26);
3055     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X27);
3056     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X28);
3057     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_X29);
3058     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V8);
3059     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V9);
3060     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V10);
3061     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V11);
3062     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V12);
3063     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V13);
3064     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V14);
3065     tcg_regset_reset_reg(tcg_target_call_clobber_regs, TCG_REG_V15);
3066
3067     s->reserved_regs = 0;
3068     tcg_regset_set_reg(s->reserved_regs, TCG_REG_SP);
3069     tcg_regset_set_reg(s->reserved_regs, TCG_REG_FP);
3070     tcg_regset_set_reg(s->reserved_regs, TCG_REG_X18); /* platform register */
3071     tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP0);
3072     tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP1);
3073     tcg_regset_set_reg(s->reserved_regs, TCG_REG_TMP2);
3074     tcg_regset_set_reg(s->reserved_regs, TCG_VEC_TMP0);
3075 }
3076
3077 /* Saving pairs: (X19, X20) .. (X27, X28), (X29(fp), X30(lr)).  */
3078 #define PUSH_SIZE  ((30 - 19 + 1) * 8)
3079
3080 #define FRAME_SIZE \
3081     ((PUSH_SIZE \
3082       + TCG_STATIC_CALL_ARGS_SIZE \
3083       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3084       + TCG_TARGET_STACK_ALIGN - 1) \
3085      & ~(TCG_TARGET_STACK_ALIGN - 1))
3086
3087 /* We're expecting a 2 byte uleb128 encoded value.  */
3088 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3089
3090 /* We're expecting to use a single ADDI insn.  */
3091 QEMU_BUILD_BUG_ON(FRAME_SIZE - PUSH_SIZE > 0xfff);
3092
3093 static void tcg_target_qemu_prologue(TCGContext *s)
3094 {
3095     TCGReg r;
3096
3097     tcg_out_bti(s, BTI_C);
3098
3099     /* Push (FP, LR) and allocate space for all saved registers.  */
3100     tcg_out_insn(s, 3314, STP, TCG_REG_FP, TCG_REG_LR,
3101                  TCG_REG_SP, -PUSH_SIZE, 1, 1);
3102
3103     /* Set up frame pointer for canonical unwinding.  */
3104     tcg_out_movr_sp(s, TCG_TYPE_I64, TCG_REG_FP, TCG_REG_SP);
3105
3106     /* Store callee-preserved regs x19..x28.  */
3107     for (r = TCG_REG_X19; r <= TCG_REG_X27; r += 2) {
3108         int ofs = (r - TCG_REG_X19 + 2) * 8;
3109         tcg_out_insn(s, 3314, STP, r, r + 1, TCG_REG_SP, ofs, 1, 0);
3110     }
3111
3112     /* Make stack space for TCG locals.  */
3113     tcg_out_insn(s, 3401, SUBI, TCG_TYPE_I64, TCG_REG_SP, TCG_REG_SP,
3114                  FRAME_SIZE - PUSH_SIZE);
3115
3116     /* Inform TCG about how to find TCG locals with register, offset, size.  */
3117     tcg_set_frame(s, TCG_REG_SP, TCG_STATIC_CALL_ARGS_SIZE,
3118                   CPU_TEMP_BUF_NLONGS * sizeof(long));
3119
3120 #if !defined(CONFIG_SOFTMMU)
3121     /*
3122      * Note that XZR cannot be encoded in the address base register slot,
3123      * as that actually encodes SP.  Depending on the guest, we may need
3124      * to zero-extend the guest address via the address index register slot,
3125      * therefore we need to load even a zero guest base into a register.
3126      */
3127     tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_GUEST_BASE, guest_base);
3128     tcg_regset_set_reg(s->reserved_regs, TCG_REG_GUEST_BASE);
3129 #endif
3130
3131     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3132     tcg_out_insn(s, 3207, BR, tcg_target_call_iarg_regs[1]);
3133
3134     /*
3135      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3136      * and fall through to the rest of the epilogue.
3137      */
3138     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3139     tcg_out_bti(s, BTI_J);
3140     tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_X0, 0);
3141
3142     /* TB epilogue */
3143     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3144     tcg_out_bti(s, BTI_J);
3145
3146     /* Remove TCG locals stack space.  */
3147     tcg_out_insn(s, 3401, ADDI, TCG_TYPE_I64, TCG_REG_SP, TCG_REG_SP,
3148                  FRAME_SIZE - PUSH_SIZE);
3149
3150     /* Restore registers x19..x28.  */
3151     for (r = TCG_REG_X19; r <= TCG_REG_X27; r += 2) {
3152         int ofs = (r - TCG_REG_X19 + 2) * 8;
3153         tcg_out_insn(s, 3314, LDP, r, r + 1, TCG_REG_SP, ofs, 1, 0);
3154     }
3155
3156     /* Pop (FP, LR), restore SP to previous frame.  */
3157     tcg_out_insn(s, 3314, LDP, TCG_REG_FP, TCG_REG_LR,
3158                  TCG_REG_SP, PUSH_SIZE, 0, 1);
3159     tcg_out_insn(s, 3207, RET, TCG_REG_LR);
3160 }
3161
3162 static void tcg_out_tb_start(TCGContext *s)
3163 {
3164     tcg_out_bti(s, BTI_J);
3165 }
3166
3167 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3168 {
3169     int i;
3170     for (i = 0; i < count; ++i) {
3171         p[i] = NOP;
3172     }
3173 }
3174
3175 typedef struct {
3176     DebugFrameHeader h;
3177     uint8_t fde_def_cfa[4];
3178     uint8_t fde_reg_ofs[24];
3179 } DebugFrame;
3180
3181 #define ELF_HOST_MACHINE EM_AARCH64
3182
3183 static const DebugFrame debug_frame = {
3184     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3185     .h.cie.id = -1,
3186     .h.cie.version = 1,
3187     .h.cie.code_align = 1,
3188     .h.cie.data_align = 0x78,             /* sleb128 -8 */
3189     .h.cie.return_column = TCG_REG_LR,
3190
3191     /* Total FDE size does not include the "len" member.  */
3192     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3193
3194     .fde_def_cfa = {
3195         12, TCG_REG_SP,                 /* DW_CFA_def_cfa sp, ... */
3196         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
3197         (FRAME_SIZE >> 7)
3198     },
3199     .fde_reg_ofs = {
3200         0x80 + 28, 1,                   /* DW_CFA_offset, x28,  -8 */
3201         0x80 + 27, 2,                   /* DW_CFA_offset, x27, -16 */
3202         0x80 + 26, 3,                   /* DW_CFA_offset, x26, -24 */
3203         0x80 + 25, 4,                   /* DW_CFA_offset, x25, -32 */
3204         0x80 + 24, 5,                   /* DW_CFA_offset, x24, -40 */
3205         0x80 + 23, 6,                   /* DW_CFA_offset, x23, -48 */
3206         0x80 + 22, 7,                   /* DW_CFA_offset, x22, -56 */
3207         0x80 + 21, 8,                   /* DW_CFA_offset, x21, -64 */
3208         0x80 + 20, 9,                   /* DW_CFA_offset, x20, -72 */
3209         0x80 + 19, 10,                  /* DW_CFA_offset, x1p, -80 */
3210         0x80 + 30, 11,                  /* DW_CFA_offset,  lr, -88 */
3211         0x80 + 29, 12,                  /* DW_CFA_offset,  fp, -96 */
3212     }
3213 };
3214
3215 void tcg_register_jit(const void *buf, size_t buf_size)
3216 {
3217     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3218 }