tcg/i386/tcg-target.c.inc

   1 /*
   2  * Tiny Code Generator for QEMU
   3  *
   4  * Copyright (c) 2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "../tcg-ldst.c.inc"
  26 #include "../tcg-pool.c.inc"
  27
  28 #ifdef CONFIG_DEBUG_TCG
  29 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  30 #if TCG_TARGET_REG_BITS == 64
  31     "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  32 #else
  33     "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  34 #endif
  35     "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  36     "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  37 #if TCG_TARGET_REG_BITS == 64
  38     "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  39     "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  40 #endif
  41 };
  42 #endif
  43
  44 static const int tcg_target_reg_alloc_order[] = {
  45 #if TCG_TARGET_REG_BITS == 64
  46     TCG_REG_RBP,
  47     TCG_REG_RBX,
  48     TCG_REG_R12,
  49     TCG_REG_R13,
  50     TCG_REG_R14,
  51     TCG_REG_R15,
  52     TCG_REG_R10,
  53     TCG_REG_R11,
  54     TCG_REG_R9,
  55     TCG_REG_R8,
  56     TCG_REG_RCX,
  57     TCG_REG_RDX,
  58     TCG_REG_RSI,
  59     TCG_REG_RDI,
  60     TCG_REG_RAX,
  61 #else
  62     TCG_REG_EBX,
  63     TCG_REG_ESI,
  64     TCG_REG_EDI,
  65     TCG_REG_EBP,
  66     TCG_REG_ECX,
  67     TCG_REG_EDX,
  68     TCG_REG_EAX,
  69 #endif
  70     TCG_REG_XMM0,
  71     TCG_REG_XMM1,
  72     TCG_REG_XMM2,
  73     TCG_REG_XMM3,
  74     TCG_REG_XMM4,
  75     TCG_REG_XMM5,
  76 #ifndef _WIN64
  77     /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  78        any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  79     TCG_REG_XMM6,
  80     TCG_REG_XMM7,
  81 #if TCG_TARGET_REG_BITS == 64
  82     TCG_REG_XMM8,
  83     TCG_REG_XMM9,
  84     TCG_REG_XMM10,
  85     TCG_REG_XMM11,
  86     TCG_REG_XMM12,
  87     TCG_REG_XMM13,
  88     TCG_REG_XMM14,
  89     TCG_REG_XMM15,
  90 #endif
  91 #endif
  92 };
  93
  94 #define TCG_TMP_VEC  TCG_REG_XMM5
  95
  96 static const int tcg_target_call_iarg_regs[] = {
  97 #if TCG_TARGET_REG_BITS == 64
  98 #if defined(_WIN64)
  99     TCG_REG_RCX,
 100     TCG_REG_RDX,
 101 #else
 102     TCG_REG_RDI,
 103     TCG_REG_RSI,
 104     TCG_REG_RDX,
 105     TCG_REG_RCX,
 106 #endif
 107     TCG_REG_R8,
 108     TCG_REG_R9,
 109 #else
 110     /* 32 bit mode uses stack based calling convention (GCC default). */
 111 #endif
 112 };
 113
 114 static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
 115 {
 116     switch (kind) {
 117     case TCG_CALL_RET_NORMAL:
 118         tcg_debug_assert(slot >= 0 && slot <= 1);
 119         return slot ? TCG_REG_EDX : TCG_REG_EAX;
 120 #ifdef _WIN64
 121     case TCG_CALL_RET_BY_VEC:
 122         tcg_debug_assert(slot == 0);
 123         return TCG_REG_XMM0;
 124 #endif
 125     default:
 126         g_assert_not_reached();
 127     }
 128 }
 129
 130 /* Constants we accept.  */
 131 #define TCG_CT_CONST_S32 0x100
 132 #define TCG_CT_CONST_U32 0x200
 133 #define TCG_CT_CONST_I32 0x400
 134 #define TCG_CT_CONST_WSZ 0x800
 135
 136 /* Registers used with L constraint, which are the first argument
 137    registers on x86_64, and two random call clobbered registers on
 138    i386. */
 139 #if TCG_TARGET_REG_BITS == 64
 140 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 141 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 142 #else
 143 # define TCG_REG_L0 TCG_REG_EAX
 144 # define TCG_REG_L1 TCG_REG_EDX
 145 #endif
 146
 147 #if TCG_TARGET_REG_BITS == 64
 148 # define ALL_GENERAL_REGS      0x0000ffffu
 149 # define ALL_VECTOR_REGS       0xffff0000u
 150 # define ALL_BYTEL_REGS        ALL_GENERAL_REGS
 151 #else
 152 # define ALL_GENERAL_REGS      0x000000ffu
 153 # define ALL_VECTOR_REGS       0x00ff0000u
 154 # define ALL_BYTEL_REGS        0x0000000fu
 155 #endif
 156 #ifdef CONFIG_SOFTMMU
 157 # define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
 158 #else
 159 # define SOFTMMU_RESERVE_REGS  0
 160 #endif
 161
 162 /* For 64-bit, we always know that CMOV is available.  */
 163 #if TCG_TARGET_REG_BITS == 64
 164 # define have_cmov      true
 165 #else
 166 # define have_cmov      (cpuinfo & CPUINFO_CMOV)
 167 #endif
 168 #define have_bmi2       (cpuinfo & CPUINFO_BMI2)
 169 #define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
 170
 171 static const tcg_insn_unit *tb_ret_addr;
 172
 173 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 174                         intptr_t value, intptr_t addend)
 175 {
 176     value += addend;
 177     switch(type) {
 178     case R_386_PC32:
 179         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 180         if (value != (int32_t)value) {
 181             return false;
 182         }
 183         /* FALLTHRU */
 184     case R_386_32:
 185         tcg_patch32(code_ptr, value);
 186         break;
 187     case R_386_PC8:
 188         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 189         if (value != (int8_t)value) {
 190             return false;
 191         }
 192         tcg_patch8(code_ptr, value);
 193         break;
 194     default:
 195         g_assert_not_reached();
 196     }
 197     return true;
 198 }
 199
 200 /* test if a constant matches the constraint */
 201 static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
 202 {
 203     if (ct & TCG_CT_CONST) {
 204         return 1;
 205     }
 206     if (type == TCG_TYPE_I32) {
 207         if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
 208             return 1;
 209         }
 210     } else {
 211         if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 212             return 1;
 213         }
 214         if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 215             return 1;
 216         }
 217         if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 218             return 1;
 219         }
 220     }
 221     if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 222         return 1;
 223     }
 224     return 0;
 225 }
 226
 227 # define LOWREGMASK(x)  ((x) & 7)
 228
 229 #define P_EXT           0x100           /* 0x0f opcode prefix */
 230 #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 231 #define P_DATA16        0x400           /* 0x66 opcode prefix */
 232 #define P_VEXW          0x1000          /* Set VEX.W = 1 */
 233 #if TCG_TARGET_REG_BITS == 64
 234 # define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
 235 # define P_REXB_R       0x2000          /* REG field as byte register */
 236 # define P_REXB_RM      0x4000          /* R/M field as byte register */
 237 # define P_GS           0x8000          /* gs segment override */
 238 #else
 239 # define P_REXW         0
 240 # define P_REXB_R       0
 241 # define P_REXB_RM      0
 242 # define P_GS           0
 243 #endif
 244 #define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 245 #define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 246 #define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 247 #define P_VEXL          0x80000         /* Set VEX.L = 1 */
 248 #define P_EVEX          0x100000        /* Requires EVEX encoding */
 249
 250 #define OPC_ARITH_EvIz  (0x81)
 251 #define OPC_ARITH_EvIb  (0x83)
 252 #define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 253 #define OPC_ANDN        (0xf2 | P_EXT38)
 254 #define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 255 #define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 256 #define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 257 #define OPC_BSF         (0xbc | P_EXT)
 258 #define OPC_BSR         (0xbd | P_EXT)
 259 #define OPC_BSWAP       (0xc8 | P_EXT)
 260 #define OPC_CALL_Jz     (0xe8)
 261 #define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 262 #define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 263 #define OPC_DEC_r32     (0x48)
 264 #define OPC_IMUL_GvEv   (0xaf | P_EXT)
 265 #define OPC_IMUL_GvEvIb (0x6b)
 266 #define OPC_IMUL_GvEvIz (0x69)
 267 #define OPC_INC_r32     (0x40)
 268 #define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 269 #define OPC_JCC_short   (0x70)          /* ... plus condition code */
 270 #define OPC_JMP_long    (0xe9)
 271 #define OPC_JMP_short   (0xeb)
 272 #define OPC_LEA         (0x8d)
 273 #define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 274 #define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 275 #define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 276 #define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 277 #define OPC_MOVB_EvIz   (0xc6)
 278 #define OPC_MOVL_EvIz   (0xc7)
 279 #define OPC_MOVB_Ib     (0xb0)
 280 #define OPC_MOVL_Iv     (0xb8)
 281 #define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 282 #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 283 #define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 284 #define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 285 #define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 286 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 287 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 288 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 289 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 290 #define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 291 #define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 292 #define OPC_MOVSBL      (0xbe | P_EXT)
 293 #define OPC_MOVSWL      (0xbf | P_EXT)
 294 #define OPC_MOVSLQ      (0x63 | P_REXW)
 295 #define OPC_MOVZBL      (0xb6 | P_EXT)
 296 #define OPC_MOVZWL      (0xb7 | P_EXT)
 297 #define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
 298 #define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
 299 #define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
 300 #define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 301 #define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 302 #define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 303 #define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 304 #define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 305 #define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 306 #define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 307 #define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 308 #define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 309 #define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
 310 #define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
 311 #define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
 312 #define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
 313 #define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 314 #define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 315 #define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 316 #define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 317 #define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 318 #define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 319 #define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 320 #define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 321 #define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 322 #define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 323 #define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 324 #define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
 325 #define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
 326 #define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 327 #define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 328 #define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
 329 #define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 330 #define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 331 #define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 332 #define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
 333 #define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 334 #define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 335 #define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 336 #define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
 337 #define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 338 #define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 339 #define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 340 #define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
 341 #define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 342 #define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 343 #define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 344 #define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 345 #define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 346 #define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 347 #define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 348 #define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 349 #define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 350 #define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 351 #define OPC_POR         (0xeb | P_EXT | P_DATA16)
 352 #define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 353 #define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 354 #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 355 #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 356 #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 357 #define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
 358 #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 359 #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 360 #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
 361 #define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
 362 #define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
 363 #define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
 364 #define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
 365 #define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
 366 #define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
 367 #define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
 368 #define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 369 #define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 370 #define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 371 #define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 372 #define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
 373 #define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
 374 #define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
 375 #define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
 376 #define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 377 #define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 378 #define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 379 #define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 380 #define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 381 #define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 382 #define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 383 #define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 384 #define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 385 #define OPC_POP_r32     (0x58)
 386 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 387 #define OPC_PUSH_r32    (0x50)
 388 #define OPC_PUSH_Iv     (0x68)
 389 #define OPC_PUSH_Ib     (0x6a)
 390 #define OPC_RET         (0xc3)
 391 #define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 392 #define OPC_SHIFT_1     (0xd1)
 393 #define OPC_SHIFT_Ib    (0xc1)
 394 #define OPC_SHIFT_cl    (0xd3)
 395 #define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 396 #define OPC_SHUFPS      (0xc6 | P_EXT)
 397 #define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 398 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 399 #define OPC_SHRD_Ib     (0xac | P_EXT)
 400 #define OPC_TESTL       (0x85)
 401 #define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 402 #define OPC_UD2         (0x0b | P_EXT)
 403 #define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 404 #define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 405 #define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
 406 #define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
 407 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
 408 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
 409 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 410 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 411 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 412 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 413 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
 414 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 415 #define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
 416 #define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 417 #define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
 418 #define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 419 #define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 420 #define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
 421 #define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 422 #define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 423 #define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
 424 #define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 425 #define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 426 #define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
 427 #define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 428 #define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 429 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 430 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
 431 #define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 432 #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
 433 #define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 434 #define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 435 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 436 #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
 437 #define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 438 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 439 #define OPC_XCHG_ax_r32 (0x90)
 440 #define OPC_XCHG_EvGv   (0x87)
 441
 442 #define OPC_GRP3_Eb     (0xf6)
 443 #define OPC_GRP3_Ev     (0xf7)
 444 #define OPC_GRP5        (0xff)
 445 #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 446
 447 /* Group 1 opcode extensions for 0x80-0x83.
 448    These are also used as modifiers for OPC_ARITH.  */
 449 #define ARITH_ADD 0
 450 #define ARITH_OR  1
 451 #define ARITH_ADC 2
 452 #define ARITH_SBB 3
 453 #define ARITH_AND 4
 454 #define ARITH_SUB 5
 455 #define ARITH_XOR 6
 456 #define ARITH_CMP 7
 457
 458 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 459 #define SHIFT_ROL 0
 460 #define SHIFT_ROR 1
 461 #define SHIFT_SHL 4
 462 #define SHIFT_SHR 5
 463 #define SHIFT_SAR 7
 464
 465 /* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 466 #define EXT3_TESTi 0
 467 #define EXT3_NOT   2
 468 #define EXT3_NEG   3
 469 #define EXT3_MUL   4
 470 #define EXT3_IMUL  5
 471 #define EXT3_DIV   6
 472 #define EXT3_IDIV  7
 473
 474 /* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 475 #define EXT5_INC_Ev     0
 476 #define EXT5_DEC_Ev     1
 477 #define EXT5_CALLN_Ev   2
 478 #define EXT5_JMPN_Ev    4
 479
 480 /* Condition codes to be added to OPC_JCC_{long,short}.  */
 481 #define JCC_JMP (-1)
 482 #define JCC_JO  0x0
 483 #define JCC_JNO 0x1
 484 #define JCC_JB  0x2
 485 #define JCC_JAE 0x3
 486 #define JCC_JE  0x4
 487 #define JCC_JNE 0x5
 488 #define JCC_JBE 0x6
 489 #define JCC_JA  0x7
 490 #define JCC_JS  0x8
 491 #define JCC_JNS 0x9
 492 #define JCC_JP  0xa
 493 #define JCC_JNP 0xb
 494 #define JCC_JL  0xc
 495 #define JCC_JGE 0xd
 496 #define JCC_JLE 0xe
 497 #define JCC_JG  0xf
 498
 499 static const uint8_t tcg_cond_to_jcc[] = {
 500     [TCG_COND_EQ] = JCC_JE,
 501     [TCG_COND_NE] = JCC_JNE,
 502     [TCG_COND_LT] = JCC_JL,
 503     [TCG_COND_GE] = JCC_JGE,
 504     [TCG_COND_LE] = JCC_JLE,
 505     [TCG_COND_GT] = JCC_JG,
 506     [TCG_COND_LTU] = JCC_JB,
 507     [TCG_COND_GEU] = JCC_JAE,
 508     [TCG_COND_LEU] = JCC_JBE,
 509     [TCG_COND_GTU] = JCC_JA,
 510 };
 511
 512 #if TCG_TARGET_REG_BITS == 64
 513 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 514 {
 515     int rex;
 516
 517     if (opc & P_GS) {
 518         tcg_out8(s, 0x65);
 519     }
 520     if (opc & P_DATA16) {
 521         /* We should never be asking for both 16 and 64-bit operation.  */
 522         tcg_debug_assert((opc & P_REXW) == 0);
 523         tcg_out8(s, 0x66);
 524     }
 525     if (opc & P_SIMDF3) {
 526         tcg_out8(s, 0xf3);
 527     } else if (opc & P_SIMDF2) {
 528         tcg_out8(s, 0xf2);
 529     }
 530
 531     rex = 0;
 532     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 533     rex |= (r & 8) >> 1;                /* REX.R */
 534     rex |= (x & 8) >> 2;                /* REX.X */
 535     rex |= (rm & 8) >> 3;               /* REX.B */
 536
 537     /* P_REXB_{R,RM} indicates that the given register is the low byte.
 538        For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 539        as otherwise the encoding indicates %[abcd]h.  Note that the values
 540        that are ORed in merely indicate that the REX byte must be present;
 541        those bits get discarded in output.  */
 542     rex |= opc & (r >= 4 ? P_REXB_R : 0);
 543     rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 544
 545     if (rex) {
 546         tcg_out8(s, (uint8_t)(rex | 0x40));
 547     }
 548
 549     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 550         tcg_out8(s, 0x0f);
 551         if (opc & P_EXT38) {
 552             tcg_out8(s, 0x38);
 553         } else if (opc & P_EXT3A) {
 554             tcg_out8(s, 0x3a);
 555         }
 556     }
 557
 558     tcg_out8(s, opc);
 559 }
 560 #else
 561 static void tcg_out_opc(TCGContext *s, int opc)
 562 {
 563     if (opc & P_DATA16) {
 564         tcg_out8(s, 0x66);
 565     }
 566     if (opc & P_SIMDF3) {
 567         tcg_out8(s, 0xf3);
 568     } else if (opc & P_SIMDF2) {
 569         tcg_out8(s, 0xf2);
 570     }
 571     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 572         tcg_out8(s, 0x0f);
 573         if (opc & P_EXT38) {
 574             tcg_out8(s, 0x38);
 575         } else if (opc & P_EXT3A) {
 576             tcg_out8(s, 0x3a);
 577         }
 578     }
 579     tcg_out8(s, opc);
 580 }
 581 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
 582    the 32-bit compilation paths.  This method works with all versions of gcc,
 583    whereas relying on optimization may not be able to exclude them.  */
 584 #define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 585 #endif
 586
 587 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 588 {
 589     tcg_out_opc(s, opc, r, rm, 0);
 590     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 591 }
 592
 593 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 594                             int rm, int index)
 595 {
 596     int tmp;
 597
 598     if (opc & P_GS) {
 599         tcg_out8(s, 0x65);
 600     }
 601     /* Use the two byte form if possible, which cannot encode
 602        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 603     if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
 604         && ((rm | index) & 8) == 0) {
 605         /* Two byte VEX prefix.  */
 606         tcg_out8(s, 0xc5);
 607
 608         tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 609     } else {
 610         /* Three byte VEX prefix.  */
 611         tcg_out8(s, 0xc4);
 612
 613         /* VEX.m-mmmm */
 614         if (opc & P_EXT3A) {
 615             tmp = 3;
 616         } else if (opc & P_EXT38) {
 617             tmp = 2;
 618         } else if (opc & P_EXT) {
 619             tmp = 1;
 620         } else {
 621             g_assert_not_reached();
 622         }
 623         tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 624         tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 625         tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 626         tcg_out8(s, tmp);
 627
 628         tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
 629     }
 630
 631     tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 632     /* VEX.pp */
 633     if (opc & P_DATA16) {
 634         tmp |= 1;                          /* 0x66 */
 635     } else if (opc & P_SIMDF3) {
 636         tmp |= 2;                          /* 0xf3 */
 637     } else if (opc & P_SIMDF2) {
 638         tmp |= 3;                          /* 0xf2 */
 639     }
 640     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 641     tcg_out8(s, tmp);
 642     tcg_out8(s, opc);
 643 }
 644
 645 static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
 646                              int rm, int index)
 647 {
 648     /* The entire 4-byte evex prefix; with R' and V' set. */
 649     uint32_t p = 0x08041062;
 650     int mm, pp;
 651
 652     tcg_debug_assert(have_avx512vl);
 653
 654     /* EVEX.mm */
 655     if (opc & P_EXT3A) {
 656         mm = 3;
 657     } else if (opc & P_EXT38) {
 658         mm = 2;
 659     } else if (opc & P_EXT) {
 660         mm = 1;
 661     } else {
 662         g_assert_not_reached();
 663     }
 664
 665     /* EVEX.pp */
 666     if (opc & P_DATA16) {
 667         pp = 1;                          /* 0x66 */
 668     } else if (opc & P_SIMDF3) {
 669         pp = 2;                          /* 0xf3 */
 670     } else if (opc & P_SIMDF2) {
 671         pp = 3;                          /* 0xf2 */
 672     } else {
 673         pp = 0;
 674     }
 675
 676     p = deposit32(p, 8, 2, mm);
 677     p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
 678     p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
 679     p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
 680     p = deposit32(p, 16, 2, pp);
 681     p = deposit32(p, 19, 4, ~v);
 682     p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
 683     p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
 684
 685     tcg_out32(s, p);
 686     tcg_out8(s, opc);
 687 }
 688
 689 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 690 {
 691     if (opc & P_EVEX) {
 692         tcg_out_evex_opc(s, opc, r, v, rm, 0);
 693     } else {
 694         tcg_out_vex_opc(s, opc, r, v, rm, 0);
 695     }
 696     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 697 }
 698
 699 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 700    We handle either RM and INDEX missing with a negative value.  In 64-bit
 701    mode for absolute addresses, ~RM is the size of the immediate operand
 702    that will follow the instruction.  */
 703
 704 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 705                                int shift, intptr_t offset)
 706 {
 707     int mod, len;
 708
 709     if (index < 0 && rm < 0) {
 710         if (TCG_TARGET_REG_BITS == 64) {
 711             /* Try for a rip-relative addressing mode.  This has replaced
 712                the 32-bit-mode absolute addressing encoding.  */
 713             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 714             intptr_t disp = offset - pc;
 715             if (disp == (int32_t)disp) {
 716                 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 717                 tcg_out32(s, disp);
 718                 return;
 719             }
 720
 721             /* Try for an absolute address encoding.  This requires the
 722                use of the MODRM+SIB encoding and is therefore larger than
 723                rip-relative addressing.  */
 724             if (offset == (int32_t)offset) {
 725                 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 726                 tcg_out8(s, (4 << 3) | 5);
 727                 tcg_out32(s, offset);
 728                 return;
 729             }
 730
 731             /* ??? The memory isn't directly addressable.  */
 732             g_assert_not_reached();
 733         } else {
 734             /* Absolute address.  */
 735             tcg_out8(s, (r << 3) | 5);
 736             tcg_out32(s, offset);
 737             return;
 738         }
 739     }
 740
 741     /* Find the length of the immediate addend.  Note that the encoding
 742        that would be used for (%ebp) indicates absolute addressing.  */
 743     if (rm < 0) {
 744         mod = 0, len = 4, rm = 5;
 745     } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 746         mod = 0, len = 0;
 747     } else if (offset == (int8_t)offset) {
 748         mod = 0x40, len = 1;
 749     } else {
 750         mod = 0x80, len = 4;
 751     }
 752
 753     /* Use a single byte MODRM format if possible.  Note that the encoding
 754        that would be used for %esp is the escape to the two byte form.  */
 755     if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 756         /* Single byte MODRM format.  */
 757         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 758     } else {
 759         /* Two byte MODRM+SIB format.  */
 760
 761         /* Note that the encoding that would place %esp into the index
 762            field indicates no index register.  In 64-bit mode, the REX.X
 763            bit counts, so %r12 can be used as the index.  */
 764         if (index < 0) {
 765             index = 4;
 766         } else {
 767             tcg_debug_assert(index != TCG_REG_ESP);
 768         }
 769
 770         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 771         tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 772     }
 773
 774     if (len == 1) {
 775         tcg_out8(s, offset);
 776     } else if (len == 4) {
 777         tcg_out32(s, offset);
 778     }
 779 }
 780
 781 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 782                                      int index, int shift, intptr_t offset)
 783 {
 784     tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 785     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 786 }
 787
 788 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 789                                          int rm, int index, int shift,
 790                                          intptr_t offset)
 791 {
 792     tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 793     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 794 }
 795
 796 /* A simplification of the above with no index or shift.  */
 797 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 798                                         int rm, intptr_t offset)
 799 {
 800     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 801 }
 802
 803 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 804                                             int v, int rm, intptr_t offset)
 805 {
 806     tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 807 }
 808
 809 /* Output an opcode with an expected reference to the constant pool.  */
 810 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 811 {
 812     tcg_out_opc(s, opc, r, 0, 0);
 813     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 814     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 815     tcg_out32(s, 0);
 816 }
 817
 818 /* Output an opcode with an expected reference to the constant pool.  */
 819 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 820 {
 821     tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 822     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 823     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 824     tcg_out32(s, 0);
 825 }
 826
 827 /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 828 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 829 {
 830     /* Propagate an opcode prefix, such as P_REXW.  */
 831     int ext = subop & ~0x7;
 832     subop &= 0x7;
 833
 834     tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 835 }
 836
 837 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 838 {
 839     int rexw = 0;
 840
 841     if (arg == ret) {
 842         return true;
 843     }
 844     switch (type) {
 845     case TCG_TYPE_I64:
 846         rexw = P_REXW;
 847         /* fallthru */
 848     case TCG_TYPE_I32:
 849         if (ret < 16) {
 850             if (arg < 16) {
 851                 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 852             } else {
 853                 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 854             }
 855         } else {
 856             if (arg < 16) {
 857                 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 858             } else {
 859                 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 860             }
 861         }
 862         break;
 863
 864     case TCG_TYPE_V64:
 865         tcg_debug_assert(ret >= 16 && arg >= 16);
 866         tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 867         break;
 868     case TCG_TYPE_V128:
 869         tcg_debug_assert(ret >= 16 && arg >= 16);
 870         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 871         break;
 872     case TCG_TYPE_V256:
 873         tcg_debug_assert(ret >= 16 && arg >= 16);
 874         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 875         break;
 876
 877     default:
 878         g_assert_not_reached();
 879     }
 880     return true;
 881 }
 882
 883 static const int avx2_dup_insn[4] = {
 884     OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 885     OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 886 };
 887
 888 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 889                             TCGReg r, TCGReg a)
 890 {
 891     if (have_avx2) {
 892         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 893         tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
 894     } else {
 895         switch (vece) {
 896         case MO_8:
 897             /* ??? With zero in a register, use PSHUFB.  */
 898             tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 899             a = r;
 900             /* FALLTHRU */
 901         case MO_16:
 902             tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 903             a = r;
 904             /* FALLTHRU */
 905         case MO_32:
 906             tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 907             /* imm8 operand: all output lanes selected from input lane 0.  */
 908             tcg_out8(s, 0);
 909             break;
 910         case MO_64:
 911             tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 912             break;
 913         default:
 914             g_assert_not_reached();
 915         }
 916     }
 917     return true;
 918 }
 919
 920 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 921                              TCGReg r, TCGReg base, intptr_t offset)
 922 {
 923     if (have_avx2) {
 924         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 925         tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
 926                                  r, 0, base, offset);
 927     } else {
 928         switch (vece) {
 929         case MO_64:
 930             tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
 931             break;
 932         case MO_32:
 933             tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
 934             break;
 935         case MO_16:
 936             tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
 937             tcg_out8(s, 0); /* imm8 */
 938             tcg_out_dup_vec(s, type, vece, r, r);
 939             break;
 940         case MO_8:
 941             tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
 942             tcg_out8(s, 0); /* imm8 */
 943             tcg_out_dup_vec(s, type, vece, r, r);
 944             break;
 945         default:
 946             g_assert_not_reached();
 947         }
 948     }
 949     return true;
 950 }
 951
 952 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 953                              TCGReg ret, int64_t arg)
 954 {
 955     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 956
 957     if (arg == 0) {
 958         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 959         return;
 960     }
 961     if (arg == -1) {
 962         tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 963         return;
 964     }
 965
 966     if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
 967         if (have_avx2) {
 968             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 969         } else {
 970             tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 971         }
 972         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 973     } else {
 974         if (type == TCG_TYPE_V64) {
 975             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 976         } else if (have_avx2) {
 977             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 978         } else {
 979             tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
 980         }
 981         if (TCG_TARGET_REG_BITS == 64) {
 982             new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 983         } else {
 984             new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
 985         }
 986     }
 987 }
 988
 989 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
 990                              TCGReg ret, tcg_target_long arg)
 991 {
 992     if (arg == 0) {
 993         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 994         return;
 995     }
 996     if (arg == -1) {
 997         tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
 998         return;
 999     }
1000
1001     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1002     tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1003     if (TCG_TARGET_REG_BITS == 64) {
1004         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1005     } else {
1006         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1007     }
1008 }
1009
1010 static void tcg_out_movi_int(TCGContext *s, TCGType type,
1011                              TCGReg ret, tcg_target_long arg)
1012 {
1013     tcg_target_long diff;
1014
1015     if (arg == 0) {
1016         tgen_arithr(s, ARITH_XOR, ret, ret);
1017         return;
1018     }
1019     if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1020         tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1021         tcg_out32(s, arg);
1022         return;
1023     }
1024     if (arg == (int32_t)arg) {
1025         tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1026         tcg_out32(s, arg);
1027         return;
1028     }
1029
1030     /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1031     diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1032     if (diff == (int32_t)diff) {
1033         tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1034         tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1035         tcg_out32(s, diff);
1036         return;
1037     }
1038
1039     tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1040     tcg_out64(s, arg);
1041 }
1042
1043 static void tcg_out_movi(TCGContext *s, TCGType type,
1044                          TCGReg ret, tcg_target_long arg)
1045 {
1046     switch (type) {
1047     case TCG_TYPE_I32:
1048 #if TCG_TARGET_REG_BITS == 64
1049     case TCG_TYPE_I64:
1050 #endif
1051         if (ret < 16) {
1052             tcg_out_movi_int(s, type, ret, arg);
1053         } else {
1054             tcg_out_movi_vec(s, type, ret, arg);
1055         }
1056         break;
1057     default:
1058         g_assert_not_reached();
1059     }
1060 }
1061
1062 static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1063 {
1064     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1065     tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1066     return true;
1067 }
1068
1069 static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1070                              tcg_target_long imm)
1071 {
1072     /* This function is only used for passing structs by reference. */
1073     tcg_debug_assert(imm == (int32_t)imm);
1074     tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1075 }
1076
1077 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1078 {
1079     if (val == (int8_t)val) {
1080         tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1081         tcg_out8(s, val);
1082     } else if (val == (int32_t)val) {
1083         tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1084         tcg_out32(s, val);
1085     } else {
1086         g_assert_not_reached();
1087     }
1088 }
1089
1090 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1091 {
1092     /* Given the strength of x86 memory ordering, we only need care for
1093        store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1094        faster than "mfence", so don't bother with the sse insn.  */
1095     if (a0 & TCG_MO_ST_LD) {
1096         tcg_out8(s, 0xf0);
1097         tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1098         tcg_out8(s, 0);
1099     }
1100 }
1101
1102 static inline void tcg_out_push(TCGContext *s, int reg)
1103 {
1104     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1105 }
1106
1107 static inline void tcg_out_pop(TCGContext *s, int reg)
1108 {
1109     tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1110 }
1111
1112 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1113                        TCGReg arg1, intptr_t arg2)
1114 {
1115     switch (type) {
1116     case TCG_TYPE_I32:
1117         if (ret < 16) {
1118             tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1119         } else {
1120             tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1121         }
1122         break;
1123     case TCG_TYPE_I64:
1124         if (ret < 16) {
1125             tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1126             break;
1127         }
1128         /* FALLTHRU */
1129     case TCG_TYPE_V64:
1130         /* There is no instruction that can validate 8-byte alignment.  */
1131         tcg_debug_assert(ret >= 16);
1132         tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1133         break;
1134     case TCG_TYPE_V128:
1135         /*
1136          * The gvec infrastructure is asserts that v128 vector loads
1137          * and stores use a 16-byte aligned offset.  Validate that the
1138          * final pointer is aligned by using an insn that will SIGSEGV.
1139          */
1140         tcg_debug_assert(ret >= 16);
1141         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1142         break;
1143     case TCG_TYPE_V256:
1144         /*
1145          * The gvec infrastructure only requires 16-byte alignment,
1146          * so here we must use an unaligned load.
1147          */
1148         tcg_debug_assert(ret >= 16);
1149         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1150                                  ret, 0, arg1, arg2);
1151         break;
1152     default:
1153         g_assert_not_reached();
1154     }
1155 }
1156
1157 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1158                        TCGReg arg1, intptr_t arg2)
1159 {
1160     switch (type) {
1161     case TCG_TYPE_I32:
1162         if (arg < 16) {
1163             tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1164         } else {
1165             tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1166         }
1167         break;
1168     case TCG_TYPE_I64:
1169         if (arg < 16) {
1170             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1171             break;
1172         }
1173         /* FALLTHRU */
1174     case TCG_TYPE_V64:
1175         /* There is no instruction that can validate 8-byte alignment.  */
1176         tcg_debug_assert(arg >= 16);
1177         tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1178         break;
1179     case TCG_TYPE_V128:
1180         /*
1181          * The gvec infrastructure is asserts that v128 vector loads
1182          * and stores use a 16-byte aligned offset.  Validate that the
1183          * final pointer is aligned by using an insn that will SIGSEGV.
1184          *
1185          * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1186          * for _WIN64, which must have SSE2 but may not have AVX.
1187          */
1188         tcg_debug_assert(arg >= 16);
1189         if (have_avx1) {
1190             tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1191         } else {
1192             tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1193         }
1194         break;
1195     case TCG_TYPE_V256:
1196         /*
1197          * The gvec infrastructure only requires 16-byte alignment,
1198          * so here we must use an unaligned store.
1199          */
1200         tcg_debug_assert(arg >= 16);
1201         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1202                                  arg, 0, arg1, arg2);
1203         break;
1204     default:
1205         g_assert_not_reached();
1206     }
1207 }
1208
1209 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1210                         TCGReg base, intptr_t ofs)
1211 {
1212     int rexw = 0;
1213     if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1214         if (val != (int32_t)val) {
1215             return false;
1216         }
1217         rexw = P_REXW;
1218     } else if (type != TCG_TYPE_I32) {
1219         return false;
1220     }
1221     tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1222     tcg_out32(s, val);
1223     return true;
1224 }
1225
1226 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1227 {
1228     /* Propagate an opcode prefix, such as P_DATA16.  */
1229     int ext = subopc & ~0x7;
1230     subopc &= 0x7;
1231
1232     if (count == 1) {
1233         tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1234     } else {
1235         tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1236         tcg_out8(s, count);
1237     }
1238 }
1239
1240 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1241 {
1242     tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1243 }
1244
1245 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1246 {
1247     tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1248 }
1249
1250 static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1251 {
1252     /* movzbl */
1253     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1254     tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1255 }
1256
1257 static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1258 {
1259     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1260     /* movsbl */
1261     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1262     tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1263 }
1264
1265 static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1266 {
1267     /* movzwl */
1268     tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1269 }
1270
1271 static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1272 {
1273     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1274     /* movsw[lq] */
1275     tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1276 }
1277
1278 static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1279 {
1280     /* 32-bit mov zero extends.  */
1281     tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1282 }
1283
1284 static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1285 {
1286     tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1287     tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1288 }
1289
1290 static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1291 {
1292     tcg_out_ext32s(s, dest, src);
1293 }
1294
1295 static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1296 {
1297     if (dest != src) {
1298         tcg_out_ext32u(s, dest, src);
1299     }
1300 }
1301
1302 static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1303 {
1304     tcg_out_ext32u(s, dest, src);
1305 }
1306
1307 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1308 {
1309     tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1310 }
1311
1312 static void tgen_arithi(TCGContext *s, int c, int r0,
1313                         tcg_target_long val, int cf)
1314 {
1315     int rexw = 0;
1316
1317     if (TCG_TARGET_REG_BITS == 64) {
1318         rexw = c & -8;
1319         c &= 7;
1320     }
1321
1322     /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1323        partial flags update stalls on Pentium4 and are not recommended
1324        by current Intel optimization manuals.  */
1325     if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1326         int is_inc = (c == ARITH_ADD) ^ (val < 0);
1327         if (TCG_TARGET_REG_BITS == 64) {
1328             /* The single-byte increment encodings are re-tasked as the
1329                REX prefixes.  Use the MODRM encoding.  */
1330             tcg_out_modrm(s, OPC_GRP5 + rexw,
1331                           (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1332         } else {
1333             tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1334         }
1335         return;
1336     }
1337
1338     if (c == ARITH_AND) {
1339         if (TCG_TARGET_REG_BITS == 64) {
1340             if (val == 0xffffffffu) {
1341                 tcg_out_ext32u(s, r0, r0);
1342                 return;
1343             }
1344             if (val == (uint32_t)val) {
1345                 /* AND with no high bits set can use a 32-bit operation.  */
1346                 rexw = 0;
1347             }
1348         }
1349         if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1350             tcg_out_ext8u(s, r0, r0);
1351             return;
1352         }
1353         if (val == 0xffffu) {
1354             tcg_out_ext16u(s, r0, r0);
1355             return;
1356         }
1357     }
1358
1359     if (val == (int8_t)val) {
1360         tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1361         tcg_out8(s, val);
1362         return;
1363     }
1364     if (rexw == 0 || val == (int32_t)val) {
1365         tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1366         tcg_out32(s, val);
1367         return;
1368     }
1369
1370     g_assert_not_reached();
1371 }
1372
1373 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1374 {
1375     if (val != 0) {
1376         tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1377     }
1378 }
1379
1380 /* Set SMALL to force a short forward branch.  */
1381 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1382 {
1383     int32_t val, val1;
1384
1385     if (l->has_value) {
1386         val = tcg_pcrel_diff(s, l->u.value_ptr);
1387         val1 = val - 2;
1388         if ((int8_t)val1 == val1) {
1389             if (opc == -1) {
1390                 tcg_out8(s, OPC_JMP_short);
1391             } else {
1392                 tcg_out8(s, OPC_JCC_short + opc);
1393             }
1394             tcg_out8(s, val1);
1395         } else {
1396             tcg_debug_assert(!small);
1397             if (opc == -1) {
1398                 tcg_out8(s, OPC_JMP_long);
1399                 tcg_out32(s, val - 5);
1400             } else {
1401                 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1402                 tcg_out32(s, val - 6);
1403             }
1404         }
1405     } else if (small) {
1406         if (opc == -1) {
1407             tcg_out8(s, OPC_JMP_short);
1408         } else {
1409             tcg_out8(s, OPC_JCC_short + opc);
1410         }
1411         tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1412         s->code_ptr += 1;
1413     } else {
1414         if (opc == -1) {
1415             tcg_out8(s, OPC_JMP_long);
1416         } else {
1417             tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1418         }
1419         tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1420         s->code_ptr += 4;
1421     }
1422 }
1423
1424 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1425                         int const_arg2, int rexw)
1426 {
1427     if (const_arg2) {
1428         if (arg2 == 0) {
1429             /* test r, r */
1430             tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1431         } else {
1432             tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1433         }
1434     } else {
1435         tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1436     }
1437 }
1438
1439 static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1440                            TCGArg arg1, TCGArg arg2, int const_arg2,
1441                            TCGLabel *label, bool small)
1442 {
1443     tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1444     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1445 }
1446
1447 #if TCG_TARGET_REG_BITS == 32
1448 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1449                             const int *const_args, bool small)
1450 {
1451     TCGLabel *label_next = gen_new_label();
1452     TCGLabel *label_this = arg_label(args[5]);
1453
1454     switch(args[4]) {
1455     case TCG_COND_EQ:
1456         tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1457                        label_next, 1);
1458         tcg_out_brcond(s, 0, TCG_COND_EQ, args[1], args[3], const_args[3],
1459                        label_this, small);
1460         break;
1461     case TCG_COND_NE:
1462         tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1463                        label_this, small);
1464         tcg_out_brcond(s, 0, TCG_COND_NE, args[1], args[3], const_args[3],
1465                        label_this, small);
1466         break;
1467     case TCG_COND_LT:
1468         tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1469                        label_this, small);
1470         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1471         tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1472                        label_this, small);
1473         break;
1474     case TCG_COND_LE:
1475         tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1476                        label_this, small);
1477         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1478         tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1479                        label_this, small);
1480         break;
1481     case TCG_COND_GT:
1482         tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1483                        label_this, small);
1484         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1485         tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1486                        label_this, small);
1487         break;
1488     case TCG_COND_GE:
1489         tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1490                        label_this, small);
1491         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1492         tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1493                        label_this, small);
1494         break;
1495     case TCG_COND_LTU:
1496         tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1497                        label_this, small);
1498         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1499         tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1500                        label_this, small);
1501         break;
1502     case TCG_COND_LEU:
1503         tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1504                        label_this, small);
1505         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1506         tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1507                        label_this, small);
1508         break;
1509     case TCG_COND_GTU:
1510         tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1511                        label_this, small);
1512         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1513         tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1514                        label_this, small);
1515         break;
1516     case TCG_COND_GEU:
1517         tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1518                        label_this, small);
1519         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1520         tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1521                        label_this, small);
1522         break;
1523     default:
1524         g_assert_not_reached();
1525     }
1526     tcg_out_label(s, label_next);
1527 }
1528 #endif
1529
1530 static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1531                             TCGArg dest, TCGArg arg1, TCGArg arg2,
1532                             int const_arg2)
1533 {
1534     bool inv = false;
1535
1536     switch (cond) {
1537     case TCG_COND_NE:
1538         inv = true;
1539         /* fall through */
1540     case TCG_COND_EQ:
1541         /* If arg2 is 0, convert to LTU/GEU vs 1. */
1542         if (const_arg2 && arg2 == 0) {
1543             arg2 = 1;
1544             goto do_ltu;
1545         }
1546         break;
1547
1548     case TCG_COND_LEU:
1549         inv = true;
1550         /* fall through */
1551     case TCG_COND_GTU:
1552         /* If arg2 is a register, swap for LTU/GEU. */
1553         if (!const_arg2) {
1554             TCGReg t = arg1;
1555             arg1 = arg2;
1556             arg2 = t;
1557             goto do_ltu;
1558         }
1559         break;
1560
1561     case TCG_COND_GEU:
1562         inv = true;
1563         /* fall through */
1564     case TCG_COND_LTU:
1565     do_ltu:
1566         /*
1567          * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1568          * We can then use NEG or INC to produce the desired result.
1569          * This is always smaller than the SETCC expansion.
1570          */
1571         tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1572         tgen_arithr(s, ARITH_SBB, dest, dest);              /* T:-1 F:0 */
1573         if (inv) {
1574             tgen_arithi(s, ARITH_ADD, dest, 1, 0);          /* T:0  F:1 */
1575         } else {
1576             tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);  /* T:1  F:0 */
1577         }
1578         return;
1579
1580     default:
1581         break;
1582     }
1583
1584     tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1585     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1586     tcg_out_ext8u(s, dest, dest);
1587 }
1588
1589 #if TCG_TARGET_REG_BITS == 32
1590 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1591                              const int *const_args)
1592 {
1593     TCGArg new_args[6];
1594     TCGLabel *label_true, *label_over;
1595
1596     memcpy(new_args, args+1, 5*sizeof(TCGArg));
1597
1598     if (args[0] == args[1] || args[0] == args[2]
1599         || (!const_args[3] && args[0] == args[3])
1600         || (!const_args[4] && args[0] == args[4])) {
1601         /* When the destination overlaps with one of the argument
1602            registers, don't do anything tricky.  */
1603         label_true = gen_new_label();
1604         label_over = gen_new_label();
1605
1606         new_args[5] = label_arg(label_true);
1607         tcg_out_brcond2(s, new_args, const_args+1, 1);
1608
1609         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1610         tcg_out_jxx(s, JCC_JMP, label_over, 1);
1611         tcg_out_label(s, label_true);
1612
1613         tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1614         tcg_out_label(s, label_over);
1615     } else {
1616         /* When the destination does not overlap one of the arguments,
1617            clear the destination first, jump if cond false, and emit an
1618            increment in the true case.  This results in smaller code.  */
1619
1620         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1621
1622         label_over = gen_new_label();
1623         new_args[4] = tcg_invert_cond(new_args[4]);
1624         new_args[5] = label_arg(label_over);
1625         tcg_out_brcond2(s, new_args, const_args+1, 1);
1626
1627         tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1628         tcg_out_label(s, label_over);
1629     }
1630 }
1631 #endif
1632
1633 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1634                          TCGReg dest, TCGReg v1)
1635 {
1636     if (have_cmov) {
1637         tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1638     } else {
1639         TCGLabel *over = gen_new_label();
1640         tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1641         tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1642         tcg_out_label(s, over);
1643     }
1644 }
1645
1646 static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1647                             TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1648                             TCGReg v1)
1649 {
1650     tcg_out_cmp(s, c1, c2, const_c2, rexw);
1651     tcg_out_cmov(s, cond, rexw, dest, v1);
1652 }
1653
1654 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1655                         TCGArg arg2, bool const_a2)
1656 {
1657     if (have_bmi1) {
1658         tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1659         if (const_a2) {
1660             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1661         } else {
1662             tcg_debug_assert(dest != arg2);
1663             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1664         }
1665     } else {
1666         tcg_debug_assert(dest != arg2);
1667         tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1668         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1669     }
1670 }
1671
1672 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1673                         TCGArg arg2, bool const_a2)
1674 {
1675     if (have_lzcnt) {
1676         tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1677         if (const_a2) {
1678             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1679         } else {
1680             tcg_debug_assert(dest != arg2);
1681             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1682         }
1683     } else {
1684         tcg_debug_assert(!const_a2);
1685         tcg_debug_assert(dest != arg1);
1686         tcg_debug_assert(dest != arg2);
1687
1688         /* Recall that the output of BSR is the index not the count.  */
1689         tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1690         tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1691
1692         /* Since we have destroyed the flags from BSR, we have to re-test.  */
1693         tcg_out_cmp(s, arg1, 0, 1, rexw);
1694         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1695     }
1696 }
1697
1698 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1699 {
1700     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1701
1702     if (disp == (int32_t)disp) {
1703         tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1704         tcg_out32(s, disp);
1705     } else {
1706         /* rip-relative addressing into the constant pool.
1707            This is 6 + 8 = 14 bytes, as compared to using an
1708            immediate load 10 + 6 = 16 bytes, plus we may
1709            be able to re-use the pool constant for more calls.  */
1710         tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1711         tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1712         new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1713         tcg_out32(s, 0);
1714     }
1715 }
1716
1717 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1718                          const TCGHelperInfo *info)
1719 {
1720     tcg_out_branch(s, 1, dest);
1721
1722 #ifndef _WIN32
1723     if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1724         /*
1725          * The sysv i386 abi for struct return places a reference as the
1726          * first argument of the stack, and pops that argument with the
1727          * return statement.  Since we want to retain the aligned stack
1728          * pointer for the callee, we do not want to actually push that
1729          * argument before the call but rely on the normal store to the
1730          * stack slot.  But we do need to compensate for the pop in order
1731          * to reset our correct stack pointer value.
1732          * Pushing a garbage value back onto the stack is quickest.
1733          */
1734         tcg_out_push(s, TCG_REG_EAX);
1735     }
1736 #endif
1737 }
1738
1739 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1740 {
1741     tcg_out_branch(s, 0, dest);
1742 }
1743
1744 static void tcg_out_nopn(TCGContext *s, int n)
1745 {
1746     int i;
1747     /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1748      * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1749      * duplicate prefix, and all of the interesting recent cores can
1750      * decode and discard the duplicates in a single cycle.
1751      */
1752     tcg_debug_assert(n >= 1);
1753     for (i = 1; i < n; ++i) {
1754         tcg_out8(s, 0x66);
1755     }
1756     tcg_out8(s, 0x90);
1757 }
1758
1759 /* Test register R vs immediate bits I, setting Z flag for EQ/NE. */
1760 static void __attribute__((unused))
1761 tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i)
1762 {
1763     /*
1764      * This is used for testing alignment, so we can usually use testb.
1765      * For i686, we have to use testl for %esi/%edi.
1766      */
1767     if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) {
1768         tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r);
1769         tcg_out8(s, i);
1770     } else {
1771         tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r);
1772         tcg_out32(s, i);
1773     }
1774 }
1775
1776 typedef struct {
1777     TCGReg base;
1778     int index;
1779     int ofs;
1780     int seg;
1781     TCGAtomAlign aa;
1782 } HostAddress;
1783
1784 bool tcg_target_has_memory_bswap(MemOp memop)
1785 {
1786     TCGAtomAlign aa;
1787
1788     if (!have_movbe) {
1789         return false;
1790     }
1791     if ((memop & MO_SIZE) < MO_128) {
1792         return true;
1793     }
1794
1795     /*
1796      * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1797      * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1798      */
1799     aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1800     return aa.atom < MO_128;
1801 }
1802
1803 /*
1804  * Because i686 has no register parameters and because x86_64 has xchg
1805  * to handle addr/data register overlap, we have placed all input arguments
1806  * before we need might need a scratch reg.
1807  *
1808  * Even then, a scratch is only needed for l->raddr.  Rather than expose
1809  * a general-purpose scratch when we don't actually know it's available,
1810  * use the ra_gen hook to load into RAX if needed.
1811  */
1812 #if TCG_TARGET_REG_BITS == 64
1813 static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1814 {
1815     if (arg < 0) {
1816         arg = TCG_REG_RAX;
1817     }
1818     tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1819     return arg;
1820 }
1821 static const TCGLdstHelperParam ldst_helper_param = {
1822     .ra_gen = ldst_ra_gen
1823 };
1824 #else
1825 static const TCGLdstHelperParam ldst_helper_param = { };
1826 #endif
1827
1828 static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1829                                 TCGReg l, TCGReg h, TCGReg v)
1830 {
1831     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1832
1833     /* vpmov{d,q} %v, %l */
1834     tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
1835     /* vpextr{d,q} $1, %v, %h */
1836     tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
1837     tcg_out8(s, 1);
1838 }
1839
1840 static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
1841                                 TCGReg v, TCGReg l, TCGReg h)
1842 {
1843     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1844
1845     /* vmov{d,q} %l, %v */
1846     tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
1847     /* vpinsr{d,q} $1, %h, %v, %v */
1848     tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
1849     tcg_out8(s, 1);
1850 }
1851
1852 /*
1853  * Generate code for the slow path for a load at the end of block
1854  */
1855 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1856 {
1857     MemOp opc = get_memop(l->oi);
1858     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1859
1860     /* resolve label address */
1861     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1862     if (label_ptr[1]) {
1863         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1864     }
1865
1866     tcg_out_ld_helper_args(s, l, &ldst_helper_param);
1867     tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
1868     tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
1869
1870     tcg_out_jmp(s, l->raddr);
1871     return true;
1872 }
1873
1874 /*
1875  * Generate code for the slow path for a store at the end of block
1876  */
1877 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1878 {
1879     MemOp opc = get_memop(l->oi);
1880     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1881
1882     /* resolve label address */
1883     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1884     if (label_ptr[1]) {
1885         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1886     }
1887
1888     tcg_out_st_helper_args(s, l, &ldst_helper_param);
1889     tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
1890
1891     tcg_out_jmp(s, l->raddr);
1892     return true;
1893 }
1894
1895 #ifndef CONFIG_SOFTMMU
1896 static HostAddress x86_guest_base = {
1897     .index = -1
1898 };
1899
1900 #if defined(__x86_64__) && defined(__linux__)
1901 # include <asm/prctl.h>
1902 # include <sys/prctl.h>
1903 int arch_prctl(int code, unsigned long addr);
1904 static inline int setup_guest_base_seg(void)
1905 {
1906     if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1907         return P_GS;
1908     }
1909     return 0;
1910 }
1911 #elif defined(__x86_64__) && \
1912       (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
1913 # include <machine/sysarch.h>
1914 static inline int setup_guest_base_seg(void)
1915 {
1916     if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1917         return P_GS;
1918     }
1919     return 0;
1920 }
1921 #else
1922 static inline int setup_guest_base_seg(void)
1923 {
1924     return 0;
1925 }
1926 #endif /* setup_guest_base_seg */
1927 #endif /* !SOFTMMU */
1928
1929 #define MIN_TLB_MASK_TABLE_OFS  INT_MIN
1930
1931 /*
1932  * For softmmu, perform the TLB load and compare.
1933  * For useronly, perform any required alignment tests.
1934  * In both cases, return a TCGLabelQemuLdst structure if the slow path
1935  * is required and fill in @h with the host address for the fast path.
1936  */
1937 static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
1938                                            TCGReg addrlo, TCGReg addrhi,
1939                                            MemOpIdx oi, bool is_ld)
1940 {
1941     TCGLabelQemuLdst *ldst = NULL;
1942     MemOp opc = get_memop(oi);
1943     MemOp s_bits = opc & MO_SIZE;
1944     unsigned a_mask;
1945
1946 #ifdef CONFIG_SOFTMMU
1947     h->index = TCG_REG_L0;
1948     h->ofs = 0;
1949     h->seg = 0;
1950 #else
1951     *h = x86_guest_base;
1952 #endif
1953     h->base = addrlo;
1954     h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
1955     a_mask = (1 << h->aa.align) - 1;
1956
1957 #ifdef CONFIG_SOFTMMU
1958     int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
1959                         : offsetof(CPUTLBEntry, addr_write);
1960     TCGType ttype = TCG_TYPE_I32;
1961     TCGType tlbtype = TCG_TYPE_I32;
1962     int trexw = 0, hrexw = 0, tlbrexw = 0;
1963     unsigned mem_index = get_mmuidx(oi);
1964     unsigned s_mask = (1 << s_bits) - 1;
1965     int fast_ofs = tlb_mask_table_ofs(s, mem_index);
1966     int tlb_mask;
1967
1968     ldst = new_ldst_label(s);
1969     ldst->is_ld = is_ld;
1970     ldst->oi = oi;
1971     ldst->addrlo_reg = addrlo;
1972     ldst->addrhi_reg = addrhi;
1973
1974     if (TCG_TARGET_REG_BITS == 64) {
1975         ttype = s->addr_type;
1976         trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
1977         if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1978             hrexw = P_REXW;
1979             if (s->page_bits + s->tlb_dyn_max_bits > 32) {
1980                 tlbtype = TCG_TYPE_I64;
1981                 tlbrexw = P_REXW;
1982             }
1983         }
1984     }
1985
1986     tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
1987     tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
1988                    s->page_bits - CPU_TLB_ENTRY_BITS);
1989
1990     tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
1991                          fast_ofs + offsetof(CPUTLBDescFast, mask));
1992
1993     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
1994                          fast_ofs + offsetof(CPUTLBDescFast, table));
1995
1996     /*
1997      * If the required alignment is at least as large as the access, simply
1998      * copy the address and mask.  For lesser alignments, check that we don't
1999      * cross pages for the complete access.
2000      */
2001     if (a_mask >= s_mask) {
2002         tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2003     } else {
2004         tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2005                              addrlo, s_mask - a_mask);
2006     }
2007     tlb_mask = s->page_mask | a_mask;
2008     tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2009
2010     /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2011     tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2012                          TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2013
2014     /* jne slow_path */
2015     tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2016     ldst->label_ptr[0] = s->code_ptr;
2017     s->code_ptr += 4;
2018
2019     if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2020         /* cmp 4(TCG_REG_L0), addrhi */
2021         tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, TCG_REG_L0, cmp_ofs + 4);
2022
2023         /* jne slow_path */
2024         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2025         ldst->label_ptr[1] = s->code_ptr;
2026         s->code_ptr += 4;
2027     }
2028
2029     /* TLB Hit.  */
2030     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2031                offsetof(CPUTLBEntry, addend));
2032 #else
2033     if (a_mask) {
2034         ldst = new_ldst_label(s);
2035
2036         ldst->is_ld = is_ld;
2037         ldst->oi = oi;
2038         ldst->addrlo_reg = addrlo;
2039         ldst->addrhi_reg = addrhi;
2040
2041         tcg_out_testi(s, addrlo, a_mask);
2042         /* jne slow_path */
2043         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2044         ldst->label_ptr[0] = s->code_ptr;
2045         s->code_ptr += 4;
2046     }
2047 #endif
2048
2049     return ldst;
2050 }
2051
2052 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2053                                    HostAddress h, TCGType type, MemOp memop)
2054 {
2055     bool use_movbe = false;
2056     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2057     int movop = OPC_MOVL_GvEv;
2058
2059     /* Do big-endian loads with movbe.  */
2060     if (memop & MO_BSWAP) {
2061         tcg_debug_assert(have_movbe);
2062         use_movbe = true;
2063         movop = OPC_MOVBE_GyMy;
2064     }
2065
2066     switch (memop & MO_SSIZE) {
2067     case MO_UB:
2068         tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2069                                  h.base, h.index, 0, h.ofs);
2070         break;
2071     case MO_SB:
2072         tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2073                                  h.base, h.index, 0, h.ofs);
2074         break;
2075     case MO_UW:
2076         if (use_movbe) {
2077             /* There is no extending movbe; only low 16-bits are modified.  */
2078             if (datalo != h.base && datalo != h.index) {
2079                 /* XOR breaks dependency chains.  */
2080                 tgen_arithr(s, ARITH_XOR, datalo, datalo);
2081                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2082                                          datalo, h.base, h.index, 0, h.ofs);
2083             } else {
2084                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2085                                          datalo, h.base, h.index, 0, h.ofs);
2086                 tcg_out_ext16u(s, datalo, datalo);
2087             }
2088         } else {
2089             tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2090                                      h.base, h.index, 0, h.ofs);
2091         }
2092         break;
2093     case MO_SW:
2094         if (use_movbe) {
2095             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2096                                      datalo, h.base, h.index, 0, h.ofs);
2097             tcg_out_ext16s(s, type, datalo, datalo);
2098         } else {
2099             tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2100                                      datalo, h.base, h.index, 0, h.ofs);
2101         }
2102         break;
2103     case MO_UL:
2104         tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2105                                  h.base, h.index, 0, h.ofs);
2106         break;
2107 #if TCG_TARGET_REG_BITS == 64
2108     case MO_SL:
2109         if (use_movbe) {
2110             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2111                                      h.base, h.index, 0, h.ofs);
2112             tcg_out_ext32s(s, datalo, datalo);
2113         } else {
2114             tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2115                                      h.base, h.index, 0, h.ofs);
2116         }
2117         break;
2118 #endif
2119     case MO_UQ:
2120         if (TCG_TARGET_REG_BITS == 64) {
2121             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2122                                      h.base, h.index, 0, h.ofs);
2123             break;
2124         }
2125         if (use_movbe) {
2126             TCGReg t = datalo;
2127             datalo = datahi;
2128             datahi = t;
2129         }
2130         if (h.base == datalo || h.index == datalo) {
2131             tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2132                                      h.base, h.index, 0, h.ofs);
2133             tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2134             tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2135         } else {
2136             tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2137                                      h.base, h.index, 0, h.ofs);
2138             tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2139                                      h.base, h.index, 0, h.ofs + 4);
2140         }
2141         break;
2142
2143     case MO_128:
2144         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2145
2146         /*
2147          * Without 16-byte atomicity, use integer regs.
2148          * That is where we want the data, and it allows bswaps.
2149          */
2150         if (h.aa.atom < MO_128) {
2151             if (use_movbe) {
2152                 TCGReg t = datalo;
2153                 datalo = datahi;
2154                 datahi = t;
2155             }
2156             if (h.base == datalo || h.index == datalo) {
2157                 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2158                                          h.base, h.index, 0, h.ofs);
2159                 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2160                                      datalo, datahi, 0);
2161                 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2162                                      datahi, datahi, 8);
2163             } else {
2164                 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2165                                          h.base, h.index, 0, h.ofs);
2166                 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2167                                          h.base, h.index, 0, h.ofs + 8);
2168             }
2169             break;
2170         }
2171
2172         /*
2173          * With 16-byte atomicity, a vector load is required.
2174          * If we already have 16-byte alignment, then VMOVDQA always works.
2175          * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2176          * Else use we require a runtime test for alignment for VMOVDQA;
2177          * use VMOVDQU on the unaligned nonatomic path for simplicity.
2178          */
2179         if (h.aa.align >= MO_128) {
2180             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2181                                          TCG_TMP_VEC, 0,
2182                                          h.base, h.index, 0, h.ofs);
2183         } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2184             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2185                                          TCG_TMP_VEC, 0,
2186                                          h.base, h.index, 0, h.ofs);
2187         } else {
2188             TCGLabel *l1 = gen_new_label();
2189             TCGLabel *l2 = gen_new_label();
2190
2191             tcg_out_testi(s, h.base, 15);
2192             tcg_out_jxx(s, JCC_JNE, l1, true);
2193
2194             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2195                                          TCG_TMP_VEC, 0,
2196                                          h.base, h.index, 0, h.ofs);
2197             tcg_out_jxx(s, JCC_JMP, l2, true);
2198
2199             tcg_out_label(s, l1);
2200             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2201                                          TCG_TMP_VEC, 0,
2202                                          h.base, h.index, 0, h.ofs);
2203             tcg_out_label(s, l2);
2204         }
2205         tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2206         break;
2207
2208     default:
2209         g_assert_not_reached();
2210     }
2211 }
2212
2213 static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2214                             TCGReg addrlo, TCGReg addrhi,
2215                             MemOpIdx oi, TCGType data_type)
2216 {
2217     TCGLabelQemuLdst *ldst;
2218     HostAddress h;
2219
2220     ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2221     tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2222
2223     if (ldst) {
2224         ldst->type = data_type;
2225         ldst->datalo_reg = datalo;
2226         ldst->datahi_reg = datahi;
2227         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2228     }
2229 }
2230
2231 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2232                                    HostAddress h, MemOp memop)
2233 {
2234     bool use_movbe = false;
2235     int movop = OPC_MOVL_EvGv;
2236
2237     /*
2238      * Do big-endian stores with movbe or softmmu.
2239      * User-only without movbe will have its swapping done generically.
2240      */
2241     if (memop & MO_BSWAP) {
2242         tcg_debug_assert(have_movbe);
2243         use_movbe = true;
2244         movop = OPC_MOVBE_MyGy;
2245     }
2246
2247     switch (memop & MO_SIZE) {
2248     case MO_8:
2249         /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2250         tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2251         tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2252                                  datalo, h.base, h.index, 0, h.ofs);
2253         break;
2254     case MO_16:
2255         tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2256                                  h.base, h.index, 0, h.ofs);
2257         break;
2258     case MO_32:
2259         tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2260                                  h.base, h.index, 0, h.ofs);
2261         break;
2262     case MO_64:
2263         if (TCG_TARGET_REG_BITS == 64) {
2264             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2265                                      h.base, h.index, 0, h.ofs);
2266         } else {
2267             if (use_movbe) {
2268                 TCGReg t = datalo;
2269                 datalo = datahi;
2270                 datahi = t;
2271             }
2272             tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2273                                      h.base, h.index, 0, h.ofs);
2274             tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2275                                      h.base, h.index, 0, h.ofs + 4);
2276         }
2277         break;
2278
2279     case MO_128:
2280         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2281
2282         /*
2283          * Without 16-byte atomicity, use integer regs.
2284          * That is where we have the data, and it allows bswaps.
2285          */
2286         if (h.aa.atom < MO_128) {
2287             if (use_movbe) {
2288                 TCGReg t = datalo;
2289                 datalo = datahi;
2290                 datahi = t;
2291             }
2292             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2293                                      h.base, h.index, 0, h.ofs);
2294             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2295                                      h.base, h.index, 0, h.ofs + 8);
2296             break;
2297         }
2298
2299         /*
2300          * With 16-byte atomicity, a vector store is required.
2301          * If we already have 16-byte alignment, then VMOVDQA always works.
2302          * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2303          * Else use we require a runtime test for alignment for VMOVDQA;
2304          * use VMOVDQU on the unaligned nonatomic path for simplicity.
2305          */
2306         tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2307         if (h.aa.align >= MO_128) {
2308             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2309                                          TCG_TMP_VEC, 0,
2310                                          h.base, h.index, 0, h.ofs);
2311         } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2312             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2313                                          TCG_TMP_VEC, 0,
2314                                          h.base, h.index, 0, h.ofs);
2315         } else {
2316             TCGLabel *l1 = gen_new_label();
2317             TCGLabel *l2 = gen_new_label();
2318
2319             tcg_out_testi(s, h.base, 15);
2320             tcg_out_jxx(s, JCC_JNE, l1, true);
2321
2322             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2323                                          TCG_TMP_VEC, 0,
2324                                          h.base, h.index, 0, h.ofs);
2325             tcg_out_jxx(s, JCC_JMP, l2, true);
2326
2327             tcg_out_label(s, l1);
2328             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2329                                          TCG_TMP_VEC, 0,
2330                                          h.base, h.index, 0, h.ofs);
2331             tcg_out_label(s, l2);
2332         }
2333         break;
2334
2335     default:
2336         g_assert_not_reached();
2337     }
2338 }
2339
2340 static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2341                             TCGReg addrlo, TCGReg addrhi,
2342                             MemOpIdx oi, TCGType data_type)
2343 {
2344     TCGLabelQemuLdst *ldst;
2345     HostAddress h;
2346
2347     ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2348     tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2349
2350     if (ldst) {
2351         ldst->type = data_type;
2352         ldst->datalo_reg = datalo;
2353         ldst->datahi_reg = datahi;
2354         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2355     }
2356 }
2357
2358 static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2359 {
2360     /* Reuse the zeroing that exists for goto_ptr.  */
2361     if (a0 == 0) {
2362         tcg_out_jmp(s, tcg_code_gen_epilogue);
2363     } else {
2364         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2365         tcg_out_jmp(s, tb_ret_addr);
2366     }
2367 }
2368
2369 static void tcg_out_goto_tb(TCGContext *s, int which)
2370 {
2371     /*
2372      * Jump displacement must be aligned for atomic patching;
2373      * see if we need to add extra nops before jump
2374      */
2375     int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2376     if (gap != 1) {
2377         tcg_out_nopn(s, gap - 1);
2378     }
2379     tcg_out8(s, OPC_JMP_long); /* jmp im */
2380     set_jmp_insn_offset(s, which);
2381     tcg_out32(s, 0);
2382     set_jmp_reset_offset(s, which);
2383 }
2384
2385 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2386                               uintptr_t jmp_rx, uintptr_t jmp_rw)
2387 {
2388     /* patch the branch destination */
2389     uintptr_t addr = tb->jmp_target_addr[n];
2390     qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2391     /* no need to flush icache explicitly */
2392 }
2393
2394 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2395                               const TCGArg args[TCG_MAX_OP_ARGS],
2396                               const int const_args[TCG_MAX_OP_ARGS])
2397 {
2398     TCGArg a0, a1, a2;
2399     int c, const_a2, vexop, rexw = 0;
2400
2401 #if TCG_TARGET_REG_BITS == 64
2402 # define OP_32_64(x) \
2403         case glue(glue(INDEX_op_, x), _i64): \
2404             rexw = P_REXW; /* FALLTHRU */    \
2405         case glue(glue(INDEX_op_, x), _i32)
2406 #else
2407 # define OP_32_64(x) \
2408         case glue(glue(INDEX_op_, x), _i32)
2409 #endif
2410
2411     /* Hoist the loads of the most common arguments.  */
2412     a0 = args[0];
2413     a1 = args[1];
2414     a2 = args[2];
2415     const_a2 = const_args[2];
2416
2417     switch (opc) {
2418     case INDEX_op_goto_ptr:
2419         /* jmp to the given host address (could be epilogue) */
2420         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2421         break;
2422     case INDEX_op_br:
2423         tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2424         break;
2425     OP_32_64(ld8u):
2426         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2427         tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2428         break;
2429     OP_32_64(ld8s):
2430         tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2431         break;
2432     OP_32_64(ld16u):
2433         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2434         tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2435         break;
2436     OP_32_64(ld16s):
2437         tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2438         break;
2439 #if TCG_TARGET_REG_BITS == 64
2440     case INDEX_op_ld32u_i64:
2441 #endif
2442     case INDEX_op_ld_i32:
2443         tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2444         break;
2445
2446     OP_32_64(st8):
2447         if (const_args[0]) {
2448             tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2449             tcg_out8(s, a0);
2450         } else {
2451             tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2452         }
2453         break;
2454     OP_32_64(st16):
2455         if (const_args[0]) {
2456             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2457             tcg_out16(s, a0);
2458         } else {
2459             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2460         }
2461         break;
2462 #if TCG_TARGET_REG_BITS == 64
2463     case INDEX_op_st32_i64:
2464 #endif
2465     case INDEX_op_st_i32:
2466         if (const_args[0]) {
2467             tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2468             tcg_out32(s, a0);
2469         } else {
2470             tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2471         }
2472         break;
2473
2474     OP_32_64(add):
2475         /* For 3-operand addition, use LEA.  */
2476         if (a0 != a1) {
2477             TCGArg c3 = 0;
2478             if (const_a2) {
2479                 c3 = a2, a2 = -1;
2480             } else if (a0 == a2) {
2481                 /* Watch out for dest = src + dest, since we've removed
2482                    the matching constraint on the add.  */
2483                 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2484                 break;
2485             }
2486
2487             tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2488             break;
2489         }
2490         c = ARITH_ADD;
2491         goto gen_arith;
2492     OP_32_64(sub):
2493         c = ARITH_SUB;
2494         goto gen_arith;
2495     OP_32_64(and):
2496         c = ARITH_AND;
2497         goto gen_arith;
2498     OP_32_64(or):
2499         c = ARITH_OR;
2500         goto gen_arith;
2501     OP_32_64(xor):
2502         c = ARITH_XOR;
2503         goto gen_arith;
2504     gen_arith:
2505         if (const_a2) {
2506             tgen_arithi(s, c + rexw, a0, a2, 0);
2507         } else {
2508             tgen_arithr(s, c + rexw, a0, a2);
2509         }
2510         break;
2511
2512     OP_32_64(andc):
2513         if (const_a2) {
2514             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2515             tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2516         } else {
2517             tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2518         }
2519         break;
2520
2521     OP_32_64(mul):
2522         if (const_a2) {
2523             int32_t val;
2524             val = a2;
2525             if (val == (int8_t)val) {
2526                 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2527                 tcg_out8(s, val);
2528             } else {
2529                 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2530                 tcg_out32(s, val);
2531             }
2532         } else {
2533             tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2534         }
2535         break;
2536
2537     OP_32_64(div2):
2538         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2539         break;
2540     OP_32_64(divu2):
2541         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2542         break;
2543
2544     OP_32_64(shl):
2545         /* For small constant 3-operand shift, use LEA.  */
2546         if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2547             if (a2 - 1 == 0) {
2548                 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2549                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2550             } else {
2551                 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2552                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2553             }
2554             break;
2555         }
2556         c = SHIFT_SHL;
2557         vexop = OPC_SHLX;
2558         goto gen_shift_maybe_vex;
2559     OP_32_64(shr):
2560         c = SHIFT_SHR;
2561         vexop = OPC_SHRX;
2562         goto gen_shift_maybe_vex;
2563     OP_32_64(sar):
2564         c = SHIFT_SAR;
2565         vexop = OPC_SARX;
2566         goto gen_shift_maybe_vex;
2567     OP_32_64(rotl):
2568         c = SHIFT_ROL;
2569         goto gen_shift;
2570     OP_32_64(rotr):
2571         c = SHIFT_ROR;
2572         goto gen_shift;
2573     gen_shift_maybe_vex:
2574         if (have_bmi2) {
2575             if (!const_a2) {
2576                 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2577                 break;
2578             }
2579             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2580         }
2581         /* FALLTHRU */
2582     gen_shift:
2583         if (const_a2) {
2584             tcg_out_shifti(s, c + rexw, a0, a2);
2585         } else {
2586             tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2587         }
2588         break;
2589
2590     OP_32_64(ctz):
2591         tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2592         break;
2593     OP_32_64(clz):
2594         tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2595         break;
2596     OP_32_64(ctpop):
2597         tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2598         break;
2599
2600     OP_32_64(brcond):
2601         tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2602                        arg_label(args[3]), 0);
2603         break;
2604     OP_32_64(setcond):
2605         tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2);
2606         break;
2607     OP_32_64(movcond):
2608         tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2609         break;
2610
2611     OP_32_64(bswap16):
2612         if (a2 & TCG_BSWAP_OS) {
2613             /* Output must be sign-extended. */
2614             if (rexw) {
2615                 tcg_out_bswap64(s, a0);
2616                 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2617             } else {
2618                 tcg_out_bswap32(s, a0);
2619                 tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2620             }
2621         } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2622             /* Output must be zero-extended, but input isn't. */
2623             tcg_out_bswap32(s, a0);
2624             tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2625         } else {
2626             tcg_out_rolw_8(s, a0);
2627         }
2628         break;
2629     OP_32_64(bswap32):
2630         tcg_out_bswap32(s, a0);
2631         if (rexw && (a2 & TCG_BSWAP_OS)) {
2632             tcg_out_ext32s(s, a0, a0);
2633         }
2634         break;
2635
2636     OP_32_64(neg):
2637         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2638         break;
2639     OP_32_64(not):
2640         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2641         break;
2642
2643     case INDEX_op_qemu_ld_a64_i32:
2644         if (TCG_TARGET_REG_BITS == 32) {
2645             tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2646             break;
2647         }
2648         /* fall through */
2649     case INDEX_op_qemu_ld_a32_i32:
2650         tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2651         break;
2652     case INDEX_op_qemu_ld_a32_i64:
2653         if (TCG_TARGET_REG_BITS == 64) {
2654             tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2655         } else {
2656             tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2657         }
2658         break;
2659     case INDEX_op_qemu_ld_a64_i64:
2660         if (TCG_TARGET_REG_BITS == 64) {
2661             tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2662         } else {
2663             tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2664         }
2665         break;
2666     case INDEX_op_qemu_ld_a32_i128:
2667     case INDEX_op_qemu_ld_a64_i128:
2668         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2669         tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2670         break;
2671
2672     case INDEX_op_qemu_st_a64_i32:
2673     case INDEX_op_qemu_st8_a64_i32:
2674         if (TCG_TARGET_REG_BITS == 32) {
2675             tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2676             break;
2677         }
2678         /* fall through */
2679     case INDEX_op_qemu_st_a32_i32:
2680     case INDEX_op_qemu_st8_a32_i32:
2681         tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2682         break;
2683     case INDEX_op_qemu_st_a32_i64:
2684         if (TCG_TARGET_REG_BITS == 64) {
2685             tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2686         } else {
2687             tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2688         }
2689         break;
2690     case INDEX_op_qemu_st_a64_i64:
2691         if (TCG_TARGET_REG_BITS == 64) {
2692             tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2693         } else {
2694             tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2695         }
2696         break;
2697     case INDEX_op_qemu_st_a32_i128:
2698     case INDEX_op_qemu_st_a64_i128:
2699         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2700         tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2701         break;
2702
2703     OP_32_64(mulu2):
2704         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2705         break;
2706     OP_32_64(muls2):
2707         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2708         break;
2709     OP_32_64(add2):
2710         if (const_args[4]) {
2711             tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2712         } else {
2713             tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2714         }
2715         if (const_args[5]) {
2716             tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2717         } else {
2718             tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2719         }
2720         break;
2721     OP_32_64(sub2):
2722         if (const_args[4]) {
2723             tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2724         } else {
2725             tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2726         }
2727         if (const_args[5]) {
2728             tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2729         } else {
2730             tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2731         }
2732         break;
2733
2734 #if TCG_TARGET_REG_BITS == 32
2735     case INDEX_op_brcond2_i32:
2736         tcg_out_brcond2(s, args, const_args, 0);
2737         break;
2738     case INDEX_op_setcond2_i32:
2739         tcg_out_setcond2(s, args, const_args);
2740         break;
2741 #else /* TCG_TARGET_REG_BITS == 64 */
2742     case INDEX_op_ld32s_i64:
2743         tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2744         break;
2745     case INDEX_op_ld_i64:
2746         tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2747         break;
2748     case INDEX_op_st_i64:
2749         if (const_args[0]) {
2750             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2751             tcg_out32(s, a0);
2752         } else {
2753             tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2754         }
2755         break;
2756
2757     case INDEX_op_bswap64_i64:
2758         tcg_out_bswap64(s, a0);
2759         break;
2760     case INDEX_op_extrh_i64_i32:
2761         tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2762         break;
2763 #endif
2764
2765     OP_32_64(deposit):
2766         if (args[3] == 0 && args[4] == 8) {
2767             /* load bits 0..7 */
2768             if (const_a2) {
2769                 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2770                             0, a0, 0);
2771                 tcg_out8(s, a2);
2772             } else {
2773                 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2774             }
2775         } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2776             /* load bits 8..15 */
2777             if (const_a2) {
2778                 tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2779                 tcg_out8(s, a2);
2780             } else {
2781                 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2782             }
2783         } else if (args[3] == 0 && args[4] == 16) {
2784             /* load bits 0..15 */
2785             if (const_a2) {
2786                 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2787                             0, a0, 0);
2788                 tcg_out16(s, a2);
2789             } else {
2790                 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2791             }
2792         } else {
2793             g_assert_not_reached();
2794         }
2795         break;
2796
2797     case INDEX_op_extract_i64:
2798         if (a2 + args[3] == 32) {
2799             /* This is a 32-bit zero-extending right shift.  */
2800             tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2801             tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2802             break;
2803         }
2804         /* FALLTHRU */
2805     case INDEX_op_extract_i32:
2806         /* On the off-chance that we can use the high-byte registers.
2807            Otherwise we emit the same ext16 + shift pattern that we
2808            would have gotten from the normal tcg-op.c expansion.  */
2809         tcg_debug_assert(a2 == 8 && args[3] == 8);
2810         if (a1 < 4 && a0 < 8) {
2811             tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2812         } else {
2813             tcg_out_ext16u(s, a0, a1);
2814             tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2815         }
2816         break;
2817
2818     case INDEX_op_sextract_i32:
2819         /* We don't implement sextract_i64, as we cannot sign-extend to
2820            64-bits without using the REX prefix that explicitly excludes
2821            access to the high-byte registers.  */
2822         tcg_debug_assert(a2 == 8 && args[3] == 8);
2823         if (a1 < 4 && a0 < 8) {
2824             tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2825         } else {
2826             tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2827             tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2828         }
2829         break;
2830
2831     OP_32_64(extract2):
2832         /* Note that SHRD outputs to the r/m operand.  */
2833         tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2834         tcg_out8(s, args[3]);
2835         break;
2836
2837     case INDEX_op_mb:
2838         tcg_out_mb(s, a0);
2839         break;
2840     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2841     case INDEX_op_mov_i64:
2842     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2843     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
2844     case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
2845     case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
2846     case INDEX_op_ext8s_i64:
2847     case INDEX_op_ext8u_i32:
2848     case INDEX_op_ext8u_i64:
2849     case INDEX_op_ext16s_i32:
2850     case INDEX_op_ext16s_i64:
2851     case INDEX_op_ext16u_i32:
2852     case INDEX_op_ext16u_i64:
2853     case INDEX_op_ext32s_i64:
2854     case INDEX_op_ext32u_i64:
2855     case INDEX_op_ext_i32_i64:
2856     case INDEX_op_extu_i32_i64:
2857     case INDEX_op_extrl_i64_i32:
2858     default:
2859         g_assert_not_reached();
2860     }
2861
2862 #undef OP_32_64
2863 }
2864
2865 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2866                            unsigned vecl, unsigned vece,
2867                            const TCGArg args[TCG_MAX_OP_ARGS],
2868                            const int const_args[TCG_MAX_OP_ARGS])
2869 {
2870     static int const add_insn[4] = {
2871         OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2872     };
2873     static int const ssadd_insn[4] = {
2874         OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2875     };
2876     static int const usadd_insn[4] = {
2877         OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2878     };
2879     static int const sub_insn[4] = {
2880         OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2881     };
2882     static int const sssub_insn[4] = {
2883         OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2884     };
2885     static int const ussub_insn[4] = {
2886         OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2887     };
2888     static int const mul_insn[4] = {
2889         OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2890     };
2891     static int const shift_imm_insn[4] = {
2892         OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2893     };
2894     static int const cmpeq_insn[4] = {
2895         OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2896     };
2897     static int const cmpgt_insn[4] = {
2898         OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2899     };
2900     static int const punpckl_insn[4] = {
2901         OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2902     };
2903     static int const punpckh_insn[4] = {
2904         OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2905     };
2906     static int const packss_insn[4] = {
2907         OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2908     };
2909     static int const packus_insn[4] = {
2910         OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2911     };
2912     static int const smin_insn[4] = {
2913         OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2914     };
2915     static int const smax_insn[4] = {
2916         OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2917     };
2918     static int const umin_insn[4] = {
2919         OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2920     };
2921     static int const umax_insn[4] = {
2922         OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2923     };
2924     static int const rotlv_insn[4] = {
2925         OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2926     };
2927     static int const rotrv_insn[4] = {
2928         OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
2929     };
2930     static int const shlv_insn[4] = {
2931         OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
2932     };
2933     static int const shrv_insn[4] = {
2934         OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
2935     };
2936     static int const sarv_insn[4] = {
2937         OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
2938     };
2939     static int const shls_insn[4] = {
2940         OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2941     };
2942     static int const shrs_insn[4] = {
2943         OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2944     };
2945     static int const sars_insn[4] = {
2946         OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
2947     };
2948     static int const vpshldi_insn[4] = {
2949         OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
2950     };
2951     static int const vpshldv_insn[4] = {
2952         OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
2953     };
2954     static int const vpshrdv_insn[4] = {
2955         OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
2956     };
2957     static int const abs_insn[4] = {
2958         OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
2959     };
2960
2961     TCGType type = vecl + TCG_TYPE_V64;
2962     int insn, sub;
2963     TCGArg a0, a1, a2, a3;
2964
2965     a0 = args[0];
2966     a1 = args[1];
2967     a2 = args[2];
2968
2969     switch (opc) {
2970     case INDEX_op_add_vec:
2971         insn = add_insn[vece];
2972         goto gen_simd;
2973     case INDEX_op_ssadd_vec:
2974         insn = ssadd_insn[vece];
2975         goto gen_simd;
2976     case INDEX_op_usadd_vec:
2977         insn = usadd_insn[vece];
2978         goto gen_simd;
2979     case INDEX_op_sub_vec:
2980         insn = sub_insn[vece];
2981         goto gen_simd;
2982     case INDEX_op_sssub_vec:
2983         insn = sssub_insn[vece];
2984         goto gen_simd;
2985     case INDEX_op_ussub_vec:
2986         insn = ussub_insn[vece];
2987         goto gen_simd;
2988     case INDEX_op_mul_vec:
2989         insn = mul_insn[vece];
2990         goto gen_simd;
2991     case INDEX_op_and_vec:
2992         insn = OPC_PAND;
2993         goto gen_simd;
2994     case INDEX_op_or_vec:
2995         insn = OPC_POR;
2996         goto gen_simd;
2997     case INDEX_op_xor_vec:
2998         insn = OPC_PXOR;
2999         goto gen_simd;
3000     case INDEX_op_smin_vec:
3001         insn = smin_insn[vece];
3002         goto gen_simd;
3003     case INDEX_op_umin_vec:
3004         insn = umin_insn[vece];
3005         goto gen_simd;
3006     case INDEX_op_smax_vec:
3007         insn = smax_insn[vece];
3008         goto gen_simd;
3009     case INDEX_op_umax_vec:
3010         insn = umax_insn[vece];
3011         goto gen_simd;
3012     case INDEX_op_shlv_vec:
3013         insn = shlv_insn[vece];
3014         goto gen_simd;
3015     case INDEX_op_shrv_vec:
3016         insn = shrv_insn[vece];
3017         goto gen_simd;
3018     case INDEX_op_sarv_vec:
3019         insn = sarv_insn[vece];
3020         goto gen_simd;
3021     case INDEX_op_rotlv_vec:
3022         insn = rotlv_insn[vece];
3023         goto gen_simd;
3024     case INDEX_op_rotrv_vec:
3025         insn = rotrv_insn[vece];
3026         goto gen_simd;
3027     case INDEX_op_shls_vec:
3028         insn = shls_insn[vece];
3029         goto gen_simd;
3030     case INDEX_op_shrs_vec:
3031         insn = shrs_insn[vece];
3032         goto gen_simd;
3033     case INDEX_op_sars_vec:
3034         insn = sars_insn[vece];
3035         goto gen_simd;
3036     case INDEX_op_x86_punpckl_vec:
3037         insn = punpckl_insn[vece];
3038         goto gen_simd;
3039     case INDEX_op_x86_punpckh_vec:
3040         insn = punpckh_insn[vece];
3041         goto gen_simd;
3042     case INDEX_op_x86_packss_vec:
3043         insn = packss_insn[vece];
3044         goto gen_simd;
3045     case INDEX_op_x86_packus_vec:
3046         insn = packus_insn[vece];
3047         goto gen_simd;
3048     case INDEX_op_x86_vpshldv_vec:
3049         insn = vpshldv_insn[vece];
3050         a1 = a2;
3051         a2 = args[3];
3052         goto gen_simd;
3053     case INDEX_op_x86_vpshrdv_vec:
3054         insn = vpshrdv_insn[vece];
3055         a1 = a2;
3056         a2 = args[3];
3057         goto gen_simd;
3058 #if TCG_TARGET_REG_BITS == 32
3059     case INDEX_op_dup2_vec:
3060         /* First merge the two 32-bit inputs to a single 64-bit element. */
3061         tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3062         /* Then replicate the 64-bit elements across the rest of the vector. */
3063         if (type != TCG_TYPE_V64) {
3064             tcg_out_dup_vec(s, type, MO_64, a0, a0);
3065         }
3066         break;
3067 #endif
3068     case INDEX_op_abs_vec:
3069         insn = abs_insn[vece];
3070         a2 = a1;
3071         a1 = 0;
3072         goto gen_simd;
3073     gen_simd:
3074         tcg_debug_assert(insn != OPC_UD2);
3075         if (type == TCG_TYPE_V256) {
3076             insn |= P_VEXL;
3077         }
3078         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3079         break;
3080
3081     case INDEX_op_cmp_vec:
3082         sub = args[3];
3083         if (sub == TCG_COND_EQ) {
3084             insn = cmpeq_insn[vece];
3085         } else if (sub == TCG_COND_GT) {
3086             insn = cmpgt_insn[vece];
3087         } else {
3088             g_assert_not_reached();
3089         }
3090         goto gen_simd;
3091
3092     case INDEX_op_andc_vec:
3093         insn = OPC_PANDN;
3094         if (type == TCG_TYPE_V256) {
3095             insn |= P_VEXL;
3096         }
3097         tcg_out_vex_modrm(s, insn, a0, a2, a1);
3098         break;
3099
3100     case INDEX_op_shli_vec:
3101         insn = shift_imm_insn[vece];
3102         sub = 6;
3103         goto gen_shift;
3104     case INDEX_op_shri_vec:
3105         insn = shift_imm_insn[vece];
3106         sub = 2;
3107         goto gen_shift;
3108     case INDEX_op_sari_vec:
3109         if (vece == MO_64) {
3110             insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3111         } else {
3112             insn = shift_imm_insn[vece];
3113         }
3114         sub = 4;
3115         goto gen_shift;
3116     case INDEX_op_rotli_vec:
3117         insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3118         if (vece == MO_64) {
3119             insn |= P_VEXW;
3120         }
3121         sub = 1;
3122         goto gen_shift;
3123     gen_shift:
3124         tcg_debug_assert(vece != MO_8);
3125         if (type == TCG_TYPE_V256) {
3126             insn |= P_VEXL;
3127         }
3128         tcg_out_vex_modrm(s, insn, sub, a0, a1);
3129         tcg_out8(s, a2);
3130         break;
3131
3132     case INDEX_op_ld_vec:
3133         tcg_out_ld(s, type, a0, a1, a2);
3134         break;
3135     case INDEX_op_st_vec:
3136         tcg_out_st(s, type, a0, a1, a2);
3137         break;
3138     case INDEX_op_dupm_vec:
3139         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3140         break;
3141
3142     case INDEX_op_x86_shufps_vec:
3143         insn = OPC_SHUFPS;
3144         sub = args[3];
3145         goto gen_simd_imm8;
3146     case INDEX_op_x86_blend_vec:
3147         if (vece == MO_16) {
3148             insn = OPC_PBLENDW;
3149         } else if (vece == MO_32) {
3150             insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3151         } else {
3152             g_assert_not_reached();
3153         }
3154         sub = args[3];
3155         goto gen_simd_imm8;
3156     case INDEX_op_x86_vperm2i128_vec:
3157         insn = OPC_VPERM2I128;
3158         sub = args[3];
3159         goto gen_simd_imm8;
3160     case INDEX_op_x86_vpshldi_vec:
3161         insn = vpshldi_insn[vece];
3162         sub = args[3];
3163         goto gen_simd_imm8;
3164
3165     case INDEX_op_not_vec:
3166         insn = OPC_VPTERNLOGQ;
3167         a2 = a1;
3168         sub = 0x33; /* !B */
3169         goto gen_simd_imm8;
3170     case INDEX_op_nor_vec:
3171         insn = OPC_VPTERNLOGQ;
3172         sub = 0x11; /* norCB */
3173         goto gen_simd_imm8;
3174     case INDEX_op_nand_vec:
3175         insn = OPC_VPTERNLOGQ;
3176         sub = 0x77; /* nandCB */
3177         goto gen_simd_imm8;
3178     case INDEX_op_eqv_vec:
3179         insn = OPC_VPTERNLOGQ;
3180         sub = 0x99; /* xnorCB */
3181         goto gen_simd_imm8;
3182     case INDEX_op_orc_vec:
3183         insn = OPC_VPTERNLOGQ;
3184         sub = 0xdd; /* orB!C */
3185         goto gen_simd_imm8;
3186
3187     case INDEX_op_bitsel_vec:
3188         insn = OPC_VPTERNLOGQ;
3189         a3 = args[3];
3190         if (a0 == a1) {
3191             a1 = a2;
3192             a2 = a3;
3193             sub = 0xca; /* A?B:C */
3194         } else if (a0 == a2) {
3195             a2 = a3;
3196             sub = 0xe2; /* B?A:C */
3197         } else {
3198             tcg_out_mov(s, type, a0, a3);
3199             sub = 0xb8; /* B?C:A */
3200         }
3201         goto gen_simd_imm8;
3202
3203     gen_simd_imm8:
3204         tcg_debug_assert(insn != OPC_UD2);
3205         if (type == TCG_TYPE_V256) {
3206             insn |= P_VEXL;
3207         }
3208         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3209         tcg_out8(s, sub);
3210         break;
3211
3212     case INDEX_op_x86_vpblendvb_vec:
3213         insn = OPC_VPBLENDVB;
3214         if (type == TCG_TYPE_V256) {
3215             insn |= P_VEXL;
3216         }
3217         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3218         tcg_out8(s, args[3] << 4);
3219         break;
3220
3221     case INDEX_op_x86_psrldq_vec:
3222         tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3223         tcg_out8(s, a2);
3224         break;
3225
3226     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3227     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3228     default:
3229         g_assert_not_reached();
3230     }
3231 }
3232
3233 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3234 {
3235     switch (op) {
3236     case INDEX_op_goto_ptr:
3237         return C_O0_I1(r);
3238
3239     case INDEX_op_ld8u_i32:
3240     case INDEX_op_ld8u_i64:
3241     case INDEX_op_ld8s_i32:
3242     case INDEX_op_ld8s_i64:
3243     case INDEX_op_ld16u_i32:
3244     case INDEX_op_ld16u_i64:
3245     case INDEX_op_ld16s_i32:
3246     case INDEX_op_ld16s_i64:
3247     case INDEX_op_ld_i32:
3248     case INDEX_op_ld32u_i64:
3249     case INDEX_op_ld32s_i64:
3250     case INDEX_op_ld_i64:
3251         return C_O1_I1(r, r);
3252
3253     case INDEX_op_st8_i32:
3254     case INDEX_op_st8_i64:
3255         return C_O0_I2(qi, r);
3256
3257     case INDEX_op_st16_i32:
3258     case INDEX_op_st16_i64:
3259     case INDEX_op_st_i32:
3260     case INDEX_op_st32_i64:
3261         return C_O0_I2(ri, r);
3262
3263     case INDEX_op_st_i64:
3264         return C_O0_I2(re, r);
3265
3266     case INDEX_op_add_i32:
3267     case INDEX_op_add_i64:
3268         return C_O1_I2(r, r, re);
3269
3270     case INDEX_op_sub_i32:
3271     case INDEX_op_sub_i64:
3272     case INDEX_op_mul_i32:
3273     case INDEX_op_mul_i64:
3274     case INDEX_op_or_i32:
3275     case INDEX_op_or_i64:
3276     case INDEX_op_xor_i32:
3277     case INDEX_op_xor_i64:
3278         return C_O1_I2(r, 0, re);
3279
3280     case INDEX_op_and_i32:
3281     case INDEX_op_and_i64:
3282         return C_O1_I2(r, 0, reZ);
3283
3284     case INDEX_op_andc_i32:
3285     case INDEX_op_andc_i64:
3286         return C_O1_I2(r, r, rI);
3287
3288     case INDEX_op_shl_i32:
3289     case INDEX_op_shl_i64:
3290     case INDEX_op_shr_i32:
3291     case INDEX_op_shr_i64:
3292     case INDEX_op_sar_i32:
3293     case INDEX_op_sar_i64:
3294         return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3295
3296     case INDEX_op_rotl_i32:
3297     case INDEX_op_rotl_i64:
3298     case INDEX_op_rotr_i32:
3299     case INDEX_op_rotr_i64:
3300         return C_O1_I2(r, 0, ci);
3301
3302     case INDEX_op_brcond_i32:
3303     case INDEX_op_brcond_i64:
3304         return C_O0_I2(r, re);
3305
3306     case INDEX_op_bswap16_i32:
3307     case INDEX_op_bswap16_i64:
3308     case INDEX_op_bswap32_i32:
3309     case INDEX_op_bswap32_i64:
3310     case INDEX_op_bswap64_i64:
3311     case INDEX_op_neg_i32:
3312     case INDEX_op_neg_i64:
3313     case INDEX_op_not_i32:
3314     case INDEX_op_not_i64:
3315     case INDEX_op_extrh_i64_i32:
3316         return C_O1_I1(r, 0);
3317
3318     case INDEX_op_ext8s_i32:
3319     case INDEX_op_ext8s_i64:
3320     case INDEX_op_ext8u_i32:
3321     case INDEX_op_ext8u_i64:
3322         return C_O1_I1(r, q);
3323
3324     case INDEX_op_ext16s_i32:
3325     case INDEX_op_ext16s_i64:
3326     case INDEX_op_ext16u_i32:
3327     case INDEX_op_ext16u_i64:
3328     case INDEX_op_ext32s_i64:
3329     case INDEX_op_ext32u_i64:
3330     case INDEX_op_ext_i32_i64:
3331     case INDEX_op_extu_i32_i64:
3332     case INDEX_op_extrl_i64_i32:
3333     case INDEX_op_extract_i32:
3334     case INDEX_op_extract_i64:
3335     case INDEX_op_sextract_i32:
3336     case INDEX_op_ctpop_i32:
3337     case INDEX_op_ctpop_i64:
3338         return C_O1_I1(r, r);
3339
3340     case INDEX_op_extract2_i32:
3341     case INDEX_op_extract2_i64:
3342         return C_O1_I2(r, 0, r);
3343
3344     case INDEX_op_deposit_i32:
3345     case INDEX_op_deposit_i64:
3346         return C_O1_I2(q, 0, qi);
3347
3348     case INDEX_op_setcond_i32:
3349     case INDEX_op_setcond_i64:
3350         return C_O1_I2(q, r, re);
3351
3352     case INDEX_op_movcond_i32:
3353     case INDEX_op_movcond_i64:
3354         return C_O1_I4(r, r, re, r, 0);
3355
3356     case INDEX_op_div2_i32:
3357     case INDEX_op_div2_i64:
3358     case INDEX_op_divu2_i32:
3359     case INDEX_op_divu2_i64:
3360         return C_O2_I3(a, d, 0, 1, r);
3361
3362     case INDEX_op_mulu2_i32:
3363     case INDEX_op_mulu2_i64:
3364     case INDEX_op_muls2_i32:
3365     case INDEX_op_muls2_i64:
3366         return C_O2_I2(a, d, a, r);
3367
3368     case INDEX_op_add2_i32:
3369     case INDEX_op_add2_i64:
3370     case INDEX_op_sub2_i32:
3371     case INDEX_op_sub2_i64:
3372         return C_N1_O1_I4(r, r, 0, 1, re, re);
3373
3374     case INDEX_op_ctz_i32:
3375     case INDEX_op_ctz_i64:
3376         return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3377
3378     case INDEX_op_clz_i32:
3379     case INDEX_op_clz_i64:
3380         return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3381
3382     case INDEX_op_qemu_ld_a32_i32:
3383         return C_O1_I1(r, L);
3384     case INDEX_op_qemu_ld_a64_i32:
3385         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3386
3387     case INDEX_op_qemu_st_a32_i32:
3388         return C_O0_I2(L, L);
3389     case INDEX_op_qemu_st_a64_i32:
3390         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3391     case INDEX_op_qemu_st8_a32_i32:
3392         return C_O0_I2(s, L);
3393     case INDEX_op_qemu_st8_a64_i32:
3394         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3395
3396     case INDEX_op_qemu_ld_a32_i64:
3397         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3398     case INDEX_op_qemu_ld_a64_i64:
3399         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3400
3401     case INDEX_op_qemu_st_a32_i64:
3402         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3403     case INDEX_op_qemu_st_a64_i64:
3404         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3405
3406     case INDEX_op_qemu_ld_a32_i128:
3407     case INDEX_op_qemu_ld_a64_i128:
3408         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3409         return C_O2_I1(r, r, L);
3410     case INDEX_op_qemu_st_a32_i128:
3411     case INDEX_op_qemu_st_a64_i128:
3412         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3413         return C_O0_I3(L, L, L);
3414
3415     case INDEX_op_brcond2_i32:
3416         return C_O0_I4(r, r, ri, ri);
3417
3418     case INDEX_op_setcond2_i32:
3419         return C_O1_I4(r, r, r, ri, ri);
3420
3421     case INDEX_op_ld_vec:
3422     case INDEX_op_dupm_vec:
3423         return C_O1_I1(x, r);
3424
3425     case INDEX_op_st_vec:
3426         return C_O0_I2(x, r);
3427
3428     case INDEX_op_add_vec:
3429     case INDEX_op_sub_vec:
3430     case INDEX_op_mul_vec:
3431     case INDEX_op_and_vec:
3432     case INDEX_op_or_vec:
3433     case INDEX_op_xor_vec:
3434     case INDEX_op_andc_vec:
3435     case INDEX_op_orc_vec:
3436     case INDEX_op_nand_vec:
3437     case INDEX_op_nor_vec:
3438     case INDEX_op_eqv_vec:
3439     case INDEX_op_ssadd_vec:
3440     case INDEX_op_usadd_vec:
3441     case INDEX_op_sssub_vec:
3442     case INDEX_op_ussub_vec:
3443     case INDEX_op_smin_vec:
3444     case INDEX_op_umin_vec:
3445     case INDEX_op_smax_vec:
3446     case INDEX_op_umax_vec:
3447     case INDEX_op_shlv_vec:
3448     case INDEX_op_shrv_vec:
3449     case INDEX_op_sarv_vec:
3450     case INDEX_op_rotlv_vec:
3451     case INDEX_op_rotrv_vec:
3452     case INDEX_op_shls_vec:
3453     case INDEX_op_shrs_vec:
3454     case INDEX_op_sars_vec:
3455     case INDEX_op_cmp_vec:
3456     case INDEX_op_x86_shufps_vec:
3457     case INDEX_op_x86_blend_vec:
3458     case INDEX_op_x86_packss_vec:
3459     case INDEX_op_x86_packus_vec:
3460     case INDEX_op_x86_vperm2i128_vec:
3461     case INDEX_op_x86_punpckl_vec:
3462     case INDEX_op_x86_punpckh_vec:
3463     case INDEX_op_x86_vpshldi_vec:
3464 #if TCG_TARGET_REG_BITS == 32
3465     case INDEX_op_dup2_vec:
3466 #endif
3467         return C_O1_I2(x, x, x);
3468
3469     case INDEX_op_abs_vec:
3470     case INDEX_op_dup_vec:
3471     case INDEX_op_not_vec:
3472     case INDEX_op_shli_vec:
3473     case INDEX_op_shri_vec:
3474     case INDEX_op_sari_vec:
3475     case INDEX_op_rotli_vec:
3476     case INDEX_op_x86_psrldq_vec:
3477         return C_O1_I1(x, x);
3478
3479     case INDEX_op_x86_vpshldv_vec:
3480     case INDEX_op_x86_vpshrdv_vec:
3481         return C_O1_I3(x, 0, x, x);
3482
3483     case INDEX_op_bitsel_vec:
3484     case INDEX_op_x86_vpblendvb_vec:
3485         return C_O1_I3(x, x, x, x);
3486
3487     default:
3488         g_assert_not_reached();
3489     }
3490 }
3491
3492 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3493 {
3494     switch (opc) {
3495     case INDEX_op_add_vec:
3496     case INDEX_op_sub_vec:
3497     case INDEX_op_and_vec:
3498     case INDEX_op_or_vec:
3499     case INDEX_op_xor_vec:
3500     case INDEX_op_andc_vec:
3501     case INDEX_op_orc_vec:
3502     case INDEX_op_nand_vec:
3503     case INDEX_op_nor_vec:
3504     case INDEX_op_eqv_vec:
3505     case INDEX_op_not_vec:
3506     case INDEX_op_bitsel_vec:
3507         return 1;
3508     case INDEX_op_cmp_vec:
3509     case INDEX_op_cmpsel_vec:
3510         return -1;
3511
3512     case INDEX_op_rotli_vec:
3513         return have_avx512vl && vece >= MO_32 ? 1 : -1;
3514
3515     case INDEX_op_shli_vec:
3516     case INDEX_op_shri_vec:
3517         /* We must expand the operation for MO_8.  */
3518         return vece == MO_8 ? -1 : 1;
3519
3520     case INDEX_op_sari_vec:
3521         switch (vece) {
3522         case MO_8:
3523             return -1;
3524         case MO_16:
3525         case MO_32:
3526             return 1;
3527         case MO_64:
3528             if (have_avx512vl) {
3529                 return 1;
3530             }
3531             /*
3532              * We can emulate this for MO_64, but it does not pay off
3533              * unless we're producing at least 4 values.
3534              */
3535             return type >= TCG_TYPE_V256 ? -1 : 0;
3536         }
3537         return 0;
3538
3539     case INDEX_op_shls_vec:
3540     case INDEX_op_shrs_vec:
3541         return vece >= MO_16;
3542     case INDEX_op_sars_vec:
3543         switch (vece) {
3544         case MO_16:
3545         case MO_32:
3546             return 1;
3547         case MO_64:
3548             return have_avx512vl;
3549         }
3550         return 0;
3551     case INDEX_op_rotls_vec:
3552         return vece >= MO_16 ? -1 : 0;
3553
3554     case INDEX_op_shlv_vec:
3555     case INDEX_op_shrv_vec:
3556         switch (vece) {
3557         case MO_16:
3558             return have_avx512bw;
3559         case MO_32:
3560         case MO_64:
3561             return have_avx2;
3562         }
3563         return 0;
3564     case INDEX_op_sarv_vec:
3565         switch (vece) {
3566         case MO_16:
3567             return have_avx512bw;
3568         case MO_32:
3569             return have_avx2;
3570         case MO_64:
3571             return have_avx512vl;
3572         }
3573         return 0;
3574     case INDEX_op_rotlv_vec:
3575     case INDEX_op_rotrv_vec:
3576         switch (vece) {
3577         case MO_16:
3578             return have_avx512vbmi2 ? -1 : 0;
3579         case MO_32:
3580         case MO_64:
3581             return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3582         }
3583         return 0;
3584
3585     case INDEX_op_mul_vec:
3586         switch (vece) {
3587         case MO_8:
3588             return -1;
3589         case MO_64:
3590             return have_avx512dq;
3591         }
3592         return 1;
3593
3594     case INDEX_op_ssadd_vec:
3595     case INDEX_op_usadd_vec:
3596     case INDEX_op_sssub_vec:
3597     case INDEX_op_ussub_vec:
3598         return vece <= MO_16;
3599     case INDEX_op_smin_vec:
3600     case INDEX_op_smax_vec:
3601     case INDEX_op_umin_vec:
3602     case INDEX_op_umax_vec:
3603     case INDEX_op_abs_vec:
3604         return vece <= MO_32 || have_avx512vl;
3605
3606     default:
3607         return 0;
3608     }
3609 }
3610
3611 static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3612                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3613 {
3614     TCGv_vec t1, t2;
3615
3616     tcg_debug_assert(vece == MO_8);
3617
3618     t1 = tcg_temp_new_vec(type);
3619     t2 = tcg_temp_new_vec(type);
3620
3621     /*
3622      * Unpack to W, shift, and repack.  Tricky bits:
3623      * (1) Use punpck*bw x,x to produce DDCCBBAA,
3624      *     i.e. duplicate in other half of the 16-bit lane.
3625      * (2) For right-shift, add 8 so that the high half of the lane
3626      *     becomes zero.  For left-shift, and left-rotate, we must
3627      *     shift up and down again.
3628      * (3) Step 2 leaves high half zero such that PACKUSWB
3629      *     (pack with unsigned saturation) does not modify
3630      *     the quantity.
3631      */
3632     vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3633               tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3634     vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3635               tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3636
3637     if (opc != INDEX_op_rotli_vec) {
3638         imm += 8;
3639     }
3640     if (opc == INDEX_op_shri_vec) {
3641         tcg_gen_shri_vec(MO_16, t1, t1, imm);
3642         tcg_gen_shri_vec(MO_16, t2, t2, imm);
3643     } else {
3644         tcg_gen_shli_vec(MO_16, t1, t1, imm);
3645         tcg_gen_shli_vec(MO_16, t2, t2, imm);
3646         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3647         tcg_gen_shri_vec(MO_16, t2, t2, 8);
3648     }
3649
3650     vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3651               tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3652     tcg_temp_free_vec(t1);
3653     tcg_temp_free_vec(t2);
3654 }
3655
3656 static void expand_vec_sari(TCGType type, unsigned vece,
3657                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3658 {
3659     TCGv_vec t1, t2;
3660
3661     switch (vece) {
3662     case MO_8:
3663         /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3664         t1 = tcg_temp_new_vec(type);
3665         t2 = tcg_temp_new_vec(type);
3666         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3667                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3668         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3669                   tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3670         tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3671         tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3672         vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3673                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3674         tcg_temp_free_vec(t1);
3675         tcg_temp_free_vec(t2);
3676         break;
3677
3678     case MO_64:
3679         t1 = tcg_temp_new_vec(type);
3680         if (imm <= 32) {
3681             /*
3682              * We can emulate a small sign extend by performing an arithmetic
3683              * 32-bit shift and overwriting the high half of a 64-bit logical
3684              * shift.  Note that the ISA says shift of 32 is valid, but TCG
3685              * does not, so we have to bound the smaller shift -- we get the
3686              * same result in the high half either way.
3687              */
3688             tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3689             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3690             vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3691                       tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3692                       tcgv_vec_arg(t1), 0xaa);
3693         } else {
3694             /* Otherwise we will need to use a compare vs 0 to produce
3695              * the sign-extend, shift and merge.
3696              */
3697             tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3698                             tcg_constant_vec(type, MO_64, 0), v1);
3699             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3700             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3701             tcg_gen_or_vec(MO_64, v0, v0, t1);
3702         }
3703         tcg_temp_free_vec(t1);
3704         break;
3705
3706     default:
3707         g_assert_not_reached();
3708     }
3709 }
3710
3711 static void expand_vec_rotli(TCGType type, unsigned vece,
3712                              TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3713 {
3714     TCGv_vec t;
3715
3716     if (vece == MO_8) {
3717         expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3718         return;
3719     }
3720
3721     if (have_avx512vbmi2) {
3722         vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3723                   tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3724         return;
3725     }
3726
3727     t = tcg_temp_new_vec(type);
3728     tcg_gen_shli_vec(vece, t, v1, imm);
3729     tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3730     tcg_gen_or_vec(vece, v0, v0, t);
3731     tcg_temp_free_vec(t);
3732 }
3733
3734 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3735                             TCGv_vec v1, TCGv_vec sh, bool right)
3736 {
3737     TCGv_vec t;
3738
3739     if (have_avx512vbmi2) {
3740         vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3741                   type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3742                   tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3743         return;
3744     }
3745
3746     t = tcg_temp_new_vec(type);
3747     tcg_gen_dupi_vec(vece, t, 8 << vece);
3748     tcg_gen_sub_vec(vece, t, t, sh);
3749     if (right) {
3750         tcg_gen_shlv_vec(vece, t, v1, t);
3751         tcg_gen_shrv_vec(vece, v0, v1, sh);
3752     } else {
3753         tcg_gen_shrv_vec(vece, t, v1, t);
3754         tcg_gen_shlv_vec(vece, v0, v1, sh);
3755     }
3756     tcg_gen_or_vec(vece, v0, v0, t);
3757     tcg_temp_free_vec(t);
3758 }
3759
3760 static void expand_vec_rotls(TCGType type, unsigned vece,
3761                              TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3762 {
3763     TCGv_vec t = tcg_temp_new_vec(type);
3764
3765     tcg_debug_assert(vece != MO_8);
3766
3767     if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3768         tcg_gen_dup_i32_vec(vece, t, lsh);
3769         if (vece >= MO_32) {
3770             tcg_gen_rotlv_vec(vece, v0, v1, t);
3771         } else {
3772             expand_vec_rotv(type, vece, v0, v1, t, false);
3773         }
3774     } else {
3775         TCGv_i32 rsh = tcg_temp_new_i32();
3776
3777         tcg_gen_neg_i32(rsh, lsh);
3778         tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3779         tcg_gen_shls_vec(vece, t, v1, lsh);
3780         tcg_gen_shrs_vec(vece, v0, v1, rsh);
3781         tcg_gen_or_vec(vece, v0, v0, t);
3782
3783         tcg_temp_free_i32(rsh);
3784     }
3785
3786     tcg_temp_free_vec(t);
3787 }
3788
3789 static void expand_vec_mul(TCGType type, unsigned vece,
3790                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3791 {
3792     TCGv_vec t1, t2, t3, t4, zero;
3793
3794     tcg_debug_assert(vece == MO_8);
3795
3796     /*
3797      * Unpack v1 bytes to words, 0 | x.
3798      * Unpack v2 bytes to words, y | 0.
3799      * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3800      * Shift logical right by 8 bits to clear the high 8 bytes before
3801      * using an unsigned saturated pack.
3802      *
3803      * The difference between the V64, V128 and V256 cases is merely how
3804      * we distribute the expansion between temporaries.
3805      */
3806     switch (type) {
3807     case TCG_TYPE_V64:
3808         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3809         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3810         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3811         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3812                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3813         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3814                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3815         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3816         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3817         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3818                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3819         tcg_temp_free_vec(t1);
3820         tcg_temp_free_vec(t2);
3821         break;
3822
3823     case TCG_TYPE_V128:
3824     case TCG_TYPE_V256:
3825         t1 = tcg_temp_new_vec(type);
3826         t2 = tcg_temp_new_vec(type);
3827         t3 = tcg_temp_new_vec(type);
3828         t4 = tcg_temp_new_vec(type);
3829         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3830         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3831                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3832         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3833                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3834         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3835                   tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3836         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3837                   tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3838         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3839         tcg_gen_mul_vec(MO_16, t3, t3, t4);
3840         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3841         tcg_gen_shri_vec(MO_16, t3, t3, 8);
3842         vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3843                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3844         tcg_temp_free_vec(t1);
3845         tcg_temp_free_vec(t2);
3846         tcg_temp_free_vec(t3);
3847         tcg_temp_free_vec(t4);
3848         break;
3849
3850     default:
3851         g_assert_not_reached();
3852     }
3853 }
3854
3855 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3856                                  TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3857 {
3858     enum {
3859         NEED_INV  = 1,
3860         NEED_SWAP = 2,
3861         NEED_BIAS = 4,
3862         NEED_UMIN = 8,
3863         NEED_UMAX = 16,
3864     };
3865     TCGv_vec t1, t2, t3;
3866     uint8_t fixup;
3867
3868     switch (cond) {
3869     case TCG_COND_EQ:
3870     case TCG_COND_GT:
3871         fixup = 0;
3872         break;
3873     case TCG_COND_NE:
3874     case TCG_COND_LE:
3875         fixup = NEED_INV;
3876         break;
3877     case TCG_COND_LT:
3878         fixup = NEED_SWAP;
3879         break;
3880     case TCG_COND_GE:
3881         fixup = NEED_SWAP | NEED_INV;
3882         break;
3883     case TCG_COND_LEU:
3884         if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3885             fixup = NEED_UMIN;
3886         } else {
3887             fixup = NEED_BIAS | NEED_INV;
3888         }
3889         break;
3890     case TCG_COND_GTU:
3891         if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3892             fixup = NEED_UMIN | NEED_INV;
3893         } else {
3894             fixup = NEED_BIAS;
3895         }
3896         break;
3897     case TCG_COND_GEU:
3898         if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3899             fixup = NEED_UMAX;
3900         } else {
3901             fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3902         }
3903         break;
3904     case TCG_COND_LTU:
3905         if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3906             fixup = NEED_UMAX | NEED_INV;
3907         } else {
3908             fixup = NEED_BIAS | NEED_SWAP;
3909         }
3910         break;
3911     default:
3912         g_assert_not_reached();
3913     }
3914
3915     if (fixup & NEED_INV) {
3916         cond = tcg_invert_cond(cond);
3917     }
3918     if (fixup & NEED_SWAP) {
3919         t1 = v1, v1 = v2, v2 = t1;
3920         cond = tcg_swap_cond(cond);
3921     }
3922
3923     t1 = t2 = NULL;
3924     if (fixup & (NEED_UMIN | NEED_UMAX)) {
3925         t1 = tcg_temp_new_vec(type);
3926         if (fixup & NEED_UMIN) {
3927             tcg_gen_umin_vec(vece, t1, v1, v2);
3928         } else {
3929             tcg_gen_umax_vec(vece, t1, v1, v2);
3930         }
3931         v2 = t1;
3932         cond = TCG_COND_EQ;
3933     } else if (fixup & NEED_BIAS) {
3934         t1 = tcg_temp_new_vec(type);
3935         t2 = tcg_temp_new_vec(type);
3936         t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3937         tcg_gen_sub_vec(vece, t1, v1, t3);
3938         tcg_gen_sub_vec(vece, t2, v2, t3);
3939         v1 = t1;
3940         v2 = t2;
3941         cond = tcg_signed_cond(cond);
3942     }
3943
3944     tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3945     /* Expand directly; do not recurse.  */
3946     vec_gen_4(INDEX_op_cmp_vec, type, vece,
3947               tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3948
3949     if (t1) {
3950         tcg_temp_free_vec(t1);
3951         if (t2) {
3952             tcg_temp_free_vec(t2);
3953         }
3954     }
3955     return fixup & NEED_INV;
3956 }
3957
3958 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3959                            TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3960 {
3961     if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3962         tcg_gen_not_vec(vece, v0, v0);
3963     }
3964 }
3965
3966 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3967                               TCGv_vec c1, TCGv_vec c2,
3968                               TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3969 {
3970     TCGv_vec t = tcg_temp_new_vec(type);
3971
3972     if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3973         /* Invert the sense of the compare by swapping arguments.  */
3974         TCGv_vec x;
3975         x = v3, v3 = v4, v4 = x;
3976     }
3977     vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3978               tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3979               tcgv_vec_arg(v3), tcgv_vec_arg(t));
3980     tcg_temp_free_vec(t);
3981 }
3982
3983 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3984                        TCGArg a0, ...)
3985 {
3986     va_list va;
3987     TCGArg a2;
3988     TCGv_vec v0, v1, v2, v3, v4;
3989
3990     va_start(va, a0);
3991     v0 = temp_tcgv_vec(arg_temp(a0));
3992     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3993     a2 = va_arg(va, TCGArg);
3994
3995     switch (opc) {
3996     case INDEX_op_shli_vec:
3997     case INDEX_op_shri_vec:
3998         expand_vec_shi(type, vece, opc, v0, v1, a2);
3999         break;
4000
4001     case INDEX_op_sari_vec:
4002         expand_vec_sari(type, vece, v0, v1, a2);
4003         break;
4004
4005     case INDEX_op_rotli_vec:
4006         expand_vec_rotli(type, vece, v0, v1, a2);
4007         break;
4008
4009     case INDEX_op_rotls_vec:
4010         expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4011         break;
4012
4013     case INDEX_op_rotlv_vec:
4014         v2 = temp_tcgv_vec(arg_temp(a2));
4015         expand_vec_rotv(type, vece, v0, v1, v2, false);
4016         break;
4017     case INDEX_op_rotrv_vec:
4018         v2 = temp_tcgv_vec(arg_temp(a2));
4019         expand_vec_rotv(type, vece, v0, v1, v2, true);
4020         break;
4021
4022     case INDEX_op_mul_vec:
4023         v2 = temp_tcgv_vec(arg_temp(a2));
4024         expand_vec_mul(type, vece, v0, v1, v2);
4025         break;
4026
4027     case INDEX_op_cmp_vec:
4028         v2 = temp_tcgv_vec(arg_temp(a2));
4029         expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
4030         break;
4031
4032     case INDEX_op_cmpsel_vec:
4033         v2 = temp_tcgv_vec(arg_temp(a2));
4034         v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4035         v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4036         expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4037         break;
4038
4039     default:
4040         break;
4041     }
4042
4043     va_end(va);
4044 }
4045
4046 static const int tcg_target_callee_save_regs[] = {
4047 #if TCG_TARGET_REG_BITS == 64
4048     TCG_REG_RBP,
4049     TCG_REG_RBX,
4050 #if defined(_WIN64)
4051     TCG_REG_RDI,
4052     TCG_REG_RSI,
4053 #endif
4054     TCG_REG_R12,
4055     TCG_REG_R13,
4056     TCG_REG_R14, /* Currently used for the global env. */
4057     TCG_REG_R15,
4058 #else
4059     TCG_REG_EBP, /* Currently used for the global env. */
4060     TCG_REG_EBX,
4061     TCG_REG_ESI,
4062     TCG_REG_EDI,
4063 #endif
4064 };
4065
4066 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
4067    and tcg_register_jit.  */
4068
4069 #define PUSH_SIZE \
4070     ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4071      * (TCG_TARGET_REG_BITS / 8))
4072
4073 #define FRAME_SIZE \
4074     ((PUSH_SIZE \
4075       + TCG_STATIC_CALL_ARGS_SIZE \
4076       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4077       + TCG_TARGET_STACK_ALIGN - 1) \
4078      & ~(TCG_TARGET_STACK_ALIGN - 1))
4079
4080 /* Generate global QEMU prologue and epilogue code */
4081 static void tcg_target_qemu_prologue(TCGContext *s)
4082 {
4083     int i, stack_addend;
4084
4085     /* TB prologue */
4086
4087     /* Reserve some stack space, also for TCG temps.  */
4088     stack_addend = FRAME_SIZE - PUSH_SIZE;
4089     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4090                   CPU_TEMP_BUF_NLONGS * sizeof(long));
4091
4092     /* Save all callee saved registers.  */
4093     for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4094         tcg_out_push(s, tcg_target_callee_save_regs[i]);
4095     }
4096
4097 #if TCG_TARGET_REG_BITS == 32
4098     tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4099                (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4100     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4101     /* jmp *tb.  */
4102     tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4103                          (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4104                          + stack_addend);
4105 #else
4106 # if !defined(CONFIG_SOFTMMU)
4107     if (guest_base) {
4108         int seg = setup_guest_base_seg();
4109         if (seg != 0) {
4110             x86_guest_base.seg = seg;
4111         } else if (guest_base == (int32_t)guest_base) {
4112             x86_guest_base.ofs = guest_base;
4113         } else {
4114             /* Choose R12 because, as a base, it requires a SIB byte. */
4115             x86_guest_base.index = TCG_REG_R12;
4116             tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4117             tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4118         }
4119     }
4120 # endif
4121     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4122     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4123     /* jmp *tb.  */
4124     tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4125 #endif
4126
4127     /*
4128      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4129      * and fall through to the rest of the epilogue.
4130      */
4131     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4132     tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4133
4134     /* TB epilogue */
4135     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4136
4137     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4138
4139     if (have_avx2) {
4140         tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4141     }
4142     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4143         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4144     }
4145     tcg_out_opc(s, OPC_RET, 0, 0, 0);
4146 }
4147
4148 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4149 {
4150     memset(p, 0x90, count);
4151 }
4152
4153 static void tcg_target_init(TCGContext *s)
4154 {
4155     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4156     if (TCG_TARGET_REG_BITS == 64) {
4157         tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4158     }
4159     if (have_avx1) {
4160         tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4161         tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4162     }
4163     if (have_avx2) {
4164         tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4165     }
4166
4167     tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4168     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4169     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4170     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4171     if (TCG_TARGET_REG_BITS == 64) {
4172 #if !defined(_WIN64)
4173         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4174         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4175 #endif
4176         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4177         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4178         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4179         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4180     }
4181
4182     s->reserved_regs = 0;
4183     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4184     tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4185 #ifdef _WIN64
4186     /* These are call saved, and we don't save them, so don't use them. */
4187     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4188     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4189     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4190     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4191     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4192     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4193     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4194     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4195     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4196     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4197 #endif
4198 }
4199
4200 typedef struct {
4201     DebugFrameHeader h;
4202     uint8_t fde_def_cfa[4];
4203     uint8_t fde_reg_ofs[14];
4204 } DebugFrame;
4205
4206 /* We're expecting a 2 byte uleb128 encoded value.  */
4207 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4208
4209 #if !defined(__ELF__)
4210     /* Host machine without ELF. */
4211 #elif TCG_TARGET_REG_BITS == 64
4212 #define ELF_HOST_MACHINE EM_X86_64
4213 static const DebugFrame debug_frame = {
4214     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4215     .h.cie.id = -1,
4216     .h.cie.version = 1,
4217     .h.cie.code_align = 1,
4218     .h.cie.data_align = 0x78,             /* sleb128 -8 */
4219     .h.cie.return_column = 16,
4220
4221     /* Total FDE size does not include the "len" member.  */
4222     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4223
4224     .fde_def_cfa = {
4225         12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4226         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4227         (FRAME_SIZE >> 7)
4228     },
4229     .fde_reg_ofs = {
4230         0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4231         /* The following ordering must match tcg_target_callee_save_regs.  */
4232         0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4233         0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4234         0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4235         0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4236         0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4237         0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4238     }
4239 };
4240 #else
4241 #define ELF_HOST_MACHINE EM_386
4242 static const DebugFrame debug_frame = {
4243     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4244     .h.cie.id = -1,
4245     .h.cie.version = 1,
4246     .h.cie.code_align = 1,
4247     .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4248     .h.cie.return_column = 8,
4249
4250     /* Total FDE size does not include the "len" member.  */
4251     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4252
4253     .fde_def_cfa = {
4254         12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4255         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4256         (FRAME_SIZE >> 7)
4257     },
4258     .fde_reg_ofs = {
4259         0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4260         /* The following ordering must match tcg_target_callee_save_regs.  */
4261         0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4262         0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4263         0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4264         0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4265     }
4266 };
4267 #endif
4268
4269 #if defined(ELF_HOST_MACHINE)
4270 void tcg_register_jit(const void *buf, size_t buf_size)
4271 {
4272     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4273 }
4274 #endif