tcg/i386/tcg-target.c.inc

   1 /*
   2  * Tiny Code Generator for QEMU
   3  *
   4  * Copyright (c) 2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "../tcg-ldst.c.inc"
  26 #include "../tcg-pool.c.inc"
  27
  28 #ifdef CONFIG_DEBUG_TCG
  29 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  30 #if TCG_TARGET_REG_BITS == 64
  31     "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  32 #else
  33     "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  34 #endif
  35     "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  36     "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  37 #if TCG_TARGET_REG_BITS == 64
  38     "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  39     "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  40 #endif
  41 };
  42 #endif
  43
  44 static const int tcg_target_reg_alloc_order[] = {
  45 #if TCG_TARGET_REG_BITS == 64
  46     TCG_REG_RBP,
  47     TCG_REG_RBX,
  48     TCG_REG_R12,
  49     TCG_REG_R13,
  50     TCG_REG_R14,
  51     TCG_REG_R15,
  52     TCG_REG_R10,
  53     TCG_REG_R11,
  54     TCG_REG_R9,
  55     TCG_REG_R8,
  56     TCG_REG_RCX,
  57     TCG_REG_RDX,
  58     TCG_REG_RSI,
  59     TCG_REG_RDI,
  60     TCG_REG_RAX,
  61 #else
  62     TCG_REG_EBX,
  63     TCG_REG_ESI,
  64     TCG_REG_EDI,
  65     TCG_REG_EBP,
  66     TCG_REG_ECX,
  67     TCG_REG_EDX,
  68     TCG_REG_EAX,
  69 #endif
  70     TCG_REG_XMM0,
  71     TCG_REG_XMM1,
  72     TCG_REG_XMM2,
  73     TCG_REG_XMM3,
  74     TCG_REG_XMM4,
  75     TCG_REG_XMM5,
  76 #ifndef _WIN64
  77     /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  78        any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  79     TCG_REG_XMM6,
  80     TCG_REG_XMM7,
  81 #if TCG_TARGET_REG_BITS == 64
  82     TCG_REG_XMM8,
  83     TCG_REG_XMM9,
  84     TCG_REG_XMM10,
  85     TCG_REG_XMM11,
  86     TCG_REG_XMM12,
  87     TCG_REG_XMM13,
  88     TCG_REG_XMM14,
  89     TCG_REG_XMM15,
  90 #endif
  91 #endif
  92 };
  93
  94 #define TCG_TMP_VEC  TCG_REG_XMM5
  95
  96 static const int tcg_target_call_iarg_regs[] = {
  97 #if TCG_TARGET_REG_BITS == 64
  98 #if defined(_WIN64)
  99     TCG_REG_RCX,
 100     TCG_REG_RDX,
 101 #else
 102     TCG_REG_RDI,
 103     TCG_REG_RSI,
 104     TCG_REG_RDX,
 105     TCG_REG_RCX,
 106 #endif
 107     TCG_REG_R8,
 108     TCG_REG_R9,
 109 #else
 110     /* 32 bit mode uses stack based calling convention (GCC default). */
 111 #endif
 112 };
 113
 114 static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
 115 {
 116     switch (kind) {
 117     case TCG_CALL_RET_NORMAL:
 118         tcg_debug_assert(slot >= 0 && slot <= 1);
 119         return slot ? TCG_REG_EDX : TCG_REG_EAX;
 120 #ifdef _WIN64
 121     case TCG_CALL_RET_BY_VEC:
 122         tcg_debug_assert(slot == 0);
 123         return TCG_REG_XMM0;
 124 #endif
 125     default:
 126         g_assert_not_reached();
 127     }
 128 }
 129
 130 /* Constants we accept.  */
 131 #define TCG_CT_CONST_S32 0x100
 132 #define TCG_CT_CONST_U32 0x200
 133 #define TCG_CT_CONST_I32 0x400
 134 #define TCG_CT_CONST_WSZ 0x800
 135
 136 /* Registers used with L constraint, which are the first argument
 137    registers on x86_64, and two random call clobbered registers on
 138    i386. */
 139 #if TCG_TARGET_REG_BITS == 64
 140 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 141 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 142 #else
 143 # define TCG_REG_L0 TCG_REG_EAX
 144 # define TCG_REG_L1 TCG_REG_EDX
 145 #endif
 146
 147 #if TCG_TARGET_REG_BITS == 64
 148 # define ALL_GENERAL_REGS      0x0000ffffu
 149 # define ALL_VECTOR_REGS       0xffff0000u
 150 # define ALL_BYTEL_REGS        ALL_GENERAL_REGS
 151 #else
 152 # define ALL_GENERAL_REGS      0x000000ffu
 153 # define ALL_VECTOR_REGS       0x00ff0000u
 154 # define ALL_BYTEL_REGS        0x0000000fu
 155 #endif
 156 #ifdef CONFIG_SOFTMMU
 157 # define SOFTMMU_RESERVE_REGS  ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
 158 #else
 159 # define SOFTMMU_RESERVE_REGS  0
 160 #endif
 161
 162 /* For 64-bit, we always know that CMOV is available.  */
 163 #if TCG_TARGET_REG_BITS == 64
 164 # define have_cmov      true
 165 #else
 166 # define have_cmov      (cpuinfo & CPUINFO_CMOV)
 167 #endif
 168 #define have_bmi2       (cpuinfo & CPUINFO_BMI2)
 169 #define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
 170
 171 static const tcg_insn_unit *tb_ret_addr;
 172
 173 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 174                         intptr_t value, intptr_t addend)
 175 {
 176     value += addend;
 177     switch(type) {
 178     case R_386_PC32:
 179         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 180         if (value != (int32_t)value) {
 181             return false;
 182         }
 183         /* FALLTHRU */
 184     case R_386_32:
 185         tcg_patch32(code_ptr, value);
 186         break;
 187     case R_386_PC8:
 188         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 189         if (value != (int8_t)value) {
 190             return false;
 191         }
 192         tcg_patch8(code_ptr, value);
 193         break;
 194     default:
 195         g_assert_not_reached();
 196     }
 197     return true;
 198 }
 199
 200 /* test if a constant matches the constraint */
 201 static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
 202 {
 203     if (ct & TCG_CT_CONST) {
 204         return 1;
 205     }
 206     if (type == TCG_TYPE_I32) {
 207         if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
 208             return 1;
 209         }
 210     } else {
 211         if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 212             return 1;
 213         }
 214         if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 215             return 1;
 216         }
 217         if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 218             return 1;
 219         }
 220     }
 221     if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 222         return 1;
 223     }
 224     return 0;
 225 }
 226
 227 # define LOWREGMASK(x)  ((x) & 7)
 228
 229 #define P_EXT           0x100           /* 0x0f opcode prefix */
 230 #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 231 #define P_DATA16        0x400           /* 0x66 opcode prefix */
 232 #define P_VEXW          0x1000          /* Set VEX.W = 1 */
 233 #if TCG_TARGET_REG_BITS == 64
 234 # define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
 235 # define P_REXB_R       0x2000          /* REG field as byte register */
 236 # define P_REXB_RM      0x4000          /* R/M field as byte register */
 237 # define P_GS           0x8000          /* gs segment override */
 238 #else
 239 # define P_REXW         0
 240 # define P_REXB_R       0
 241 # define P_REXB_RM      0
 242 # define P_GS           0
 243 #endif
 244 #define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 245 #define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 246 #define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 247 #define P_VEXL          0x80000         /* Set VEX.L = 1 */
 248 #define P_EVEX          0x100000        /* Requires EVEX encoding */
 249
 250 #define OPC_ARITH_EvIz  (0x81)
 251 #define OPC_ARITH_EvIb  (0x83)
 252 #define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 253 #define OPC_ANDN        (0xf2 | P_EXT38)
 254 #define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 255 #define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 256 #define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 257 #define OPC_BSF         (0xbc | P_EXT)
 258 #define OPC_BSR         (0xbd | P_EXT)
 259 #define OPC_BSWAP       (0xc8 | P_EXT)
 260 #define OPC_CALL_Jz     (0xe8)
 261 #define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 262 #define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 263 #define OPC_DEC_r32     (0x48)
 264 #define OPC_IMUL_GvEv   (0xaf | P_EXT)
 265 #define OPC_IMUL_GvEvIb (0x6b)
 266 #define OPC_IMUL_GvEvIz (0x69)
 267 #define OPC_INC_r32     (0x40)
 268 #define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 269 #define OPC_JCC_short   (0x70)          /* ... plus condition code */
 270 #define OPC_JMP_long    (0xe9)
 271 #define OPC_JMP_short   (0xeb)
 272 #define OPC_LEA         (0x8d)
 273 #define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 274 #define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 275 #define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 276 #define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 277 #define OPC_MOVB_EvIz   (0xc6)
 278 #define OPC_MOVL_EvIz   (0xc7)
 279 #define OPC_MOVB_Ib     (0xb0)
 280 #define OPC_MOVL_Iv     (0xb8)
 281 #define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 282 #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 283 #define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 284 #define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 285 #define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 286 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 287 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 288 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 289 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 290 #define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 291 #define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 292 #define OPC_MOVSBL      (0xbe | P_EXT)
 293 #define OPC_MOVSWL      (0xbf | P_EXT)
 294 #define OPC_MOVSLQ      (0x63 | P_REXW)
 295 #define OPC_MOVZBL      (0xb6 | P_EXT)
 296 #define OPC_MOVZWL      (0xb7 | P_EXT)
 297 #define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
 298 #define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
 299 #define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
 300 #define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 301 #define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 302 #define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 303 #define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 304 #define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 305 #define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 306 #define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 307 #define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 308 #define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 309 #define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
 310 #define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
 311 #define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
 312 #define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
 313 #define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 314 #define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 315 #define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 316 #define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 317 #define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 318 #define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 319 #define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 320 #define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 321 #define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 322 #define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 323 #define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 324 #define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
 325 #define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
 326 #define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 327 #define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 328 #define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
 329 #define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 330 #define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 331 #define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 332 #define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
 333 #define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 334 #define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 335 #define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 336 #define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
 337 #define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 338 #define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 339 #define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 340 #define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
 341 #define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 342 #define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 343 #define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 344 #define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 345 #define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 346 #define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 347 #define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 348 #define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 349 #define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 350 #define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 351 #define OPC_POR         (0xeb | P_EXT | P_DATA16)
 352 #define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 353 #define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 354 #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 355 #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 356 #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 357 #define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
 358 #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 359 #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 360 #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
 361 #define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
 362 #define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
 363 #define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
 364 #define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
 365 #define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
 366 #define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
 367 #define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
 368 #define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 369 #define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 370 #define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 371 #define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 372 #define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
 373 #define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
 374 #define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
 375 #define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
 376 #define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 377 #define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 378 #define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 379 #define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 380 #define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 381 #define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 382 #define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 383 #define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 384 #define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 385 #define OPC_POP_r32     (0x58)
 386 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 387 #define OPC_PUSH_r32    (0x50)
 388 #define OPC_PUSH_Iv     (0x68)
 389 #define OPC_PUSH_Ib     (0x6a)
 390 #define OPC_RET         (0xc3)
 391 #define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 392 #define OPC_SHIFT_1     (0xd1)
 393 #define OPC_SHIFT_Ib    (0xc1)
 394 #define OPC_SHIFT_cl    (0xd3)
 395 #define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 396 #define OPC_SHUFPS      (0xc6 | P_EXT)
 397 #define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 398 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 399 #define OPC_SHRD_Ib     (0xac | P_EXT)
 400 #define OPC_TESTL       (0x85)
 401 #define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 402 #define OPC_UD2         (0x0b | P_EXT)
 403 #define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 404 #define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 405 #define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
 406 #define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
 407 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
 408 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
 409 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 410 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 411 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 412 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 413 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
 414 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 415 #define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
 416 #define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 417 #define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
 418 #define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 419 #define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 420 #define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
 421 #define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 422 #define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 423 #define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
 424 #define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 425 #define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 426 #define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
 427 #define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 428 #define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 429 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 430 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
 431 #define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 432 #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
 433 #define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 434 #define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 435 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 436 #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
 437 #define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 438 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 439 #define OPC_XCHG_ax_r32 (0x90)
 440 #define OPC_XCHG_EvGv   (0x87)
 441
 442 #define OPC_GRP3_Eb     (0xf6)
 443 #define OPC_GRP3_Ev     (0xf7)
 444 #define OPC_GRP5        (0xff)
 445 #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 446
 447 /* Group 1 opcode extensions for 0x80-0x83.
 448    These are also used as modifiers for OPC_ARITH.  */
 449 #define ARITH_ADD 0
 450 #define ARITH_OR  1
 451 #define ARITH_ADC 2
 452 #define ARITH_SBB 3
 453 #define ARITH_AND 4
 454 #define ARITH_SUB 5
 455 #define ARITH_XOR 6
 456 #define ARITH_CMP 7
 457
 458 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 459 #define SHIFT_ROL 0
 460 #define SHIFT_ROR 1
 461 #define SHIFT_SHL 4
 462 #define SHIFT_SHR 5
 463 #define SHIFT_SAR 7
 464
 465 /* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 466 #define EXT3_TESTi 0
 467 #define EXT3_NOT   2
 468 #define EXT3_NEG   3
 469 #define EXT3_MUL   4
 470 #define EXT3_IMUL  5
 471 #define EXT3_DIV   6
 472 #define EXT3_IDIV  7
 473
 474 /* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 475 #define EXT5_INC_Ev     0
 476 #define EXT5_DEC_Ev     1
 477 #define EXT5_CALLN_Ev   2
 478 #define EXT5_JMPN_Ev    4
 479
 480 /* Condition codes to be added to OPC_JCC_{long,short}.  */
 481 #define JCC_JMP (-1)
 482 #define JCC_JO  0x0
 483 #define JCC_JNO 0x1
 484 #define JCC_JB  0x2
 485 #define JCC_JAE 0x3
 486 #define JCC_JE  0x4
 487 #define JCC_JNE 0x5
 488 #define JCC_JBE 0x6
 489 #define JCC_JA  0x7
 490 #define JCC_JS  0x8
 491 #define JCC_JNS 0x9
 492 #define JCC_JP  0xa
 493 #define JCC_JNP 0xb
 494 #define JCC_JL  0xc
 495 #define JCC_JGE 0xd
 496 #define JCC_JLE 0xe
 497 #define JCC_JG  0xf
 498
 499 static const uint8_t tcg_cond_to_jcc[] = {
 500     [TCG_COND_EQ] = JCC_JE,
 501     [TCG_COND_NE] = JCC_JNE,
 502     [TCG_COND_LT] = JCC_JL,
 503     [TCG_COND_GE] = JCC_JGE,
 504     [TCG_COND_LE] = JCC_JLE,
 505     [TCG_COND_GT] = JCC_JG,
 506     [TCG_COND_LTU] = JCC_JB,
 507     [TCG_COND_GEU] = JCC_JAE,
 508     [TCG_COND_LEU] = JCC_JBE,
 509     [TCG_COND_GTU] = JCC_JA,
 510 };
 511
 512 #if TCG_TARGET_REG_BITS == 64
 513 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 514 {
 515     int rex;
 516
 517     if (opc & P_GS) {
 518         tcg_out8(s, 0x65);
 519     }
 520     if (opc & P_DATA16) {
 521         /* We should never be asking for both 16 and 64-bit operation.  */
 522         tcg_debug_assert((opc & P_REXW) == 0);
 523         tcg_out8(s, 0x66);
 524     }
 525     if (opc & P_SIMDF3) {
 526         tcg_out8(s, 0xf3);
 527     } else if (opc & P_SIMDF2) {
 528         tcg_out8(s, 0xf2);
 529     }
 530
 531     rex = 0;
 532     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 533     rex |= (r & 8) >> 1;                /* REX.R */
 534     rex |= (x & 8) >> 2;                /* REX.X */
 535     rex |= (rm & 8) >> 3;               /* REX.B */
 536
 537     /* P_REXB_{R,RM} indicates that the given register is the low byte.
 538        For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 539        as otherwise the encoding indicates %[abcd]h.  Note that the values
 540        that are ORed in merely indicate that the REX byte must be present;
 541        those bits get discarded in output.  */
 542     rex |= opc & (r >= 4 ? P_REXB_R : 0);
 543     rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 544
 545     if (rex) {
 546         tcg_out8(s, (uint8_t)(rex | 0x40));
 547     }
 548
 549     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 550         tcg_out8(s, 0x0f);
 551         if (opc & P_EXT38) {
 552             tcg_out8(s, 0x38);
 553         } else if (opc & P_EXT3A) {
 554             tcg_out8(s, 0x3a);
 555         }
 556     }
 557
 558     tcg_out8(s, opc);
 559 }
 560 #else
 561 static void tcg_out_opc(TCGContext *s, int opc)
 562 {
 563     if (opc & P_DATA16) {
 564         tcg_out8(s, 0x66);
 565     }
 566     if (opc & P_SIMDF3) {
 567         tcg_out8(s, 0xf3);
 568     } else if (opc & P_SIMDF2) {
 569         tcg_out8(s, 0xf2);
 570     }
 571     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 572         tcg_out8(s, 0x0f);
 573         if (opc & P_EXT38) {
 574             tcg_out8(s, 0x38);
 575         } else if (opc & P_EXT3A) {
 576             tcg_out8(s, 0x3a);
 577         }
 578     }
 579     tcg_out8(s, opc);
 580 }
 581 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
 582    the 32-bit compilation paths.  This method works with all versions of gcc,
 583    whereas relying on optimization may not be able to exclude them.  */
 584 #define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 585 #endif
 586
 587 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 588 {
 589     tcg_out_opc(s, opc, r, rm, 0);
 590     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 591 }
 592
 593 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 594                             int rm, int index)
 595 {
 596     int tmp;
 597
 598     if (opc & P_GS) {
 599         tcg_out8(s, 0x65);
 600     }
 601     /* Use the two byte form if possible, which cannot encode
 602        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 603     if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
 604         && ((rm | index) & 8) == 0) {
 605         /* Two byte VEX prefix.  */
 606         tcg_out8(s, 0xc5);
 607
 608         tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 609     } else {
 610         /* Three byte VEX prefix.  */
 611         tcg_out8(s, 0xc4);
 612
 613         /* VEX.m-mmmm */
 614         if (opc & P_EXT3A) {
 615             tmp = 3;
 616         } else if (opc & P_EXT38) {
 617             tmp = 2;
 618         } else if (opc & P_EXT) {
 619             tmp = 1;
 620         } else {
 621             g_assert_not_reached();
 622         }
 623         tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 624         tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 625         tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 626         tcg_out8(s, tmp);
 627
 628         tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
 629     }
 630
 631     tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 632     /* VEX.pp */
 633     if (opc & P_DATA16) {
 634         tmp |= 1;                          /* 0x66 */
 635     } else if (opc & P_SIMDF3) {
 636         tmp |= 2;                          /* 0xf3 */
 637     } else if (opc & P_SIMDF2) {
 638         tmp |= 3;                          /* 0xf2 */
 639     }
 640     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 641     tcg_out8(s, tmp);
 642     tcg_out8(s, opc);
 643 }
 644
 645 static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
 646                              int rm, int index)
 647 {
 648     /* The entire 4-byte evex prefix; with R' and V' set. */
 649     uint32_t p = 0x08041062;
 650     int mm, pp;
 651
 652     tcg_debug_assert(have_avx512vl);
 653
 654     /* EVEX.mm */
 655     if (opc & P_EXT3A) {
 656         mm = 3;
 657     } else if (opc & P_EXT38) {
 658         mm = 2;
 659     } else if (opc & P_EXT) {
 660         mm = 1;
 661     } else {
 662         g_assert_not_reached();
 663     }
 664
 665     /* EVEX.pp */
 666     if (opc & P_DATA16) {
 667         pp = 1;                          /* 0x66 */
 668     } else if (opc & P_SIMDF3) {
 669         pp = 2;                          /* 0xf3 */
 670     } else if (opc & P_SIMDF2) {
 671         pp = 3;                          /* 0xf2 */
 672     } else {
 673         pp = 0;
 674     }
 675
 676     p = deposit32(p, 8, 2, mm);
 677     p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
 678     p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
 679     p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
 680     p = deposit32(p, 16, 2, pp);
 681     p = deposit32(p, 19, 4, ~v);
 682     p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
 683     p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
 684
 685     tcg_out32(s, p);
 686     tcg_out8(s, opc);
 687 }
 688
 689 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 690 {
 691     if (opc & P_EVEX) {
 692         tcg_out_evex_opc(s, opc, r, v, rm, 0);
 693     } else {
 694         tcg_out_vex_opc(s, opc, r, v, rm, 0);
 695     }
 696     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 697 }
 698
 699 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 700    We handle either RM and INDEX missing with a negative value.  In 64-bit
 701    mode for absolute addresses, ~RM is the size of the immediate operand
 702    that will follow the instruction.  */
 703
 704 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 705                                int shift, intptr_t offset)
 706 {
 707     int mod, len;
 708
 709     if (index < 0 && rm < 0) {
 710         if (TCG_TARGET_REG_BITS == 64) {
 711             /* Try for a rip-relative addressing mode.  This has replaced
 712                the 32-bit-mode absolute addressing encoding.  */
 713             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 714             intptr_t disp = offset - pc;
 715             if (disp == (int32_t)disp) {
 716                 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 717                 tcg_out32(s, disp);
 718                 return;
 719             }
 720
 721             /* Try for an absolute address encoding.  This requires the
 722                use of the MODRM+SIB encoding and is therefore larger than
 723                rip-relative addressing.  */
 724             if (offset == (int32_t)offset) {
 725                 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 726                 tcg_out8(s, (4 << 3) | 5);
 727                 tcg_out32(s, offset);
 728                 return;
 729             }
 730
 731             /* ??? The memory isn't directly addressable.  */
 732             g_assert_not_reached();
 733         } else {
 734             /* Absolute address.  */
 735             tcg_out8(s, (r << 3) | 5);
 736             tcg_out32(s, offset);
 737             return;
 738         }
 739     }
 740
 741     /* Find the length of the immediate addend.  Note that the encoding
 742        that would be used for (%ebp) indicates absolute addressing.  */
 743     if (rm < 0) {
 744         mod = 0, len = 4, rm = 5;
 745     } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 746         mod = 0, len = 0;
 747     } else if (offset == (int8_t)offset) {
 748         mod = 0x40, len = 1;
 749     } else {
 750         mod = 0x80, len = 4;
 751     }
 752
 753     /* Use a single byte MODRM format if possible.  Note that the encoding
 754        that would be used for %esp is the escape to the two byte form.  */
 755     if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 756         /* Single byte MODRM format.  */
 757         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 758     } else {
 759         /* Two byte MODRM+SIB format.  */
 760
 761         /* Note that the encoding that would place %esp into the index
 762            field indicates no index register.  In 64-bit mode, the REX.X
 763            bit counts, so %r12 can be used as the index.  */
 764         if (index < 0) {
 765             index = 4;
 766         } else {
 767             tcg_debug_assert(index != TCG_REG_ESP);
 768         }
 769
 770         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 771         tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 772     }
 773
 774     if (len == 1) {
 775         tcg_out8(s, offset);
 776     } else if (len == 4) {
 777         tcg_out32(s, offset);
 778     }
 779 }
 780
 781 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 782                                      int index, int shift, intptr_t offset)
 783 {
 784     tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 785     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 786 }
 787
 788 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 789                                          int rm, int index, int shift,
 790                                          intptr_t offset)
 791 {
 792     tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 793     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 794 }
 795
 796 /* A simplification of the above with no index or shift.  */
 797 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 798                                         int rm, intptr_t offset)
 799 {
 800     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 801 }
 802
 803 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 804                                             int v, int rm, intptr_t offset)
 805 {
 806     tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 807 }
 808
 809 /* Output an opcode with an expected reference to the constant pool.  */
 810 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 811 {
 812     tcg_out_opc(s, opc, r, 0, 0);
 813     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 814     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 815     tcg_out32(s, 0);
 816 }
 817
 818 /* Output an opcode with an expected reference to the constant pool.  */
 819 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 820 {
 821     tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 822     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 823     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 824     tcg_out32(s, 0);
 825 }
 826
 827 /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 828 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 829 {
 830     /* Propagate an opcode prefix, such as P_REXW.  */
 831     int ext = subop & ~0x7;
 832     subop &= 0x7;
 833
 834     tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 835 }
 836
 837 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 838 {
 839     int rexw = 0;
 840
 841     if (arg == ret) {
 842         return true;
 843     }
 844     switch (type) {
 845     case TCG_TYPE_I64:
 846         rexw = P_REXW;
 847         /* fallthru */
 848     case TCG_TYPE_I32:
 849         if (ret < 16) {
 850             if (arg < 16) {
 851                 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 852             } else {
 853                 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 854             }
 855         } else {
 856             if (arg < 16) {
 857                 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 858             } else {
 859                 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 860             }
 861         }
 862         break;
 863
 864     case TCG_TYPE_V64:
 865         tcg_debug_assert(ret >= 16 && arg >= 16);
 866         tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 867         break;
 868     case TCG_TYPE_V128:
 869         tcg_debug_assert(ret >= 16 && arg >= 16);
 870         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 871         break;
 872     case TCG_TYPE_V256:
 873         tcg_debug_assert(ret >= 16 && arg >= 16);
 874         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 875         break;
 876
 877     default:
 878         g_assert_not_reached();
 879     }
 880     return true;
 881 }
 882
 883 static const int avx2_dup_insn[4] = {
 884     OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 885     OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 886 };
 887
 888 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 889                             TCGReg r, TCGReg a)
 890 {
 891     if (have_avx2) {
 892         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 893         tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
 894     } else {
 895         switch (vece) {
 896         case MO_8:
 897             /* ??? With zero in a register, use PSHUFB.  */
 898             tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 899             a = r;
 900             /* FALLTHRU */
 901         case MO_16:
 902             tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 903             a = r;
 904             /* FALLTHRU */
 905         case MO_32:
 906             tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 907             /* imm8 operand: all output lanes selected from input lane 0.  */
 908             tcg_out8(s, 0);
 909             break;
 910         case MO_64:
 911             tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 912             break;
 913         default:
 914             g_assert_not_reached();
 915         }
 916     }
 917     return true;
 918 }
 919
 920 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 921                              TCGReg r, TCGReg base, intptr_t offset)
 922 {
 923     if (have_avx2) {
 924         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 925         tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
 926                                  r, 0, base, offset);
 927     } else {
 928         switch (vece) {
 929         case MO_64:
 930             tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
 931             break;
 932         case MO_32:
 933             tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
 934             break;
 935         case MO_16:
 936             tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
 937             tcg_out8(s, 0); /* imm8 */
 938             tcg_out_dup_vec(s, type, vece, r, r);
 939             break;
 940         case MO_8:
 941             tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
 942             tcg_out8(s, 0); /* imm8 */
 943             tcg_out_dup_vec(s, type, vece, r, r);
 944             break;
 945         default:
 946             g_assert_not_reached();
 947         }
 948     }
 949     return true;
 950 }
 951
 952 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 953                              TCGReg ret, int64_t arg)
 954 {
 955     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 956
 957     if (arg == 0) {
 958         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 959         return;
 960     }
 961     if (arg == -1) {
 962         tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 963         return;
 964     }
 965
 966     if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
 967         if (have_avx2) {
 968             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 969         } else {
 970             tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 971         }
 972         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 973     } else {
 974         if (type == TCG_TYPE_V64) {
 975             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 976         } else if (have_avx2) {
 977             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 978         } else {
 979             tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
 980         }
 981         if (TCG_TARGET_REG_BITS == 64) {
 982             new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 983         } else {
 984             new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
 985         }
 986     }
 987 }
 988
 989 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
 990                              TCGReg ret, tcg_target_long arg)
 991 {
 992     if (arg == 0) {
 993         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 994         return;
 995     }
 996     if (arg == -1) {
 997         tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
 998         return;
 999     }
1000
1001     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
1002     tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1003     if (TCG_TARGET_REG_BITS == 64) {
1004         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1005     } else {
1006         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1007     }
1008 }
1009
1010 static void tcg_out_movi_int(TCGContext *s, TCGType type,
1011                              TCGReg ret, tcg_target_long arg)
1012 {
1013     tcg_target_long diff;
1014
1015     if (arg == 0) {
1016         tgen_arithr(s, ARITH_XOR, ret, ret);
1017         return;
1018     }
1019     if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1020         tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1021         tcg_out32(s, arg);
1022         return;
1023     }
1024     if (arg == (int32_t)arg) {
1025         tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1026         tcg_out32(s, arg);
1027         return;
1028     }
1029
1030     /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1031     diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1032     if (diff == (int32_t)diff) {
1033         tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1034         tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1035         tcg_out32(s, diff);
1036         return;
1037     }
1038
1039     tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1040     tcg_out64(s, arg);
1041 }
1042
1043 static void tcg_out_movi(TCGContext *s, TCGType type,
1044                          TCGReg ret, tcg_target_long arg)
1045 {
1046     switch (type) {
1047     case TCG_TYPE_I32:
1048 #if TCG_TARGET_REG_BITS == 64
1049     case TCG_TYPE_I64:
1050 #endif
1051         if (ret < 16) {
1052             tcg_out_movi_int(s, type, ret, arg);
1053         } else {
1054             tcg_out_movi_vec(s, type, ret, arg);
1055         }
1056         break;
1057     default:
1058         g_assert_not_reached();
1059     }
1060 }
1061
1062 static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1063 {
1064     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1065     tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1066     return true;
1067 }
1068
1069 static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1070                              tcg_target_long imm)
1071 {
1072     /* This function is only used for passing structs by reference. */
1073     tcg_debug_assert(imm == (int32_t)imm);
1074     tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1075 }
1076
1077 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1078 {
1079     if (val == (int8_t)val) {
1080         tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1081         tcg_out8(s, val);
1082     } else if (val == (int32_t)val) {
1083         tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1084         tcg_out32(s, val);
1085     } else {
1086         g_assert_not_reached();
1087     }
1088 }
1089
1090 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1091 {
1092     /* Given the strength of x86 memory ordering, we only need care for
1093        store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1094        faster than "mfence", so don't bother with the sse insn.  */
1095     if (a0 & TCG_MO_ST_LD) {
1096         tcg_out8(s, 0xf0);
1097         tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1098         tcg_out8(s, 0);
1099     }
1100 }
1101
1102 static inline void tcg_out_push(TCGContext *s, int reg)
1103 {
1104     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1105 }
1106
1107 static inline void tcg_out_pop(TCGContext *s, int reg)
1108 {
1109     tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1110 }
1111
1112 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1113                        TCGReg arg1, intptr_t arg2)
1114 {
1115     switch (type) {
1116     case TCG_TYPE_I32:
1117         if (ret < 16) {
1118             tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1119         } else {
1120             tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1121         }
1122         break;
1123     case TCG_TYPE_I64:
1124         if (ret < 16) {
1125             tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1126             break;
1127         }
1128         /* FALLTHRU */
1129     case TCG_TYPE_V64:
1130         /* There is no instruction that can validate 8-byte alignment.  */
1131         tcg_debug_assert(ret >= 16);
1132         tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1133         break;
1134     case TCG_TYPE_V128:
1135         /*
1136          * The gvec infrastructure is asserts that v128 vector loads
1137          * and stores use a 16-byte aligned offset.  Validate that the
1138          * final pointer is aligned by using an insn that will SIGSEGV.
1139          */
1140         tcg_debug_assert(ret >= 16);
1141         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1142         break;
1143     case TCG_TYPE_V256:
1144         /*
1145          * The gvec infrastructure only requires 16-byte alignment,
1146          * so here we must use an unaligned load.
1147          */
1148         tcg_debug_assert(ret >= 16);
1149         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1150                                  ret, 0, arg1, arg2);
1151         break;
1152     default:
1153         g_assert_not_reached();
1154     }
1155 }
1156
1157 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1158                        TCGReg arg1, intptr_t arg2)
1159 {
1160     switch (type) {
1161     case TCG_TYPE_I32:
1162         if (arg < 16) {
1163             tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1164         } else {
1165             tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1166         }
1167         break;
1168     case TCG_TYPE_I64:
1169         if (arg < 16) {
1170             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1171             break;
1172         }
1173         /* FALLTHRU */
1174     case TCG_TYPE_V64:
1175         /* There is no instruction that can validate 8-byte alignment.  */
1176         tcg_debug_assert(arg >= 16);
1177         tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1178         break;
1179     case TCG_TYPE_V128:
1180         /*
1181          * The gvec infrastructure is asserts that v128 vector loads
1182          * and stores use a 16-byte aligned offset.  Validate that the
1183          * final pointer is aligned by using an insn that will SIGSEGV.
1184          *
1185          * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1186          * for _WIN64, which must have SSE2 but may not have AVX.
1187          */
1188         tcg_debug_assert(arg >= 16);
1189         if (have_avx1) {
1190             tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1191         } else {
1192             tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1193         }
1194         break;
1195     case TCG_TYPE_V256:
1196         /*
1197          * The gvec infrastructure only requires 16-byte alignment,
1198          * so here we must use an unaligned store.
1199          */
1200         tcg_debug_assert(arg >= 16);
1201         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1202                                  arg, 0, arg1, arg2);
1203         break;
1204     default:
1205         g_assert_not_reached();
1206     }
1207 }
1208
1209 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1210                         TCGReg base, intptr_t ofs)
1211 {
1212     int rexw = 0;
1213     if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1214         if (val != (int32_t)val) {
1215             return false;
1216         }
1217         rexw = P_REXW;
1218     } else if (type != TCG_TYPE_I32) {
1219         return false;
1220     }
1221     tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1222     tcg_out32(s, val);
1223     return true;
1224 }
1225
1226 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1227 {
1228     /* Propagate an opcode prefix, such as P_DATA16.  */
1229     int ext = subopc & ~0x7;
1230     subopc &= 0x7;
1231
1232     if (count == 1) {
1233         tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1234     } else {
1235         tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1236         tcg_out8(s, count);
1237     }
1238 }
1239
1240 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1241 {
1242     tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1243 }
1244
1245 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1246 {
1247     tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1248 }
1249
1250 static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1251 {
1252     /* movzbl */
1253     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1254     tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1255 }
1256
1257 static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1258 {
1259     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1260     /* movsbl */
1261     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1262     tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1263 }
1264
1265 static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1266 {
1267     /* movzwl */
1268     tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1269 }
1270
1271 static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1272 {
1273     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1274     /* movsw[lq] */
1275     tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1276 }
1277
1278 static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1279 {
1280     /* 32-bit mov zero extends.  */
1281     tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1282 }
1283
1284 static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1285 {
1286     tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1287     tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1288 }
1289
1290 static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1291 {
1292     tcg_out_ext32s(s, dest, src);
1293 }
1294
1295 static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1296 {
1297     if (dest != src) {
1298         tcg_out_ext32u(s, dest, src);
1299     }
1300 }
1301
1302 static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1303 {
1304     tcg_out_ext32u(s, dest, src);
1305 }
1306
1307 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1308 {
1309     tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1310 }
1311
1312 static void tgen_arithi(TCGContext *s, int c, int r0,
1313                         tcg_target_long val, int cf)
1314 {
1315     int rexw = 0;
1316
1317     if (TCG_TARGET_REG_BITS == 64) {
1318         rexw = c & -8;
1319         c &= 7;
1320     }
1321
1322     /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1323        partial flags update stalls on Pentium4 and are not recommended
1324        by current Intel optimization manuals.  */
1325     if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1326         int is_inc = (c == ARITH_ADD) ^ (val < 0);
1327         if (TCG_TARGET_REG_BITS == 64) {
1328             /* The single-byte increment encodings are re-tasked as the
1329                REX prefixes.  Use the MODRM encoding.  */
1330             tcg_out_modrm(s, OPC_GRP5 + rexw,
1331                           (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1332         } else {
1333             tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1334         }
1335         return;
1336     }
1337
1338     if (c == ARITH_AND) {
1339         if (TCG_TARGET_REG_BITS == 64) {
1340             if (val == 0xffffffffu) {
1341                 tcg_out_ext32u(s, r0, r0);
1342                 return;
1343             }
1344             if (val == (uint32_t)val) {
1345                 /* AND with no high bits set can use a 32-bit operation.  */
1346                 rexw = 0;
1347             }
1348         }
1349         if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1350             tcg_out_ext8u(s, r0, r0);
1351             return;
1352         }
1353         if (val == 0xffffu) {
1354             tcg_out_ext16u(s, r0, r0);
1355             return;
1356         }
1357     }
1358
1359     if (val == (int8_t)val) {
1360         tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1361         tcg_out8(s, val);
1362         return;
1363     }
1364     if (rexw == 0 || val == (int32_t)val) {
1365         tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1366         tcg_out32(s, val);
1367         return;
1368     }
1369
1370     g_assert_not_reached();
1371 }
1372
1373 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1374 {
1375     if (val != 0) {
1376         tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1377     }
1378 }
1379
1380 /* Set SMALL to force a short forward branch.  */
1381 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1382 {
1383     int32_t val, val1;
1384
1385     if (l->has_value) {
1386         val = tcg_pcrel_diff(s, l->u.value_ptr);
1387         val1 = val - 2;
1388         if ((int8_t)val1 == val1) {
1389             if (opc == -1) {
1390                 tcg_out8(s, OPC_JMP_short);
1391             } else {
1392                 tcg_out8(s, OPC_JCC_short + opc);
1393             }
1394             tcg_out8(s, val1);
1395         } else {
1396             tcg_debug_assert(!small);
1397             if (opc == -1) {
1398                 tcg_out8(s, OPC_JMP_long);
1399                 tcg_out32(s, val - 5);
1400             } else {
1401                 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1402                 tcg_out32(s, val - 6);
1403             }
1404         }
1405     } else if (small) {
1406         if (opc == -1) {
1407             tcg_out8(s, OPC_JMP_short);
1408         } else {
1409             tcg_out8(s, OPC_JCC_short + opc);
1410         }
1411         tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1412         s->code_ptr += 1;
1413     } else {
1414         if (opc == -1) {
1415             tcg_out8(s, OPC_JMP_long);
1416         } else {
1417             tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1418         }
1419         tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1420         s->code_ptr += 4;
1421     }
1422 }
1423
1424 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1425                         int const_arg2, int rexw)
1426 {
1427     if (const_arg2) {
1428         if (arg2 == 0) {
1429             /* test r, r */
1430             tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1431         } else {
1432             tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1433         }
1434     } else {
1435         tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1436     }
1437 }
1438
1439 static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1440                            TCGArg arg1, TCGArg arg2, int const_arg2,
1441                            TCGLabel *label, bool small)
1442 {
1443     tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1444     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1445 }
1446
1447 #if TCG_TARGET_REG_BITS == 32
1448 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1449                             const int *const_args, bool small)
1450 {
1451     TCGLabel *label_next = gen_new_label();
1452     TCGLabel *label_this = arg_label(args[5]);
1453
1454     switch(args[4]) {
1455     case TCG_COND_EQ:
1456         tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1457                        label_next, 1);
1458         tcg_out_brcond(s, 0, TCG_COND_EQ, args[1], args[3], const_args[3],
1459                        label_this, small);
1460         break;
1461     case TCG_COND_NE:
1462         tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1463                        label_this, small);
1464         tcg_out_brcond(s, 0, TCG_COND_NE, args[1], args[3], const_args[3],
1465                        label_this, small);
1466         break;
1467     case TCG_COND_LT:
1468         tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1469                        label_this, small);
1470         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1471         tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1472                        label_this, small);
1473         break;
1474     case TCG_COND_LE:
1475         tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1476                        label_this, small);
1477         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1478         tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1479                        label_this, small);
1480         break;
1481     case TCG_COND_GT:
1482         tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1483                        label_this, small);
1484         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1485         tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1486                        label_this, small);
1487         break;
1488     case TCG_COND_GE:
1489         tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1490                        label_this, small);
1491         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1492         tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1493                        label_this, small);
1494         break;
1495     case TCG_COND_LTU:
1496         tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1497                        label_this, small);
1498         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1499         tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1500                        label_this, small);
1501         break;
1502     case TCG_COND_LEU:
1503         tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1504                        label_this, small);
1505         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1506         tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1507                        label_this, small);
1508         break;
1509     case TCG_COND_GTU:
1510         tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1511                        label_this, small);
1512         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1513         tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1514                        label_this, small);
1515         break;
1516     case TCG_COND_GEU:
1517         tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1518                        label_this, small);
1519         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1520         tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1521                        label_this, small);
1522         break;
1523     default:
1524         g_assert_not_reached();
1525     }
1526     tcg_out_label(s, label_next);
1527 }
1528 #endif
1529
1530 static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1531                             TCGArg dest, TCGArg arg1, TCGArg arg2,
1532                             int const_arg2, bool neg)
1533 {
1534     bool inv = false;
1535     bool cleared;
1536
1537     switch (cond) {
1538     case TCG_COND_NE:
1539         inv = true;
1540         /* fall through */
1541     case TCG_COND_EQ:
1542         /* If arg2 is 0, convert to LTU/GEU vs 1. */
1543         if (const_arg2 && arg2 == 0) {
1544             arg2 = 1;
1545             goto do_ltu;
1546         }
1547         break;
1548
1549     case TCG_COND_LEU:
1550         inv = true;
1551         /* fall through */
1552     case TCG_COND_GTU:
1553         /* If arg2 is a register, swap for LTU/GEU. */
1554         if (!const_arg2) {
1555             TCGReg t = arg1;
1556             arg1 = arg2;
1557             arg2 = t;
1558             goto do_ltu;
1559         }
1560         break;
1561
1562     case TCG_COND_GEU:
1563         inv = true;
1564         /* fall through */
1565     case TCG_COND_LTU:
1566     do_ltu:
1567         /*
1568          * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1569          * We can then use NEG or INC to produce the desired result.
1570          * This is always smaller than the SETCC expansion.
1571          */
1572         tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1573
1574         /* X - X - C = -C = (C ? -1 : 0) */
1575         tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
1576         if (inv && neg) {
1577             /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1578             tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1579         } else if (inv) {
1580             /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1581             tgen_arithi(s, ARITH_ADD, dest, 1, 0);
1582         } else if (!neg) {
1583             /* -(C ? -1 : 0) = (C ? 1 : 0) */
1584             tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
1585         }
1586         return;
1587
1588     case TCG_COND_GE:
1589         inv = true;
1590         /* fall through */
1591     case TCG_COND_LT:
1592         /* If arg2 is 0, extract the sign bit. */
1593         if (const_arg2 && arg2 == 0) {
1594             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
1595             if (inv) {
1596                 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1597             }
1598             tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
1599                            dest, rexw ? 63 : 31);
1600             return;
1601         }
1602         break;
1603
1604     default:
1605         break;
1606     }
1607
1608     /*
1609      * If dest does not overlap the inputs, clearing it first is preferred.
1610      * The XOR breaks any false dependency for the low-byte write to dest,
1611      * and is also one byte smaller than MOVZBL.
1612      */
1613     cleared = false;
1614     if (dest != arg1 && (const_arg2 || dest != arg2)) {
1615         tgen_arithr(s, ARITH_XOR, dest, dest);
1616         cleared = true;
1617     }
1618
1619     tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1620     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1621
1622     if (!cleared) {
1623         tcg_out_ext8u(s, dest, dest);
1624     }
1625     if (neg) {
1626         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
1627     }
1628 }
1629
1630 #if TCG_TARGET_REG_BITS == 32
1631 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1632                              const int *const_args)
1633 {
1634     TCGArg new_args[6];
1635     TCGLabel *label_true, *label_over;
1636
1637     memcpy(new_args, args+1, 5*sizeof(TCGArg));
1638
1639     if (args[0] == args[1] || args[0] == args[2]
1640         || (!const_args[3] && args[0] == args[3])
1641         || (!const_args[4] && args[0] == args[4])) {
1642         /* When the destination overlaps with one of the argument
1643            registers, don't do anything tricky.  */
1644         label_true = gen_new_label();
1645         label_over = gen_new_label();
1646
1647         new_args[5] = label_arg(label_true);
1648         tcg_out_brcond2(s, new_args, const_args+1, 1);
1649
1650         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1651         tcg_out_jxx(s, JCC_JMP, label_over, 1);
1652         tcg_out_label(s, label_true);
1653
1654         tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1655         tcg_out_label(s, label_over);
1656     } else {
1657         /* When the destination does not overlap one of the arguments,
1658            clear the destination first, jump if cond false, and emit an
1659            increment in the true case.  This results in smaller code.  */
1660
1661         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1662
1663         label_over = gen_new_label();
1664         new_args[4] = tcg_invert_cond(new_args[4]);
1665         new_args[5] = label_arg(label_over);
1666         tcg_out_brcond2(s, new_args, const_args+1, 1);
1667
1668         tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1669         tcg_out_label(s, label_over);
1670     }
1671 }
1672 #endif
1673
1674 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1675                          TCGReg dest, TCGReg v1)
1676 {
1677     if (have_cmov) {
1678         tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1679     } else {
1680         TCGLabel *over = gen_new_label();
1681         tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1682         tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1683         tcg_out_label(s, over);
1684     }
1685 }
1686
1687 static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1688                             TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1689                             TCGReg v1)
1690 {
1691     tcg_out_cmp(s, c1, c2, const_c2, rexw);
1692     tcg_out_cmov(s, cond, rexw, dest, v1);
1693 }
1694
1695 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1696                         TCGArg arg2, bool const_a2)
1697 {
1698     if (have_bmi1) {
1699         tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1700         if (const_a2) {
1701             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1702         } else {
1703             tcg_debug_assert(dest != arg2);
1704             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1705         }
1706     } else {
1707         tcg_debug_assert(dest != arg2);
1708         tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1709         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1710     }
1711 }
1712
1713 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1714                         TCGArg arg2, bool const_a2)
1715 {
1716     if (have_lzcnt) {
1717         tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1718         if (const_a2) {
1719             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1720         } else {
1721             tcg_debug_assert(dest != arg2);
1722             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1723         }
1724     } else {
1725         tcg_debug_assert(!const_a2);
1726         tcg_debug_assert(dest != arg1);
1727         tcg_debug_assert(dest != arg2);
1728
1729         /* Recall that the output of BSR is the index not the count.  */
1730         tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1731         tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1732
1733         /* Since we have destroyed the flags from BSR, we have to re-test.  */
1734         tcg_out_cmp(s, arg1, 0, 1, rexw);
1735         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1736     }
1737 }
1738
1739 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1740 {
1741     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1742
1743     if (disp == (int32_t)disp) {
1744         tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1745         tcg_out32(s, disp);
1746     } else {
1747         /* rip-relative addressing into the constant pool.
1748            This is 6 + 8 = 14 bytes, as compared to using an
1749            immediate load 10 + 6 = 16 bytes, plus we may
1750            be able to re-use the pool constant for more calls.  */
1751         tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1752         tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1753         new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1754         tcg_out32(s, 0);
1755     }
1756 }
1757
1758 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1759                          const TCGHelperInfo *info)
1760 {
1761     tcg_out_branch(s, 1, dest);
1762
1763 #ifndef _WIN32
1764     if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1765         /*
1766          * The sysv i386 abi for struct return places a reference as the
1767          * first argument of the stack, and pops that argument with the
1768          * return statement.  Since we want to retain the aligned stack
1769          * pointer for the callee, we do not want to actually push that
1770          * argument before the call but rely on the normal store to the
1771          * stack slot.  But we do need to compensate for the pop in order
1772          * to reset our correct stack pointer value.
1773          * Pushing a garbage value back onto the stack is quickest.
1774          */
1775         tcg_out_push(s, TCG_REG_EAX);
1776     }
1777 #endif
1778 }
1779
1780 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1781 {
1782     tcg_out_branch(s, 0, dest);
1783 }
1784
1785 static void tcg_out_nopn(TCGContext *s, int n)
1786 {
1787     int i;
1788     /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1789      * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1790      * duplicate prefix, and all of the interesting recent cores can
1791      * decode and discard the duplicates in a single cycle.
1792      */
1793     tcg_debug_assert(n >= 1);
1794     for (i = 1; i < n; ++i) {
1795         tcg_out8(s, 0x66);
1796     }
1797     tcg_out8(s, 0x90);
1798 }
1799
1800 /* Test register R vs immediate bits I, setting Z flag for EQ/NE. */
1801 static void __attribute__((unused))
1802 tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i)
1803 {
1804     /*
1805      * This is used for testing alignment, so we can usually use testb.
1806      * For i686, we have to use testl for %esi/%edi.
1807      */
1808     if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) {
1809         tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r);
1810         tcg_out8(s, i);
1811     } else {
1812         tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r);
1813         tcg_out32(s, i);
1814     }
1815 }
1816
1817 typedef struct {
1818     TCGReg base;
1819     int index;
1820     int ofs;
1821     int seg;
1822     TCGAtomAlign aa;
1823 } HostAddress;
1824
1825 bool tcg_target_has_memory_bswap(MemOp memop)
1826 {
1827     TCGAtomAlign aa;
1828
1829     if (!have_movbe) {
1830         return false;
1831     }
1832     if ((memop & MO_SIZE) < MO_128) {
1833         return true;
1834     }
1835
1836     /*
1837      * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1838      * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1839      */
1840     aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1841     return aa.atom < MO_128;
1842 }
1843
1844 /*
1845  * Because i686 has no register parameters and because x86_64 has xchg
1846  * to handle addr/data register overlap, we have placed all input arguments
1847  * before we need might need a scratch reg.
1848  *
1849  * Even then, a scratch is only needed for l->raddr.  Rather than expose
1850  * a general-purpose scratch when we don't actually know it's available,
1851  * use the ra_gen hook to load into RAX if needed.
1852  */
1853 #if TCG_TARGET_REG_BITS == 64
1854 static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1855 {
1856     if (arg < 0) {
1857         arg = TCG_REG_RAX;
1858     }
1859     tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1860     return arg;
1861 }
1862 static const TCGLdstHelperParam ldst_helper_param = {
1863     .ra_gen = ldst_ra_gen
1864 };
1865 #else
1866 static const TCGLdstHelperParam ldst_helper_param = { };
1867 #endif
1868
1869 static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1870                                 TCGReg l, TCGReg h, TCGReg v)
1871 {
1872     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1873
1874     /* vpmov{d,q} %v, %l */
1875     tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
1876     /* vpextr{d,q} $1, %v, %h */
1877     tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
1878     tcg_out8(s, 1);
1879 }
1880
1881 static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
1882                                 TCGReg v, TCGReg l, TCGReg h)
1883 {
1884     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1885
1886     /* vmov{d,q} %l, %v */
1887     tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
1888     /* vpinsr{d,q} $1, %h, %v, %v */
1889     tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
1890     tcg_out8(s, 1);
1891 }
1892
1893 /*
1894  * Generate code for the slow path for a load at the end of block
1895  */
1896 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1897 {
1898     MemOp opc = get_memop(l->oi);
1899     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1900
1901     /* resolve label address */
1902     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1903     if (label_ptr[1]) {
1904         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1905     }
1906
1907     tcg_out_ld_helper_args(s, l, &ldst_helper_param);
1908     tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
1909     tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
1910
1911     tcg_out_jmp(s, l->raddr);
1912     return true;
1913 }
1914
1915 /*
1916  * Generate code for the slow path for a store at the end of block
1917  */
1918 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1919 {
1920     MemOp opc = get_memop(l->oi);
1921     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1922
1923     /* resolve label address */
1924     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1925     if (label_ptr[1]) {
1926         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1927     }
1928
1929     tcg_out_st_helper_args(s, l, &ldst_helper_param);
1930     tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
1931
1932     tcg_out_jmp(s, l->raddr);
1933     return true;
1934 }
1935
1936 #ifndef CONFIG_SOFTMMU
1937 static HostAddress x86_guest_base = {
1938     .index = -1
1939 };
1940
1941 #if defined(__x86_64__) && defined(__linux__)
1942 # include <asm/prctl.h>
1943 # include <sys/prctl.h>
1944 int arch_prctl(int code, unsigned long addr);
1945 static inline int setup_guest_base_seg(void)
1946 {
1947     if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1948         return P_GS;
1949     }
1950     return 0;
1951 }
1952 #elif defined(__x86_64__) && \
1953       (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
1954 # include <machine/sysarch.h>
1955 static inline int setup_guest_base_seg(void)
1956 {
1957     if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1958         return P_GS;
1959     }
1960     return 0;
1961 }
1962 #else
1963 static inline int setup_guest_base_seg(void)
1964 {
1965     return 0;
1966 }
1967 #endif /* setup_guest_base_seg */
1968 #endif /* !SOFTMMU */
1969
1970 #define MIN_TLB_MASK_TABLE_OFS  INT_MIN
1971
1972 /*
1973  * For softmmu, perform the TLB load and compare.
1974  * For useronly, perform any required alignment tests.
1975  * In both cases, return a TCGLabelQemuLdst structure if the slow path
1976  * is required and fill in @h with the host address for the fast path.
1977  */
1978 static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
1979                                            TCGReg addrlo, TCGReg addrhi,
1980                                            MemOpIdx oi, bool is_ld)
1981 {
1982     TCGLabelQemuLdst *ldst = NULL;
1983     MemOp opc = get_memop(oi);
1984     MemOp s_bits = opc & MO_SIZE;
1985     unsigned a_mask;
1986
1987 #ifdef CONFIG_SOFTMMU
1988     h->index = TCG_REG_L0;
1989     h->ofs = 0;
1990     h->seg = 0;
1991 #else
1992     *h = x86_guest_base;
1993 #endif
1994     h->base = addrlo;
1995     h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
1996     a_mask = (1 << h->aa.align) - 1;
1997
1998 #ifdef CONFIG_SOFTMMU
1999     int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
2000                         : offsetof(CPUTLBEntry, addr_write);
2001     TCGType ttype = TCG_TYPE_I32;
2002     TCGType tlbtype = TCG_TYPE_I32;
2003     int trexw = 0, hrexw = 0, tlbrexw = 0;
2004     unsigned mem_index = get_mmuidx(oi);
2005     unsigned s_mask = (1 << s_bits) - 1;
2006     int fast_ofs = tlb_mask_table_ofs(s, mem_index);
2007     int tlb_mask;
2008
2009     ldst = new_ldst_label(s);
2010     ldst->is_ld = is_ld;
2011     ldst->oi = oi;
2012     ldst->addrlo_reg = addrlo;
2013     ldst->addrhi_reg = addrhi;
2014
2015     if (TCG_TARGET_REG_BITS == 64) {
2016         ttype = s->addr_type;
2017         trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
2018         if (TCG_TYPE_PTR == TCG_TYPE_I64) {
2019             hrexw = P_REXW;
2020             if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2021                 tlbtype = TCG_TYPE_I64;
2022                 tlbrexw = P_REXW;
2023             }
2024         }
2025     }
2026
2027     tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
2028     tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
2029                    s->page_bits - CPU_TLB_ENTRY_BITS);
2030
2031     tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2032                          fast_ofs + offsetof(CPUTLBDescFast, mask));
2033
2034     tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2035                          fast_ofs + offsetof(CPUTLBDescFast, table));
2036
2037     /*
2038      * If the required alignment is at least as large as the access, simply
2039      * copy the address and mask.  For lesser alignments, check that we don't
2040      * cross pages for the complete access.
2041      */
2042     if (a_mask >= s_mask) {
2043         tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2044     } else {
2045         tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2046                              addrlo, s_mask - a_mask);
2047     }
2048     tlb_mask = s->page_mask | a_mask;
2049     tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2050
2051     /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2052     tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2053                          TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2054
2055     /* jne slow_path */
2056     tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2057     ldst->label_ptr[0] = s->code_ptr;
2058     s->code_ptr += 4;
2059
2060     if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2061         /* cmp 4(TCG_REG_L0), addrhi */
2062         tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, TCG_REG_L0, cmp_ofs + 4);
2063
2064         /* jne slow_path */
2065         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2066         ldst->label_ptr[1] = s->code_ptr;
2067         s->code_ptr += 4;
2068     }
2069
2070     /* TLB Hit.  */
2071     tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2072                offsetof(CPUTLBEntry, addend));
2073 #else
2074     if (a_mask) {
2075         ldst = new_ldst_label(s);
2076
2077         ldst->is_ld = is_ld;
2078         ldst->oi = oi;
2079         ldst->addrlo_reg = addrlo;
2080         ldst->addrhi_reg = addrhi;
2081
2082         tcg_out_testi(s, addrlo, a_mask);
2083         /* jne slow_path */
2084         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2085         ldst->label_ptr[0] = s->code_ptr;
2086         s->code_ptr += 4;
2087     }
2088 #endif
2089
2090     return ldst;
2091 }
2092
2093 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2094                                    HostAddress h, TCGType type, MemOp memop)
2095 {
2096     bool use_movbe = false;
2097     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2098     int movop = OPC_MOVL_GvEv;
2099
2100     /* Do big-endian loads with movbe.  */
2101     if (memop & MO_BSWAP) {
2102         tcg_debug_assert(have_movbe);
2103         use_movbe = true;
2104         movop = OPC_MOVBE_GyMy;
2105     }
2106
2107     switch (memop & MO_SSIZE) {
2108     case MO_UB:
2109         tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2110                                  h.base, h.index, 0, h.ofs);
2111         break;
2112     case MO_SB:
2113         tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2114                                  h.base, h.index, 0, h.ofs);
2115         break;
2116     case MO_UW:
2117         if (use_movbe) {
2118             /* There is no extending movbe; only low 16-bits are modified.  */
2119             if (datalo != h.base && datalo != h.index) {
2120                 /* XOR breaks dependency chains.  */
2121                 tgen_arithr(s, ARITH_XOR, datalo, datalo);
2122                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2123                                          datalo, h.base, h.index, 0, h.ofs);
2124             } else {
2125                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2126                                          datalo, h.base, h.index, 0, h.ofs);
2127                 tcg_out_ext16u(s, datalo, datalo);
2128             }
2129         } else {
2130             tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2131                                      h.base, h.index, 0, h.ofs);
2132         }
2133         break;
2134     case MO_SW:
2135         if (use_movbe) {
2136             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2137                                      datalo, h.base, h.index, 0, h.ofs);
2138             tcg_out_ext16s(s, type, datalo, datalo);
2139         } else {
2140             tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2141                                      datalo, h.base, h.index, 0, h.ofs);
2142         }
2143         break;
2144     case MO_UL:
2145         tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2146                                  h.base, h.index, 0, h.ofs);
2147         break;
2148 #if TCG_TARGET_REG_BITS == 64
2149     case MO_SL:
2150         if (use_movbe) {
2151             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2152                                      h.base, h.index, 0, h.ofs);
2153             tcg_out_ext32s(s, datalo, datalo);
2154         } else {
2155             tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2156                                      h.base, h.index, 0, h.ofs);
2157         }
2158         break;
2159 #endif
2160     case MO_UQ:
2161         if (TCG_TARGET_REG_BITS == 64) {
2162             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2163                                      h.base, h.index, 0, h.ofs);
2164             break;
2165         }
2166         if (use_movbe) {
2167             TCGReg t = datalo;
2168             datalo = datahi;
2169             datahi = t;
2170         }
2171         if (h.base == datalo || h.index == datalo) {
2172             tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2173                                      h.base, h.index, 0, h.ofs);
2174             tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2175             tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2176         } else {
2177             tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2178                                      h.base, h.index, 0, h.ofs);
2179             tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2180                                      h.base, h.index, 0, h.ofs + 4);
2181         }
2182         break;
2183
2184     case MO_128:
2185         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2186
2187         /*
2188          * Without 16-byte atomicity, use integer regs.
2189          * That is where we want the data, and it allows bswaps.
2190          */
2191         if (h.aa.atom < MO_128) {
2192             if (use_movbe) {
2193                 TCGReg t = datalo;
2194                 datalo = datahi;
2195                 datahi = t;
2196             }
2197             if (h.base == datalo || h.index == datalo) {
2198                 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2199                                          h.base, h.index, 0, h.ofs);
2200                 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2201                                      datalo, datahi, 0);
2202                 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2203                                      datahi, datahi, 8);
2204             } else {
2205                 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2206                                          h.base, h.index, 0, h.ofs);
2207                 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2208                                          h.base, h.index, 0, h.ofs + 8);
2209             }
2210             break;
2211         }
2212
2213         /*
2214          * With 16-byte atomicity, a vector load is required.
2215          * If we already have 16-byte alignment, then VMOVDQA always works.
2216          * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2217          * Else use we require a runtime test for alignment for VMOVDQA;
2218          * use VMOVDQU on the unaligned nonatomic path for simplicity.
2219          */
2220         if (h.aa.align >= MO_128) {
2221             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2222                                          TCG_TMP_VEC, 0,
2223                                          h.base, h.index, 0, h.ofs);
2224         } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2225             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2226                                          TCG_TMP_VEC, 0,
2227                                          h.base, h.index, 0, h.ofs);
2228         } else {
2229             TCGLabel *l1 = gen_new_label();
2230             TCGLabel *l2 = gen_new_label();
2231
2232             tcg_out_testi(s, h.base, 15);
2233             tcg_out_jxx(s, JCC_JNE, l1, true);
2234
2235             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2236                                          TCG_TMP_VEC, 0,
2237                                          h.base, h.index, 0, h.ofs);
2238             tcg_out_jxx(s, JCC_JMP, l2, true);
2239
2240             tcg_out_label(s, l1);
2241             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2242                                          TCG_TMP_VEC, 0,
2243                                          h.base, h.index, 0, h.ofs);
2244             tcg_out_label(s, l2);
2245         }
2246         tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2247         break;
2248
2249     default:
2250         g_assert_not_reached();
2251     }
2252 }
2253
2254 static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2255                             TCGReg addrlo, TCGReg addrhi,
2256                             MemOpIdx oi, TCGType data_type)
2257 {
2258     TCGLabelQemuLdst *ldst;
2259     HostAddress h;
2260
2261     ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2262     tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2263
2264     if (ldst) {
2265         ldst->type = data_type;
2266         ldst->datalo_reg = datalo;
2267         ldst->datahi_reg = datahi;
2268         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2269     }
2270 }
2271
2272 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2273                                    HostAddress h, MemOp memop)
2274 {
2275     bool use_movbe = false;
2276     int movop = OPC_MOVL_EvGv;
2277
2278     /*
2279      * Do big-endian stores with movbe or softmmu.
2280      * User-only without movbe will have its swapping done generically.
2281      */
2282     if (memop & MO_BSWAP) {
2283         tcg_debug_assert(have_movbe);
2284         use_movbe = true;
2285         movop = OPC_MOVBE_MyGy;
2286     }
2287
2288     switch (memop & MO_SIZE) {
2289     case MO_8:
2290         /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2291         tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2292         tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2293                                  datalo, h.base, h.index, 0, h.ofs);
2294         break;
2295     case MO_16:
2296         tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2297                                  h.base, h.index, 0, h.ofs);
2298         break;
2299     case MO_32:
2300         tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2301                                  h.base, h.index, 0, h.ofs);
2302         break;
2303     case MO_64:
2304         if (TCG_TARGET_REG_BITS == 64) {
2305             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2306                                      h.base, h.index, 0, h.ofs);
2307         } else {
2308             if (use_movbe) {
2309                 TCGReg t = datalo;
2310                 datalo = datahi;
2311                 datahi = t;
2312             }
2313             tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2314                                      h.base, h.index, 0, h.ofs);
2315             tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2316                                      h.base, h.index, 0, h.ofs + 4);
2317         }
2318         break;
2319
2320     case MO_128:
2321         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2322
2323         /*
2324          * Without 16-byte atomicity, use integer regs.
2325          * That is where we have the data, and it allows bswaps.
2326          */
2327         if (h.aa.atom < MO_128) {
2328             if (use_movbe) {
2329                 TCGReg t = datalo;
2330                 datalo = datahi;
2331                 datahi = t;
2332             }
2333             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2334                                      h.base, h.index, 0, h.ofs);
2335             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2336                                      h.base, h.index, 0, h.ofs + 8);
2337             break;
2338         }
2339
2340         /*
2341          * With 16-byte atomicity, a vector store is required.
2342          * If we already have 16-byte alignment, then VMOVDQA always works.
2343          * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2344          * Else use we require a runtime test for alignment for VMOVDQA;
2345          * use VMOVDQU on the unaligned nonatomic path for simplicity.
2346          */
2347         tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2348         if (h.aa.align >= MO_128) {
2349             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2350                                          TCG_TMP_VEC, 0,
2351                                          h.base, h.index, 0, h.ofs);
2352         } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2353             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2354                                          TCG_TMP_VEC, 0,
2355                                          h.base, h.index, 0, h.ofs);
2356         } else {
2357             TCGLabel *l1 = gen_new_label();
2358             TCGLabel *l2 = gen_new_label();
2359
2360             tcg_out_testi(s, h.base, 15);
2361             tcg_out_jxx(s, JCC_JNE, l1, true);
2362
2363             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2364                                          TCG_TMP_VEC, 0,
2365                                          h.base, h.index, 0, h.ofs);
2366             tcg_out_jxx(s, JCC_JMP, l2, true);
2367
2368             tcg_out_label(s, l1);
2369             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2370                                          TCG_TMP_VEC, 0,
2371                                          h.base, h.index, 0, h.ofs);
2372             tcg_out_label(s, l2);
2373         }
2374         break;
2375
2376     default:
2377         g_assert_not_reached();
2378     }
2379 }
2380
2381 static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2382                             TCGReg addrlo, TCGReg addrhi,
2383                             MemOpIdx oi, TCGType data_type)
2384 {
2385     TCGLabelQemuLdst *ldst;
2386     HostAddress h;
2387
2388     ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2389     tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2390
2391     if (ldst) {
2392         ldst->type = data_type;
2393         ldst->datalo_reg = datalo;
2394         ldst->datahi_reg = datahi;
2395         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2396     }
2397 }
2398
2399 static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2400 {
2401     /* Reuse the zeroing that exists for goto_ptr.  */
2402     if (a0 == 0) {
2403         tcg_out_jmp(s, tcg_code_gen_epilogue);
2404     } else {
2405         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2406         tcg_out_jmp(s, tb_ret_addr);
2407     }
2408 }
2409
2410 static void tcg_out_goto_tb(TCGContext *s, int which)
2411 {
2412     /*
2413      * Jump displacement must be aligned for atomic patching;
2414      * see if we need to add extra nops before jump
2415      */
2416     int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2417     if (gap != 1) {
2418         tcg_out_nopn(s, gap - 1);
2419     }
2420     tcg_out8(s, OPC_JMP_long); /* jmp im */
2421     set_jmp_insn_offset(s, which);
2422     tcg_out32(s, 0);
2423     set_jmp_reset_offset(s, which);
2424 }
2425
2426 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2427                               uintptr_t jmp_rx, uintptr_t jmp_rw)
2428 {
2429     /* patch the branch destination */
2430     uintptr_t addr = tb->jmp_target_addr[n];
2431     qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2432     /* no need to flush icache explicitly */
2433 }
2434
2435 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2436                               const TCGArg args[TCG_MAX_OP_ARGS],
2437                               const int const_args[TCG_MAX_OP_ARGS])
2438 {
2439     TCGArg a0, a1, a2;
2440     int c, const_a2, vexop, rexw = 0;
2441
2442 #if TCG_TARGET_REG_BITS == 64
2443 # define OP_32_64(x) \
2444         case glue(glue(INDEX_op_, x), _i64): \
2445             rexw = P_REXW; /* FALLTHRU */    \
2446         case glue(glue(INDEX_op_, x), _i32)
2447 #else
2448 # define OP_32_64(x) \
2449         case glue(glue(INDEX_op_, x), _i32)
2450 #endif
2451
2452     /* Hoist the loads of the most common arguments.  */
2453     a0 = args[0];
2454     a1 = args[1];
2455     a2 = args[2];
2456     const_a2 = const_args[2];
2457
2458     switch (opc) {
2459     case INDEX_op_goto_ptr:
2460         /* jmp to the given host address (could be epilogue) */
2461         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2462         break;
2463     case INDEX_op_br:
2464         tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2465         break;
2466     OP_32_64(ld8u):
2467         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2468         tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2469         break;
2470     OP_32_64(ld8s):
2471         tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2472         break;
2473     OP_32_64(ld16u):
2474         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2475         tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2476         break;
2477     OP_32_64(ld16s):
2478         tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2479         break;
2480 #if TCG_TARGET_REG_BITS == 64
2481     case INDEX_op_ld32u_i64:
2482 #endif
2483     case INDEX_op_ld_i32:
2484         tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2485         break;
2486
2487     OP_32_64(st8):
2488         if (const_args[0]) {
2489             tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2490             tcg_out8(s, a0);
2491         } else {
2492             tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2493         }
2494         break;
2495     OP_32_64(st16):
2496         if (const_args[0]) {
2497             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2498             tcg_out16(s, a0);
2499         } else {
2500             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2501         }
2502         break;
2503 #if TCG_TARGET_REG_BITS == 64
2504     case INDEX_op_st32_i64:
2505 #endif
2506     case INDEX_op_st_i32:
2507         if (const_args[0]) {
2508             tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2509             tcg_out32(s, a0);
2510         } else {
2511             tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2512         }
2513         break;
2514
2515     OP_32_64(add):
2516         /* For 3-operand addition, use LEA.  */
2517         if (a0 != a1) {
2518             TCGArg c3 = 0;
2519             if (const_a2) {
2520                 c3 = a2, a2 = -1;
2521             } else if (a0 == a2) {
2522                 /* Watch out for dest = src + dest, since we've removed
2523                    the matching constraint on the add.  */
2524                 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2525                 break;
2526             }
2527
2528             tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2529             break;
2530         }
2531         c = ARITH_ADD;
2532         goto gen_arith;
2533     OP_32_64(sub):
2534         c = ARITH_SUB;
2535         goto gen_arith;
2536     OP_32_64(and):
2537         c = ARITH_AND;
2538         goto gen_arith;
2539     OP_32_64(or):
2540         c = ARITH_OR;
2541         goto gen_arith;
2542     OP_32_64(xor):
2543         c = ARITH_XOR;
2544         goto gen_arith;
2545     gen_arith:
2546         if (const_a2) {
2547             tgen_arithi(s, c + rexw, a0, a2, 0);
2548         } else {
2549             tgen_arithr(s, c + rexw, a0, a2);
2550         }
2551         break;
2552
2553     OP_32_64(andc):
2554         if (const_a2) {
2555             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2556             tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2557         } else {
2558             tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2559         }
2560         break;
2561
2562     OP_32_64(mul):
2563         if (const_a2) {
2564             int32_t val;
2565             val = a2;
2566             if (val == (int8_t)val) {
2567                 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2568                 tcg_out8(s, val);
2569             } else {
2570                 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2571                 tcg_out32(s, val);
2572             }
2573         } else {
2574             tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2575         }
2576         break;
2577
2578     OP_32_64(div2):
2579         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2580         break;
2581     OP_32_64(divu2):
2582         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2583         break;
2584
2585     OP_32_64(shl):
2586         /* For small constant 3-operand shift, use LEA.  */
2587         if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2588             if (a2 - 1 == 0) {
2589                 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2590                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2591             } else {
2592                 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2593                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2594             }
2595             break;
2596         }
2597         c = SHIFT_SHL;
2598         vexop = OPC_SHLX;
2599         goto gen_shift_maybe_vex;
2600     OP_32_64(shr):
2601         c = SHIFT_SHR;
2602         vexop = OPC_SHRX;
2603         goto gen_shift_maybe_vex;
2604     OP_32_64(sar):
2605         c = SHIFT_SAR;
2606         vexop = OPC_SARX;
2607         goto gen_shift_maybe_vex;
2608     OP_32_64(rotl):
2609         c = SHIFT_ROL;
2610         goto gen_shift;
2611     OP_32_64(rotr):
2612         c = SHIFT_ROR;
2613         goto gen_shift;
2614     gen_shift_maybe_vex:
2615         if (have_bmi2) {
2616             if (!const_a2) {
2617                 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2618                 break;
2619             }
2620             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2621         }
2622         /* FALLTHRU */
2623     gen_shift:
2624         if (const_a2) {
2625             tcg_out_shifti(s, c + rexw, a0, a2);
2626         } else {
2627             tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2628         }
2629         break;
2630
2631     OP_32_64(ctz):
2632         tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2633         break;
2634     OP_32_64(clz):
2635         tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2636         break;
2637     OP_32_64(ctpop):
2638         tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2639         break;
2640
2641     OP_32_64(brcond):
2642         tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2643                        arg_label(args[3]), 0);
2644         break;
2645     OP_32_64(setcond):
2646         tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
2647         break;
2648     OP_32_64(negsetcond):
2649         tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
2650         break;
2651     OP_32_64(movcond):
2652         tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2653         break;
2654
2655     OP_32_64(bswap16):
2656         if (a2 & TCG_BSWAP_OS) {
2657             /* Output must be sign-extended. */
2658             if (rexw) {
2659                 tcg_out_bswap64(s, a0);
2660                 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2661             } else {
2662                 tcg_out_bswap32(s, a0);
2663                 tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2664             }
2665         } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2666             /* Output must be zero-extended, but input isn't. */
2667             tcg_out_bswap32(s, a0);
2668             tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2669         } else {
2670             tcg_out_rolw_8(s, a0);
2671         }
2672         break;
2673     OP_32_64(bswap32):
2674         tcg_out_bswap32(s, a0);
2675         if (rexw && (a2 & TCG_BSWAP_OS)) {
2676             tcg_out_ext32s(s, a0, a0);
2677         }
2678         break;
2679
2680     OP_32_64(neg):
2681         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2682         break;
2683     OP_32_64(not):
2684         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2685         break;
2686
2687     case INDEX_op_qemu_ld_a64_i32:
2688         if (TCG_TARGET_REG_BITS == 32) {
2689             tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2690             break;
2691         }
2692         /* fall through */
2693     case INDEX_op_qemu_ld_a32_i32:
2694         tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2695         break;
2696     case INDEX_op_qemu_ld_a32_i64:
2697         if (TCG_TARGET_REG_BITS == 64) {
2698             tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2699         } else {
2700             tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2701         }
2702         break;
2703     case INDEX_op_qemu_ld_a64_i64:
2704         if (TCG_TARGET_REG_BITS == 64) {
2705             tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2706         } else {
2707             tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2708         }
2709         break;
2710     case INDEX_op_qemu_ld_a32_i128:
2711     case INDEX_op_qemu_ld_a64_i128:
2712         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2713         tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2714         break;
2715
2716     case INDEX_op_qemu_st_a64_i32:
2717     case INDEX_op_qemu_st8_a64_i32:
2718         if (TCG_TARGET_REG_BITS == 32) {
2719             tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2720             break;
2721         }
2722         /* fall through */
2723     case INDEX_op_qemu_st_a32_i32:
2724     case INDEX_op_qemu_st8_a32_i32:
2725         tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2726         break;
2727     case INDEX_op_qemu_st_a32_i64:
2728         if (TCG_TARGET_REG_BITS == 64) {
2729             tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2730         } else {
2731             tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2732         }
2733         break;
2734     case INDEX_op_qemu_st_a64_i64:
2735         if (TCG_TARGET_REG_BITS == 64) {
2736             tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2737         } else {
2738             tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2739         }
2740         break;
2741     case INDEX_op_qemu_st_a32_i128:
2742     case INDEX_op_qemu_st_a64_i128:
2743         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2744         tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2745         break;
2746
2747     OP_32_64(mulu2):
2748         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2749         break;
2750     OP_32_64(muls2):
2751         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2752         break;
2753     OP_32_64(add2):
2754         if (const_args[4]) {
2755             tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2756         } else {
2757             tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2758         }
2759         if (const_args[5]) {
2760             tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2761         } else {
2762             tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2763         }
2764         break;
2765     OP_32_64(sub2):
2766         if (const_args[4]) {
2767             tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2768         } else {
2769             tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2770         }
2771         if (const_args[5]) {
2772             tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2773         } else {
2774             tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2775         }
2776         break;
2777
2778 #if TCG_TARGET_REG_BITS == 32
2779     case INDEX_op_brcond2_i32:
2780         tcg_out_brcond2(s, args, const_args, 0);
2781         break;
2782     case INDEX_op_setcond2_i32:
2783         tcg_out_setcond2(s, args, const_args);
2784         break;
2785 #else /* TCG_TARGET_REG_BITS == 64 */
2786     case INDEX_op_ld32s_i64:
2787         tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2788         break;
2789     case INDEX_op_ld_i64:
2790         tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2791         break;
2792     case INDEX_op_st_i64:
2793         if (const_args[0]) {
2794             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2795             tcg_out32(s, a0);
2796         } else {
2797             tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2798         }
2799         break;
2800
2801     case INDEX_op_bswap64_i64:
2802         tcg_out_bswap64(s, a0);
2803         break;
2804     case INDEX_op_extrh_i64_i32:
2805         tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2806         break;
2807 #endif
2808
2809     OP_32_64(deposit):
2810         if (args[3] == 0 && args[4] == 8) {
2811             /* load bits 0..7 */
2812             if (const_a2) {
2813                 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2814                             0, a0, 0);
2815                 tcg_out8(s, a2);
2816             } else {
2817                 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2818             }
2819         } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2820             /* load bits 8..15 */
2821             if (const_a2) {
2822                 tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2823                 tcg_out8(s, a2);
2824             } else {
2825                 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2826             }
2827         } else if (args[3] == 0 && args[4] == 16) {
2828             /* load bits 0..15 */
2829             if (const_a2) {
2830                 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2831                             0, a0, 0);
2832                 tcg_out16(s, a2);
2833             } else {
2834                 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2835             }
2836         } else {
2837             g_assert_not_reached();
2838         }
2839         break;
2840
2841     case INDEX_op_extract_i64:
2842         if (a2 + args[3] == 32) {
2843             /* This is a 32-bit zero-extending right shift.  */
2844             tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2845             tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2846             break;
2847         }
2848         /* FALLTHRU */
2849     case INDEX_op_extract_i32:
2850         /* On the off-chance that we can use the high-byte registers.
2851            Otherwise we emit the same ext16 + shift pattern that we
2852            would have gotten from the normal tcg-op.c expansion.  */
2853         tcg_debug_assert(a2 == 8 && args[3] == 8);
2854         if (a1 < 4 && a0 < 8) {
2855             tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2856         } else {
2857             tcg_out_ext16u(s, a0, a1);
2858             tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2859         }
2860         break;
2861
2862     case INDEX_op_sextract_i32:
2863         /* We don't implement sextract_i64, as we cannot sign-extend to
2864            64-bits without using the REX prefix that explicitly excludes
2865            access to the high-byte registers.  */
2866         tcg_debug_assert(a2 == 8 && args[3] == 8);
2867         if (a1 < 4 && a0 < 8) {
2868             tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2869         } else {
2870             tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2871             tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2872         }
2873         break;
2874
2875     OP_32_64(extract2):
2876         /* Note that SHRD outputs to the r/m operand.  */
2877         tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2878         tcg_out8(s, args[3]);
2879         break;
2880
2881     case INDEX_op_mb:
2882         tcg_out_mb(s, a0);
2883         break;
2884     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2885     case INDEX_op_mov_i64:
2886     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2887     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
2888     case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
2889     case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
2890     case INDEX_op_ext8s_i64:
2891     case INDEX_op_ext8u_i32:
2892     case INDEX_op_ext8u_i64:
2893     case INDEX_op_ext16s_i32:
2894     case INDEX_op_ext16s_i64:
2895     case INDEX_op_ext16u_i32:
2896     case INDEX_op_ext16u_i64:
2897     case INDEX_op_ext32s_i64:
2898     case INDEX_op_ext32u_i64:
2899     case INDEX_op_ext_i32_i64:
2900     case INDEX_op_extu_i32_i64:
2901     case INDEX_op_extrl_i64_i32:
2902     default:
2903         g_assert_not_reached();
2904     }
2905
2906 #undef OP_32_64
2907 }
2908
2909 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2910                            unsigned vecl, unsigned vece,
2911                            const TCGArg args[TCG_MAX_OP_ARGS],
2912                            const int const_args[TCG_MAX_OP_ARGS])
2913 {
2914     static int const add_insn[4] = {
2915         OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2916     };
2917     static int const ssadd_insn[4] = {
2918         OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2919     };
2920     static int const usadd_insn[4] = {
2921         OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2922     };
2923     static int const sub_insn[4] = {
2924         OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2925     };
2926     static int const sssub_insn[4] = {
2927         OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2928     };
2929     static int const ussub_insn[4] = {
2930         OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2931     };
2932     static int const mul_insn[4] = {
2933         OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2934     };
2935     static int const shift_imm_insn[4] = {
2936         OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2937     };
2938     static int const cmpeq_insn[4] = {
2939         OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2940     };
2941     static int const cmpgt_insn[4] = {
2942         OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2943     };
2944     static int const punpckl_insn[4] = {
2945         OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2946     };
2947     static int const punpckh_insn[4] = {
2948         OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2949     };
2950     static int const packss_insn[4] = {
2951         OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2952     };
2953     static int const packus_insn[4] = {
2954         OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2955     };
2956     static int const smin_insn[4] = {
2957         OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2958     };
2959     static int const smax_insn[4] = {
2960         OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2961     };
2962     static int const umin_insn[4] = {
2963         OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2964     };
2965     static int const umax_insn[4] = {
2966         OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2967     };
2968     static int const rotlv_insn[4] = {
2969         OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2970     };
2971     static int const rotrv_insn[4] = {
2972         OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
2973     };
2974     static int const shlv_insn[4] = {
2975         OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
2976     };
2977     static int const shrv_insn[4] = {
2978         OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
2979     };
2980     static int const sarv_insn[4] = {
2981         OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
2982     };
2983     static int const shls_insn[4] = {
2984         OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2985     };
2986     static int const shrs_insn[4] = {
2987         OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2988     };
2989     static int const sars_insn[4] = {
2990         OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
2991     };
2992     static int const vpshldi_insn[4] = {
2993         OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
2994     };
2995     static int const vpshldv_insn[4] = {
2996         OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
2997     };
2998     static int const vpshrdv_insn[4] = {
2999         OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
3000     };
3001     static int const abs_insn[4] = {
3002         OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
3003     };
3004
3005     TCGType type = vecl + TCG_TYPE_V64;
3006     int insn, sub;
3007     TCGArg a0, a1, a2, a3;
3008
3009     a0 = args[0];
3010     a1 = args[1];
3011     a2 = args[2];
3012
3013     switch (opc) {
3014     case INDEX_op_add_vec:
3015         insn = add_insn[vece];
3016         goto gen_simd;
3017     case INDEX_op_ssadd_vec:
3018         insn = ssadd_insn[vece];
3019         goto gen_simd;
3020     case INDEX_op_usadd_vec:
3021         insn = usadd_insn[vece];
3022         goto gen_simd;
3023     case INDEX_op_sub_vec:
3024         insn = sub_insn[vece];
3025         goto gen_simd;
3026     case INDEX_op_sssub_vec:
3027         insn = sssub_insn[vece];
3028         goto gen_simd;
3029     case INDEX_op_ussub_vec:
3030         insn = ussub_insn[vece];
3031         goto gen_simd;
3032     case INDEX_op_mul_vec:
3033         insn = mul_insn[vece];
3034         goto gen_simd;
3035     case INDEX_op_and_vec:
3036         insn = OPC_PAND;
3037         goto gen_simd;
3038     case INDEX_op_or_vec:
3039         insn = OPC_POR;
3040         goto gen_simd;
3041     case INDEX_op_xor_vec:
3042         insn = OPC_PXOR;
3043         goto gen_simd;
3044     case INDEX_op_smin_vec:
3045         insn = smin_insn[vece];
3046         goto gen_simd;
3047     case INDEX_op_umin_vec:
3048         insn = umin_insn[vece];
3049         goto gen_simd;
3050     case INDEX_op_smax_vec:
3051         insn = smax_insn[vece];
3052         goto gen_simd;
3053     case INDEX_op_umax_vec:
3054         insn = umax_insn[vece];
3055         goto gen_simd;
3056     case INDEX_op_shlv_vec:
3057         insn = shlv_insn[vece];
3058         goto gen_simd;
3059     case INDEX_op_shrv_vec:
3060         insn = shrv_insn[vece];
3061         goto gen_simd;
3062     case INDEX_op_sarv_vec:
3063         insn = sarv_insn[vece];
3064         goto gen_simd;
3065     case INDEX_op_rotlv_vec:
3066         insn = rotlv_insn[vece];
3067         goto gen_simd;
3068     case INDEX_op_rotrv_vec:
3069         insn = rotrv_insn[vece];
3070         goto gen_simd;
3071     case INDEX_op_shls_vec:
3072         insn = shls_insn[vece];
3073         goto gen_simd;
3074     case INDEX_op_shrs_vec:
3075         insn = shrs_insn[vece];
3076         goto gen_simd;
3077     case INDEX_op_sars_vec:
3078         insn = sars_insn[vece];
3079         goto gen_simd;
3080     case INDEX_op_x86_punpckl_vec:
3081         insn = punpckl_insn[vece];
3082         goto gen_simd;
3083     case INDEX_op_x86_punpckh_vec:
3084         insn = punpckh_insn[vece];
3085         goto gen_simd;
3086     case INDEX_op_x86_packss_vec:
3087         insn = packss_insn[vece];
3088         goto gen_simd;
3089     case INDEX_op_x86_packus_vec:
3090         insn = packus_insn[vece];
3091         goto gen_simd;
3092     case INDEX_op_x86_vpshldv_vec:
3093         insn = vpshldv_insn[vece];
3094         a1 = a2;
3095         a2 = args[3];
3096         goto gen_simd;
3097     case INDEX_op_x86_vpshrdv_vec:
3098         insn = vpshrdv_insn[vece];
3099         a1 = a2;
3100         a2 = args[3];
3101         goto gen_simd;
3102 #if TCG_TARGET_REG_BITS == 32
3103     case INDEX_op_dup2_vec:
3104         /* First merge the two 32-bit inputs to a single 64-bit element. */
3105         tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3106         /* Then replicate the 64-bit elements across the rest of the vector. */
3107         if (type != TCG_TYPE_V64) {
3108             tcg_out_dup_vec(s, type, MO_64, a0, a0);
3109         }
3110         break;
3111 #endif
3112     case INDEX_op_abs_vec:
3113         insn = abs_insn[vece];
3114         a2 = a1;
3115         a1 = 0;
3116         goto gen_simd;
3117     gen_simd:
3118         tcg_debug_assert(insn != OPC_UD2);
3119         if (type == TCG_TYPE_V256) {
3120             insn |= P_VEXL;
3121         }
3122         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3123         break;
3124
3125     case INDEX_op_cmp_vec:
3126         sub = args[3];
3127         if (sub == TCG_COND_EQ) {
3128             insn = cmpeq_insn[vece];
3129         } else if (sub == TCG_COND_GT) {
3130             insn = cmpgt_insn[vece];
3131         } else {
3132             g_assert_not_reached();
3133         }
3134         goto gen_simd;
3135
3136     case INDEX_op_andc_vec:
3137         insn = OPC_PANDN;
3138         if (type == TCG_TYPE_V256) {
3139             insn |= P_VEXL;
3140         }
3141         tcg_out_vex_modrm(s, insn, a0, a2, a1);
3142         break;
3143
3144     case INDEX_op_shli_vec:
3145         insn = shift_imm_insn[vece];
3146         sub = 6;
3147         goto gen_shift;
3148     case INDEX_op_shri_vec:
3149         insn = shift_imm_insn[vece];
3150         sub = 2;
3151         goto gen_shift;
3152     case INDEX_op_sari_vec:
3153         if (vece == MO_64) {
3154             insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3155         } else {
3156             insn = shift_imm_insn[vece];
3157         }
3158         sub = 4;
3159         goto gen_shift;
3160     case INDEX_op_rotli_vec:
3161         insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3162         if (vece == MO_64) {
3163             insn |= P_VEXW;
3164         }
3165         sub = 1;
3166         goto gen_shift;
3167     gen_shift:
3168         tcg_debug_assert(vece != MO_8);
3169         if (type == TCG_TYPE_V256) {
3170             insn |= P_VEXL;
3171         }
3172         tcg_out_vex_modrm(s, insn, sub, a0, a1);
3173         tcg_out8(s, a2);
3174         break;
3175
3176     case INDEX_op_ld_vec:
3177         tcg_out_ld(s, type, a0, a1, a2);
3178         break;
3179     case INDEX_op_st_vec:
3180         tcg_out_st(s, type, a0, a1, a2);
3181         break;
3182     case INDEX_op_dupm_vec:
3183         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3184         break;
3185
3186     case INDEX_op_x86_shufps_vec:
3187         insn = OPC_SHUFPS;
3188         sub = args[3];
3189         goto gen_simd_imm8;
3190     case INDEX_op_x86_blend_vec:
3191         if (vece == MO_16) {
3192             insn = OPC_PBLENDW;
3193         } else if (vece == MO_32) {
3194             insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3195         } else {
3196             g_assert_not_reached();
3197         }
3198         sub = args[3];
3199         goto gen_simd_imm8;
3200     case INDEX_op_x86_vperm2i128_vec:
3201         insn = OPC_VPERM2I128;
3202         sub = args[3];
3203         goto gen_simd_imm8;
3204     case INDEX_op_x86_vpshldi_vec:
3205         insn = vpshldi_insn[vece];
3206         sub = args[3];
3207         goto gen_simd_imm8;
3208
3209     case INDEX_op_not_vec:
3210         insn = OPC_VPTERNLOGQ;
3211         a2 = a1;
3212         sub = 0x33; /* !B */
3213         goto gen_simd_imm8;
3214     case INDEX_op_nor_vec:
3215         insn = OPC_VPTERNLOGQ;
3216         sub = 0x11; /* norCB */
3217         goto gen_simd_imm8;
3218     case INDEX_op_nand_vec:
3219         insn = OPC_VPTERNLOGQ;
3220         sub = 0x77; /* nandCB */
3221         goto gen_simd_imm8;
3222     case INDEX_op_eqv_vec:
3223         insn = OPC_VPTERNLOGQ;
3224         sub = 0x99; /* xnorCB */
3225         goto gen_simd_imm8;
3226     case INDEX_op_orc_vec:
3227         insn = OPC_VPTERNLOGQ;
3228         sub = 0xdd; /* orB!C */
3229         goto gen_simd_imm8;
3230
3231     case INDEX_op_bitsel_vec:
3232         insn = OPC_VPTERNLOGQ;
3233         a3 = args[3];
3234         if (a0 == a1) {
3235             a1 = a2;
3236             a2 = a3;
3237             sub = 0xca; /* A?B:C */
3238         } else if (a0 == a2) {
3239             a2 = a3;
3240             sub = 0xe2; /* B?A:C */
3241         } else {
3242             tcg_out_mov(s, type, a0, a3);
3243             sub = 0xb8; /* B?C:A */
3244         }
3245         goto gen_simd_imm8;
3246
3247     gen_simd_imm8:
3248         tcg_debug_assert(insn != OPC_UD2);
3249         if (type == TCG_TYPE_V256) {
3250             insn |= P_VEXL;
3251         }
3252         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3253         tcg_out8(s, sub);
3254         break;
3255
3256     case INDEX_op_x86_vpblendvb_vec:
3257         insn = OPC_VPBLENDVB;
3258         if (type == TCG_TYPE_V256) {
3259             insn |= P_VEXL;
3260         }
3261         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3262         tcg_out8(s, args[3] << 4);
3263         break;
3264
3265     case INDEX_op_x86_psrldq_vec:
3266         tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3267         tcg_out8(s, a2);
3268         break;
3269
3270     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3271     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3272     default:
3273         g_assert_not_reached();
3274     }
3275 }
3276
3277 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3278 {
3279     switch (op) {
3280     case INDEX_op_goto_ptr:
3281         return C_O0_I1(r);
3282
3283     case INDEX_op_ld8u_i32:
3284     case INDEX_op_ld8u_i64:
3285     case INDEX_op_ld8s_i32:
3286     case INDEX_op_ld8s_i64:
3287     case INDEX_op_ld16u_i32:
3288     case INDEX_op_ld16u_i64:
3289     case INDEX_op_ld16s_i32:
3290     case INDEX_op_ld16s_i64:
3291     case INDEX_op_ld_i32:
3292     case INDEX_op_ld32u_i64:
3293     case INDEX_op_ld32s_i64:
3294     case INDEX_op_ld_i64:
3295         return C_O1_I1(r, r);
3296
3297     case INDEX_op_st8_i32:
3298     case INDEX_op_st8_i64:
3299         return C_O0_I2(qi, r);
3300
3301     case INDEX_op_st16_i32:
3302     case INDEX_op_st16_i64:
3303     case INDEX_op_st_i32:
3304     case INDEX_op_st32_i64:
3305         return C_O0_I2(ri, r);
3306
3307     case INDEX_op_st_i64:
3308         return C_O0_I2(re, r);
3309
3310     case INDEX_op_add_i32:
3311     case INDEX_op_add_i64:
3312         return C_O1_I2(r, r, re);
3313
3314     case INDEX_op_sub_i32:
3315     case INDEX_op_sub_i64:
3316     case INDEX_op_mul_i32:
3317     case INDEX_op_mul_i64:
3318     case INDEX_op_or_i32:
3319     case INDEX_op_or_i64:
3320     case INDEX_op_xor_i32:
3321     case INDEX_op_xor_i64:
3322         return C_O1_I2(r, 0, re);
3323
3324     case INDEX_op_and_i32:
3325     case INDEX_op_and_i64:
3326         return C_O1_I2(r, 0, reZ);
3327
3328     case INDEX_op_andc_i32:
3329     case INDEX_op_andc_i64:
3330         return C_O1_I2(r, r, rI);
3331
3332     case INDEX_op_shl_i32:
3333     case INDEX_op_shl_i64:
3334     case INDEX_op_shr_i32:
3335     case INDEX_op_shr_i64:
3336     case INDEX_op_sar_i32:
3337     case INDEX_op_sar_i64:
3338         return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3339
3340     case INDEX_op_rotl_i32:
3341     case INDEX_op_rotl_i64:
3342     case INDEX_op_rotr_i32:
3343     case INDEX_op_rotr_i64:
3344         return C_O1_I2(r, 0, ci);
3345
3346     case INDEX_op_brcond_i32:
3347     case INDEX_op_brcond_i64:
3348         return C_O0_I2(r, re);
3349
3350     case INDEX_op_bswap16_i32:
3351     case INDEX_op_bswap16_i64:
3352     case INDEX_op_bswap32_i32:
3353     case INDEX_op_bswap32_i64:
3354     case INDEX_op_bswap64_i64:
3355     case INDEX_op_neg_i32:
3356     case INDEX_op_neg_i64:
3357     case INDEX_op_not_i32:
3358     case INDEX_op_not_i64:
3359     case INDEX_op_extrh_i64_i32:
3360         return C_O1_I1(r, 0);
3361
3362     case INDEX_op_ext8s_i32:
3363     case INDEX_op_ext8s_i64:
3364     case INDEX_op_ext8u_i32:
3365     case INDEX_op_ext8u_i64:
3366         return C_O1_I1(r, q);
3367
3368     case INDEX_op_ext16s_i32:
3369     case INDEX_op_ext16s_i64:
3370     case INDEX_op_ext16u_i32:
3371     case INDEX_op_ext16u_i64:
3372     case INDEX_op_ext32s_i64:
3373     case INDEX_op_ext32u_i64:
3374     case INDEX_op_ext_i32_i64:
3375     case INDEX_op_extu_i32_i64:
3376     case INDEX_op_extrl_i64_i32:
3377     case INDEX_op_extract_i32:
3378     case INDEX_op_extract_i64:
3379     case INDEX_op_sextract_i32:
3380     case INDEX_op_ctpop_i32:
3381     case INDEX_op_ctpop_i64:
3382         return C_O1_I1(r, r);
3383
3384     case INDEX_op_extract2_i32:
3385     case INDEX_op_extract2_i64:
3386         return C_O1_I2(r, 0, r);
3387
3388     case INDEX_op_deposit_i32:
3389     case INDEX_op_deposit_i64:
3390         return C_O1_I2(q, 0, qi);
3391
3392     case INDEX_op_setcond_i32:
3393     case INDEX_op_setcond_i64:
3394     case INDEX_op_negsetcond_i32:
3395     case INDEX_op_negsetcond_i64:
3396         return C_O1_I2(q, r, re);
3397
3398     case INDEX_op_movcond_i32:
3399     case INDEX_op_movcond_i64:
3400         return C_O1_I4(r, r, re, r, 0);
3401
3402     case INDEX_op_div2_i32:
3403     case INDEX_op_div2_i64:
3404     case INDEX_op_divu2_i32:
3405     case INDEX_op_divu2_i64:
3406         return C_O2_I3(a, d, 0, 1, r);
3407
3408     case INDEX_op_mulu2_i32:
3409     case INDEX_op_mulu2_i64:
3410     case INDEX_op_muls2_i32:
3411     case INDEX_op_muls2_i64:
3412         return C_O2_I2(a, d, a, r);
3413
3414     case INDEX_op_add2_i32:
3415     case INDEX_op_add2_i64:
3416     case INDEX_op_sub2_i32:
3417     case INDEX_op_sub2_i64:
3418         return C_N1_O1_I4(r, r, 0, 1, re, re);
3419
3420     case INDEX_op_ctz_i32:
3421     case INDEX_op_ctz_i64:
3422         return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3423
3424     case INDEX_op_clz_i32:
3425     case INDEX_op_clz_i64:
3426         return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3427
3428     case INDEX_op_qemu_ld_a32_i32:
3429         return C_O1_I1(r, L);
3430     case INDEX_op_qemu_ld_a64_i32:
3431         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3432
3433     case INDEX_op_qemu_st_a32_i32:
3434         return C_O0_I2(L, L);
3435     case INDEX_op_qemu_st_a64_i32:
3436         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3437     case INDEX_op_qemu_st8_a32_i32:
3438         return C_O0_I2(s, L);
3439     case INDEX_op_qemu_st8_a64_i32:
3440         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3441
3442     case INDEX_op_qemu_ld_a32_i64:
3443         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3444     case INDEX_op_qemu_ld_a64_i64:
3445         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3446
3447     case INDEX_op_qemu_st_a32_i64:
3448         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3449     case INDEX_op_qemu_st_a64_i64:
3450         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3451
3452     case INDEX_op_qemu_ld_a32_i128:
3453     case INDEX_op_qemu_ld_a64_i128:
3454         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3455         return C_O2_I1(r, r, L);
3456     case INDEX_op_qemu_st_a32_i128:
3457     case INDEX_op_qemu_st_a64_i128:
3458         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3459         return C_O0_I3(L, L, L);
3460
3461     case INDEX_op_brcond2_i32:
3462         return C_O0_I4(r, r, ri, ri);
3463
3464     case INDEX_op_setcond2_i32:
3465         return C_O1_I4(r, r, r, ri, ri);
3466
3467     case INDEX_op_ld_vec:
3468     case INDEX_op_dupm_vec:
3469         return C_O1_I1(x, r);
3470
3471     case INDEX_op_st_vec:
3472         return C_O0_I2(x, r);
3473
3474     case INDEX_op_add_vec:
3475     case INDEX_op_sub_vec:
3476     case INDEX_op_mul_vec:
3477     case INDEX_op_and_vec:
3478     case INDEX_op_or_vec:
3479     case INDEX_op_xor_vec:
3480     case INDEX_op_andc_vec:
3481     case INDEX_op_orc_vec:
3482     case INDEX_op_nand_vec:
3483     case INDEX_op_nor_vec:
3484     case INDEX_op_eqv_vec:
3485     case INDEX_op_ssadd_vec:
3486     case INDEX_op_usadd_vec:
3487     case INDEX_op_sssub_vec:
3488     case INDEX_op_ussub_vec:
3489     case INDEX_op_smin_vec:
3490     case INDEX_op_umin_vec:
3491     case INDEX_op_smax_vec:
3492     case INDEX_op_umax_vec:
3493     case INDEX_op_shlv_vec:
3494     case INDEX_op_shrv_vec:
3495     case INDEX_op_sarv_vec:
3496     case INDEX_op_rotlv_vec:
3497     case INDEX_op_rotrv_vec:
3498     case INDEX_op_shls_vec:
3499     case INDEX_op_shrs_vec:
3500     case INDEX_op_sars_vec:
3501     case INDEX_op_cmp_vec:
3502     case INDEX_op_x86_shufps_vec:
3503     case INDEX_op_x86_blend_vec:
3504     case INDEX_op_x86_packss_vec:
3505     case INDEX_op_x86_packus_vec:
3506     case INDEX_op_x86_vperm2i128_vec:
3507     case INDEX_op_x86_punpckl_vec:
3508     case INDEX_op_x86_punpckh_vec:
3509     case INDEX_op_x86_vpshldi_vec:
3510 #if TCG_TARGET_REG_BITS == 32
3511     case INDEX_op_dup2_vec:
3512 #endif
3513         return C_O1_I2(x, x, x);
3514
3515     case INDEX_op_abs_vec:
3516     case INDEX_op_dup_vec:
3517     case INDEX_op_not_vec:
3518     case INDEX_op_shli_vec:
3519     case INDEX_op_shri_vec:
3520     case INDEX_op_sari_vec:
3521     case INDEX_op_rotli_vec:
3522     case INDEX_op_x86_psrldq_vec:
3523         return C_O1_I1(x, x);
3524
3525     case INDEX_op_x86_vpshldv_vec:
3526     case INDEX_op_x86_vpshrdv_vec:
3527         return C_O1_I3(x, 0, x, x);
3528
3529     case INDEX_op_bitsel_vec:
3530     case INDEX_op_x86_vpblendvb_vec:
3531         return C_O1_I3(x, x, x, x);
3532
3533     default:
3534         g_assert_not_reached();
3535     }
3536 }
3537
3538 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3539 {
3540     switch (opc) {
3541     case INDEX_op_add_vec:
3542     case INDEX_op_sub_vec:
3543     case INDEX_op_and_vec:
3544     case INDEX_op_or_vec:
3545     case INDEX_op_xor_vec:
3546     case INDEX_op_andc_vec:
3547     case INDEX_op_orc_vec:
3548     case INDEX_op_nand_vec:
3549     case INDEX_op_nor_vec:
3550     case INDEX_op_eqv_vec:
3551     case INDEX_op_not_vec:
3552     case INDEX_op_bitsel_vec:
3553         return 1;
3554     case INDEX_op_cmp_vec:
3555     case INDEX_op_cmpsel_vec:
3556         return -1;
3557
3558     case INDEX_op_rotli_vec:
3559         return have_avx512vl && vece >= MO_32 ? 1 : -1;
3560
3561     case INDEX_op_shli_vec:
3562     case INDEX_op_shri_vec:
3563         /* We must expand the operation for MO_8.  */
3564         return vece == MO_8 ? -1 : 1;
3565
3566     case INDEX_op_sari_vec:
3567         switch (vece) {
3568         case MO_8:
3569             return -1;
3570         case MO_16:
3571         case MO_32:
3572             return 1;
3573         case MO_64:
3574             if (have_avx512vl) {
3575                 return 1;
3576             }
3577             /*
3578              * We can emulate this for MO_64, but it does not pay off
3579              * unless we're producing at least 4 values.
3580              */
3581             return type >= TCG_TYPE_V256 ? -1 : 0;
3582         }
3583         return 0;
3584
3585     case INDEX_op_shls_vec:
3586     case INDEX_op_shrs_vec:
3587         return vece >= MO_16;
3588     case INDEX_op_sars_vec:
3589         switch (vece) {
3590         case MO_16:
3591         case MO_32:
3592             return 1;
3593         case MO_64:
3594             return have_avx512vl;
3595         }
3596         return 0;
3597     case INDEX_op_rotls_vec:
3598         return vece >= MO_16 ? -1 : 0;
3599
3600     case INDEX_op_shlv_vec:
3601     case INDEX_op_shrv_vec:
3602         switch (vece) {
3603         case MO_16:
3604             return have_avx512bw;
3605         case MO_32:
3606         case MO_64:
3607             return have_avx2;
3608         }
3609         return 0;
3610     case INDEX_op_sarv_vec:
3611         switch (vece) {
3612         case MO_16:
3613             return have_avx512bw;
3614         case MO_32:
3615             return have_avx2;
3616         case MO_64:
3617             return have_avx512vl;
3618         }
3619         return 0;
3620     case INDEX_op_rotlv_vec:
3621     case INDEX_op_rotrv_vec:
3622         switch (vece) {
3623         case MO_16:
3624             return have_avx512vbmi2 ? -1 : 0;
3625         case MO_32:
3626         case MO_64:
3627             return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3628         }
3629         return 0;
3630
3631     case INDEX_op_mul_vec:
3632         switch (vece) {
3633         case MO_8:
3634             return -1;
3635         case MO_64:
3636             return have_avx512dq;
3637         }
3638         return 1;
3639
3640     case INDEX_op_ssadd_vec:
3641     case INDEX_op_usadd_vec:
3642     case INDEX_op_sssub_vec:
3643     case INDEX_op_ussub_vec:
3644         return vece <= MO_16;
3645     case INDEX_op_smin_vec:
3646     case INDEX_op_smax_vec:
3647     case INDEX_op_umin_vec:
3648     case INDEX_op_umax_vec:
3649     case INDEX_op_abs_vec:
3650         return vece <= MO_32 || have_avx512vl;
3651
3652     default:
3653         return 0;
3654     }
3655 }
3656
3657 static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3658                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3659 {
3660     TCGv_vec t1, t2;
3661
3662     tcg_debug_assert(vece == MO_8);
3663
3664     t1 = tcg_temp_new_vec(type);
3665     t2 = tcg_temp_new_vec(type);
3666
3667     /*
3668      * Unpack to W, shift, and repack.  Tricky bits:
3669      * (1) Use punpck*bw x,x to produce DDCCBBAA,
3670      *     i.e. duplicate in other half of the 16-bit lane.
3671      * (2) For right-shift, add 8 so that the high half of the lane
3672      *     becomes zero.  For left-shift, and left-rotate, we must
3673      *     shift up and down again.
3674      * (3) Step 2 leaves high half zero such that PACKUSWB
3675      *     (pack with unsigned saturation) does not modify
3676      *     the quantity.
3677      */
3678     vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3679               tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3680     vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3681               tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3682
3683     if (opc != INDEX_op_rotli_vec) {
3684         imm += 8;
3685     }
3686     if (opc == INDEX_op_shri_vec) {
3687         tcg_gen_shri_vec(MO_16, t1, t1, imm);
3688         tcg_gen_shri_vec(MO_16, t2, t2, imm);
3689     } else {
3690         tcg_gen_shli_vec(MO_16, t1, t1, imm);
3691         tcg_gen_shli_vec(MO_16, t2, t2, imm);
3692         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3693         tcg_gen_shri_vec(MO_16, t2, t2, 8);
3694     }
3695
3696     vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3697               tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3698     tcg_temp_free_vec(t1);
3699     tcg_temp_free_vec(t2);
3700 }
3701
3702 static void expand_vec_sari(TCGType type, unsigned vece,
3703                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3704 {
3705     TCGv_vec t1, t2;
3706
3707     switch (vece) {
3708     case MO_8:
3709         /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3710         t1 = tcg_temp_new_vec(type);
3711         t2 = tcg_temp_new_vec(type);
3712         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3713                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3714         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3715                   tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3716         tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3717         tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3718         vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3719                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3720         tcg_temp_free_vec(t1);
3721         tcg_temp_free_vec(t2);
3722         break;
3723
3724     case MO_64:
3725         t1 = tcg_temp_new_vec(type);
3726         if (imm <= 32) {
3727             /*
3728              * We can emulate a small sign extend by performing an arithmetic
3729              * 32-bit shift and overwriting the high half of a 64-bit logical
3730              * shift.  Note that the ISA says shift of 32 is valid, but TCG
3731              * does not, so we have to bound the smaller shift -- we get the
3732              * same result in the high half either way.
3733              */
3734             tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3735             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3736             vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3737                       tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3738                       tcgv_vec_arg(t1), 0xaa);
3739         } else {
3740             /* Otherwise we will need to use a compare vs 0 to produce
3741              * the sign-extend, shift and merge.
3742              */
3743             tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3744                             tcg_constant_vec(type, MO_64, 0), v1);
3745             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3746             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3747             tcg_gen_or_vec(MO_64, v0, v0, t1);
3748         }
3749         tcg_temp_free_vec(t1);
3750         break;
3751
3752     default:
3753         g_assert_not_reached();
3754     }
3755 }
3756
3757 static void expand_vec_rotli(TCGType type, unsigned vece,
3758                              TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3759 {
3760     TCGv_vec t;
3761
3762     if (vece == MO_8) {
3763         expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3764         return;
3765     }
3766
3767     if (have_avx512vbmi2) {
3768         vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3769                   tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3770         return;
3771     }
3772
3773     t = tcg_temp_new_vec(type);
3774     tcg_gen_shli_vec(vece, t, v1, imm);
3775     tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3776     tcg_gen_or_vec(vece, v0, v0, t);
3777     tcg_temp_free_vec(t);
3778 }
3779
3780 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3781                             TCGv_vec v1, TCGv_vec sh, bool right)
3782 {
3783     TCGv_vec t;
3784
3785     if (have_avx512vbmi2) {
3786         vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3787                   type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3788                   tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3789         return;
3790     }
3791
3792     t = tcg_temp_new_vec(type);
3793     tcg_gen_dupi_vec(vece, t, 8 << vece);
3794     tcg_gen_sub_vec(vece, t, t, sh);
3795     if (right) {
3796         tcg_gen_shlv_vec(vece, t, v1, t);
3797         tcg_gen_shrv_vec(vece, v0, v1, sh);
3798     } else {
3799         tcg_gen_shrv_vec(vece, t, v1, t);
3800         tcg_gen_shlv_vec(vece, v0, v1, sh);
3801     }
3802     tcg_gen_or_vec(vece, v0, v0, t);
3803     tcg_temp_free_vec(t);
3804 }
3805
3806 static void expand_vec_rotls(TCGType type, unsigned vece,
3807                              TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3808 {
3809     TCGv_vec t = tcg_temp_new_vec(type);
3810
3811     tcg_debug_assert(vece != MO_8);
3812
3813     if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3814         tcg_gen_dup_i32_vec(vece, t, lsh);
3815         if (vece >= MO_32) {
3816             tcg_gen_rotlv_vec(vece, v0, v1, t);
3817         } else {
3818             expand_vec_rotv(type, vece, v0, v1, t, false);
3819         }
3820     } else {
3821         TCGv_i32 rsh = tcg_temp_new_i32();
3822
3823         tcg_gen_neg_i32(rsh, lsh);
3824         tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3825         tcg_gen_shls_vec(vece, t, v1, lsh);
3826         tcg_gen_shrs_vec(vece, v0, v1, rsh);
3827         tcg_gen_or_vec(vece, v0, v0, t);
3828
3829         tcg_temp_free_i32(rsh);
3830     }
3831
3832     tcg_temp_free_vec(t);
3833 }
3834
3835 static void expand_vec_mul(TCGType type, unsigned vece,
3836                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3837 {
3838     TCGv_vec t1, t2, t3, t4, zero;
3839
3840     tcg_debug_assert(vece == MO_8);
3841
3842     /*
3843      * Unpack v1 bytes to words, 0 | x.
3844      * Unpack v2 bytes to words, y | 0.
3845      * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3846      * Shift logical right by 8 bits to clear the high 8 bytes before
3847      * using an unsigned saturated pack.
3848      *
3849      * The difference between the V64, V128 and V256 cases is merely how
3850      * we distribute the expansion between temporaries.
3851      */
3852     switch (type) {
3853     case TCG_TYPE_V64:
3854         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3855         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3856         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3857         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3858                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3859         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3860                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3861         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3862         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3863         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3864                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3865         tcg_temp_free_vec(t1);
3866         tcg_temp_free_vec(t2);
3867         break;
3868
3869     case TCG_TYPE_V128:
3870     case TCG_TYPE_V256:
3871         t1 = tcg_temp_new_vec(type);
3872         t2 = tcg_temp_new_vec(type);
3873         t3 = tcg_temp_new_vec(type);
3874         t4 = tcg_temp_new_vec(type);
3875         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3876         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3877                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3878         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3879                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3880         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3881                   tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3882         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3883                   tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3884         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3885         tcg_gen_mul_vec(MO_16, t3, t3, t4);
3886         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3887         tcg_gen_shri_vec(MO_16, t3, t3, 8);
3888         vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3889                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3890         tcg_temp_free_vec(t1);
3891         tcg_temp_free_vec(t2);
3892         tcg_temp_free_vec(t3);
3893         tcg_temp_free_vec(t4);
3894         break;
3895
3896     default:
3897         g_assert_not_reached();
3898     }
3899 }
3900
3901 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3902                                  TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3903 {
3904     enum {
3905         NEED_INV  = 1,
3906         NEED_SWAP = 2,
3907         NEED_BIAS = 4,
3908         NEED_UMIN = 8,
3909         NEED_UMAX = 16,
3910     };
3911     TCGv_vec t1, t2, t3;
3912     uint8_t fixup;
3913
3914     switch (cond) {
3915     case TCG_COND_EQ:
3916     case TCG_COND_GT:
3917         fixup = 0;
3918         break;
3919     case TCG_COND_NE:
3920     case TCG_COND_LE:
3921         fixup = NEED_INV;
3922         break;
3923     case TCG_COND_LT:
3924         fixup = NEED_SWAP;
3925         break;
3926     case TCG_COND_GE:
3927         fixup = NEED_SWAP | NEED_INV;
3928         break;
3929     case TCG_COND_LEU:
3930         if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3931             fixup = NEED_UMIN;
3932         } else {
3933             fixup = NEED_BIAS | NEED_INV;
3934         }
3935         break;
3936     case TCG_COND_GTU:
3937         if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3938             fixup = NEED_UMIN | NEED_INV;
3939         } else {
3940             fixup = NEED_BIAS;
3941         }
3942         break;
3943     case TCG_COND_GEU:
3944         if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3945             fixup = NEED_UMAX;
3946         } else {
3947             fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3948         }
3949         break;
3950     case TCG_COND_LTU:
3951         if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3952             fixup = NEED_UMAX | NEED_INV;
3953         } else {
3954             fixup = NEED_BIAS | NEED_SWAP;
3955         }
3956         break;
3957     default:
3958         g_assert_not_reached();
3959     }
3960
3961     if (fixup & NEED_INV) {
3962         cond = tcg_invert_cond(cond);
3963     }
3964     if (fixup & NEED_SWAP) {
3965         t1 = v1, v1 = v2, v2 = t1;
3966         cond = tcg_swap_cond(cond);
3967     }
3968
3969     t1 = t2 = NULL;
3970     if (fixup & (NEED_UMIN | NEED_UMAX)) {
3971         t1 = tcg_temp_new_vec(type);
3972         if (fixup & NEED_UMIN) {
3973             tcg_gen_umin_vec(vece, t1, v1, v2);
3974         } else {
3975             tcg_gen_umax_vec(vece, t1, v1, v2);
3976         }
3977         v2 = t1;
3978         cond = TCG_COND_EQ;
3979     } else if (fixup & NEED_BIAS) {
3980         t1 = tcg_temp_new_vec(type);
3981         t2 = tcg_temp_new_vec(type);
3982         t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3983         tcg_gen_sub_vec(vece, t1, v1, t3);
3984         tcg_gen_sub_vec(vece, t2, v2, t3);
3985         v1 = t1;
3986         v2 = t2;
3987         cond = tcg_signed_cond(cond);
3988     }
3989
3990     tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3991     /* Expand directly; do not recurse.  */
3992     vec_gen_4(INDEX_op_cmp_vec, type, vece,
3993               tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3994
3995     if (t1) {
3996         tcg_temp_free_vec(t1);
3997         if (t2) {
3998             tcg_temp_free_vec(t2);
3999         }
4000     }
4001     return fixup & NEED_INV;
4002 }
4003
4004 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
4005                            TCGv_vec v1, TCGv_vec v2, TCGCond cond)
4006 {
4007     if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
4008         tcg_gen_not_vec(vece, v0, v0);
4009     }
4010 }
4011
4012 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
4013                               TCGv_vec c1, TCGv_vec c2,
4014                               TCGv_vec v3, TCGv_vec v4, TCGCond cond)
4015 {
4016     TCGv_vec t = tcg_temp_new_vec(type);
4017
4018     if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
4019         /* Invert the sense of the compare by swapping arguments.  */
4020         TCGv_vec x;
4021         x = v3, v3 = v4, v4 = x;
4022     }
4023     vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
4024               tcgv_vec_arg(v0), tcgv_vec_arg(v4),
4025               tcgv_vec_arg(v3), tcgv_vec_arg(t));
4026     tcg_temp_free_vec(t);
4027 }
4028
4029 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
4030                        TCGArg a0, ...)
4031 {
4032     va_list va;
4033     TCGArg a2;
4034     TCGv_vec v0, v1, v2, v3, v4;
4035
4036     va_start(va, a0);
4037     v0 = temp_tcgv_vec(arg_temp(a0));
4038     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4039     a2 = va_arg(va, TCGArg);
4040
4041     switch (opc) {
4042     case INDEX_op_shli_vec:
4043     case INDEX_op_shri_vec:
4044         expand_vec_shi(type, vece, opc, v0, v1, a2);
4045         break;
4046
4047     case INDEX_op_sari_vec:
4048         expand_vec_sari(type, vece, v0, v1, a2);
4049         break;
4050
4051     case INDEX_op_rotli_vec:
4052         expand_vec_rotli(type, vece, v0, v1, a2);
4053         break;
4054
4055     case INDEX_op_rotls_vec:
4056         expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4057         break;
4058
4059     case INDEX_op_rotlv_vec:
4060         v2 = temp_tcgv_vec(arg_temp(a2));
4061         expand_vec_rotv(type, vece, v0, v1, v2, false);
4062         break;
4063     case INDEX_op_rotrv_vec:
4064         v2 = temp_tcgv_vec(arg_temp(a2));
4065         expand_vec_rotv(type, vece, v0, v1, v2, true);
4066         break;
4067
4068     case INDEX_op_mul_vec:
4069         v2 = temp_tcgv_vec(arg_temp(a2));
4070         expand_vec_mul(type, vece, v0, v1, v2);
4071         break;
4072
4073     case INDEX_op_cmp_vec:
4074         v2 = temp_tcgv_vec(arg_temp(a2));
4075         expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
4076         break;
4077
4078     case INDEX_op_cmpsel_vec:
4079         v2 = temp_tcgv_vec(arg_temp(a2));
4080         v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4081         v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4082         expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4083         break;
4084
4085     default:
4086         break;
4087     }
4088
4089     va_end(va);
4090 }
4091
4092 static const int tcg_target_callee_save_regs[] = {
4093 #if TCG_TARGET_REG_BITS == 64
4094     TCG_REG_RBP,
4095     TCG_REG_RBX,
4096 #if defined(_WIN64)
4097     TCG_REG_RDI,
4098     TCG_REG_RSI,
4099 #endif
4100     TCG_REG_R12,
4101     TCG_REG_R13,
4102     TCG_REG_R14, /* Currently used for the global env. */
4103     TCG_REG_R15,
4104 #else
4105     TCG_REG_EBP, /* Currently used for the global env. */
4106     TCG_REG_EBX,
4107     TCG_REG_ESI,
4108     TCG_REG_EDI,
4109 #endif
4110 };
4111
4112 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
4113    and tcg_register_jit.  */
4114
4115 #define PUSH_SIZE \
4116     ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4117      * (TCG_TARGET_REG_BITS / 8))
4118
4119 #define FRAME_SIZE \
4120     ((PUSH_SIZE \
4121       + TCG_STATIC_CALL_ARGS_SIZE \
4122       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4123       + TCG_TARGET_STACK_ALIGN - 1) \
4124      & ~(TCG_TARGET_STACK_ALIGN - 1))
4125
4126 /* Generate global QEMU prologue and epilogue code */
4127 static void tcg_target_qemu_prologue(TCGContext *s)
4128 {
4129     int i, stack_addend;
4130
4131     /* TB prologue */
4132
4133     /* Reserve some stack space, also for TCG temps.  */
4134     stack_addend = FRAME_SIZE - PUSH_SIZE;
4135     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4136                   CPU_TEMP_BUF_NLONGS * sizeof(long));
4137
4138     /* Save all callee saved registers.  */
4139     for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4140         tcg_out_push(s, tcg_target_callee_save_regs[i]);
4141     }
4142
4143 #if TCG_TARGET_REG_BITS == 32
4144     tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4145                (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4146     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4147     /* jmp *tb.  */
4148     tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4149                          (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4150                          + stack_addend);
4151 #else
4152 # if !defined(CONFIG_SOFTMMU)
4153     if (guest_base) {
4154         int seg = setup_guest_base_seg();
4155         if (seg != 0) {
4156             x86_guest_base.seg = seg;
4157         } else if (guest_base == (int32_t)guest_base) {
4158             x86_guest_base.ofs = guest_base;
4159         } else {
4160             /* Choose R12 because, as a base, it requires a SIB byte. */
4161             x86_guest_base.index = TCG_REG_R12;
4162             tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4163             tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4164         }
4165     }
4166 # endif
4167     tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4168     tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4169     /* jmp *tb.  */
4170     tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4171 #endif
4172
4173     /*
4174      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4175      * and fall through to the rest of the epilogue.
4176      */
4177     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4178     tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4179
4180     /* TB epilogue */
4181     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4182
4183     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4184
4185     if (have_avx2) {
4186         tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4187     }
4188     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4189         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4190     }
4191     tcg_out_opc(s, OPC_RET, 0, 0, 0);
4192 }
4193
4194 static void tcg_out_tb_start(TCGContext *s)
4195 {
4196     /* nothing to do */
4197 }
4198
4199 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4200 {
4201     memset(p, 0x90, count);
4202 }
4203
4204 static void tcg_target_init(TCGContext *s)
4205 {
4206     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4207     if (TCG_TARGET_REG_BITS == 64) {
4208         tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4209     }
4210     if (have_avx1) {
4211         tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4212         tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4213     }
4214     if (have_avx2) {
4215         tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4216     }
4217
4218     tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4219     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4220     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4221     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4222     if (TCG_TARGET_REG_BITS == 64) {
4223 #if !defined(_WIN64)
4224         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4225         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4226 #endif
4227         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4228         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4229         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4230         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4231     }
4232
4233     s->reserved_regs = 0;
4234     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4235     tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4236 #ifdef _WIN64
4237     /* These are call saved, and we don't save them, so don't use them. */
4238     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4239     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4240     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4241     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4242     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4243     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4244     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4245     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4246     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4247     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4248 #endif
4249 }
4250
4251 typedef struct {
4252     DebugFrameHeader h;
4253     uint8_t fde_def_cfa[4];
4254     uint8_t fde_reg_ofs[14];
4255 } DebugFrame;
4256
4257 /* We're expecting a 2 byte uleb128 encoded value.  */
4258 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4259
4260 #if !defined(__ELF__)
4261     /* Host machine without ELF. */
4262 #elif TCG_TARGET_REG_BITS == 64
4263 #define ELF_HOST_MACHINE EM_X86_64
4264 static const DebugFrame debug_frame = {
4265     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4266     .h.cie.id = -1,
4267     .h.cie.version = 1,
4268     .h.cie.code_align = 1,
4269     .h.cie.data_align = 0x78,             /* sleb128 -8 */
4270     .h.cie.return_column = 16,
4271
4272     /* Total FDE size does not include the "len" member.  */
4273     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4274
4275     .fde_def_cfa = {
4276         12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4277         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4278         (FRAME_SIZE >> 7)
4279     },
4280     .fde_reg_ofs = {
4281         0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4282         /* The following ordering must match tcg_target_callee_save_regs.  */
4283         0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4284         0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4285         0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4286         0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4287         0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4288         0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4289     }
4290 };
4291 #else
4292 #define ELF_HOST_MACHINE EM_386
4293 static const DebugFrame debug_frame = {
4294     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4295     .h.cie.id = -1,
4296     .h.cie.version = 1,
4297     .h.cie.code_align = 1,
4298     .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4299     .h.cie.return_column = 8,
4300
4301     /* Total FDE size does not include the "len" member.  */
4302     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4303
4304     .fde_def_cfa = {
4305         12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4306         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4307         (FRAME_SIZE >> 7)
4308     },
4309     .fde_reg_ofs = {
4310         0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4311         /* The following ordering must match tcg_target_callee_save_regs.  */
4312         0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4313         0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4314         0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4315         0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4316     }
4317 };
4318 #endif
4319
4320 #if defined(ELF_HOST_MACHINE)
4321 void tcg_register_jit(const void *buf, size_t buf_size)
4322 {
4323     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4324 }
4325 #endif