tcg/i386/tcg-target.c.inc

   1 /*
   2  * Tiny Code Generator for QEMU
   3  *
   4  * Copyright (c) 2008 Fabrice Bellard
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a copy
   7  * of this software and associated documentation files (the "Software"), to deal
   8  * in the Software without restriction, including without limitation the rights
   9  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  10  * copies of the Software, and to permit persons to whom the Software is
  11  * furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  19  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  22  * THE SOFTWARE.
  23  */
  24
  25 #include "../tcg-ldst.c.inc"
  26 #include "../tcg-pool.c.inc"
  27
  28 #ifdef CONFIG_DEBUG_TCG
  29 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
  30 #if TCG_TARGET_REG_BITS == 64
  31     "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
  32 #else
  33     "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
  34 #endif
  35     "%r8",  "%r9",  "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
  36     "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
  37 #if TCG_TARGET_REG_BITS == 64
  38     "%xmm8", "%xmm9", "%xmm10", "%xmm11",
  39     "%xmm12", "%xmm13", "%xmm14", "%xmm15",
  40 #endif
  41 };
  42 #endif
  43
  44 static const int tcg_target_reg_alloc_order[] = {
  45 #if TCG_TARGET_REG_BITS == 64
  46     TCG_REG_RBP,
  47     TCG_REG_RBX,
  48     TCG_REG_R12,
  49     TCG_REG_R13,
  50     TCG_REG_R14,
  51     TCG_REG_R15,
  52     TCG_REG_R10,
  53     TCG_REG_R11,
  54     TCG_REG_R9,
  55     TCG_REG_R8,
  56     TCG_REG_RCX,
  57     TCG_REG_RDX,
  58     TCG_REG_RSI,
  59     TCG_REG_RDI,
  60     TCG_REG_RAX,
  61 #else
  62     TCG_REG_EBX,
  63     TCG_REG_ESI,
  64     TCG_REG_EDI,
  65     TCG_REG_EBP,
  66     TCG_REG_ECX,
  67     TCG_REG_EDX,
  68     TCG_REG_EAX,
  69 #endif
  70     TCG_REG_XMM0,
  71     TCG_REG_XMM1,
  72     TCG_REG_XMM2,
  73     TCG_REG_XMM3,
  74     TCG_REG_XMM4,
  75     TCG_REG_XMM5,
  76 #ifndef _WIN64
  77     /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
  78        any of them.  Therefore only allow xmm0-xmm5 to be allocated.  */
  79     TCG_REG_XMM6,
  80     TCG_REG_XMM7,
  81 #if TCG_TARGET_REG_BITS == 64
  82     TCG_REG_XMM8,
  83     TCG_REG_XMM9,
  84     TCG_REG_XMM10,
  85     TCG_REG_XMM11,
  86     TCG_REG_XMM12,
  87     TCG_REG_XMM13,
  88     TCG_REG_XMM14,
  89     TCG_REG_XMM15,
  90 #endif
  91 #endif
  92 };
  93
  94 #define TCG_TMP_VEC  TCG_REG_XMM5
  95
  96 static const int tcg_target_call_iarg_regs[] = {
  97 #if TCG_TARGET_REG_BITS == 64
  98 #if defined(_WIN64)
  99     TCG_REG_RCX,
 100     TCG_REG_RDX,
 101 #else
 102     TCG_REG_RDI,
 103     TCG_REG_RSI,
 104     TCG_REG_RDX,
 105     TCG_REG_RCX,
 106 #endif
 107     TCG_REG_R8,
 108     TCG_REG_R9,
 109 #else
 110     /* 32 bit mode uses stack based calling convention (GCC default). */
 111 #endif
 112 };
 113
 114 static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
 115 {
 116     switch (kind) {
 117     case TCG_CALL_RET_NORMAL:
 118         tcg_debug_assert(slot >= 0 && slot <= 1);
 119         return slot ? TCG_REG_EDX : TCG_REG_EAX;
 120 #ifdef _WIN64
 121     case TCG_CALL_RET_BY_VEC:
 122         tcg_debug_assert(slot == 0);
 123         return TCG_REG_XMM0;
 124 #endif
 125     default:
 126         g_assert_not_reached();
 127     }
 128 }
 129
 130 /* Constants we accept.  */
 131 #define TCG_CT_CONST_S32 0x100
 132 #define TCG_CT_CONST_U32 0x200
 133 #define TCG_CT_CONST_I32 0x400
 134 #define TCG_CT_CONST_WSZ 0x800
 135
 136 /* Registers used with L constraint, which are the first argument
 137    registers on x86_64, and two random call clobbered registers on
 138    i386. */
 139 #if TCG_TARGET_REG_BITS == 64
 140 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
 141 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
 142 #else
 143 # define TCG_REG_L0 TCG_REG_EAX
 144 # define TCG_REG_L1 TCG_REG_EDX
 145 #endif
 146
 147 #if TCG_TARGET_REG_BITS == 64
 148 # define ALL_GENERAL_REGS      0x0000ffffu
 149 # define ALL_VECTOR_REGS       0xffff0000u
 150 # define ALL_BYTEL_REGS        ALL_GENERAL_REGS
 151 #else
 152 # define ALL_GENERAL_REGS      0x000000ffu
 153 # define ALL_VECTOR_REGS       0x00ff0000u
 154 # define ALL_BYTEL_REGS        0x0000000fu
 155 #endif
 156 #define SOFTMMU_RESERVE_REGS \
 157     (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0)
 158
 159 /* For 64-bit, we always know that CMOV is available.  */
 160 #if TCG_TARGET_REG_BITS == 64
 161 # define have_cmov      true
 162 #else
 163 # define have_cmov      (cpuinfo & CPUINFO_CMOV)
 164 #endif
 165 #define have_bmi2       (cpuinfo & CPUINFO_BMI2)
 166 #define have_lzcnt      (cpuinfo & CPUINFO_LZCNT)
 167
 168 static const tcg_insn_unit *tb_ret_addr;
 169
 170 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
 171                         intptr_t value, intptr_t addend)
 172 {
 173     value += addend;
 174     switch(type) {
 175     case R_386_PC32:
 176         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 177         if (value != (int32_t)value) {
 178             return false;
 179         }
 180         /* FALLTHRU */
 181     case R_386_32:
 182         tcg_patch32(code_ptr, value);
 183         break;
 184     case R_386_PC8:
 185         value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
 186         if (value != (int8_t)value) {
 187             return false;
 188         }
 189         tcg_patch8(code_ptr, value);
 190         break;
 191     default:
 192         g_assert_not_reached();
 193     }
 194     return true;
 195 }
 196
 197 /* test if a constant matches the constraint */
 198 static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
 199 {
 200     if (ct & TCG_CT_CONST) {
 201         return 1;
 202     }
 203     if (type == TCG_TYPE_I32) {
 204         if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
 205             return 1;
 206         }
 207     } else {
 208         if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
 209             return 1;
 210         }
 211         if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
 212             return 1;
 213         }
 214         if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
 215             return 1;
 216         }
 217     }
 218     if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
 219         return 1;
 220     }
 221     return 0;
 222 }
 223
 224 # define LOWREGMASK(x)  ((x) & 7)
 225
 226 #define P_EXT           0x100           /* 0x0f opcode prefix */
 227 #define P_EXT38         0x200           /* 0x0f 0x38 opcode prefix */
 228 #define P_DATA16        0x400           /* 0x66 opcode prefix */
 229 #define P_VEXW          0x1000          /* Set VEX.W = 1 */
 230 #if TCG_TARGET_REG_BITS == 64
 231 # define P_REXW         P_VEXW          /* Set REX.W = 1; match VEXW */
 232 # define P_REXB_R       0x2000          /* REG field as byte register */
 233 # define P_REXB_RM      0x4000          /* R/M field as byte register */
 234 # define P_GS           0x8000          /* gs segment override */
 235 #else
 236 # define P_REXW         0
 237 # define P_REXB_R       0
 238 # define P_REXB_RM      0
 239 # define P_GS           0
 240 #endif
 241 #define P_EXT3A         0x10000         /* 0x0f 0x3a opcode prefix */
 242 #define P_SIMDF3        0x20000         /* 0xf3 opcode prefix */
 243 #define P_SIMDF2        0x40000         /* 0xf2 opcode prefix */
 244 #define P_VEXL          0x80000         /* Set VEX.L = 1 */
 245 #define P_EVEX          0x100000        /* Requires EVEX encoding */
 246
 247 #define OPC_ARITH_EvIz  (0x81)
 248 #define OPC_ARITH_EvIb  (0x83)
 249 #define OPC_ARITH_GvEv  (0x03)          /* ... plus (ARITH_FOO << 3) */
 250 #define OPC_ANDN        (0xf2 | P_EXT38)
 251 #define OPC_ADD_GvEv    (OPC_ARITH_GvEv | (ARITH_ADD << 3))
 252 #define OPC_AND_GvEv    (OPC_ARITH_GvEv | (ARITH_AND << 3))
 253 #define OPC_BLENDPS     (0x0c | P_EXT3A | P_DATA16)
 254 #define OPC_BSF         (0xbc | P_EXT)
 255 #define OPC_BSR         (0xbd | P_EXT)
 256 #define OPC_BSWAP       (0xc8 | P_EXT)
 257 #define OPC_CALL_Jz     (0xe8)
 258 #define OPC_CMOVCC      (0x40 | P_EXT)  /* ... plus condition code */
 259 #define OPC_CMP_GvEv    (OPC_ARITH_GvEv | (ARITH_CMP << 3))
 260 #define OPC_DEC_r32     (0x48)
 261 #define OPC_IMUL_GvEv   (0xaf | P_EXT)
 262 #define OPC_IMUL_GvEvIb (0x6b)
 263 #define OPC_IMUL_GvEvIz (0x69)
 264 #define OPC_INC_r32     (0x40)
 265 #define OPC_JCC_long    (0x80 | P_EXT)  /* ... plus condition code */
 266 #define OPC_JCC_short   (0x70)          /* ... plus condition code */
 267 #define OPC_JMP_long    (0xe9)
 268 #define OPC_JMP_short   (0xeb)
 269 #define OPC_LEA         (0x8d)
 270 #define OPC_LZCNT       (0xbd | P_EXT | P_SIMDF3)
 271 #define OPC_MOVB_EvGv   (0x88)          /* stores, more or less */
 272 #define OPC_MOVL_EvGv   (0x89)          /* stores, more or less */
 273 #define OPC_MOVL_GvEv   (0x8b)          /* loads, more or less */
 274 #define OPC_MOVB_EvIz   (0xc6)
 275 #define OPC_MOVL_EvIz   (0xc7)
 276 #define OPC_MOVB_Ib     (0xb0)
 277 #define OPC_MOVL_Iv     (0xb8)
 278 #define OPC_MOVBE_GyMy  (0xf0 | P_EXT38)
 279 #define OPC_MOVBE_MyGy  (0xf1 | P_EXT38)
 280 #define OPC_MOVD_VyEy   (0x6e | P_EXT | P_DATA16)
 281 #define OPC_MOVD_EyVy   (0x7e | P_EXT | P_DATA16)
 282 #define OPC_MOVDDUP     (0x12 | P_EXT | P_SIMDF2)
 283 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
 284 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
 285 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
 286 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
 287 #define OPC_MOVQ_VqWq   (0x7e | P_EXT | P_SIMDF3)
 288 #define OPC_MOVQ_WqVq   (0xd6 | P_EXT | P_DATA16)
 289 #define OPC_MOVSBL      (0xbe | P_EXT)
 290 #define OPC_MOVSWL      (0xbf | P_EXT)
 291 #define OPC_MOVSLQ      (0x63 | P_REXW)
 292 #define OPC_MOVZBL      (0xb6 | P_EXT)
 293 #define OPC_MOVZWL      (0xb7 | P_EXT)
 294 #define OPC_PABSB       (0x1c | P_EXT38 | P_DATA16)
 295 #define OPC_PABSW       (0x1d | P_EXT38 | P_DATA16)
 296 #define OPC_PABSD       (0x1e | P_EXT38 | P_DATA16)
 297 #define OPC_VPABSQ      (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 298 #define OPC_PACKSSDW    (0x6b | P_EXT | P_DATA16)
 299 #define OPC_PACKSSWB    (0x63 | P_EXT | P_DATA16)
 300 #define OPC_PACKUSDW    (0x2b | P_EXT38 | P_DATA16)
 301 #define OPC_PACKUSWB    (0x67 | P_EXT | P_DATA16)
 302 #define OPC_PADDB       (0xfc | P_EXT | P_DATA16)
 303 #define OPC_PADDW       (0xfd | P_EXT | P_DATA16)
 304 #define OPC_PADDD       (0xfe | P_EXT | P_DATA16)
 305 #define OPC_PADDQ       (0xd4 | P_EXT | P_DATA16)
 306 #define OPC_PADDSB      (0xec | P_EXT | P_DATA16)
 307 #define OPC_PADDSW      (0xed | P_EXT | P_DATA16)
 308 #define OPC_PADDUB      (0xdc | P_EXT | P_DATA16)
 309 #define OPC_PADDUW      (0xdd | P_EXT | P_DATA16)
 310 #define OPC_PAND        (0xdb | P_EXT | P_DATA16)
 311 #define OPC_PANDN       (0xdf | P_EXT | P_DATA16)
 312 #define OPC_PBLENDW     (0x0e | P_EXT3A | P_DATA16)
 313 #define OPC_PCMPEQB     (0x74 | P_EXT | P_DATA16)
 314 #define OPC_PCMPEQW     (0x75 | P_EXT | P_DATA16)
 315 #define OPC_PCMPEQD     (0x76 | P_EXT | P_DATA16)
 316 #define OPC_PCMPEQQ     (0x29 | P_EXT38 | P_DATA16)
 317 #define OPC_PCMPGTB     (0x64 | P_EXT | P_DATA16)
 318 #define OPC_PCMPGTW     (0x65 | P_EXT | P_DATA16)
 319 #define OPC_PCMPGTD     (0x66 | P_EXT | P_DATA16)
 320 #define OPC_PCMPGTQ     (0x37 | P_EXT38 | P_DATA16)
 321 #define OPC_PEXTRD      (0x16 | P_EXT3A | P_DATA16)
 322 #define OPC_PINSRD      (0x22 | P_EXT3A | P_DATA16)
 323 #define OPC_PMAXSB      (0x3c | P_EXT38 | P_DATA16)
 324 #define OPC_PMAXSW      (0xee | P_EXT | P_DATA16)
 325 #define OPC_PMAXSD      (0x3d | P_EXT38 | P_DATA16)
 326 #define OPC_VPMAXSQ     (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 327 #define OPC_PMAXUB      (0xde | P_EXT | P_DATA16)
 328 #define OPC_PMAXUW      (0x3e | P_EXT38 | P_DATA16)
 329 #define OPC_PMAXUD      (0x3f | P_EXT38 | P_DATA16)
 330 #define OPC_VPMAXUQ     (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 331 #define OPC_PMINSB      (0x38 | P_EXT38 | P_DATA16)
 332 #define OPC_PMINSW      (0xea | P_EXT | P_DATA16)
 333 #define OPC_PMINSD      (0x39 | P_EXT38 | P_DATA16)
 334 #define OPC_VPMINSQ     (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 335 #define OPC_PMINUB      (0xda | P_EXT | P_DATA16)
 336 #define OPC_PMINUW      (0x3a | P_EXT38 | P_DATA16)
 337 #define OPC_PMINUD      (0x3b | P_EXT38 | P_DATA16)
 338 #define OPC_VPMINUQ     (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 339 #define OPC_PMOVSXBW    (0x20 | P_EXT38 | P_DATA16)
 340 #define OPC_PMOVSXWD    (0x23 | P_EXT38 | P_DATA16)
 341 #define OPC_PMOVSXDQ    (0x25 | P_EXT38 | P_DATA16)
 342 #define OPC_PMOVZXBW    (0x30 | P_EXT38 | P_DATA16)
 343 #define OPC_PMOVZXWD    (0x33 | P_EXT38 | P_DATA16)
 344 #define OPC_PMOVZXDQ    (0x35 | P_EXT38 | P_DATA16)
 345 #define OPC_PMULLW      (0xd5 | P_EXT | P_DATA16)
 346 #define OPC_PMULLD      (0x40 | P_EXT38 | P_DATA16)
 347 #define OPC_VPMULLQ     (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 348 #define OPC_POR         (0xeb | P_EXT | P_DATA16)
 349 #define OPC_PSHUFB      (0x00 | P_EXT38 | P_DATA16)
 350 #define OPC_PSHUFD      (0x70 | P_EXT | P_DATA16)
 351 #define OPC_PSHUFLW     (0x70 | P_EXT | P_SIMDF2)
 352 #define OPC_PSHUFHW     (0x70 | P_EXT | P_SIMDF3)
 353 #define OPC_PSHIFTW_Ib  (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
 354 #define OPC_PSHIFTD_Ib  (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
 355 #define OPC_PSHIFTQ_Ib  (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
 356 #define OPC_PSLLW       (0xf1 | P_EXT | P_DATA16)
 357 #define OPC_PSLLD       (0xf2 | P_EXT | P_DATA16)
 358 #define OPC_PSLLQ       (0xf3 | P_EXT | P_DATA16)
 359 #define OPC_PSRAW       (0xe1 | P_EXT | P_DATA16)
 360 #define OPC_PSRAD       (0xe2 | P_EXT | P_DATA16)
 361 #define OPC_VPSRAQ      (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
 362 #define OPC_PSRLW       (0xd1 | P_EXT | P_DATA16)
 363 #define OPC_PSRLD       (0xd2 | P_EXT | P_DATA16)
 364 #define OPC_PSRLQ       (0xd3 | P_EXT | P_DATA16)
 365 #define OPC_PSUBB       (0xf8 | P_EXT | P_DATA16)
 366 #define OPC_PSUBW       (0xf9 | P_EXT | P_DATA16)
 367 #define OPC_PSUBD       (0xfa | P_EXT | P_DATA16)
 368 #define OPC_PSUBQ       (0xfb | P_EXT | P_DATA16)
 369 #define OPC_PSUBSB      (0xe8 | P_EXT | P_DATA16)
 370 #define OPC_PSUBSW      (0xe9 | P_EXT | P_DATA16)
 371 #define OPC_PSUBUB      (0xd8 | P_EXT | P_DATA16)
 372 #define OPC_PSUBUW      (0xd9 | P_EXT | P_DATA16)
 373 #define OPC_PUNPCKLBW   (0x60 | P_EXT | P_DATA16)
 374 #define OPC_PUNPCKLWD   (0x61 | P_EXT | P_DATA16)
 375 #define OPC_PUNPCKLDQ   (0x62 | P_EXT | P_DATA16)
 376 #define OPC_PUNPCKLQDQ  (0x6c | P_EXT | P_DATA16)
 377 #define OPC_PUNPCKHBW   (0x68 | P_EXT | P_DATA16)
 378 #define OPC_PUNPCKHWD   (0x69 | P_EXT | P_DATA16)
 379 #define OPC_PUNPCKHDQ   (0x6a | P_EXT | P_DATA16)
 380 #define OPC_PUNPCKHQDQ  (0x6d | P_EXT | P_DATA16)
 381 #define OPC_PXOR        (0xef | P_EXT | P_DATA16)
 382 #define OPC_POP_r32     (0x58)
 383 #define OPC_POPCNT      (0xb8 | P_EXT | P_SIMDF3)
 384 #define OPC_PUSH_r32    (0x50)
 385 #define OPC_PUSH_Iv     (0x68)
 386 #define OPC_PUSH_Ib     (0x6a)
 387 #define OPC_RET         (0xc3)
 388 #define OPC_SETCC       (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
 389 #define OPC_SHIFT_1     (0xd1)
 390 #define OPC_SHIFT_Ib    (0xc1)
 391 #define OPC_SHIFT_cl    (0xd3)
 392 #define OPC_SARX        (0xf7 | P_EXT38 | P_SIMDF3)
 393 #define OPC_SHUFPS      (0xc6 | P_EXT)
 394 #define OPC_SHLX        (0xf7 | P_EXT38 | P_DATA16)
 395 #define OPC_SHRX        (0xf7 | P_EXT38 | P_SIMDF2)
 396 #define OPC_SHRD_Ib     (0xac | P_EXT)
 397 #define OPC_TESTL       (0x85)
 398 #define OPC_TZCNT       (0xbc | P_EXT | P_SIMDF3)
 399 #define OPC_UD2         (0x0b | P_EXT)
 400 #define OPC_VPBLENDD    (0x02 | P_EXT3A | P_DATA16)
 401 #define OPC_VPBLENDVB   (0x4c | P_EXT3A | P_DATA16)
 402 #define OPC_VPINSRB     (0x20 | P_EXT3A | P_DATA16)
 403 #define OPC_VPINSRW     (0xc4 | P_EXT | P_DATA16)
 404 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
 405 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
 406 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
 407 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
 408 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
 409 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
 410 #define OPC_VPERMQ      (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
 411 #define OPC_VPERM2I128  (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
 412 #define OPC_VPROLVD     (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
 413 #define OPC_VPROLVQ     (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 414 #define OPC_VPRORVD     (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
 415 #define OPC_VPRORVQ     (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 416 #define OPC_VPSHLDW     (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 417 #define OPC_VPSHLDD     (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
 418 #define OPC_VPSHLDQ     (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 419 #define OPC_VPSHLDVW    (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 420 #define OPC_VPSHLDVD    (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
 421 #define OPC_VPSHLDVQ    (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 422 #define OPC_VPSHRDVW    (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 423 #define OPC_VPSHRDVD    (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
 424 #define OPC_VPSHRDVQ    (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 425 #define OPC_VPSLLVW     (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 426 #define OPC_VPSLLVD     (0x47 | P_EXT38 | P_DATA16)
 427 #define OPC_VPSLLVQ     (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
 428 #define OPC_VPSRAVW     (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 429 #define OPC_VPSRAVD     (0x46 | P_EXT38 | P_DATA16)
 430 #define OPC_VPSRAVQ     (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 431 #define OPC_VPSRLVW     (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
 432 #define OPC_VPSRLVD     (0x45 | P_EXT38 | P_DATA16)
 433 #define OPC_VPSRLVQ     (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
 434 #define OPC_VPTERNLOGQ  (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
 435 #define OPC_VZEROUPPER  (0x77 | P_EXT)
 436 #define OPC_XCHG_ax_r32 (0x90)
 437 #define OPC_XCHG_EvGv   (0x87)
 438
 439 #define OPC_GRP3_Eb     (0xf6)
 440 #define OPC_GRP3_Ev     (0xf7)
 441 #define OPC_GRP5        (0xff)
 442 #define OPC_GRP14       (0x73 | P_EXT | P_DATA16)
 443
 444 /* Group 1 opcode extensions for 0x80-0x83.
 445    These are also used as modifiers for OPC_ARITH.  */
 446 #define ARITH_ADD 0
 447 #define ARITH_OR  1
 448 #define ARITH_ADC 2
 449 #define ARITH_SBB 3
 450 #define ARITH_AND 4
 451 #define ARITH_SUB 5
 452 #define ARITH_XOR 6
 453 #define ARITH_CMP 7
 454
 455 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3.  */
 456 #define SHIFT_ROL 0
 457 #define SHIFT_ROR 1
 458 #define SHIFT_SHL 4
 459 #define SHIFT_SHR 5
 460 #define SHIFT_SAR 7
 461
 462 /* Group 3 opcode extensions for 0xf6, 0xf7.  To be used with OPC_GRP3.  */
 463 #define EXT3_TESTi 0
 464 #define EXT3_NOT   2
 465 #define EXT3_NEG   3
 466 #define EXT3_MUL   4
 467 #define EXT3_IMUL  5
 468 #define EXT3_DIV   6
 469 #define EXT3_IDIV  7
 470
 471 /* Group 5 opcode extensions for 0xff.  To be used with OPC_GRP5.  */
 472 #define EXT5_INC_Ev     0
 473 #define EXT5_DEC_Ev     1
 474 #define EXT5_CALLN_Ev   2
 475 #define EXT5_JMPN_Ev    4
 476
 477 /* Condition codes to be added to OPC_JCC_{long,short}.  */
 478 #define JCC_JMP (-1)
 479 #define JCC_JO  0x0
 480 #define JCC_JNO 0x1
 481 #define JCC_JB  0x2
 482 #define JCC_JAE 0x3
 483 #define JCC_JE  0x4
 484 #define JCC_JNE 0x5
 485 #define JCC_JBE 0x6
 486 #define JCC_JA  0x7
 487 #define JCC_JS  0x8
 488 #define JCC_JNS 0x9
 489 #define JCC_JP  0xa
 490 #define JCC_JNP 0xb
 491 #define JCC_JL  0xc
 492 #define JCC_JGE 0xd
 493 #define JCC_JLE 0xe
 494 #define JCC_JG  0xf
 495
 496 static const uint8_t tcg_cond_to_jcc[] = {
 497     [TCG_COND_EQ] = JCC_JE,
 498     [TCG_COND_NE] = JCC_JNE,
 499     [TCG_COND_LT] = JCC_JL,
 500     [TCG_COND_GE] = JCC_JGE,
 501     [TCG_COND_LE] = JCC_JLE,
 502     [TCG_COND_GT] = JCC_JG,
 503     [TCG_COND_LTU] = JCC_JB,
 504     [TCG_COND_GEU] = JCC_JAE,
 505     [TCG_COND_LEU] = JCC_JBE,
 506     [TCG_COND_GTU] = JCC_JA,
 507 };
 508
 509 #if TCG_TARGET_REG_BITS == 64
 510 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
 511 {
 512     int rex;
 513
 514     if (opc & P_GS) {
 515         tcg_out8(s, 0x65);
 516     }
 517     if (opc & P_DATA16) {
 518         /* We should never be asking for both 16 and 64-bit operation.  */
 519         tcg_debug_assert((opc & P_REXW) == 0);
 520         tcg_out8(s, 0x66);
 521     }
 522     if (opc & P_SIMDF3) {
 523         tcg_out8(s, 0xf3);
 524     } else if (opc & P_SIMDF2) {
 525         tcg_out8(s, 0xf2);
 526     }
 527
 528     rex = 0;
 529     rex |= (opc & P_REXW) ? 0x8 : 0x0;  /* REX.W */
 530     rex |= (r & 8) >> 1;                /* REX.R */
 531     rex |= (x & 8) >> 2;                /* REX.X */
 532     rex |= (rm & 8) >> 3;               /* REX.B */
 533
 534     /* P_REXB_{R,RM} indicates that the given register is the low byte.
 535        For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
 536        as otherwise the encoding indicates %[abcd]h.  Note that the values
 537        that are ORed in merely indicate that the REX byte must be present;
 538        those bits get discarded in output.  */
 539     rex |= opc & (r >= 4 ? P_REXB_R : 0);
 540     rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
 541
 542     if (rex) {
 543         tcg_out8(s, (uint8_t)(rex | 0x40));
 544     }
 545
 546     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 547         tcg_out8(s, 0x0f);
 548         if (opc & P_EXT38) {
 549             tcg_out8(s, 0x38);
 550         } else if (opc & P_EXT3A) {
 551             tcg_out8(s, 0x3a);
 552         }
 553     }
 554
 555     tcg_out8(s, opc);
 556 }
 557 #else
 558 static void tcg_out_opc(TCGContext *s, int opc)
 559 {
 560     if (opc & P_DATA16) {
 561         tcg_out8(s, 0x66);
 562     }
 563     if (opc & P_SIMDF3) {
 564         tcg_out8(s, 0xf3);
 565     } else if (opc & P_SIMDF2) {
 566         tcg_out8(s, 0xf2);
 567     }
 568     if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
 569         tcg_out8(s, 0x0f);
 570         if (opc & P_EXT38) {
 571             tcg_out8(s, 0x38);
 572         } else if (opc & P_EXT3A) {
 573             tcg_out8(s, 0x3a);
 574         }
 575     }
 576     tcg_out8(s, opc);
 577 }
 578 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
 579    the 32-bit compilation paths.  This method works with all versions of gcc,
 580    whereas relying on optimization may not be able to exclude them.  */
 581 #define tcg_out_opc(s, opc, r, rm, x)  (tcg_out_opc)(s, opc)
 582 #endif
 583
 584 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
 585 {
 586     tcg_out_opc(s, opc, r, rm, 0);
 587     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 588 }
 589
 590 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
 591                             int rm, int index)
 592 {
 593     int tmp;
 594
 595     if (opc & P_GS) {
 596         tcg_out8(s, 0x65);
 597     }
 598     /* Use the two byte form if possible, which cannot encode
 599        VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT.  */
 600     if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
 601         && ((rm | index) & 8) == 0) {
 602         /* Two byte VEX prefix.  */
 603         tcg_out8(s, 0xc5);
 604
 605         tmp = (r & 8 ? 0 : 0x80);              /* VEX.R */
 606     } else {
 607         /* Three byte VEX prefix.  */
 608         tcg_out8(s, 0xc4);
 609
 610         /* VEX.m-mmmm */
 611         if (opc & P_EXT3A) {
 612             tmp = 3;
 613         } else if (opc & P_EXT38) {
 614             tmp = 2;
 615         } else if (opc & P_EXT) {
 616             tmp = 1;
 617         } else {
 618             g_assert_not_reached();
 619         }
 620         tmp |= (r & 8 ? 0 : 0x80);             /* VEX.R */
 621         tmp |= (index & 8 ? 0 : 0x40);         /* VEX.X */
 622         tmp |= (rm & 8 ? 0 : 0x20);            /* VEX.B */
 623         tcg_out8(s, tmp);
 624
 625         tmp = (opc & P_VEXW ? 0x80 : 0);       /* VEX.W */
 626     }
 627
 628     tmp |= (opc & P_VEXL ? 0x04 : 0);      /* VEX.L */
 629     /* VEX.pp */
 630     if (opc & P_DATA16) {
 631         tmp |= 1;                          /* 0x66 */
 632     } else if (opc & P_SIMDF3) {
 633         tmp |= 2;                          /* 0xf3 */
 634     } else if (opc & P_SIMDF2) {
 635         tmp |= 3;                          /* 0xf2 */
 636     }
 637     tmp |= (~v & 15) << 3;                 /* VEX.vvvv */
 638     tcg_out8(s, tmp);
 639     tcg_out8(s, opc);
 640 }
 641
 642 static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
 643                              int rm, int index)
 644 {
 645     /* The entire 4-byte evex prefix; with R' and V' set. */
 646     uint32_t p = 0x08041062;
 647     int mm, pp;
 648
 649     tcg_debug_assert(have_avx512vl);
 650
 651     /* EVEX.mm */
 652     if (opc & P_EXT3A) {
 653         mm = 3;
 654     } else if (opc & P_EXT38) {
 655         mm = 2;
 656     } else if (opc & P_EXT) {
 657         mm = 1;
 658     } else {
 659         g_assert_not_reached();
 660     }
 661
 662     /* EVEX.pp */
 663     if (opc & P_DATA16) {
 664         pp = 1;                          /* 0x66 */
 665     } else if (opc & P_SIMDF3) {
 666         pp = 2;                          /* 0xf3 */
 667     } else if (opc & P_SIMDF2) {
 668         pp = 3;                          /* 0xf2 */
 669     } else {
 670         pp = 0;
 671     }
 672
 673     p = deposit32(p, 8, 2, mm);
 674     p = deposit32(p, 13, 1, (rm & 8) == 0);             /* EVEX.RXB.B */
 675     p = deposit32(p, 14, 1, (index & 8) == 0);          /* EVEX.RXB.X */
 676     p = deposit32(p, 15, 1, (r & 8) == 0);              /* EVEX.RXB.R */
 677     p = deposit32(p, 16, 2, pp);
 678     p = deposit32(p, 19, 4, ~v);
 679     p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
 680     p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
 681
 682     tcg_out32(s, p);
 683     tcg_out8(s, opc);
 684 }
 685
 686 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
 687 {
 688     if (opc & P_EVEX) {
 689         tcg_out_evex_opc(s, opc, r, v, rm, 0);
 690     } else {
 691         tcg_out_vex_opc(s, opc, r, v, rm, 0);
 692     }
 693     tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 694 }
 695
 696 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
 697    We handle either RM and INDEX missing with a negative value.  In 64-bit
 698    mode for absolute addresses, ~RM is the size of the immediate operand
 699    that will follow the instruction.  */
 700
 701 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
 702                                int shift, intptr_t offset)
 703 {
 704     int mod, len;
 705
 706     if (index < 0 && rm < 0) {
 707         if (TCG_TARGET_REG_BITS == 64) {
 708             /* Try for a rip-relative addressing mode.  This has replaced
 709                the 32-bit-mode absolute addressing encoding.  */
 710             intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
 711             intptr_t disp = offset - pc;
 712             if (disp == (int32_t)disp) {
 713                 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
 714                 tcg_out32(s, disp);
 715                 return;
 716             }
 717
 718             /* Try for an absolute address encoding.  This requires the
 719                use of the MODRM+SIB encoding and is therefore larger than
 720                rip-relative addressing.  */
 721             if (offset == (int32_t)offset) {
 722                 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
 723                 tcg_out8(s, (4 << 3) | 5);
 724                 tcg_out32(s, offset);
 725                 return;
 726             }
 727
 728             /* ??? The memory isn't directly addressable.  */
 729             g_assert_not_reached();
 730         } else {
 731             /* Absolute address.  */
 732             tcg_out8(s, (r << 3) | 5);
 733             tcg_out32(s, offset);
 734             return;
 735         }
 736     }
 737
 738     /* Find the length of the immediate addend.  Note that the encoding
 739        that would be used for (%ebp) indicates absolute addressing.  */
 740     if (rm < 0) {
 741         mod = 0, len = 4, rm = 5;
 742     } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
 743         mod = 0, len = 0;
 744     } else if (offset == (int8_t)offset) {
 745         mod = 0x40, len = 1;
 746     } else {
 747         mod = 0x80, len = 4;
 748     }
 749
 750     /* Use a single byte MODRM format if possible.  Note that the encoding
 751        that would be used for %esp is the escape to the two byte form.  */
 752     if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
 753         /* Single byte MODRM format.  */
 754         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
 755     } else {
 756         /* Two byte MODRM+SIB format.  */
 757
 758         /* Note that the encoding that would place %esp into the index
 759            field indicates no index register.  In 64-bit mode, the REX.X
 760            bit counts, so %r12 can be used as the index.  */
 761         if (index < 0) {
 762             index = 4;
 763         } else {
 764             tcg_debug_assert(index != TCG_REG_ESP);
 765         }
 766
 767         tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
 768         tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
 769     }
 770
 771     if (len == 1) {
 772         tcg_out8(s, offset);
 773     } else if (len == 4) {
 774         tcg_out32(s, offset);
 775     }
 776 }
 777
 778 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
 779                                      int index, int shift, intptr_t offset)
 780 {
 781     tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 782     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 783 }
 784
 785 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
 786                                          int rm, int index, int shift,
 787                                          intptr_t offset)
 788 {
 789     tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
 790     tcg_out_sib_offset(s, r, rm, index, shift, offset);
 791 }
 792
 793 /* A simplification of the above with no index or shift.  */
 794 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
 795                                         int rm, intptr_t offset)
 796 {
 797     tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
 798 }
 799
 800 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
 801                                             int v, int rm, intptr_t offset)
 802 {
 803     tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
 804 }
 805
 806 /* Output an opcode with an expected reference to the constant pool.  */
 807 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
 808 {
 809     tcg_out_opc(s, opc, r, 0, 0);
 810     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 811     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 812     tcg_out32(s, 0);
 813 }
 814
 815 /* Output an opcode with an expected reference to the constant pool.  */
 816 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
 817 {
 818     tcg_out_vex_opc(s, opc, r, 0, 0, 0);
 819     /* Absolute for 32-bit, pc-relative for 64-bit.  */
 820     tcg_out8(s, LOWREGMASK(r) << 3 | 5);
 821     tcg_out32(s, 0);
 822 }
 823
 824 /* Generate dest op= src.  Uses the same ARITH_* codes as tgen_arithi.  */
 825 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
 826 {
 827     /* Propagate an opcode prefix, such as P_REXW.  */
 828     int ext = subop & ~0x7;
 829     subop &= 0x7;
 830
 831     tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
 832 }
 833
 834 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
 835 {
 836     int rexw = 0;
 837
 838     if (arg == ret) {
 839         return true;
 840     }
 841     switch (type) {
 842     case TCG_TYPE_I64:
 843         rexw = P_REXW;
 844         /* fallthru */
 845     case TCG_TYPE_I32:
 846         if (ret < 16) {
 847             if (arg < 16) {
 848                 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
 849             } else {
 850                 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
 851             }
 852         } else {
 853             if (arg < 16) {
 854                 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
 855             } else {
 856                 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 857             }
 858         }
 859         break;
 860
 861     case TCG_TYPE_V64:
 862         tcg_debug_assert(ret >= 16 && arg >= 16);
 863         tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
 864         break;
 865     case TCG_TYPE_V128:
 866         tcg_debug_assert(ret >= 16 && arg >= 16);
 867         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
 868         break;
 869     case TCG_TYPE_V256:
 870         tcg_debug_assert(ret >= 16 && arg >= 16);
 871         tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
 872         break;
 873
 874     default:
 875         g_assert_not_reached();
 876     }
 877     return true;
 878 }
 879
 880 static const int avx2_dup_insn[4] = {
 881     OPC_VPBROADCASTB, OPC_VPBROADCASTW,
 882     OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
 883 };
 884
 885 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
 886                             TCGReg r, TCGReg a)
 887 {
 888     if (have_avx2) {
 889         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 890         tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
 891     } else {
 892         switch (vece) {
 893         case MO_8:
 894             /* ??? With zero in a register, use PSHUFB.  */
 895             tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
 896             a = r;
 897             /* FALLTHRU */
 898         case MO_16:
 899             tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
 900             a = r;
 901             /* FALLTHRU */
 902         case MO_32:
 903             tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
 904             /* imm8 operand: all output lanes selected from input lane 0.  */
 905             tcg_out8(s, 0);
 906             break;
 907         case MO_64:
 908             tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
 909             break;
 910         default:
 911             g_assert_not_reached();
 912         }
 913     }
 914     return true;
 915 }
 916
 917 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
 918                              TCGReg r, TCGReg base, intptr_t offset)
 919 {
 920     if (have_avx2) {
 921         int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 922         tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
 923                                  r, 0, base, offset);
 924     } else {
 925         switch (vece) {
 926         case MO_64:
 927             tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
 928             break;
 929         case MO_32:
 930             tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
 931             break;
 932         case MO_16:
 933             tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
 934             tcg_out8(s, 0); /* imm8 */
 935             tcg_out_dup_vec(s, type, vece, r, r);
 936             break;
 937         case MO_8:
 938             tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
 939             tcg_out8(s, 0); /* imm8 */
 940             tcg_out_dup_vec(s, type, vece, r, r);
 941             break;
 942         default:
 943             g_assert_not_reached();
 944         }
 945     }
 946     return true;
 947 }
 948
 949 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
 950                              TCGReg ret, int64_t arg)
 951 {
 952     int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
 953
 954     if (arg == 0) {
 955         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 956         return;
 957     }
 958     if (arg == -1) {
 959         tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
 960         return;
 961     }
 962
 963     if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
 964         if (have_avx2) {
 965             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
 966         } else {
 967             tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
 968         }
 969         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
 970     } else {
 971         if (type == TCG_TYPE_V64) {
 972             tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
 973         } else if (have_avx2) {
 974             tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
 975         } else {
 976             tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
 977         }
 978         if (TCG_TARGET_REG_BITS == 64) {
 979             new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
 980         } else {
 981             new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
 982         }
 983     }
 984 }
 985
 986 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
 987                              TCGReg ret, tcg_target_long arg)
 988 {
 989     if (arg == 0) {
 990         tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
 991         return;
 992     }
 993     if (arg == -1) {
 994         tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
 995         return;
 996     }
 997
 998     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
 999     tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1000     if (TCG_TARGET_REG_BITS == 64) {
1001         new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1002     } else {
1003         new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1004     }
1005 }
1006
1007 static void tcg_out_movi_int(TCGContext *s, TCGType type,
1008                              TCGReg ret, tcg_target_long arg)
1009 {
1010     tcg_target_long diff;
1011
1012     if (arg == 0) {
1013         tgen_arithr(s, ARITH_XOR, ret, ret);
1014         return;
1015     }
1016     if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1017         tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1018         tcg_out32(s, arg);
1019         return;
1020     }
1021     if (arg == (int32_t)arg) {
1022         tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1023         tcg_out32(s, arg);
1024         return;
1025     }
1026
1027     /* Try a 7 byte pc-relative lea before the 10 byte movq.  */
1028     diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1029     if (diff == (int32_t)diff) {
1030         tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1031         tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1032         tcg_out32(s, diff);
1033         return;
1034     }
1035
1036     tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1037     tcg_out64(s, arg);
1038 }
1039
1040 static void tcg_out_movi(TCGContext *s, TCGType type,
1041                          TCGReg ret, tcg_target_long arg)
1042 {
1043     switch (type) {
1044     case TCG_TYPE_I32:
1045 #if TCG_TARGET_REG_BITS == 64
1046     case TCG_TYPE_I64:
1047 #endif
1048         if (ret < 16) {
1049             tcg_out_movi_int(s, type, ret, arg);
1050         } else {
1051             tcg_out_movi_vec(s, type, ret, arg);
1052         }
1053         break;
1054     default:
1055         g_assert_not_reached();
1056     }
1057 }
1058
1059 static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1060 {
1061     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1062     tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1063     return true;
1064 }
1065
1066 static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1067                              tcg_target_long imm)
1068 {
1069     /* This function is only used for passing structs by reference. */
1070     tcg_debug_assert(imm == (int32_t)imm);
1071     tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1072 }
1073
1074 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1075 {
1076     if (val == (int8_t)val) {
1077         tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1078         tcg_out8(s, val);
1079     } else if (val == (int32_t)val) {
1080         tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1081         tcg_out32(s, val);
1082     } else {
1083         g_assert_not_reached();
1084     }
1085 }
1086
1087 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1088 {
1089     /* Given the strength of x86 memory ordering, we only need care for
1090        store-load ordering.  Experimentally, "lock orl $0,0(%esp)" is
1091        faster than "mfence", so don't bother with the sse insn.  */
1092     if (a0 & TCG_MO_ST_LD) {
1093         tcg_out8(s, 0xf0);
1094         tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1095         tcg_out8(s, 0);
1096     }
1097 }
1098
1099 static inline void tcg_out_push(TCGContext *s, int reg)
1100 {
1101     tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1102 }
1103
1104 static inline void tcg_out_pop(TCGContext *s, int reg)
1105 {
1106     tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1107 }
1108
1109 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1110                        TCGReg arg1, intptr_t arg2)
1111 {
1112     switch (type) {
1113     case TCG_TYPE_I32:
1114         if (ret < 16) {
1115             tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1116         } else {
1117             tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1118         }
1119         break;
1120     case TCG_TYPE_I64:
1121         if (ret < 16) {
1122             tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1123             break;
1124         }
1125         /* FALLTHRU */
1126     case TCG_TYPE_V64:
1127         /* There is no instruction that can validate 8-byte alignment.  */
1128         tcg_debug_assert(ret >= 16);
1129         tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1130         break;
1131     case TCG_TYPE_V128:
1132         /*
1133          * The gvec infrastructure is asserts that v128 vector loads
1134          * and stores use a 16-byte aligned offset.  Validate that the
1135          * final pointer is aligned by using an insn that will SIGSEGV.
1136          */
1137         tcg_debug_assert(ret >= 16);
1138         tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1139         break;
1140     case TCG_TYPE_V256:
1141         /*
1142          * The gvec infrastructure only requires 16-byte alignment,
1143          * so here we must use an unaligned load.
1144          */
1145         tcg_debug_assert(ret >= 16);
1146         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1147                                  ret, 0, arg1, arg2);
1148         break;
1149     default:
1150         g_assert_not_reached();
1151     }
1152 }
1153
1154 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1155                        TCGReg arg1, intptr_t arg2)
1156 {
1157     switch (type) {
1158     case TCG_TYPE_I32:
1159         if (arg < 16) {
1160             tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1161         } else {
1162             tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1163         }
1164         break;
1165     case TCG_TYPE_I64:
1166         if (arg < 16) {
1167             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1168             break;
1169         }
1170         /* FALLTHRU */
1171     case TCG_TYPE_V64:
1172         /* There is no instruction that can validate 8-byte alignment.  */
1173         tcg_debug_assert(arg >= 16);
1174         tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1175         break;
1176     case TCG_TYPE_V128:
1177         /*
1178          * The gvec infrastructure is asserts that v128 vector loads
1179          * and stores use a 16-byte aligned offset.  Validate that the
1180          * final pointer is aligned by using an insn that will SIGSEGV.
1181          *
1182          * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1183          * for _WIN64, which must have SSE2 but may not have AVX.
1184          */
1185         tcg_debug_assert(arg >= 16);
1186         if (have_avx1) {
1187             tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1188         } else {
1189             tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1190         }
1191         break;
1192     case TCG_TYPE_V256:
1193         /*
1194          * The gvec infrastructure only requires 16-byte alignment,
1195          * so here we must use an unaligned store.
1196          */
1197         tcg_debug_assert(arg >= 16);
1198         tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1199                                  arg, 0, arg1, arg2);
1200         break;
1201     default:
1202         g_assert_not_reached();
1203     }
1204 }
1205
1206 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1207                         TCGReg base, intptr_t ofs)
1208 {
1209     int rexw = 0;
1210     if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1211         if (val != (int32_t)val) {
1212             return false;
1213         }
1214         rexw = P_REXW;
1215     } else if (type != TCG_TYPE_I32) {
1216         return false;
1217     }
1218     tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1219     tcg_out32(s, val);
1220     return true;
1221 }
1222
1223 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1224 {
1225     /* Propagate an opcode prefix, such as P_DATA16.  */
1226     int ext = subopc & ~0x7;
1227     subopc &= 0x7;
1228
1229     if (count == 1) {
1230         tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1231     } else {
1232         tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1233         tcg_out8(s, count);
1234     }
1235 }
1236
1237 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1238 {
1239     tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1240 }
1241
1242 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1243 {
1244     tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1245 }
1246
1247 static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1248 {
1249     /* movzbl */
1250     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1251     tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1252 }
1253
1254 static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1255 {
1256     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1257     /* movsbl */
1258     tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1259     tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1260 }
1261
1262 static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1263 {
1264     /* movzwl */
1265     tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1266 }
1267
1268 static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1269 {
1270     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1271     /* movsw[lq] */
1272     tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1273 }
1274
1275 static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1276 {
1277     /* 32-bit mov zero extends.  */
1278     tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1279 }
1280
1281 static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1282 {
1283     tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1284     tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1285 }
1286
1287 static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1288 {
1289     tcg_out_ext32s(s, dest, src);
1290 }
1291
1292 static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1293 {
1294     if (dest != src) {
1295         tcg_out_ext32u(s, dest, src);
1296     }
1297 }
1298
1299 static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1300 {
1301     tcg_out_ext32u(s, dest, src);
1302 }
1303
1304 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1305 {
1306     tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1307 }
1308
1309 static void tgen_arithi(TCGContext *s, int c, int r0,
1310                         tcg_target_long val, int cf)
1311 {
1312     int rexw = 0;
1313
1314     if (TCG_TARGET_REG_BITS == 64) {
1315         rexw = c & -8;
1316         c &= 7;
1317     }
1318
1319     switch (c) {
1320     case ARITH_ADD:
1321     case ARITH_SUB:
1322         if (!cf) {
1323             /*
1324              * ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1325              * partial flags update stalls on Pentium4 and are not recommended
1326              * by current Intel optimization manuals.
1327              */
1328             if (val == 1 || val == -1) {
1329                 int is_inc = (c == ARITH_ADD) ^ (val < 0);
1330                 if (TCG_TARGET_REG_BITS == 64) {
1331                     /*
1332                      * The single-byte increment encodings are re-tasked
1333                      * as the REX prefixes.  Use the MODRM encoding.
1334                      */
1335                     tcg_out_modrm(s, OPC_GRP5 + rexw,
1336                                   (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1337                 } else {
1338                     tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1339                 }
1340                 return;
1341             }
1342             if (val == 128) {
1343                 /*
1344                  * Facilitate using an 8-bit immediate.  Carry is inverted
1345                  * by this transformation, so do it only if cf == 0.
1346                  */
1347                 c ^= ARITH_ADD ^ ARITH_SUB;
1348                 val = -128;
1349             }
1350         }
1351         break;
1352
1353     case ARITH_AND:
1354         if (TCG_TARGET_REG_BITS == 64) {
1355             if (val == 0xffffffffu) {
1356                 tcg_out_ext32u(s, r0, r0);
1357                 return;
1358             }
1359             if (val == (uint32_t)val) {
1360                 /* AND with no high bits set can use a 32-bit operation.  */
1361                 rexw = 0;
1362             }
1363         }
1364         if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1365             tcg_out_ext8u(s, r0, r0);
1366             return;
1367         }
1368         if (val == 0xffffu) {
1369             tcg_out_ext16u(s, r0, r0);
1370             return;
1371         }
1372         break;
1373     }
1374
1375     if (val == (int8_t)val) {
1376         tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1377         tcg_out8(s, val);
1378         return;
1379     }
1380     if (rexw == 0 || val == (int32_t)val) {
1381         tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1382         tcg_out32(s, val);
1383         return;
1384     }
1385
1386     g_assert_not_reached();
1387 }
1388
1389 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1390 {
1391     if (val != 0) {
1392         tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1393     }
1394 }
1395
1396 /* Set SMALL to force a short forward branch.  */
1397 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1398 {
1399     int32_t val, val1;
1400
1401     if (l->has_value) {
1402         val = tcg_pcrel_diff(s, l->u.value_ptr);
1403         val1 = val - 2;
1404         if ((int8_t)val1 == val1) {
1405             if (opc == -1) {
1406                 tcg_out8(s, OPC_JMP_short);
1407             } else {
1408                 tcg_out8(s, OPC_JCC_short + opc);
1409             }
1410             tcg_out8(s, val1);
1411         } else {
1412             tcg_debug_assert(!small);
1413             if (opc == -1) {
1414                 tcg_out8(s, OPC_JMP_long);
1415                 tcg_out32(s, val - 5);
1416             } else {
1417                 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1418                 tcg_out32(s, val - 6);
1419             }
1420         }
1421     } else if (small) {
1422         if (opc == -1) {
1423             tcg_out8(s, OPC_JMP_short);
1424         } else {
1425             tcg_out8(s, OPC_JCC_short + opc);
1426         }
1427         tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1428         s->code_ptr += 1;
1429     } else {
1430         if (opc == -1) {
1431             tcg_out8(s, OPC_JMP_long);
1432         } else {
1433             tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1434         }
1435         tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1436         s->code_ptr += 4;
1437     }
1438 }
1439
1440 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1441                         int const_arg2, int rexw)
1442 {
1443     if (const_arg2) {
1444         if (arg2 == 0) {
1445             /* test r, r */
1446             tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1447         } else {
1448             tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1449         }
1450     } else {
1451         tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1452     }
1453 }
1454
1455 static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1456                            TCGArg arg1, TCGArg arg2, int const_arg2,
1457                            TCGLabel *label, bool small)
1458 {
1459     tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1460     tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1461 }
1462
1463 #if TCG_TARGET_REG_BITS == 32
1464 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1465                             const int *const_args, bool small)
1466 {
1467     TCGLabel *label_next = gen_new_label();
1468     TCGLabel *label_this = arg_label(args[5]);
1469
1470     switch(args[4]) {
1471     case TCG_COND_EQ:
1472         tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1473                        label_next, 1);
1474         tcg_out_brcond(s, 0, TCG_COND_EQ, args[1], args[3], const_args[3],
1475                        label_this, small);
1476         break;
1477     case TCG_COND_NE:
1478         tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1479                        label_this, small);
1480         tcg_out_brcond(s, 0, TCG_COND_NE, args[1], args[3], const_args[3],
1481                        label_this, small);
1482         break;
1483     case TCG_COND_LT:
1484         tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1485                        label_this, small);
1486         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1487         tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1488                        label_this, small);
1489         break;
1490     case TCG_COND_LE:
1491         tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1492                        label_this, small);
1493         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1494         tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1495                        label_this, small);
1496         break;
1497     case TCG_COND_GT:
1498         tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1499                        label_this, small);
1500         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1501         tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1502                        label_this, small);
1503         break;
1504     case TCG_COND_GE:
1505         tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1506                        label_this, small);
1507         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1508         tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1509                        label_this, small);
1510         break;
1511     case TCG_COND_LTU:
1512         tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1513                        label_this, small);
1514         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1515         tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1516                        label_this, small);
1517         break;
1518     case TCG_COND_LEU:
1519         tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1520                        label_this, small);
1521         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1522         tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1523                        label_this, small);
1524         break;
1525     case TCG_COND_GTU:
1526         tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1527                        label_this, small);
1528         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1529         tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1530                        label_this, small);
1531         break;
1532     case TCG_COND_GEU:
1533         tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1534                        label_this, small);
1535         tcg_out_jxx(s, JCC_JNE, label_next, 1);
1536         tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1537                        label_this, small);
1538         break;
1539     default:
1540         g_assert_not_reached();
1541     }
1542     tcg_out_label(s, label_next);
1543 }
1544 #endif
1545
1546 static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1547                             TCGArg dest, TCGArg arg1, TCGArg arg2,
1548                             int const_arg2, bool neg)
1549 {
1550     bool inv = false;
1551     bool cleared;
1552
1553     switch (cond) {
1554     case TCG_COND_NE:
1555         inv = true;
1556         /* fall through */
1557     case TCG_COND_EQ:
1558         /* If arg2 is 0, convert to LTU/GEU vs 1. */
1559         if (const_arg2 && arg2 == 0) {
1560             arg2 = 1;
1561             goto do_ltu;
1562         }
1563         break;
1564
1565     case TCG_COND_LEU:
1566         inv = true;
1567         /* fall through */
1568     case TCG_COND_GTU:
1569         /* If arg2 is a register, swap for LTU/GEU. */
1570         if (!const_arg2) {
1571             TCGReg t = arg1;
1572             arg1 = arg2;
1573             arg2 = t;
1574             goto do_ltu;
1575         }
1576         break;
1577
1578     case TCG_COND_GEU:
1579         inv = true;
1580         /* fall through */
1581     case TCG_COND_LTU:
1582     do_ltu:
1583         /*
1584          * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1585          * We can then use NEG or INC to produce the desired result.
1586          * This is always smaller than the SETCC expansion.
1587          */
1588         tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1589
1590         /* X - X - C = -C = (C ? -1 : 0) */
1591         tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
1592         if (inv && neg) {
1593             /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1594             tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1595         } else if (inv) {
1596             /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1597             tgen_arithi(s, ARITH_ADD, dest, 1, 0);
1598         } else if (!neg) {
1599             /* -(C ? -1 : 0) = (C ? 1 : 0) */
1600             tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
1601         }
1602         return;
1603
1604     case TCG_COND_GE:
1605         inv = true;
1606         /* fall through */
1607     case TCG_COND_LT:
1608         /* If arg2 is 0, extract the sign bit. */
1609         if (const_arg2 && arg2 == 0) {
1610             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
1611             if (inv) {
1612                 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1613             }
1614             tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
1615                            dest, rexw ? 63 : 31);
1616             return;
1617         }
1618         break;
1619
1620     default:
1621         break;
1622     }
1623
1624     /*
1625      * If dest does not overlap the inputs, clearing it first is preferred.
1626      * The XOR breaks any false dependency for the low-byte write to dest,
1627      * and is also one byte smaller than MOVZBL.
1628      */
1629     cleared = false;
1630     if (dest != arg1 && (const_arg2 || dest != arg2)) {
1631         tgen_arithr(s, ARITH_XOR, dest, dest);
1632         cleared = true;
1633     }
1634
1635     tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1636     tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1637
1638     if (!cleared) {
1639         tcg_out_ext8u(s, dest, dest);
1640     }
1641     if (neg) {
1642         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
1643     }
1644 }
1645
1646 #if TCG_TARGET_REG_BITS == 32
1647 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1648                              const int *const_args)
1649 {
1650     TCGArg new_args[6];
1651     TCGLabel *label_true, *label_over;
1652
1653     memcpy(new_args, args+1, 5*sizeof(TCGArg));
1654
1655     if (args[0] == args[1] || args[0] == args[2]
1656         || (!const_args[3] && args[0] == args[3])
1657         || (!const_args[4] && args[0] == args[4])) {
1658         /* When the destination overlaps with one of the argument
1659            registers, don't do anything tricky.  */
1660         label_true = gen_new_label();
1661         label_over = gen_new_label();
1662
1663         new_args[5] = label_arg(label_true);
1664         tcg_out_brcond2(s, new_args, const_args+1, 1);
1665
1666         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1667         tcg_out_jxx(s, JCC_JMP, label_over, 1);
1668         tcg_out_label(s, label_true);
1669
1670         tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1671         tcg_out_label(s, label_over);
1672     } else {
1673         /* When the destination does not overlap one of the arguments,
1674            clear the destination first, jump if cond false, and emit an
1675            increment in the true case.  This results in smaller code.  */
1676
1677         tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1678
1679         label_over = gen_new_label();
1680         new_args[4] = tcg_invert_cond(new_args[4]);
1681         new_args[5] = label_arg(label_over);
1682         tcg_out_brcond2(s, new_args, const_args+1, 1);
1683
1684         tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1685         tcg_out_label(s, label_over);
1686     }
1687 }
1688 #endif
1689
1690 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1691                          TCGReg dest, TCGReg v1)
1692 {
1693     if (have_cmov) {
1694         tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1695     } else {
1696         TCGLabel *over = gen_new_label();
1697         tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1698         tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1699         tcg_out_label(s, over);
1700     }
1701 }
1702
1703 static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1704                             TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1705                             TCGReg v1)
1706 {
1707     tcg_out_cmp(s, c1, c2, const_c2, rexw);
1708     tcg_out_cmov(s, cond, rexw, dest, v1);
1709 }
1710
1711 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1712                         TCGArg arg2, bool const_a2)
1713 {
1714     if (have_bmi1) {
1715         tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1716         if (const_a2) {
1717             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1718         } else {
1719             tcg_debug_assert(dest != arg2);
1720             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1721         }
1722     } else {
1723         tcg_debug_assert(dest != arg2);
1724         tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1725         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1726     }
1727 }
1728
1729 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1730                         TCGArg arg2, bool const_a2)
1731 {
1732     if (have_lzcnt) {
1733         tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1734         if (const_a2) {
1735             tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1736         } else {
1737             tcg_debug_assert(dest != arg2);
1738             tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1739         }
1740     } else {
1741         tcg_debug_assert(!const_a2);
1742         tcg_debug_assert(dest != arg1);
1743         tcg_debug_assert(dest != arg2);
1744
1745         /* Recall that the output of BSR is the index not the count.  */
1746         tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1747         tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1748
1749         /* Since we have destroyed the flags from BSR, we have to re-test.  */
1750         tcg_out_cmp(s, arg1, 0, 1, rexw);
1751         tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1752     }
1753 }
1754
1755 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1756 {
1757     intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1758
1759     if (disp == (int32_t)disp) {
1760         tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1761         tcg_out32(s, disp);
1762     } else {
1763         /* rip-relative addressing into the constant pool.
1764            This is 6 + 8 = 14 bytes, as compared to using an
1765            immediate load 10 + 6 = 16 bytes, plus we may
1766            be able to re-use the pool constant for more calls.  */
1767         tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1768         tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1769         new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1770         tcg_out32(s, 0);
1771     }
1772 }
1773
1774 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1775                          const TCGHelperInfo *info)
1776 {
1777     tcg_out_branch(s, 1, dest);
1778
1779 #ifndef _WIN32
1780     if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1781         /*
1782          * The sysv i386 abi for struct return places a reference as the
1783          * first argument of the stack, and pops that argument with the
1784          * return statement.  Since we want to retain the aligned stack
1785          * pointer for the callee, we do not want to actually push that
1786          * argument before the call but rely on the normal store to the
1787          * stack slot.  But we do need to compensate for the pop in order
1788          * to reset our correct stack pointer value.
1789          * Pushing a garbage value back onto the stack is quickest.
1790          */
1791         tcg_out_push(s, TCG_REG_EAX);
1792     }
1793 #endif
1794 }
1795
1796 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1797 {
1798     tcg_out_branch(s, 0, dest);
1799 }
1800
1801 static void tcg_out_nopn(TCGContext *s, int n)
1802 {
1803     int i;
1804     /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1805      * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1806      * duplicate prefix, and all of the interesting recent cores can
1807      * decode and discard the duplicates in a single cycle.
1808      */
1809     tcg_debug_assert(n >= 1);
1810     for (i = 1; i < n; ++i) {
1811         tcg_out8(s, 0x66);
1812     }
1813     tcg_out8(s, 0x90);
1814 }
1815
1816 /* Test register R vs immediate bits I, setting Z flag for EQ/NE. */
1817 static void __attribute__((unused))
1818 tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i)
1819 {
1820     /*
1821      * This is used for testing alignment, so we can usually use testb.
1822      * For i686, we have to use testl for %esi/%edi.
1823      */
1824     if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) {
1825         tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r);
1826         tcg_out8(s, i);
1827     } else {
1828         tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r);
1829         tcg_out32(s, i);
1830     }
1831 }
1832
1833 typedef struct {
1834     TCGReg base;
1835     int index;
1836     int ofs;
1837     int seg;
1838     TCGAtomAlign aa;
1839 } HostAddress;
1840
1841 bool tcg_target_has_memory_bswap(MemOp memop)
1842 {
1843     TCGAtomAlign aa;
1844
1845     if (!have_movbe) {
1846         return false;
1847     }
1848     if ((memop & MO_SIZE) < MO_128) {
1849         return true;
1850     }
1851
1852     /*
1853      * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1854      * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1855      */
1856     aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1857     return aa.atom < MO_128;
1858 }
1859
1860 /*
1861  * Because i686 has no register parameters and because x86_64 has xchg
1862  * to handle addr/data register overlap, we have placed all input arguments
1863  * before we need might need a scratch reg.
1864  *
1865  * Even then, a scratch is only needed for l->raddr.  Rather than expose
1866  * a general-purpose scratch when we don't actually know it's available,
1867  * use the ra_gen hook to load into RAX if needed.
1868  */
1869 #if TCG_TARGET_REG_BITS == 64
1870 static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1871 {
1872     if (arg < 0) {
1873         arg = TCG_REG_RAX;
1874     }
1875     tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1876     return arg;
1877 }
1878 static const TCGLdstHelperParam ldst_helper_param = {
1879     .ra_gen = ldst_ra_gen
1880 };
1881 #else
1882 static const TCGLdstHelperParam ldst_helper_param = { };
1883 #endif
1884
1885 static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1886                                 TCGReg l, TCGReg h, TCGReg v)
1887 {
1888     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1889
1890     /* vpmov{d,q} %v, %l */
1891     tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
1892     /* vpextr{d,q} $1, %v, %h */
1893     tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
1894     tcg_out8(s, 1);
1895 }
1896
1897 static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
1898                                 TCGReg v, TCGReg l, TCGReg h)
1899 {
1900     int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1901
1902     /* vmov{d,q} %l, %v */
1903     tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
1904     /* vpinsr{d,q} $1, %h, %v, %v */
1905     tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
1906     tcg_out8(s, 1);
1907 }
1908
1909 /*
1910  * Generate code for the slow path for a load at the end of block
1911  */
1912 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1913 {
1914     MemOp opc = get_memop(l->oi);
1915     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1916
1917     /* resolve label address */
1918     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1919     if (label_ptr[1]) {
1920         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1921     }
1922
1923     tcg_out_ld_helper_args(s, l, &ldst_helper_param);
1924     tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
1925     tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
1926
1927     tcg_out_jmp(s, l->raddr);
1928     return true;
1929 }
1930
1931 /*
1932  * Generate code for the slow path for a store at the end of block
1933  */
1934 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1935 {
1936     MemOp opc = get_memop(l->oi);
1937     tcg_insn_unit **label_ptr = &l->label_ptr[0];
1938
1939     /* resolve label address */
1940     tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1941     if (label_ptr[1]) {
1942         tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1943     }
1944
1945     tcg_out_st_helper_args(s, l, &ldst_helper_param);
1946     tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
1947
1948     tcg_out_jmp(s, l->raddr);
1949     return true;
1950 }
1951
1952 #ifdef CONFIG_USER_ONLY
1953 static HostAddress x86_guest_base = {
1954     .index = -1
1955 };
1956
1957 #if defined(__x86_64__) && defined(__linux__)
1958 # include <asm/prctl.h>
1959 # include <sys/prctl.h>
1960 int arch_prctl(int code, unsigned long addr);
1961 static inline int setup_guest_base_seg(void)
1962 {
1963     if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1964         return P_GS;
1965     }
1966     return 0;
1967 }
1968 #define setup_guest_base_seg  setup_guest_base_seg
1969 #elif defined(__x86_64__) && \
1970       (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
1971 # include <machine/sysarch.h>
1972 static inline int setup_guest_base_seg(void)
1973 {
1974     if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1975         return P_GS;
1976     }
1977     return 0;
1978 }
1979 #define setup_guest_base_seg  setup_guest_base_seg
1980 #endif
1981 #else
1982 # define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; }))
1983 #endif /* CONFIG_USER_ONLY */
1984 #ifndef setup_guest_base_seg
1985 # define setup_guest_base_seg()  0
1986 #endif
1987
1988 #define MIN_TLB_MASK_TABLE_OFS  INT_MIN
1989
1990 /*
1991  * For softmmu, perform the TLB load and compare.
1992  * For useronly, perform any required alignment tests.
1993  * In both cases, return a TCGLabelQemuLdst structure if the slow path
1994  * is required and fill in @h with the host address for the fast path.
1995  */
1996 static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
1997                                            TCGReg addrlo, TCGReg addrhi,
1998                                            MemOpIdx oi, bool is_ld)
1999 {
2000     TCGLabelQemuLdst *ldst = NULL;
2001     MemOp opc = get_memop(oi);
2002     MemOp s_bits = opc & MO_SIZE;
2003     unsigned a_mask;
2004
2005     if (tcg_use_softmmu) {
2006         h->index = TCG_REG_L0;
2007         h->ofs = 0;
2008         h->seg = 0;
2009     } else {
2010         *h = x86_guest_base;
2011     }
2012     h->base = addrlo;
2013     h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
2014     a_mask = (1 << h->aa.align) - 1;
2015
2016     if (tcg_use_softmmu) {
2017         int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
2018                             : offsetof(CPUTLBEntry, addr_write);
2019         TCGType ttype = TCG_TYPE_I32;
2020         TCGType tlbtype = TCG_TYPE_I32;
2021         int trexw = 0, hrexw = 0, tlbrexw = 0;
2022         unsigned mem_index = get_mmuidx(oi);
2023         unsigned s_mask = (1 << s_bits) - 1;
2024         int fast_ofs = tlb_mask_table_ofs(s, mem_index);
2025         int tlb_mask;
2026
2027         ldst = new_ldst_label(s);
2028         ldst->is_ld = is_ld;
2029         ldst->oi = oi;
2030         ldst->addrlo_reg = addrlo;
2031         ldst->addrhi_reg = addrhi;
2032
2033         if (TCG_TARGET_REG_BITS == 64) {
2034             ttype = s->addr_type;
2035             trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
2036             if (TCG_TYPE_PTR == TCG_TYPE_I64) {
2037                 hrexw = P_REXW;
2038                 if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2039                     tlbtype = TCG_TYPE_I64;
2040                     tlbrexw = P_REXW;
2041                 }
2042             }
2043         }
2044
2045         tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
2046         tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
2047                        s->page_bits - CPU_TLB_ENTRY_BITS);
2048
2049         tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2050                              fast_ofs + offsetof(CPUTLBDescFast, mask));
2051
2052         tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2053                              fast_ofs + offsetof(CPUTLBDescFast, table));
2054
2055         /*
2056          * If the required alignment is at least as large as the access,
2057          * simply copy the address and mask.  For lesser alignments,
2058          * check that we don't cross pages for the complete access.
2059          */
2060         if (a_mask >= s_mask) {
2061             tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2062         } else {
2063             tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2064                                  addrlo, s_mask - a_mask);
2065         }
2066         tlb_mask = s->page_mask | a_mask;
2067         tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2068
2069         /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2070         tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2071                              TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2072
2073         /* jne slow_path */
2074         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2075         ldst->label_ptr[0] = s->code_ptr;
2076         s->code_ptr += 4;
2077
2078         if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2079             /* cmp 4(TCG_REG_L0), addrhi */
2080             tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi,
2081                                  TCG_REG_L0, cmp_ofs + 4);
2082
2083             /* jne slow_path */
2084             tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2085             ldst->label_ptr[1] = s->code_ptr;
2086             s->code_ptr += 4;
2087         }
2088
2089         /* TLB Hit.  */
2090         tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2091                    offsetof(CPUTLBEntry, addend));
2092     } else if (a_mask) {
2093         ldst = new_ldst_label(s);
2094
2095         ldst->is_ld = is_ld;
2096         ldst->oi = oi;
2097         ldst->addrlo_reg = addrlo;
2098         ldst->addrhi_reg = addrhi;
2099
2100         tcg_out_testi(s, addrlo, a_mask);
2101         /* jne slow_path */
2102         tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2103         ldst->label_ptr[0] = s->code_ptr;
2104         s->code_ptr += 4;
2105     }
2106
2107     return ldst;
2108 }
2109
2110 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2111                                    HostAddress h, TCGType type, MemOp memop)
2112 {
2113     bool use_movbe = false;
2114     int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2115     int movop = OPC_MOVL_GvEv;
2116
2117     /* Do big-endian loads with movbe.  */
2118     if (memop & MO_BSWAP) {
2119         tcg_debug_assert(have_movbe);
2120         use_movbe = true;
2121         movop = OPC_MOVBE_GyMy;
2122     }
2123
2124     switch (memop & MO_SSIZE) {
2125     case MO_UB:
2126         tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2127                                  h.base, h.index, 0, h.ofs);
2128         break;
2129     case MO_SB:
2130         tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2131                                  h.base, h.index, 0, h.ofs);
2132         break;
2133     case MO_UW:
2134         if (use_movbe) {
2135             /* There is no extending movbe; only low 16-bits are modified.  */
2136             if (datalo != h.base && datalo != h.index) {
2137                 /* XOR breaks dependency chains.  */
2138                 tgen_arithr(s, ARITH_XOR, datalo, datalo);
2139                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2140                                          datalo, h.base, h.index, 0, h.ofs);
2141             } else {
2142                 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2143                                          datalo, h.base, h.index, 0, h.ofs);
2144                 tcg_out_ext16u(s, datalo, datalo);
2145             }
2146         } else {
2147             tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2148                                      h.base, h.index, 0, h.ofs);
2149         }
2150         break;
2151     case MO_SW:
2152         if (use_movbe) {
2153             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2154                                      datalo, h.base, h.index, 0, h.ofs);
2155             tcg_out_ext16s(s, type, datalo, datalo);
2156         } else {
2157             tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2158                                      datalo, h.base, h.index, 0, h.ofs);
2159         }
2160         break;
2161     case MO_UL:
2162         tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2163                                  h.base, h.index, 0, h.ofs);
2164         break;
2165 #if TCG_TARGET_REG_BITS == 64
2166     case MO_SL:
2167         if (use_movbe) {
2168             tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2169                                      h.base, h.index, 0, h.ofs);
2170             tcg_out_ext32s(s, datalo, datalo);
2171         } else {
2172             tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2173                                      h.base, h.index, 0, h.ofs);
2174         }
2175         break;
2176 #endif
2177     case MO_UQ:
2178         if (TCG_TARGET_REG_BITS == 64) {
2179             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2180                                      h.base, h.index, 0, h.ofs);
2181             break;
2182         }
2183         if (use_movbe) {
2184             TCGReg t = datalo;
2185             datalo = datahi;
2186             datahi = t;
2187         }
2188         if (h.base == datalo || h.index == datalo) {
2189             tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2190                                      h.base, h.index, 0, h.ofs);
2191             tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2192             tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2193         } else {
2194             tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2195                                      h.base, h.index, 0, h.ofs);
2196             tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2197                                      h.base, h.index, 0, h.ofs + 4);
2198         }
2199         break;
2200
2201     case MO_128:
2202         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2203
2204         /*
2205          * Without 16-byte atomicity, use integer regs.
2206          * That is where we want the data, and it allows bswaps.
2207          */
2208         if (h.aa.atom < MO_128) {
2209             if (use_movbe) {
2210                 TCGReg t = datalo;
2211                 datalo = datahi;
2212                 datahi = t;
2213             }
2214             if (h.base == datalo || h.index == datalo) {
2215                 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2216                                          h.base, h.index, 0, h.ofs);
2217                 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2218                                      datalo, datahi, 0);
2219                 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2220                                      datahi, datahi, 8);
2221             } else {
2222                 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2223                                          h.base, h.index, 0, h.ofs);
2224                 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2225                                          h.base, h.index, 0, h.ofs + 8);
2226             }
2227             break;
2228         }
2229
2230         /*
2231          * With 16-byte atomicity, a vector load is required.
2232          * If we already have 16-byte alignment, then VMOVDQA always works.
2233          * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2234          * Else use we require a runtime test for alignment for VMOVDQA;
2235          * use VMOVDQU on the unaligned nonatomic path for simplicity.
2236          */
2237         if (h.aa.align >= MO_128) {
2238             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2239                                          TCG_TMP_VEC, 0,
2240                                          h.base, h.index, 0, h.ofs);
2241         } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2242             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2243                                          TCG_TMP_VEC, 0,
2244                                          h.base, h.index, 0, h.ofs);
2245         } else {
2246             TCGLabel *l1 = gen_new_label();
2247             TCGLabel *l2 = gen_new_label();
2248
2249             tcg_out_testi(s, h.base, 15);
2250             tcg_out_jxx(s, JCC_JNE, l1, true);
2251
2252             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2253                                          TCG_TMP_VEC, 0,
2254                                          h.base, h.index, 0, h.ofs);
2255             tcg_out_jxx(s, JCC_JMP, l2, true);
2256
2257             tcg_out_label(s, l1);
2258             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2259                                          TCG_TMP_VEC, 0,
2260                                          h.base, h.index, 0, h.ofs);
2261             tcg_out_label(s, l2);
2262         }
2263         tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2264         break;
2265
2266     default:
2267         g_assert_not_reached();
2268     }
2269 }
2270
2271 static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2272                             TCGReg addrlo, TCGReg addrhi,
2273                             MemOpIdx oi, TCGType data_type)
2274 {
2275     TCGLabelQemuLdst *ldst;
2276     HostAddress h;
2277
2278     ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2279     tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2280
2281     if (ldst) {
2282         ldst->type = data_type;
2283         ldst->datalo_reg = datalo;
2284         ldst->datahi_reg = datahi;
2285         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2286     }
2287 }
2288
2289 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2290                                    HostAddress h, MemOp memop)
2291 {
2292     bool use_movbe = false;
2293     int movop = OPC_MOVL_EvGv;
2294
2295     /*
2296      * Do big-endian stores with movbe or system-mode.
2297      * User-only without movbe will have its swapping done generically.
2298      */
2299     if (memop & MO_BSWAP) {
2300         tcg_debug_assert(have_movbe);
2301         use_movbe = true;
2302         movop = OPC_MOVBE_MyGy;
2303     }
2304
2305     switch (memop & MO_SIZE) {
2306     case MO_8:
2307         /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2308         tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2309         tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2310                                  datalo, h.base, h.index, 0, h.ofs);
2311         break;
2312     case MO_16:
2313         tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2314                                  h.base, h.index, 0, h.ofs);
2315         break;
2316     case MO_32:
2317         tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2318                                  h.base, h.index, 0, h.ofs);
2319         break;
2320     case MO_64:
2321         if (TCG_TARGET_REG_BITS == 64) {
2322             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2323                                      h.base, h.index, 0, h.ofs);
2324         } else {
2325             if (use_movbe) {
2326                 TCGReg t = datalo;
2327                 datalo = datahi;
2328                 datahi = t;
2329             }
2330             tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2331                                      h.base, h.index, 0, h.ofs);
2332             tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2333                                      h.base, h.index, 0, h.ofs + 4);
2334         }
2335         break;
2336
2337     case MO_128:
2338         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2339
2340         /*
2341          * Without 16-byte atomicity, use integer regs.
2342          * That is where we have the data, and it allows bswaps.
2343          */
2344         if (h.aa.atom < MO_128) {
2345             if (use_movbe) {
2346                 TCGReg t = datalo;
2347                 datalo = datahi;
2348                 datahi = t;
2349             }
2350             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2351                                      h.base, h.index, 0, h.ofs);
2352             tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2353                                      h.base, h.index, 0, h.ofs + 8);
2354             break;
2355         }
2356
2357         /*
2358          * With 16-byte atomicity, a vector store is required.
2359          * If we already have 16-byte alignment, then VMOVDQA always works.
2360          * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2361          * Else use we require a runtime test for alignment for VMOVDQA;
2362          * use VMOVDQU on the unaligned nonatomic path for simplicity.
2363          */
2364         tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2365         if (h.aa.align >= MO_128) {
2366             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2367                                          TCG_TMP_VEC, 0,
2368                                          h.base, h.index, 0, h.ofs);
2369         } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2370             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2371                                          TCG_TMP_VEC, 0,
2372                                          h.base, h.index, 0, h.ofs);
2373         } else {
2374             TCGLabel *l1 = gen_new_label();
2375             TCGLabel *l2 = gen_new_label();
2376
2377             tcg_out_testi(s, h.base, 15);
2378             tcg_out_jxx(s, JCC_JNE, l1, true);
2379
2380             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2381                                          TCG_TMP_VEC, 0,
2382                                          h.base, h.index, 0, h.ofs);
2383             tcg_out_jxx(s, JCC_JMP, l2, true);
2384
2385             tcg_out_label(s, l1);
2386             tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2387                                          TCG_TMP_VEC, 0,
2388                                          h.base, h.index, 0, h.ofs);
2389             tcg_out_label(s, l2);
2390         }
2391         break;
2392
2393     default:
2394         g_assert_not_reached();
2395     }
2396 }
2397
2398 static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2399                             TCGReg addrlo, TCGReg addrhi,
2400                             MemOpIdx oi, TCGType data_type)
2401 {
2402     TCGLabelQemuLdst *ldst;
2403     HostAddress h;
2404
2405     ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2406     tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2407
2408     if (ldst) {
2409         ldst->type = data_type;
2410         ldst->datalo_reg = datalo;
2411         ldst->datahi_reg = datahi;
2412         ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2413     }
2414 }
2415
2416 static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2417 {
2418     /* Reuse the zeroing that exists for goto_ptr.  */
2419     if (a0 == 0) {
2420         tcg_out_jmp(s, tcg_code_gen_epilogue);
2421     } else {
2422         tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2423         tcg_out_jmp(s, tb_ret_addr);
2424     }
2425 }
2426
2427 static void tcg_out_goto_tb(TCGContext *s, int which)
2428 {
2429     /*
2430      * Jump displacement must be aligned for atomic patching;
2431      * see if we need to add extra nops before jump
2432      */
2433     int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2434     if (gap != 1) {
2435         tcg_out_nopn(s, gap - 1);
2436     }
2437     tcg_out8(s, OPC_JMP_long); /* jmp im */
2438     set_jmp_insn_offset(s, which);
2439     tcg_out32(s, 0);
2440     set_jmp_reset_offset(s, which);
2441 }
2442
2443 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2444                               uintptr_t jmp_rx, uintptr_t jmp_rw)
2445 {
2446     /* patch the branch destination */
2447     uintptr_t addr = tb->jmp_target_addr[n];
2448     qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2449     /* no need to flush icache explicitly */
2450 }
2451
2452 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2453                               const TCGArg args[TCG_MAX_OP_ARGS],
2454                               const int const_args[TCG_MAX_OP_ARGS])
2455 {
2456     TCGArg a0, a1, a2;
2457     int c, const_a2, vexop, rexw = 0;
2458
2459 #if TCG_TARGET_REG_BITS == 64
2460 # define OP_32_64(x) \
2461         case glue(glue(INDEX_op_, x), _i64): \
2462             rexw = P_REXW; /* FALLTHRU */    \
2463         case glue(glue(INDEX_op_, x), _i32)
2464 #else
2465 # define OP_32_64(x) \
2466         case glue(glue(INDEX_op_, x), _i32)
2467 #endif
2468
2469     /* Hoist the loads of the most common arguments.  */
2470     a0 = args[0];
2471     a1 = args[1];
2472     a2 = args[2];
2473     const_a2 = const_args[2];
2474
2475     switch (opc) {
2476     case INDEX_op_goto_ptr:
2477         /* jmp to the given host address (could be epilogue) */
2478         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2479         break;
2480     case INDEX_op_br:
2481         tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2482         break;
2483     OP_32_64(ld8u):
2484         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2485         tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2486         break;
2487     OP_32_64(ld8s):
2488         tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2489         break;
2490     OP_32_64(ld16u):
2491         /* Note that we can ignore REXW for the zero-extend to 64-bit.  */
2492         tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2493         break;
2494     OP_32_64(ld16s):
2495         tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2496         break;
2497 #if TCG_TARGET_REG_BITS == 64
2498     case INDEX_op_ld32u_i64:
2499 #endif
2500     case INDEX_op_ld_i32:
2501         tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2502         break;
2503
2504     OP_32_64(st8):
2505         if (const_args[0]) {
2506             tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2507             tcg_out8(s, a0);
2508         } else {
2509             tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2510         }
2511         break;
2512     OP_32_64(st16):
2513         if (const_args[0]) {
2514             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2515             tcg_out16(s, a0);
2516         } else {
2517             tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2518         }
2519         break;
2520 #if TCG_TARGET_REG_BITS == 64
2521     case INDEX_op_st32_i64:
2522 #endif
2523     case INDEX_op_st_i32:
2524         if (const_args[0]) {
2525             tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2526             tcg_out32(s, a0);
2527         } else {
2528             tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2529         }
2530         break;
2531
2532     OP_32_64(add):
2533         /* For 3-operand addition, use LEA.  */
2534         if (a0 != a1) {
2535             TCGArg c3 = 0;
2536             if (const_a2) {
2537                 c3 = a2, a2 = -1;
2538             } else if (a0 == a2) {
2539                 /* Watch out for dest = src + dest, since we've removed
2540                    the matching constraint on the add.  */
2541                 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2542                 break;
2543             }
2544
2545             tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2546             break;
2547         }
2548         c = ARITH_ADD;
2549         goto gen_arith;
2550     OP_32_64(sub):
2551         c = ARITH_SUB;
2552         goto gen_arith;
2553     OP_32_64(and):
2554         c = ARITH_AND;
2555         goto gen_arith;
2556     OP_32_64(or):
2557         c = ARITH_OR;
2558         goto gen_arith;
2559     OP_32_64(xor):
2560         c = ARITH_XOR;
2561         goto gen_arith;
2562     gen_arith:
2563         if (const_a2) {
2564             tgen_arithi(s, c + rexw, a0, a2, 0);
2565         } else {
2566             tgen_arithr(s, c + rexw, a0, a2);
2567         }
2568         break;
2569
2570     OP_32_64(andc):
2571         if (const_a2) {
2572             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2573             tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2574         } else {
2575             tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2576         }
2577         break;
2578
2579     OP_32_64(mul):
2580         if (const_a2) {
2581             int32_t val;
2582             val = a2;
2583             if (val == (int8_t)val) {
2584                 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2585                 tcg_out8(s, val);
2586             } else {
2587                 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2588                 tcg_out32(s, val);
2589             }
2590         } else {
2591             tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2592         }
2593         break;
2594
2595     OP_32_64(div2):
2596         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2597         break;
2598     OP_32_64(divu2):
2599         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2600         break;
2601
2602     OP_32_64(shl):
2603         /* For small constant 3-operand shift, use LEA.  */
2604         if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2605             if (a2 - 1 == 0) {
2606                 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2607                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2608             } else {
2609                 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2610                 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2611             }
2612             break;
2613         }
2614         c = SHIFT_SHL;
2615         vexop = OPC_SHLX;
2616         goto gen_shift_maybe_vex;
2617     OP_32_64(shr):
2618         c = SHIFT_SHR;
2619         vexop = OPC_SHRX;
2620         goto gen_shift_maybe_vex;
2621     OP_32_64(sar):
2622         c = SHIFT_SAR;
2623         vexop = OPC_SARX;
2624         goto gen_shift_maybe_vex;
2625     OP_32_64(rotl):
2626         c = SHIFT_ROL;
2627         goto gen_shift;
2628     OP_32_64(rotr):
2629         c = SHIFT_ROR;
2630         goto gen_shift;
2631     gen_shift_maybe_vex:
2632         if (have_bmi2) {
2633             if (!const_a2) {
2634                 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2635                 break;
2636             }
2637             tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2638         }
2639         /* FALLTHRU */
2640     gen_shift:
2641         if (const_a2) {
2642             tcg_out_shifti(s, c + rexw, a0, a2);
2643         } else {
2644             tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2645         }
2646         break;
2647
2648     OP_32_64(ctz):
2649         tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2650         break;
2651     OP_32_64(clz):
2652         tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2653         break;
2654     OP_32_64(ctpop):
2655         tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2656         break;
2657
2658     OP_32_64(brcond):
2659         tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2660                        arg_label(args[3]), 0);
2661         break;
2662     OP_32_64(setcond):
2663         tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
2664         break;
2665     OP_32_64(negsetcond):
2666         tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
2667         break;
2668     OP_32_64(movcond):
2669         tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2670         break;
2671
2672     OP_32_64(bswap16):
2673         if (a2 & TCG_BSWAP_OS) {
2674             /* Output must be sign-extended. */
2675             if (rexw) {
2676                 tcg_out_bswap64(s, a0);
2677                 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2678             } else {
2679                 tcg_out_bswap32(s, a0);
2680                 tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2681             }
2682         } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2683             /* Output must be zero-extended, but input isn't. */
2684             tcg_out_bswap32(s, a0);
2685             tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2686         } else {
2687             tcg_out_rolw_8(s, a0);
2688         }
2689         break;
2690     OP_32_64(bswap32):
2691         tcg_out_bswap32(s, a0);
2692         if (rexw && (a2 & TCG_BSWAP_OS)) {
2693             tcg_out_ext32s(s, a0, a0);
2694         }
2695         break;
2696
2697     OP_32_64(neg):
2698         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2699         break;
2700     OP_32_64(not):
2701         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2702         break;
2703
2704     case INDEX_op_qemu_ld_a64_i32:
2705         if (TCG_TARGET_REG_BITS == 32) {
2706             tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2707             break;
2708         }
2709         /* fall through */
2710     case INDEX_op_qemu_ld_a32_i32:
2711         tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2712         break;
2713     case INDEX_op_qemu_ld_a32_i64:
2714         if (TCG_TARGET_REG_BITS == 64) {
2715             tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2716         } else {
2717             tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2718         }
2719         break;
2720     case INDEX_op_qemu_ld_a64_i64:
2721         if (TCG_TARGET_REG_BITS == 64) {
2722             tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2723         } else {
2724             tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2725         }
2726         break;
2727     case INDEX_op_qemu_ld_a32_i128:
2728     case INDEX_op_qemu_ld_a64_i128:
2729         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2730         tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2731         break;
2732
2733     case INDEX_op_qemu_st_a64_i32:
2734     case INDEX_op_qemu_st8_a64_i32:
2735         if (TCG_TARGET_REG_BITS == 32) {
2736             tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2737             break;
2738         }
2739         /* fall through */
2740     case INDEX_op_qemu_st_a32_i32:
2741     case INDEX_op_qemu_st8_a32_i32:
2742         tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2743         break;
2744     case INDEX_op_qemu_st_a32_i64:
2745         if (TCG_TARGET_REG_BITS == 64) {
2746             tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2747         } else {
2748             tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2749         }
2750         break;
2751     case INDEX_op_qemu_st_a64_i64:
2752         if (TCG_TARGET_REG_BITS == 64) {
2753             tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2754         } else {
2755             tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2756         }
2757         break;
2758     case INDEX_op_qemu_st_a32_i128:
2759     case INDEX_op_qemu_st_a64_i128:
2760         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2761         tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2762         break;
2763
2764     OP_32_64(mulu2):
2765         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2766         break;
2767     OP_32_64(muls2):
2768         tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2769         break;
2770     OP_32_64(add2):
2771         if (const_args[4]) {
2772             tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2773         } else {
2774             tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2775         }
2776         if (const_args[5]) {
2777             tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2778         } else {
2779             tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2780         }
2781         break;
2782     OP_32_64(sub2):
2783         if (const_args[4]) {
2784             tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2785         } else {
2786             tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2787         }
2788         if (const_args[5]) {
2789             tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2790         } else {
2791             tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2792         }
2793         break;
2794
2795 #if TCG_TARGET_REG_BITS == 32
2796     case INDEX_op_brcond2_i32:
2797         tcg_out_brcond2(s, args, const_args, 0);
2798         break;
2799     case INDEX_op_setcond2_i32:
2800         tcg_out_setcond2(s, args, const_args);
2801         break;
2802 #else /* TCG_TARGET_REG_BITS == 64 */
2803     case INDEX_op_ld32s_i64:
2804         tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2805         break;
2806     case INDEX_op_ld_i64:
2807         tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2808         break;
2809     case INDEX_op_st_i64:
2810         if (const_args[0]) {
2811             tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2812             tcg_out32(s, a0);
2813         } else {
2814             tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2815         }
2816         break;
2817
2818     case INDEX_op_bswap64_i64:
2819         tcg_out_bswap64(s, a0);
2820         break;
2821     case INDEX_op_extrh_i64_i32:
2822         tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2823         break;
2824 #endif
2825
2826     OP_32_64(deposit):
2827         if (args[3] == 0 && args[4] == 8) {
2828             /* load bits 0..7 */
2829             if (const_a2) {
2830                 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2831                             0, a0, 0);
2832                 tcg_out8(s, a2);
2833             } else {
2834                 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2835             }
2836         } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2837             /* load bits 8..15 */
2838             if (const_a2) {
2839                 tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2840                 tcg_out8(s, a2);
2841             } else {
2842                 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2843             }
2844         } else if (args[3] == 0 && args[4] == 16) {
2845             /* load bits 0..15 */
2846             if (const_a2) {
2847                 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2848                             0, a0, 0);
2849                 tcg_out16(s, a2);
2850             } else {
2851                 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2852             }
2853         } else {
2854             g_assert_not_reached();
2855         }
2856         break;
2857
2858     case INDEX_op_extract_i64:
2859         if (a2 + args[3] == 32) {
2860             /* This is a 32-bit zero-extending right shift.  */
2861             tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2862             tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2863             break;
2864         }
2865         /* FALLTHRU */
2866     case INDEX_op_extract_i32:
2867         /* On the off-chance that we can use the high-byte registers.
2868            Otherwise we emit the same ext16 + shift pattern that we
2869            would have gotten from the normal tcg-op.c expansion.  */
2870         tcg_debug_assert(a2 == 8 && args[3] == 8);
2871         if (a1 < 4 && a0 < 8) {
2872             tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2873         } else {
2874             tcg_out_ext16u(s, a0, a1);
2875             tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2876         }
2877         break;
2878
2879     case INDEX_op_sextract_i32:
2880         /* We don't implement sextract_i64, as we cannot sign-extend to
2881            64-bits without using the REX prefix that explicitly excludes
2882            access to the high-byte registers.  */
2883         tcg_debug_assert(a2 == 8 && args[3] == 8);
2884         if (a1 < 4 && a0 < 8) {
2885             tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2886         } else {
2887             tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2888             tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2889         }
2890         break;
2891
2892     OP_32_64(extract2):
2893         /* Note that SHRD outputs to the r/m operand.  */
2894         tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2895         tcg_out8(s, args[3]);
2896         break;
2897
2898     case INDEX_op_mb:
2899         tcg_out_mb(s, a0);
2900         break;
2901     case INDEX_op_mov_i32:  /* Always emitted via tcg_out_mov.  */
2902     case INDEX_op_mov_i64:
2903     case INDEX_op_call:     /* Always emitted via tcg_out_call.  */
2904     case INDEX_op_exit_tb:  /* Always emitted via tcg_out_exit_tb.  */
2905     case INDEX_op_goto_tb:  /* Always emitted via tcg_out_goto_tb.  */
2906     case INDEX_op_ext8s_i32:  /* Always emitted via tcg_reg_alloc_op.  */
2907     case INDEX_op_ext8s_i64:
2908     case INDEX_op_ext8u_i32:
2909     case INDEX_op_ext8u_i64:
2910     case INDEX_op_ext16s_i32:
2911     case INDEX_op_ext16s_i64:
2912     case INDEX_op_ext16u_i32:
2913     case INDEX_op_ext16u_i64:
2914     case INDEX_op_ext32s_i64:
2915     case INDEX_op_ext32u_i64:
2916     case INDEX_op_ext_i32_i64:
2917     case INDEX_op_extu_i32_i64:
2918     case INDEX_op_extrl_i64_i32:
2919     default:
2920         g_assert_not_reached();
2921     }
2922
2923 #undef OP_32_64
2924 }
2925
2926 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2927                            unsigned vecl, unsigned vece,
2928                            const TCGArg args[TCG_MAX_OP_ARGS],
2929                            const int const_args[TCG_MAX_OP_ARGS])
2930 {
2931     static int const add_insn[4] = {
2932         OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2933     };
2934     static int const ssadd_insn[4] = {
2935         OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2936     };
2937     static int const usadd_insn[4] = {
2938         OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2939     };
2940     static int const sub_insn[4] = {
2941         OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2942     };
2943     static int const sssub_insn[4] = {
2944         OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2945     };
2946     static int const ussub_insn[4] = {
2947         OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2948     };
2949     static int const mul_insn[4] = {
2950         OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2951     };
2952     static int const shift_imm_insn[4] = {
2953         OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2954     };
2955     static int const cmpeq_insn[4] = {
2956         OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2957     };
2958     static int const cmpgt_insn[4] = {
2959         OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2960     };
2961     static int const punpckl_insn[4] = {
2962         OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2963     };
2964     static int const punpckh_insn[4] = {
2965         OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2966     };
2967     static int const packss_insn[4] = {
2968         OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2969     };
2970     static int const packus_insn[4] = {
2971         OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2972     };
2973     static int const smin_insn[4] = {
2974         OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2975     };
2976     static int const smax_insn[4] = {
2977         OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2978     };
2979     static int const umin_insn[4] = {
2980         OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2981     };
2982     static int const umax_insn[4] = {
2983         OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2984     };
2985     static int const rotlv_insn[4] = {
2986         OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2987     };
2988     static int const rotrv_insn[4] = {
2989         OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
2990     };
2991     static int const shlv_insn[4] = {
2992         OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
2993     };
2994     static int const shrv_insn[4] = {
2995         OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
2996     };
2997     static int const sarv_insn[4] = {
2998         OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
2999     };
3000     static int const shls_insn[4] = {
3001         OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
3002     };
3003     static int const shrs_insn[4] = {
3004         OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
3005     };
3006     static int const sars_insn[4] = {
3007         OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
3008     };
3009     static int const vpshldi_insn[4] = {
3010         OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
3011     };
3012     static int const vpshldv_insn[4] = {
3013         OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
3014     };
3015     static int const vpshrdv_insn[4] = {
3016         OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
3017     };
3018     static int const abs_insn[4] = {
3019         OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
3020     };
3021
3022     TCGType type = vecl + TCG_TYPE_V64;
3023     int insn, sub;
3024     TCGArg a0, a1, a2, a3;
3025
3026     a0 = args[0];
3027     a1 = args[1];
3028     a2 = args[2];
3029
3030     switch (opc) {
3031     case INDEX_op_add_vec:
3032         insn = add_insn[vece];
3033         goto gen_simd;
3034     case INDEX_op_ssadd_vec:
3035         insn = ssadd_insn[vece];
3036         goto gen_simd;
3037     case INDEX_op_usadd_vec:
3038         insn = usadd_insn[vece];
3039         goto gen_simd;
3040     case INDEX_op_sub_vec:
3041         insn = sub_insn[vece];
3042         goto gen_simd;
3043     case INDEX_op_sssub_vec:
3044         insn = sssub_insn[vece];
3045         goto gen_simd;
3046     case INDEX_op_ussub_vec:
3047         insn = ussub_insn[vece];
3048         goto gen_simd;
3049     case INDEX_op_mul_vec:
3050         insn = mul_insn[vece];
3051         goto gen_simd;
3052     case INDEX_op_and_vec:
3053         insn = OPC_PAND;
3054         goto gen_simd;
3055     case INDEX_op_or_vec:
3056         insn = OPC_POR;
3057         goto gen_simd;
3058     case INDEX_op_xor_vec:
3059         insn = OPC_PXOR;
3060         goto gen_simd;
3061     case INDEX_op_smin_vec:
3062         insn = smin_insn[vece];
3063         goto gen_simd;
3064     case INDEX_op_umin_vec:
3065         insn = umin_insn[vece];
3066         goto gen_simd;
3067     case INDEX_op_smax_vec:
3068         insn = smax_insn[vece];
3069         goto gen_simd;
3070     case INDEX_op_umax_vec:
3071         insn = umax_insn[vece];
3072         goto gen_simd;
3073     case INDEX_op_shlv_vec:
3074         insn = shlv_insn[vece];
3075         goto gen_simd;
3076     case INDEX_op_shrv_vec:
3077         insn = shrv_insn[vece];
3078         goto gen_simd;
3079     case INDEX_op_sarv_vec:
3080         insn = sarv_insn[vece];
3081         goto gen_simd;
3082     case INDEX_op_rotlv_vec:
3083         insn = rotlv_insn[vece];
3084         goto gen_simd;
3085     case INDEX_op_rotrv_vec:
3086         insn = rotrv_insn[vece];
3087         goto gen_simd;
3088     case INDEX_op_shls_vec:
3089         insn = shls_insn[vece];
3090         goto gen_simd;
3091     case INDEX_op_shrs_vec:
3092         insn = shrs_insn[vece];
3093         goto gen_simd;
3094     case INDEX_op_sars_vec:
3095         insn = sars_insn[vece];
3096         goto gen_simd;
3097     case INDEX_op_x86_punpckl_vec:
3098         insn = punpckl_insn[vece];
3099         goto gen_simd;
3100     case INDEX_op_x86_punpckh_vec:
3101         insn = punpckh_insn[vece];
3102         goto gen_simd;
3103     case INDEX_op_x86_packss_vec:
3104         insn = packss_insn[vece];
3105         goto gen_simd;
3106     case INDEX_op_x86_packus_vec:
3107         insn = packus_insn[vece];
3108         goto gen_simd;
3109     case INDEX_op_x86_vpshldv_vec:
3110         insn = vpshldv_insn[vece];
3111         a1 = a2;
3112         a2 = args[3];
3113         goto gen_simd;
3114     case INDEX_op_x86_vpshrdv_vec:
3115         insn = vpshrdv_insn[vece];
3116         a1 = a2;
3117         a2 = args[3];
3118         goto gen_simd;
3119 #if TCG_TARGET_REG_BITS == 32
3120     case INDEX_op_dup2_vec:
3121         /* First merge the two 32-bit inputs to a single 64-bit element. */
3122         tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3123         /* Then replicate the 64-bit elements across the rest of the vector. */
3124         if (type != TCG_TYPE_V64) {
3125             tcg_out_dup_vec(s, type, MO_64, a0, a0);
3126         }
3127         break;
3128 #endif
3129     case INDEX_op_abs_vec:
3130         insn = abs_insn[vece];
3131         a2 = a1;
3132         a1 = 0;
3133         goto gen_simd;
3134     gen_simd:
3135         tcg_debug_assert(insn != OPC_UD2);
3136         if (type == TCG_TYPE_V256) {
3137             insn |= P_VEXL;
3138         }
3139         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3140         break;
3141
3142     case INDEX_op_cmp_vec:
3143         sub = args[3];
3144         if (sub == TCG_COND_EQ) {
3145             insn = cmpeq_insn[vece];
3146         } else if (sub == TCG_COND_GT) {
3147             insn = cmpgt_insn[vece];
3148         } else {
3149             g_assert_not_reached();
3150         }
3151         goto gen_simd;
3152
3153     case INDEX_op_andc_vec:
3154         insn = OPC_PANDN;
3155         if (type == TCG_TYPE_V256) {
3156             insn |= P_VEXL;
3157         }
3158         tcg_out_vex_modrm(s, insn, a0, a2, a1);
3159         break;
3160
3161     case INDEX_op_shli_vec:
3162         insn = shift_imm_insn[vece];
3163         sub = 6;
3164         goto gen_shift;
3165     case INDEX_op_shri_vec:
3166         insn = shift_imm_insn[vece];
3167         sub = 2;
3168         goto gen_shift;
3169     case INDEX_op_sari_vec:
3170         if (vece == MO_64) {
3171             insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3172         } else {
3173             insn = shift_imm_insn[vece];
3174         }
3175         sub = 4;
3176         goto gen_shift;
3177     case INDEX_op_rotli_vec:
3178         insn = OPC_PSHIFTD_Ib | P_EVEX;  /* VPROL[DQ] */
3179         if (vece == MO_64) {
3180             insn |= P_VEXW;
3181         }
3182         sub = 1;
3183         goto gen_shift;
3184     gen_shift:
3185         tcg_debug_assert(vece != MO_8);
3186         if (type == TCG_TYPE_V256) {
3187             insn |= P_VEXL;
3188         }
3189         tcg_out_vex_modrm(s, insn, sub, a0, a1);
3190         tcg_out8(s, a2);
3191         break;
3192
3193     case INDEX_op_ld_vec:
3194         tcg_out_ld(s, type, a0, a1, a2);
3195         break;
3196     case INDEX_op_st_vec:
3197         tcg_out_st(s, type, a0, a1, a2);
3198         break;
3199     case INDEX_op_dupm_vec:
3200         tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3201         break;
3202
3203     case INDEX_op_x86_shufps_vec:
3204         insn = OPC_SHUFPS;
3205         sub = args[3];
3206         goto gen_simd_imm8;
3207     case INDEX_op_x86_blend_vec:
3208         if (vece == MO_16) {
3209             insn = OPC_PBLENDW;
3210         } else if (vece == MO_32) {
3211             insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3212         } else {
3213             g_assert_not_reached();
3214         }
3215         sub = args[3];
3216         goto gen_simd_imm8;
3217     case INDEX_op_x86_vperm2i128_vec:
3218         insn = OPC_VPERM2I128;
3219         sub = args[3];
3220         goto gen_simd_imm8;
3221     case INDEX_op_x86_vpshldi_vec:
3222         insn = vpshldi_insn[vece];
3223         sub = args[3];
3224         goto gen_simd_imm8;
3225
3226     case INDEX_op_not_vec:
3227         insn = OPC_VPTERNLOGQ;
3228         a2 = a1;
3229         sub = 0x33; /* !B */
3230         goto gen_simd_imm8;
3231     case INDEX_op_nor_vec:
3232         insn = OPC_VPTERNLOGQ;
3233         sub = 0x11; /* norCB */
3234         goto gen_simd_imm8;
3235     case INDEX_op_nand_vec:
3236         insn = OPC_VPTERNLOGQ;
3237         sub = 0x77; /* nandCB */
3238         goto gen_simd_imm8;
3239     case INDEX_op_eqv_vec:
3240         insn = OPC_VPTERNLOGQ;
3241         sub = 0x99; /* xnorCB */
3242         goto gen_simd_imm8;
3243     case INDEX_op_orc_vec:
3244         insn = OPC_VPTERNLOGQ;
3245         sub = 0xdd; /* orB!C */
3246         goto gen_simd_imm8;
3247
3248     case INDEX_op_bitsel_vec:
3249         insn = OPC_VPTERNLOGQ;
3250         a3 = args[3];
3251         if (a0 == a1) {
3252             a1 = a2;
3253             a2 = a3;
3254             sub = 0xca; /* A?B:C */
3255         } else if (a0 == a2) {
3256             a2 = a3;
3257             sub = 0xe2; /* B?A:C */
3258         } else {
3259             tcg_out_mov(s, type, a0, a3);
3260             sub = 0xb8; /* B?C:A */
3261         }
3262         goto gen_simd_imm8;
3263
3264     gen_simd_imm8:
3265         tcg_debug_assert(insn != OPC_UD2);
3266         if (type == TCG_TYPE_V256) {
3267             insn |= P_VEXL;
3268         }
3269         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3270         tcg_out8(s, sub);
3271         break;
3272
3273     case INDEX_op_x86_vpblendvb_vec:
3274         insn = OPC_VPBLENDVB;
3275         if (type == TCG_TYPE_V256) {
3276             insn |= P_VEXL;
3277         }
3278         tcg_out_vex_modrm(s, insn, a0, a1, a2);
3279         tcg_out8(s, args[3] << 4);
3280         break;
3281
3282     case INDEX_op_x86_psrldq_vec:
3283         tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3284         tcg_out8(s, a2);
3285         break;
3286
3287     case INDEX_op_mov_vec:  /* Always emitted via tcg_out_mov.  */
3288     case INDEX_op_dup_vec:  /* Always emitted via tcg_out_dup_vec.  */
3289     default:
3290         g_assert_not_reached();
3291     }
3292 }
3293
3294 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3295 {
3296     switch (op) {
3297     case INDEX_op_goto_ptr:
3298         return C_O0_I1(r);
3299
3300     case INDEX_op_ld8u_i32:
3301     case INDEX_op_ld8u_i64:
3302     case INDEX_op_ld8s_i32:
3303     case INDEX_op_ld8s_i64:
3304     case INDEX_op_ld16u_i32:
3305     case INDEX_op_ld16u_i64:
3306     case INDEX_op_ld16s_i32:
3307     case INDEX_op_ld16s_i64:
3308     case INDEX_op_ld_i32:
3309     case INDEX_op_ld32u_i64:
3310     case INDEX_op_ld32s_i64:
3311     case INDEX_op_ld_i64:
3312         return C_O1_I1(r, r);
3313
3314     case INDEX_op_st8_i32:
3315     case INDEX_op_st8_i64:
3316         return C_O0_I2(qi, r);
3317
3318     case INDEX_op_st16_i32:
3319     case INDEX_op_st16_i64:
3320     case INDEX_op_st_i32:
3321     case INDEX_op_st32_i64:
3322         return C_O0_I2(ri, r);
3323
3324     case INDEX_op_st_i64:
3325         return C_O0_I2(re, r);
3326
3327     case INDEX_op_add_i32:
3328     case INDEX_op_add_i64:
3329         return C_O1_I2(r, r, re);
3330
3331     case INDEX_op_sub_i32:
3332     case INDEX_op_sub_i64:
3333     case INDEX_op_mul_i32:
3334     case INDEX_op_mul_i64:
3335     case INDEX_op_or_i32:
3336     case INDEX_op_or_i64:
3337     case INDEX_op_xor_i32:
3338     case INDEX_op_xor_i64:
3339         return C_O1_I2(r, 0, re);
3340
3341     case INDEX_op_and_i32:
3342     case INDEX_op_and_i64:
3343         return C_O1_I2(r, 0, reZ);
3344
3345     case INDEX_op_andc_i32:
3346     case INDEX_op_andc_i64:
3347         return C_O1_I2(r, r, rI);
3348
3349     case INDEX_op_shl_i32:
3350     case INDEX_op_shl_i64:
3351     case INDEX_op_shr_i32:
3352     case INDEX_op_shr_i64:
3353     case INDEX_op_sar_i32:
3354     case INDEX_op_sar_i64:
3355         return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3356
3357     case INDEX_op_rotl_i32:
3358     case INDEX_op_rotl_i64:
3359     case INDEX_op_rotr_i32:
3360     case INDEX_op_rotr_i64:
3361         return C_O1_I2(r, 0, ci);
3362
3363     case INDEX_op_brcond_i32:
3364     case INDEX_op_brcond_i64:
3365         return C_O0_I2(r, re);
3366
3367     case INDEX_op_bswap16_i32:
3368     case INDEX_op_bswap16_i64:
3369     case INDEX_op_bswap32_i32:
3370     case INDEX_op_bswap32_i64:
3371     case INDEX_op_bswap64_i64:
3372     case INDEX_op_neg_i32:
3373     case INDEX_op_neg_i64:
3374     case INDEX_op_not_i32:
3375     case INDEX_op_not_i64:
3376     case INDEX_op_extrh_i64_i32:
3377         return C_O1_I1(r, 0);
3378
3379     case INDEX_op_ext8s_i32:
3380     case INDEX_op_ext8s_i64:
3381     case INDEX_op_ext8u_i32:
3382     case INDEX_op_ext8u_i64:
3383         return C_O1_I1(r, q);
3384
3385     case INDEX_op_ext16s_i32:
3386     case INDEX_op_ext16s_i64:
3387     case INDEX_op_ext16u_i32:
3388     case INDEX_op_ext16u_i64:
3389     case INDEX_op_ext32s_i64:
3390     case INDEX_op_ext32u_i64:
3391     case INDEX_op_ext_i32_i64:
3392     case INDEX_op_extu_i32_i64:
3393     case INDEX_op_extrl_i64_i32:
3394     case INDEX_op_extract_i32:
3395     case INDEX_op_extract_i64:
3396     case INDEX_op_sextract_i32:
3397     case INDEX_op_ctpop_i32:
3398     case INDEX_op_ctpop_i64:
3399         return C_O1_I1(r, r);
3400
3401     case INDEX_op_extract2_i32:
3402     case INDEX_op_extract2_i64:
3403         return C_O1_I2(r, 0, r);
3404
3405     case INDEX_op_deposit_i32:
3406     case INDEX_op_deposit_i64:
3407         return C_O1_I2(q, 0, qi);
3408
3409     case INDEX_op_setcond_i32:
3410     case INDEX_op_setcond_i64:
3411     case INDEX_op_negsetcond_i32:
3412     case INDEX_op_negsetcond_i64:
3413         return C_O1_I2(q, r, re);
3414
3415     case INDEX_op_movcond_i32:
3416     case INDEX_op_movcond_i64:
3417         return C_O1_I4(r, r, re, r, 0);
3418
3419     case INDEX_op_div2_i32:
3420     case INDEX_op_div2_i64:
3421     case INDEX_op_divu2_i32:
3422     case INDEX_op_divu2_i64:
3423         return C_O2_I3(a, d, 0, 1, r);
3424
3425     case INDEX_op_mulu2_i32:
3426     case INDEX_op_mulu2_i64:
3427     case INDEX_op_muls2_i32:
3428     case INDEX_op_muls2_i64:
3429         return C_O2_I2(a, d, a, r);
3430
3431     case INDEX_op_add2_i32:
3432     case INDEX_op_add2_i64:
3433     case INDEX_op_sub2_i32:
3434     case INDEX_op_sub2_i64:
3435         return C_N1_O1_I4(r, r, 0, 1, re, re);
3436
3437     case INDEX_op_ctz_i32:
3438     case INDEX_op_ctz_i64:
3439         return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3440
3441     case INDEX_op_clz_i32:
3442     case INDEX_op_clz_i64:
3443         return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3444
3445     case INDEX_op_qemu_ld_a32_i32:
3446         return C_O1_I1(r, L);
3447     case INDEX_op_qemu_ld_a64_i32:
3448         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3449
3450     case INDEX_op_qemu_st_a32_i32:
3451         return C_O0_I2(L, L);
3452     case INDEX_op_qemu_st_a64_i32:
3453         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3454     case INDEX_op_qemu_st8_a32_i32:
3455         return C_O0_I2(s, L);
3456     case INDEX_op_qemu_st8_a64_i32:
3457         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3458
3459     case INDEX_op_qemu_ld_a32_i64:
3460         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3461     case INDEX_op_qemu_ld_a64_i64:
3462         return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3463
3464     case INDEX_op_qemu_st_a32_i64:
3465         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3466     case INDEX_op_qemu_st_a64_i64:
3467         return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3468
3469     case INDEX_op_qemu_ld_a32_i128:
3470     case INDEX_op_qemu_ld_a64_i128:
3471         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3472         return C_O2_I1(r, r, L);
3473     case INDEX_op_qemu_st_a32_i128:
3474     case INDEX_op_qemu_st_a64_i128:
3475         tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3476         return C_O0_I3(L, L, L);
3477
3478     case INDEX_op_brcond2_i32:
3479         return C_O0_I4(r, r, ri, ri);
3480
3481     case INDEX_op_setcond2_i32:
3482         return C_O1_I4(r, r, r, ri, ri);
3483
3484     case INDEX_op_ld_vec:
3485     case INDEX_op_dupm_vec:
3486         return C_O1_I1(x, r);
3487
3488     case INDEX_op_st_vec:
3489         return C_O0_I2(x, r);
3490
3491     case INDEX_op_add_vec:
3492     case INDEX_op_sub_vec:
3493     case INDEX_op_mul_vec:
3494     case INDEX_op_and_vec:
3495     case INDEX_op_or_vec:
3496     case INDEX_op_xor_vec:
3497     case INDEX_op_andc_vec:
3498     case INDEX_op_orc_vec:
3499     case INDEX_op_nand_vec:
3500     case INDEX_op_nor_vec:
3501     case INDEX_op_eqv_vec:
3502     case INDEX_op_ssadd_vec:
3503     case INDEX_op_usadd_vec:
3504     case INDEX_op_sssub_vec:
3505     case INDEX_op_ussub_vec:
3506     case INDEX_op_smin_vec:
3507     case INDEX_op_umin_vec:
3508     case INDEX_op_smax_vec:
3509     case INDEX_op_umax_vec:
3510     case INDEX_op_shlv_vec:
3511     case INDEX_op_shrv_vec:
3512     case INDEX_op_sarv_vec:
3513     case INDEX_op_rotlv_vec:
3514     case INDEX_op_rotrv_vec:
3515     case INDEX_op_shls_vec:
3516     case INDEX_op_shrs_vec:
3517     case INDEX_op_sars_vec:
3518     case INDEX_op_cmp_vec:
3519     case INDEX_op_x86_shufps_vec:
3520     case INDEX_op_x86_blend_vec:
3521     case INDEX_op_x86_packss_vec:
3522     case INDEX_op_x86_packus_vec:
3523     case INDEX_op_x86_vperm2i128_vec:
3524     case INDEX_op_x86_punpckl_vec:
3525     case INDEX_op_x86_punpckh_vec:
3526     case INDEX_op_x86_vpshldi_vec:
3527 #if TCG_TARGET_REG_BITS == 32
3528     case INDEX_op_dup2_vec:
3529 #endif
3530         return C_O1_I2(x, x, x);
3531
3532     case INDEX_op_abs_vec:
3533     case INDEX_op_dup_vec:
3534     case INDEX_op_not_vec:
3535     case INDEX_op_shli_vec:
3536     case INDEX_op_shri_vec:
3537     case INDEX_op_sari_vec:
3538     case INDEX_op_rotli_vec:
3539     case INDEX_op_x86_psrldq_vec:
3540         return C_O1_I1(x, x);
3541
3542     case INDEX_op_x86_vpshldv_vec:
3543     case INDEX_op_x86_vpshrdv_vec:
3544         return C_O1_I3(x, 0, x, x);
3545
3546     case INDEX_op_bitsel_vec:
3547     case INDEX_op_x86_vpblendvb_vec:
3548         return C_O1_I3(x, x, x, x);
3549
3550     default:
3551         g_assert_not_reached();
3552     }
3553 }
3554
3555 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3556 {
3557     switch (opc) {
3558     case INDEX_op_add_vec:
3559     case INDEX_op_sub_vec:
3560     case INDEX_op_and_vec:
3561     case INDEX_op_or_vec:
3562     case INDEX_op_xor_vec:
3563     case INDEX_op_andc_vec:
3564     case INDEX_op_orc_vec:
3565     case INDEX_op_nand_vec:
3566     case INDEX_op_nor_vec:
3567     case INDEX_op_eqv_vec:
3568     case INDEX_op_not_vec:
3569     case INDEX_op_bitsel_vec:
3570         return 1;
3571     case INDEX_op_cmp_vec:
3572     case INDEX_op_cmpsel_vec:
3573         return -1;
3574
3575     case INDEX_op_rotli_vec:
3576         return have_avx512vl && vece >= MO_32 ? 1 : -1;
3577
3578     case INDEX_op_shli_vec:
3579     case INDEX_op_shri_vec:
3580         /* We must expand the operation for MO_8.  */
3581         return vece == MO_8 ? -1 : 1;
3582
3583     case INDEX_op_sari_vec:
3584         switch (vece) {
3585         case MO_8:
3586             return -1;
3587         case MO_16:
3588         case MO_32:
3589             return 1;
3590         case MO_64:
3591             if (have_avx512vl) {
3592                 return 1;
3593             }
3594             /*
3595              * We can emulate this for MO_64, but it does not pay off
3596              * unless we're producing at least 4 values.
3597              */
3598             return type >= TCG_TYPE_V256 ? -1 : 0;
3599         }
3600         return 0;
3601
3602     case INDEX_op_shls_vec:
3603     case INDEX_op_shrs_vec:
3604         return vece >= MO_16;
3605     case INDEX_op_sars_vec:
3606         switch (vece) {
3607         case MO_16:
3608         case MO_32:
3609             return 1;
3610         case MO_64:
3611             return have_avx512vl;
3612         }
3613         return 0;
3614     case INDEX_op_rotls_vec:
3615         return vece >= MO_16 ? -1 : 0;
3616
3617     case INDEX_op_shlv_vec:
3618     case INDEX_op_shrv_vec:
3619         switch (vece) {
3620         case MO_16:
3621             return have_avx512bw;
3622         case MO_32:
3623         case MO_64:
3624             return have_avx2;
3625         }
3626         return 0;
3627     case INDEX_op_sarv_vec:
3628         switch (vece) {
3629         case MO_16:
3630             return have_avx512bw;
3631         case MO_32:
3632             return have_avx2;
3633         case MO_64:
3634             return have_avx512vl;
3635         }
3636         return 0;
3637     case INDEX_op_rotlv_vec:
3638     case INDEX_op_rotrv_vec:
3639         switch (vece) {
3640         case MO_16:
3641             return have_avx512vbmi2 ? -1 : 0;
3642         case MO_32:
3643         case MO_64:
3644             return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3645         }
3646         return 0;
3647
3648     case INDEX_op_mul_vec:
3649         switch (vece) {
3650         case MO_8:
3651             return -1;
3652         case MO_64:
3653             return have_avx512dq;
3654         }
3655         return 1;
3656
3657     case INDEX_op_ssadd_vec:
3658     case INDEX_op_usadd_vec:
3659     case INDEX_op_sssub_vec:
3660     case INDEX_op_ussub_vec:
3661         return vece <= MO_16;
3662     case INDEX_op_smin_vec:
3663     case INDEX_op_smax_vec:
3664     case INDEX_op_umin_vec:
3665     case INDEX_op_umax_vec:
3666     case INDEX_op_abs_vec:
3667         return vece <= MO_32 || have_avx512vl;
3668
3669     default:
3670         return 0;
3671     }
3672 }
3673
3674 static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3675                            TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3676 {
3677     TCGv_vec t1, t2;
3678
3679     tcg_debug_assert(vece == MO_8);
3680
3681     t1 = tcg_temp_new_vec(type);
3682     t2 = tcg_temp_new_vec(type);
3683
3684     /*
3685      * Unpack to W, shift, and repack.  Tricky bits:
3686      * (1) Use punpck*bw x,x to produce DDCCBBAA,
3687      *     i.e. duplicate in other half of the 16-bit lane.
3688      * (2) For right-shift, add 8 so that the high half of the lane
3689      *     becomes zero.  For left-shift, and left-rotate, we must
3690      *     shift up and down again.
3691      * (3) Step 2 leaves high half zero such that PACKUSWB
3692      *     (pack with unsigned saturation) does not modify
3693      *     the quantity.
3694      */
3695     vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3696               tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3697     vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3698               tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3699
3700     if (opc != INDEX_op_rotli_vec) {
3701         imm += 8;
3702     }
3703     if (opc == INDEX_op_shri_vec) {
3704         tcg_gen_shri_vec(MO_16, t1, t1, imm);
3705         tcg_gen_shri_vec(MO_16, t2, t2, imm);
3706     } else {
3707         tcg_gen_shli_vec(MO_16, t1, t1, imm);
3708         tcg_gen_shli_vec(MO_16, t2, t2, imm);
3709         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3710         tcg_gen_shri_vec(MO_16, t2, t2, 8);
3711     }
3712
3713     vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3714               tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3715     tcg_temp_free_vec(t1);
3716     tcg_temp_free_vec(t2);
3717 }
3718
3719 static void expand_vec_sari(TCGType type, unsigned vece,
3720                             TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3721 {
3722     TCGv_vec t1, t2;
3723
3724     switch (vece) {
3725     case MO_8:
3726         /* Unpack to W, shift, and repack, as in expand_vec_shi.  */
3727         t1 = tcg_temp_new_vec(type);
3728         t2 = tcg_temp_new_vec(type);
3729         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3730                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3731         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3732                   tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3733         tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3734         tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3735         vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3736                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3737         tcg_temp_free_vec(t1);
3738         tcg_temp_free_vec(t2);
3739         break;
3740
3741     case MO_64:
3742         t1 = tcg_temp_new_vec(type);
3743         if (imm <= 32) {
3744             /*
3745              * We can emulate a small sign extend by performing an arithmetic
3746              * 32-bit shift and overwriting the high half of a 64-bit logical
3747              * shift.  Note that the ISA says shift of 32 is valid, but TCG
3748              * does not, so we have to bound the smaller shift -- we get the
3749              * same result in the high half either way.
3750              */
3751             tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3752             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3753             vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3754                       tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3755                       tcgv_vec_arg(t1), 0xaa);
3756         } else {
3757             /* Otherwise we will need to use a compare vs 0 to produce
3758              * the sign-extend, shift and merge.
3759              */
3760             tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3761                             tcg_constant_vec(type, MO_64, 0), v1);
3762             tcg_gen_shri_vec(MO_64, v0, v1, imm);
3763             tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3764             tcg_gen_or_vec(MO_64, v0, v0, t1);
3765         }
3766         tcg_temp_free_vec(t1);
3767         break;
3768
3769     default:
3770         g_assert_not_reached();
3771     }
3772 }
3773
3774 static void expand_vec_rotli(TCGType type, unsigned vece,
3775                              TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3776 {
3777     TCGv_vec t;
3778
3779     if (vece == MO_8) {
3780         expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3781         return;
3782     }
3783
3784     if (have_avx512vbmi2) {
3785         vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3786                   tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3787         return;
3788     }
3789
3790     t = tcg_temp_new_vec(type);
3791     tcg_gen_shli_vec(vece, t, v1, imm);
3792     tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3793     tcg_gen_or_vec(vece, v0, v0, t);
3794     tcg_temp_free_vec(t);
3795 }
3796
3797 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3798                             TCGv_vec v1, TCGv_vec sh, bool right)
3799 {
3800     TCGv_vec t;
3801
3802     if (have_avx512vbmi2) {
3803         vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3804                   type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3805                   tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3806         return;
3807     }
3808
3809     t = tcg_temp_new_vec(type);
3810     tcg_gen_dupi_vec(vece, t, 8 << vece);
3811     tcg_gen_sub_vec(vece, t, t, sh);
3812     if (right) {
3813         tcg_gen_shlv_vec(vece, t, v1, t);
3814         tcg_gen_shrv_vec(vece, v0, v1, sh);
3815     } else {
3816         tcg_gen_shrv_vec(vece, t, v1, t);
3817         tcg_gen_shlv_vec(vece, v0, v1, sh);
3818     }
3819     tcg_gen_or_vec(vece, v0, v0, t);
3820     tcg_temp_free_vec(t);
3821 }
3822
3823 static void expand_vec_rotls(TCGType type, unsigned vece,
3824                              TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3825 {
3826     TCGv_vec t = tcg_temp_new_vec(type);
3827
3828     tcg_debug_assert(vece != MO_8);
3829
3830     if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3831         tcg_gen_dup_i32_vec(vece, t, lsh);
3832         if (vece >= MO_32) {
3833             tcg_gen_rotlv_vec(vece, v0, v1, t);
3834         } else {
3835             expand_vec_rotv(type, vece, v0, v1, t, false);
3836         }
3837     } else {
3838         TCGv_i32 rsh = tcg_temp_new_i32();
3839
3840         tcg_gen_neg_i32(rsh, lsh);
3841         tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3842         tcg_gen_shls_vec(vece, t, v1, lsh);
3843         tcg_gen_shrs_vec(vece, v0, v1, rsh);
3844         tcg_gen_or_vec(vece, v0, v0, t);
3845
3846         tcg_temp_free_i32(rsh);
3847     }
3848
3849     tcg_temp_free_vec(t);
3850 }
3851
3852 static void expand_vec_mul(TCGType type, unsigned vece,
3853                            TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3854 {
3855     TCGv_vec t1, t2, t3, t4, zero;
3856
3857     tcg_debug_assert(vece == MO_8);
3858
3859     /*
3860      * Unpack v1 bytes to words, 0 | x.
3861      * Unpack v2 bytes to words, y | 0.
3862      * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3863      * Shift logical right by 8 bits to clear the high 8 bytes before
3864      * using an unsigned saturated pack.
3865      *
3866      * The difference between the V64, V128 and V256 cases is merely how
3867      * we distribute the expansion between temporaries.
3868      */
3869     switch (type) {
3870     case TCG_TYPE_V64:
3871         t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3872         t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3873         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3874         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3875                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3876         vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3877                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3878         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3879         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3880         vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3881                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3882         tcg_temp_free_vec(t1);
3883         tcg_temp_free_vec(t2);
3884         break;
3885
3886     case TCG_TYPE_V128:
3887     case TCG_TYPE_V256:
3888         t1 = tcg_temp_new_vec(type);
3889         t2 = tcg_temp_new_vec(type);
3890         t3 = tcg_temp_new_vec(type);
3891         t4 = tcg_temp_new_vec(type);
3892         zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3893         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3894                   tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3895         vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3896                   tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3897         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3898                   tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3899         vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3900                   tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3901         tcg_gen_mul_vec(MO_16, t1, t1, t2);
3902         tcg_gen_mul_vec(MO_16, t3, t3, t4);
3903         tcg_gen_shri_vec(MO_16, t1, t1, 8);
3904         tcg_gen_shri_vec(MO_16, t3, t3, 8);
3905         vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3906                   tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3907         tcg_temp_free_vec(t1);
3908         tcg_temp_free_vec(t2);
3909         tcg_temp_free_vec(t3);
3910         tcg_temp_free_vec(t4);
3911         break;
3912
3913     default:
3914         g_assert_not_reached();
3915     }
3916 }
3917
3918 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3919                                  TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3920 {
3921     enum {
3922         NEED_INV  = 1,
3923         NEED_SWAP = 2,
3924         NEED_BIAS = 4,
3925         NEED_UMIN = 8,
3926         NEED_UMAX = 16,
3927     };
3928     TCGv_vec t1, t2, t3;
3929     uint8_t fixup;
3930
3931     switch (cond) {
3932     case TCG_COND_EQ:
3933     case TCG_COND_GT:
3934         fixup = 0;
3935         break;
3936     case TCG_COND_NE:
3937     case TCG_COND_LE:
3938         fixup = NEED_INV;
3939         break;
3940     case TCG_COND_LT:
3941         fixup = NEED_SWAP;
3942         break;
3943     case TCG_COND_GE:
3944         fixup = NEED_SWAP | NEED_INV;
3945         break;
3946     case TCG_COND_LEU:
3947         if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3948             fixup = NEED_UMIN;
3949         } else {
3950             fixup = NEED_BIAS | NEED_INV;
3951         }
3952         break;
3953     case TCG_COND_GTU:
3954         if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3955             fixup = NEED_UMIN | NEED_INV;
3956         } else {
3957             fixup = NEED_BIAS;
3958         }
3959         break;
3960     case TCG_COND_GEU:
3961         if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3962             fixup = NEED_UMAX;
3963         } else {
3964             fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3965         }
3966         break;
3967     case TCG_COND_LTU:
3968         if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3969             fixup = NEED_UMAX | NEED_INV;
3970         } else {
3971             fixup = NEED_BIAS | NEED_SWAP;
3972         }
3973         break;
3974     default:
3975         g_assert_not_reached();
3976     }
3977
3978     if (fixup & NEED_INV) {
3979         cond = tcg_invert_cond(cond);
3980     }
3981     if (fixup & NEED_SWAP) {
3982         t1 = v1, v1 = v2, v2 = t1;
3983         cond = tcg_swap_cond(cond);
3984     }
3985
3986     t1 = t2 = NULL;
3987     if (fixup & (NEED_UMIN | NEED_UMAX)) {
3988         t1 = tcg_temp_new_vec(type);
3989         if (fixup & NEED_UMIN) {
3990             tcg_gen_umin_vec(vece, t1, v1, v2);
3991         } else {
3992             tcg_gen_umax_vec(vece, t1, v1, v2);
3993         }
3994         v2 = t1;
3995         cond = TCG_COND_EQ;
3996     } else if (fixup & NEED_BIAS) {
3997         t1 = tcg_temp_new_vec(type);
3998         t2 = tcg_temp_new_vec(type);
3999         t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
4000         tcg_gen_sub_vec(vece, t1, v1, t3);
4001         tcg_gen_sub_vec(vece, t2, v2, t3);
4002         v1 = t1;
4003         v2 = t2;
4004         cond = tcg_signed_cond(cond);
4005     }
4006
4007     tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
4008     /* Expand directly; do not recurse.  */
4009     vec_gen_4(INDEX_op_cmp_vec, type, vece,
4010               tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
4011
4012     if (t1) {
4013         tcg_temp_free_vec(t1);
4014         if (t2) {
4015             tcg_temp_free_vec(t2);
4016         }
4017     }
4018     return fixup & NEED_INV;
4019 }
4020
4021 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
4022                            TCGv_vec v1, TCGv_vec v2, TCGCond cond)
4023 {
4024     if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
4025         tcg_gen_not_vec(vece, v0, v0);
4026     }
4027 }
4028
4029 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
4030                               TCGv_vec c1, TCGv_vec c2,
4031                               TCGv_vec v3, TCGv_vec v4, TCGCond cond)
4032 {
4033     TCGv_vec t = tcg_temp_new_vec(type);
4034
4035     if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
4036         /* Invert the sense of the compare by swapping arguments.  */
4037         TCGv_vec x;
4038         x = v3, v3 = v4, v4 = x;
4039     }
4040     vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
4041               tcgv_vec_arg(v0), tcgv_vec_arg(v4),
4042               tcgv_vec_arg(v3), tcgv_vec_arg(t));
4043     tcg_temp_free_vec(t);
4044 }
4045
4046 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
4047                        TCGArg a0, ...)
4048 {
4049     va_list va;
4050     TCGArg a2;
4051     TCGv_vec v0, v1, v2, v3, v4;
4052
4053     va_start(va, a0);
4054     v0 = temp_tcgv_vec(arg_temp(a0));
4055     v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4056     a2 = va_arg(va, TCGArg);
4057
4058     switch (opc) {
4059     case INDEX_op_shli_vec:
4060     case INDEX_op_shri_vec:
4061         expand_vec_shi(type, vece, opc, v0, v1, a2);
4062         break;
4063
4064     case INDEX_op_sari_vec:
4065         expand_vec_sari(type, vece, v0, v1, a2);
4066         break;
4067
4068     case INDEX_op_rotli_vec:
4069         expand_vec_rotli(type, vece, v0, v1, a2);
4070         break;
4071
4072     case INDEX_op_rotls_vec:
4073         expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4074         break;
4075
4076     case INDEX_op_rotlv_vec:
4077         v2 = temp_tcgv_vec(arg_temp(a2));
4078         expand_vec_rotv(type, vece, v0, v1, v2, false);
4079         break;
4080     case INDEX_op_rotrv_vec:
4081         v2 = temp_tcgv_vec(arg_temp(a2));
4082         expand_vec_rotv(type, vece, v0, v1, v2, true);
4083         break;
4084
4085     case INDEX_op_mul_vec:
4086         v2 = temp_tcgv_vec(arg_temp(a2));
4087         expand_vec_mul(type, vece, v0, v1, v2);
4088         break;
4089
4090     case INDEX_op_cmp_vec:
4091         v2 = temp_tcgv_vec(arg_temp(a2));
4092         expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
4093         break;
4094
4095     case INDEX_op_cmpsel_vec:
4096         v2 = temp_tcgv_vec(arg_temp(a2));
4097         v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4098         v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4099         expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4100         break;
4101
4102     default:
4103         break;
4104     }
4105
4106     va_end(va);
4107 }
4108
4109 static const int tcg_target_callee_save_regs[] = {
4110 #if TCG_TARGET_REG_BITS == 64
4111     TCG_REG_RBP,
4112     TCG_REG_RBX,
4113 #if defined(_WIN64)
4114     TCG_REG_RDI,
4115     TCG_REG_RSI,
4116 #endif
4117     TCG_REG_R12,
4118     TCG_REG_R13,
4119     TCG_REG_R14, /* Currently used for the global env. */
4120     TCG_REG_R15,
4121 #else
4122     TCG_REG_EBP, /* Currently used for the global env. */
4123     TCG_REG_EBX,
4124     TCG_REG_ESI,
4125     TCG_REG_EDI,
4126 #endif
4127 };
4128
4129 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
4130    and tcg_register_jit.  */
4131
4132 #define PUSH_SIZE \
4133     ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4134      * (TCG_TARGET_REG_BITS / 8))
4135
4136 #define FRAME_SIZE \
4137     ((PUSH_SIZE \
4138       + TCG_STATIC_CALL_ARGS_SIZE \
4139       + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4140       + TCG_TARGET_STACK_ALIGN - 1) \
4141      & ~(TCG_TARGET_STACK_ALIGN - 1))
4142
4143 /* Generate global QEMU prologue and epilogue code */
4144 static void tcg_target_qemu_prologue(TCGContext *s)
4145 {
4146     int i, stack_addend;
4147
4148     /* TB prologue */
4149
4150     /* Reserve some stack space, also for TCG temps.  */
4151     stack_addend = FRAME_SIZE - PUSH_SIZE;
4152     tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4153                   CPU_TEMP_BUF_NLONGS * sizeof(long));
4154
4155     /* Save all callee saved registers.  */
4156     for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4157         tcg_out_push(s, tcg_target_callee_save_regs[i]);
4158     }
4159
4160     if (!tcg_use_softmmu && guest_base) {
4161         int seg = setup_guest_base_seg();
4162         if (seg != 0) {
4163             x86_guest_base.seg = seg;
4164         } else if (guest_base == (int32_t)guest_base) {
4165             x86_guest_base.ofs = guest_base;
4166         } else {
4167             assert(TCG_TARGET_REG_BITS == 64);
4168             /* Choose R12 because, as a base, it requires a SIB byte. */
4169             x86_guest_base.index = TCG_REG_R12;
4170             tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4171             tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4172         }
4173     }
4174
4175     if (TCG_TARGET_REG_BITS == 32) {
4176         tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4177                    (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4178         tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4179         /* jmp *tb.  */
4180         tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4181                              (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4182                              + stack_addend);
4183     } else {
4184         tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4185         tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4186         /* jmp *tb.  */
4187         tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4188     }
4189
4190     /*
4191      * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4192      * and fall through to the rest of the epilogue.
4193      */
4194     tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4195     tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4196
4197     /* TB epilogue */
4198     tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4199
4200     tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4201
4202     if (have_avx2) {
4203         tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4204     }
4205     for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4206         tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4207     }
4208     tcg_out_opc(s, OPC_RET, 0, 0, 0);
4209 }
4210
4211 static void tcg_out_tb_start(TCGContext *s)
4212 {
4213     /* nothing to do */
4214 }
4215
4216 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4217 {
4218     memset(p, 0x90, count);
4219 }
4220
4221 static void tcg_target_init(TCGContext *s)
4222 {
4223     tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4224     if (TCG_TARGET_REG_BITS == 64) {
4225         tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4226     }
4227     if (have_avx1) {
4228         tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4229         tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4230     }
4231     if (have_avx2) {
4232         tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4233     }
4234
4235     tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4236     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4237     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4238     tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4239     if (TCG_TARGET_REG_BITS == 64) {
4240 #if !defined(_WIN64)
4241         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4242         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4243 #endif
4244         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4245         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4246         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4247         tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4248     }
4249
4250     s->reserved_regs = 0;
4251     tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4252     tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4253 #ifdef _WIN64
4254     /* These are call saved, and we don't save them, so don't use them. */
4255     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4256     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4257     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4258     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4259     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4260     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4261     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4262     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4263     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4264     tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4265 #endif
4266 }
4267
4268 typedef struct {
4269     DebugFrameHeader h;
4270     uint8_t fde_def_cfa[4];
4271     uint8_t fde_reg_ofs[14];
4272 } DebugFrame;
4273
4274 /* We're expecting a 2 byte uleb128 encoded value.  */
4275 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4276
4277 #if !defined(__ELF__)
4278     /* Host machine without ELF. */
4279 #elif TCG_TARGET_REG_BITS == 64
4280 #define ELF_HOST_MACHINE EM_X86_64
4281 static const DebugFrame debug_frame = {
4282     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4283     .h.cie.id = -1,
4284     .h.cie.version = 1,
4285     .h.cie.code_align = 1,
4286     .h.cie.data_align = 0x78,             /* sleb128 -8 */
4287     .h.cie.return_column = 16,
4288
4289     /* Total FDE size does not include the "len" member.  */
4290     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4291
4292     .fde_def_cfa = {
4293         12, 7,                          /* DW_CFA_def_cfa %rsp, ... */
4294         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4295         (FRAME_SIZE >> 7)
4296     },
4297     .fde_reg_ofs = {
4298         0x90, 1,                        /* DW_CFA_offset, %rip, -8 */
4299         /* The following ordering must match tcg_target_callee_save_regs.  */
4300         0x86, 2,                        /* DW_CFA_offset, %rbp, -16 */
4301         0x83, 3,                        /* DW_CFA_offset, %rbx, -24 */
4302         0x8c, 4,                        /* DW_CFA_offset, %r12, -32 */
4303         0x8d, 5,                        /* DW_CFA_offset, %r13, -40 */
4304         0x8e, 6,                        /* DW_CFA_offset, %r14, -48 */
4305         0x8f, 7,                        /* DW_CFA_offset, %r15, -56 */
4306     }
4307 };
4308 #else
4309 #define ELF_HOST_MACHINE EM_386
4310 static const DebugFrame debug_frame = {
4311     .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4312     .h.cie.id = -1,
4313     .h.cie.version = 1,
4314     .h.cie.code_align = 1,
4315     .h.cie.data_align = 0x7c,             /* sleb128 -4 */
4316     .h.cie.return_column = 8,
4317
4318     /* Total FDE size does not include the "len" member.  */
4319     .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4320
4321     .fde_def_cfa = {
4322         12, 4,                          /* DW_CFA_def_cfa %esp, ... */
4323         (FRAME_SIZE & 0x7f) | 0x80,     /* ... uleb128 FRAME_SIZE */
4324         (FRAME_SIZE >> 7)
4325     },
4326     .fde_reg_ofs = {
4327         0x88, 1,                        /* DW_CFA_offset, %eip, -4 */
4328         /* The following ordering must match tcg_target_callee_save_regs.  */
4329         0x85, 2,                        /* DW_CFA_offset, %ebp, -8 */
4330         0x83, 3,                        /* DW_CFA_offset, %ebx, -12 */
4331         0x86, 4,                        /* DW_CFA_offset, %esi, -16 */
4332         0x87, 5,                        /* DW_CFA_offset, %edi, -20 */
4333     }
4334 };
4335 #endif
4336
4337 #if defined(ELF_HOST_MACHINE)
4338 void tcg_register_jit(const void *buf, size_t buf_size)
4339 {
4340     tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4341 }
4342 #endif