]> git.proxmox.com Git - mirror_qemu.git/blame - tcg/i386/tcg-target.inc.c
Merge remote-tracking branch 'remotes/jnsnow/tags/bitmaps-pull-request' into staging
[mirror_qemu.git] / tcg / i386 / tcg-target.inc.c
CommitLineData
c896fe29
FB
1/*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
d4a9eb1f 24
2b434dd1 25#include "../tcg-pool.inc.c"
4e45f239 26
8d8fdbae 27#ifdef CONFIG_DEBUG_TCG
d4a9eb1f 28static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
5d8a4f8f
RH
29#if TCG_TARGET_REG_BITS == 64
30 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
5d8a4f8f
RH
31#else
32 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
770c2fc7
RH
33#endif
34 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
35 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
36#if TCG_TARGET_REG_BITS == 64
37 "%xmm8", "%xmm9", "%xmm10", "%xmm11",
38 "%xmm12", "%xmm13", "%xmm14", "%xmm15",
5d8a4f8f 39#endif
c896fe29 40};
d4a9eb1f 41#endif
c896fe29 42
d4a9eb1f 43static const int tcg_target_reg_alloc_order[] = {
5d8a4f8f
RH
44#if TCG_TARGET_REG_BITS == 64
45 TCG_REG_RBP,
46 TCG_REG_RBX,
47 TCG_REG_R12,
48 TCG_REG_R13,
49 TCG_REG_R14,
50 TCG_REG_R15,
51 TCG_REG_R10,
52 TCG_REG_R11,
53 TCG_REG_R9,
54 TCG_REG_R8,
55 TCG_REG_RCX,
56 TCG_REG_RDX,
57 TCG_REG_RSI,
58 TCG_REG_RDI,
59 TCG_REG_RAX,
60#else
c896fe29
FB
61 TCG_REG_EBX,
62 TCG_REG_ESI,
63 TCG_REG_EDI,
64 TCG_REG_EBP,
6648e296
RH
65 TCG_REG_ECX,
66 TCG_REG_EDX,
67 TCG_REG_EAX,
770c2fc7
RH
68#endif
69 TCG_REG_XMM0,
70 TCG_REG_XMM1,
71 TCG_REG_XMM2,
72 TCG_REG_XMM3,
73 TCG_REG_XMM4,
74 TCG_REG_XMM5,
75#ifndef _WIN64
76 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
77 any of them. Therefore only allow xmm0-xmm5 to be allocated. */
78 TCG_REG_XMM6,
79 TCG_REG_XMM7,
80#if TCG_TARGET_REG_BITS == 64
81 TCG_REG_XMM8,
82 TCG_REG_XMM9,
83 TCG_REG_XMM10,
84 TCG_REG_XMM11,
85 TCG_REG_XMM12,
86 TCG_REG_XMM13,
87 TCG_REG_XMM14,
88 TCG_REG_XMM15,
89#endif
5d8a4f8f 90#endif
c896fe29
FB
91};
92
5d8a4f8f
RH
93static const int tcg_target_call_iarg_regs[] = {
94#if TCG_TARGET_REG_BITS == 64
8d918718
SW
95#if defined(_WIN64)
96 TCG_REG_RCX,
97 TCG_REG_RDX,
98#else
5d8a4f8f
RH
99 TCG_REG_RDI,
100 TCG_REG_RSI,
101 TCG_REG_RDX,
102 TCG_REG_RCX,
8d918718 103#endif
5d8a4f8f
RH
104 TCG_REG_R8,
105 TCG_REG_R9,
106#else
d73685e3 107 /* 32 bit mode uses stack based calling convention (GCC default). */
5d8a4f8f
RH
108#endif
109};
110
68af23af 111static const int tcg_target_call_oarg_regs[] = {
5d8a4f8f 112 TCG_REG_EAX,
68af23af 113#if TCG_TARGET_REG_BITS == 32
5d8a4f8f 114 TCG_REG_EDX
68af23af 115#endif
5d8a4f8f 116};
c896fe29 117
a1b29c9a
RH
118/* Constants we accept. */
119#define TCG_CT_CONST_S32 0x100
120#define TCG_CT_CONST_U32 0x200
9d2eec20 121#define TCG_CT_CONST_I32 0x400
bbf25f90 122#define TCG_CT_CONST_WSZ 0x800
a1b29c9a 123
770c2fc7 124/* Registers used with L constraint, which are the first argument
b18212c6
SW
125 registers on x86_64, and two random call clobbered registers on
126 i386. */
127#if TCG_TARGET_REG_BITS == 64
128# define TCG_REG_L0 tcg_target_call_iarg_regs[0]
129# define TCG_REG_L1 tcg_target_call_iarg_regs[1]
b18212c6
SW
130#else
131# define TCG_REG_L0 TCG_REG_EAX
132# define TCG_REG_L1 TCG_REG_EDX
133#endif
134
085bb5bb
AJ
135/* The host compiler should supply <cpuid.h> to enable runtime features
136 detection, as we're not going to go so far as our own inline assembly.
137 If not available, default values will be assumed. */
138#if defined(CONFIG_CPUID_H)
5dd89908 139#include "qemu/cpuid.h"
085bb5bb
AJ
140#endif
141
5dd89908 142/* For 64-bit, we always know that CMOV is available. */
76a347e1
RH
143#if TCG_TARGET_REG_BITS == 64
144# define have_cmov 1
5dd89908 145#elif defined(CONFIG_CPUID_H)
76a347e1
RH
146static bool have_cmov;
147#else
148# define have_cmov 0
149#endif
150
993508e4 151/* We need these symbols in tcg-target.h, and we can't properly conditionalize
9d2eec20
RH
152 it there. Therefore we always define the variable. */
153bool have_bmi1;
993508e4 154bool have_popcnt;
770c2fc7
RH
155bool have_avx1;
156bool have_avx2;
9d2eec20 157
5dd89908
RH
158#ifdef CONFIG_CPUID_H
159static bool have_movbe;
6399ab33 160static bool have_bmi2;
bbf25f90
RH
161static bool have_lzcnt;
162#else
5dd89908
RH
163# define have_movbe 0
164# define have_bmi2 0
bbf25f90
RH
165# define have_lzcnt 0
166#endif
6399ab33 167
f6bff89d 168static tcg_insn_unit *tb_ret_addr;
b03cce8e 169
6ac17786 170static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
2ba7fae2 171 intptr_t value, intptr_t addend)
c896fe29 172{
f54b3f92 173 value += addend;
c896fe29 174 switch(type) {
c896fe29 175 case R_386_PC32:
5d8a4f8f
RH
176 value -= (uintptr_t)code_ptr;
177 if (value != (int32_t)value) {
bec3afd5 178 return false;
5d8a4f8f 179 }
770c2fc7
RH
180 /* FALLTHRU */
181 case R_386_32:
5c53bb81 182 tcg_patch32(code_ptr, value);
c896fe29 183 break;
f75b56c1 184 case R_386_PC8:
5d8a4f8f 185 value -= (uintptr_t)code_ptr;
f75b56c1 186 if (value != (int8_t)value) {
bec3afd5 187 return false;
f75b56c1 188 }
5c53bb81 189 tcg_patch8(code_ptr, value);
f75b56c1 190 break;
c896fe29
FB
191 default:
192 tcg_abort();
193 }
6ac17786 194 return true;
c896fe29
FB
195}
196
770c2fc7
RH
197#if TCG_TARGET_REG_BITS == 64
198#define ALL_GENERAL_REGS 0x0000ffffu
199#define ALL_VECTOR_REGS 0xffff0000u
200#else
201#define ALL_GENERAL_REGS 0x000000ffu
202#define ALL_VECTOR_REGS 0x00ff0000u
203#endif
204
c896fe29 205/* parse target specific constraints */
069ea736
RH
206static const char *target_parse_constraint(TCGArgConstraint *ct,
207 const char *ct_str, TCGType type)
c896fe29 208{
069ea736 209 switch(*ct_str++) {
c896fe29
FB
210 case 'a':
211 ct->ct |= TCG_CT_REG;
212 tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
213 break;
214 case 'b':
215 ct->ct |= TCG_CT_REG;
216 tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
217 break;
218 case 'c':
219 ct->ct |= TCG_CT_REG;
220 tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
221 break;
222 case 'd':
223 ct->ct |= TCG_CT_REG;
224 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
225 break;
226 case 'S':
227 ct->ct |= TCG_CT_REG;
228 tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
229 break;
230 case 'D':
231 ct->ct |= TCG_CT_REG;
232 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
233 break;
234 case 'q':
770c2fc7 235 /* A register that can be used as a byte operand. */
c896fe29 236 ct->ct |= TCG_CT_REG;
f46934df 237 ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xf;
c896fe29 238 break;
a4773324 239 case 'Q':
770c2fc7 240 /* A register with an addressable second byte (e.g. %ah). */
a4773324 241 ct->ct |= TCG_CT_REG;
f46934df 242 ct->u.regs = 0xf;
a4773324 243 break;
c896fe29 244 case 'r':
770c2fc7 245 /* A general register. */
c896fe29 246 ct->ct |= TCG_CT_REG;
770c2fc7 247 ct->u.regs |= ALL_GENERAL_REGS;
c896fe29 248 break;
bbf25f90
RH
249 case 'W':
250 /* With TZCNT/LZCNT, we can have operand-size as an input. */
251 ct->ct |= TCG_CT_CONST_WSZ;
252 break;
770c2fc7
RH
253 case 'x':
254 /* A vector register. */
255 ct->ct |= TCG_CT_REG;
256 ct->u.regs |= ALL_VECTOR_REGS;
257 break;
c896fe29
FB
258
259 /* qemu_ld/st address constraint */
260 case 'L':
261 ct->ct |= TCG_CT_REG;
f46934df 262 ct->u.regs = TCG_TARGET_REG_BITS == 64 ? 0xffff : 0xff;
17b91491
AJ
263 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
264 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
5d8a4f8f
RH
265 break;
266
267 case 'e':
cd26449a 268 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
5d8a4f8f
RH
269 break;
270 case 'Z':
cd26449a 271 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
c896fe29 272 break;
9d2eec20 273 case 'I':
cd26449a 274 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
9d2eec20 275 break;
5d8a4f8f 276
c896fe29 277 default:
069ea736 278 return NULL;
c896fe29 279 }
069ea736 280 return ct_str;
c896fe29
FB
281}
282
283/* test if a constant matches the constraint */
f6c6afc1 284static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
c896fe29
FB
285 const TCGArgConstraint *arg_ct)
286{
5d8a4f8f
RH
287 int ct = arg_ct->ct;
288 if (ct & TCG_CT_CONST) {
c896fe29 289 return 1;
5d8a4f8f
RH
290 }
291 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
292 return 1;
293 }
294 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
295 return 1;
296 }
9d2eec20
RH
297 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
298 return 1;
299 }
bbf25f90
RH
300 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
301 return 1;
302 }
5d8a4f8f 303 return 0;
c896fe29
FB
304}
305
5d8a4f8f 306# define LOWREGMASK(x) ((x) & 7)
5d8a4f8f 307
96b4cf38 308#define P_EXT 0x100 /* 0x0f opcode prefix */
2a113775
AJ
309#define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */
310#define P_DATA16 0x400 /* 0x66 opcode prefix */
5d8a4f8f 311#if TCG_TARGET_REG_BITS == 64
2a113775
AJ
312# define P_REXW 0x1000 /* Set REX.W = 1 */
313# define P_REXB_R 0x2000 /* REG field as byte register */
314# define P_REXB_RM 0x4000 /* R/M field as byte register */
315# define P_GS 0x8000 /* gs segment override */
5d8a4f8f 316#else
5d8a4f8f
RH
317# define P_REXW 0
318# define P_REXB_R 0
319# define P_REXB_RM 0
44b37ace 320# define P_GS 0
5d8a4f8f 321#endif
770c2fc7
RH
322#define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */
323#define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */
324#define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */
325#define P_VEXL 0x80000 /* Set VEX.L = 1 */
fcb5dac1 326
a369a702
RH
327#define OPC_ARITH_EvIz (0x81)
328#define OPC_ARITH_EvIb (0x83)
81570a70 329#define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
9d2eec20 330#define OPC_ANDN (0xf2 | P_EXT38)
81570a70 331#define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
54eaf40b 332#define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3))
770c2fc7 333#define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16)
bbf25f90
RH
334#define OPC_BSF (0xbc | P_EXT)
335#define OPC_BSR (0xbd | P_EXT)
fcb5dac1 336#define OPC_BSWAP (0xc8 | P_EXT)
aadb21a4 337#define OPC_CALL_Jz (0xe8)
d0a16297 338#define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */
81570a70
RH
339#define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3))
340#define OPC_DEC_r32 (0x48)
0566d387
RH
341#define OPC_IMUL_GvEv (0xaf | P_EXT)
342#define OPC_IMUL_GvEvIb (0x6b)
343#define OPC_IMUL_GvEvIz (0x69)
81570a70 344#define OPC_INC_r32 (0x40)
da441cff
RH
345#define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */
346#define OPC_JCC_short (0x70) /* ... plus condition code */
347#define OPC_JMP_long (0xe9)
348#define OPC_JMP_short (0xeb)
34a6d0b7 349#define OPC_LEA (0x8d)
bbf25f90 350#define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3)
af266089
RH
351#define OPC_MOVB_EvGv (0x88) /* stores, more or less */
352#define OPC_MOVL_EvGv (0x89) /* stores, more or less */
353#define OPC_MOVL_GvEv (0x8b) /* loads, more or less */
5c2d2a9e 354#define OPC_MOVB_EvIz (0xc6)
5d8a4f8f 355#define OPC_MOVL_EvIz (0xc7)
ef10b106 356#define OPC_MOVL_Iv (0xb8)
085bb5bb
AJ
357#define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
358#define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
770c2fc7
RH
359#define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16)
360#define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16)
7b60ef32 361#define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2)
770c2fc7
RH
362#define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
363#define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
364#define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
365#define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
366#define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3)
367#define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16)
6817c355
RH
368#define OPC_MOVSBL (0xbe | P_EXT)
369#define OPC_MOVSWL (0xbf | P_EXT)
5d8a4f8f 370#define OPC_MOVSLQ (0x63 | P_REXW)
55e082a7
RH
371#define OPC_MOVZBL (0xb6 | P_EXT)
372#define OPC_MOVZWL (0xb7 | P_EXT)
18f9b65f
RH
373#define OPC_PABSB (0x1c | P_EXT38 | P_DATA16)
374#define OPC_PABSW (0x1d | P_EXT38 | P_DATA16)
375#define OPC_PABSD (0x1e | P_EXT38 | P_DATA16)
770c2fc7
RH
376#define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16)
377#define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16)
378#define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16)
379#define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16)
380#define OPC_PADDB (0xfc | P_EXT | P_DATA16)
381#define OPC_PADDW (0xfd | P_EXT | P_DATA16)
382#define OPC_PADDD (0xfe | P_EXT | P_DATA16)
383#define OPC_PADDQ (0xd4 | P_EXT | P_DATA16)
8ffafbce
RH
384#define OPC_PADDSB (0xec | P_EXT | P_DATA16)
385#define OPC_PADDSW (0xed | P_EXT | P_DATA16)
386#define OPC_PADDUB (0xdc | P_EXT | P_DATA16)
387#define OPC_PADDUW (0xdd | P_EXT | P_DATA16)
770c2fc7
RH
388#define OPC_PAND (0xdb | P_EXT | P_DATA16)
389#define OPC_PANDN (0xdf | P_EXT | P_DATA16)
390#define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16)
391#define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16)
392#define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16)
393#define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16)
394#define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16)
395#define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16)
396#define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
397#define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
398#define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
bc37faf4
RH
399#define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16)
400#define OPC_PMAXSW (0xee | P_EXT | P_DATA16)
401#define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16)
402#define OPC_PMAXUB (0xde | P_EXT | P_DATA16)
403#define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16)
404#define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16)
405#define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16)
406#define OPC_PMINSW (0xea | P_EXT | P_DATA16)
407#define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16)
408#define OPC_PMINUB (0xda | P_EXT | P_DATA16)
409#define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16)
410#define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16)
770c2fc7
RH
411#define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16)
412#define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16)
413#define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16)
414#define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16)
415#define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16)
416#define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16)
417#define OPC_PMULLW (0xd5 | P_EXT | P_DATA16)
418#define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16)
419#define OPC_POR (0xeb | P_EXT | P_DATA16)
420#define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16)
421#define OPC_PSHUFD (0x70 | P_EXT | P_DATA16)
422#define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2)
423#define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3)
424#define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
425#define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
426#define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
0a8d7a3b
RH
427#define OPC_PSLLW (0xf1 | P_EXT | P_DATA16)
428#define OPC_PSLLD (0xf2 | P_EXT | P_DATA16)
429#define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16)
430#define OPC_PSRAW (0xe1 | P_EXT | P_DATA16)
431#define OPC_PSRAD (0xe2 | P_EXT | P_DATA16)
432#define OPC_PSRLW (0xd1 | P_EXT | P_DATA16)
433#define OPC_PSRLD (0xd2 | P_EXT | P_DATA16)
434#define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16)
770c2fc7
RH
435#define OPC_PSUBB (0xf8 | P_EXT | P_DATA16)
436#define OPC_PSUBW (0xf9 | P_EXT | P_DATA16)
437#define OPC_PSUBD (0xfa | P_EXT | P_DATA16)
438#define OPC_PSUBQ (0xfb | P_EXT | P_DATA16)
8ffafbce
RH
439#define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16)
440#define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16)
441#define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16)
442#define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16)
770c2fc7
RH
443#define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16)
444#define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16)
445#define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16)
446#define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16)
447#define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16)
448#define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16)
449#define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16)
450#define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16)
451#define OPC_PXOR (0xef | P_EXT | P_DATA16)
6858614e 452#define OPC_POP_r32 (0x58)
993508e4 453#define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
6858614e
RH
454#define OPC_PUSH_r32 (0x50)
455#define OPC_PUSH_Iv (0x68)
456#define OPC_PUSH_Ib (0x6a)
3c3accc6 457#define OPC_RET (0xc3)
5d8a4f8f 458#define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
f53dba01
RH
459#define OPC_SHIFT_1 (0xd1)
460#define OPC_SHIFT_Ib (0xc1)
461#define OPC_SHIFT_cl (0xd3)
6399ab33 462#define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3)
770c2fc7 463#define OPC_SHUFPS (0xc6 | P_EXT)
6399ab33
RH
464#define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16)
465#define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
c6fb8c0c 466#define OPC_SHRD_Ib (0xac | P_EXT)
81570a70 467#define OPC_TESTL (0x85)
bbf25f90 468#define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3)
770c2fc7
RH
469#define OPC_UD2 (0x0b | P_EXT)
470#define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
471#define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
1e262b49
RH
472#define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16)
473#define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16)
474#define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
475#define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
770c2fc7
RH
476#define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
477#define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
478#define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
479#define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
480#define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_REXW)
481#define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
a2ce146a
RH
482#define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16)
483#define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_REXW)
484#define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16)
485#define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16)
486#define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_REXW)
770c2fc7 487#define OPC_VZEROUPPER (0x77 | P_EXT)
b3e66df7 488#define OPC_XCHG_ax_r32 (0x90)
fcb5dac1 489
9363dedb
RH
490#define OPC_GRP3_Ev (0xf7)
491#define OPC_GRP5 (0xff)
770c2fc7 492#define OPC_GRP14 (0x73 | P_EXT | P_DATA16)
9363dedb
RH
493
494/* Group 1 opcode extensions for 0x80-0x83.
495 These are also used as modifiers for OPC_ARITH. */
c896fe29
FB
496#define ARITH_ADD 0
497#define ARITH_OR 1
498#define ARITH_ADC 2
499#define ARITH_SBB 3
500#define ARITH_AND 4
501#define ARITH_SUB 5
502#define ARITH_XOR 6
503#define ARITH_CMP 7
504
da441cff 505/* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */
9619376c
AJ
506#define SHIFT_ROL 0
507#define SHIFT_ROR 1
c896fe29
FB
508#define SHIFT_SHL 4
509#define SHIFT_SHR 5
510#define SHIFT_SAR 7
511
9363dedb
RH
512/* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */
513#define EXT3_NOT 2
514#define EXT3_NEG 3
515#define EXT3_MUL 4
516#define EXT3_IMUL 5
517#define EXT3_DIV 6
518#define EXT3_IDIV 7
519
520/* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */
5d8a4f8f
RH
521#define EXT5_INC_Ev 0
522#define EXT5_DEC_Ev 1
9363dedb
RH
523#define EXT5_CALLN_Ev 2
524#define EXT5_JMPN_Ev 4
da441cff
RH
525
526/* Condition codes to be added to OPC_JCC_{long,short}. */
c896fe29
FB
527#define JCC_JMP (-1)
528#define JCC_JO 0x0
529#define JCC_JNO 0x1
530#define JCC_JB 0x2
531#define JCC_JAE 0x3
532#define JCC_JE 0x4
533#define JCC_JNE 0x5
534#define JCC_JBE 0x6
535#define JCC_JA 0x7
536#define JCC_JS 0x8
537#define JCC_JNS 0x9
538#define JCC_JP 0xa
539#define JCC_JNP 0xb
540#define JCC_JL 0xc
541#define JCC_JGE 0xd
542#define JCC_JLE 0xe
543#define JCC_JG 0xf
544
0aed257f 545static const uint8_t tcg_cond_to_jcc[] = {
c896fe29
FB
546 [TCG_COND_EQ] = JCC_JE,
547 [TCG_COND_NE] = JCC_JNE,
548 [TCG_COND_LT] = JCC_JL,
549 [TCG_COND_GE] = JCC_JGE,
550 [TCG_COND_LE] = JCC_JLE,
551 [TCG_COND_GT] = JCC_JG,
552 [TCG_COND_LTU] = JCC_JB,
553 [TCG_COND_GEU] = JCC_JAE,
554 [TCG_COND_LEU] = JCC_JBE,
555 [TCG_COND_GTU] = JCC_JA,
556};
557
5d8a4f8f
RH
558#if TCG_TARGET_REG_BITS == 64
559static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
560{
561 int rex;
562
44b37ace
RH
563 if (opc & P_GS) {
564 tcg_out8(s, 0x65);
565 }
5d8a4f8f
RH
566 if (opc & P_DATA16) {
567 /* We should never be asking for both 16 and 64-bit operation. */
eabb7b91 568 tcg_debug_assert((opc & P_REXW) == 0);
5d8a4f8f
RH
569 tcg_out8(s, 0x66);
570 }
bbf25f90
RH
571 if (opc & P_SIMDF3) {
572 tcg_out8(s, 0xf3);
573 } else if (opc & P_SIMDF2) {
574 tcg_out8(s, 0xf2);
575 }
5d8a4f8f
RH
576
577 rex = 0;
c9d78213 578 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */
ecc7e843
RH
579 rex |= (r & 8) >> 1; /* REX.R */
580 rex |= (x & 8) >> 2; /* REX.X */
581 rex |= (rm & 8) >> 3; /* REX.B */
5d8a4f8f
RH
582
583 /* P_REXB_{R,RM} indicates that the given register is the low byte.
584 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
585 as otherwise the encoding indicates %[abcd]h. Note that the values
586 that are ORed in merely indicate that the REX byte must be present;
587 those bits get discarded in output. */
588 rex |= opc & (r >= 4 ? P_REXB_R : 0);
589 rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
590
591 if (rex) {
592 tcg_out8(s, (uint8_t)(rex | 0x40));
593 }
594
770c2fc7 595 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
5d8a4f8f 596 tcg_out8(s, 0x0f);
2a113775
AJ
597 if (opc & P_EXT38) {
598 tcg_out8(s, 0x38);
770c2fc7
RH
599 } else if (opc & P_EXT3A) {
600 tcg_out8(s, 0x3a);
2a113775 601 }
5d8a4f8f 602 }
2a113775 603
5d8a4f8f
RH
604 tcg_out8(s, opc);
605}
606#else
607static void tcg_out_opc(TCGContext *s, int opc)
c896fe29 608{
96b4cf38
RH
609 if (opc & P_DATA16) {
610 tcg_out8(s, 0x66);
611 }
bbf25f90
RH
612 if (opc & P_SIMDF3) {
613 tcg_out8(s, 0xf3);
614 } else if (opc & P_SIMDF2) {
615 tcg_out8(s, 0xf2);
616 }
770c2fc7 617 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
c896fe29 618 tcg_out8(s, 0x0f);
2a113775
AJ
619 if (opc & P_EXT38) {
620 tcg_out8(s, 0x38);
770c2fc7
RH
621 } else if (opc & P_EXT3A) {
622 tcg_out8(s, 0x3a);
2a113775 623 }
96b4cf38 624 }
c896fe29
FB
625 tcg_out8(s, opc);
626}
5d8a4f8f
RH
627/* Discard the register arguments to tcg_out_opc early, so as not to penalize
628 the 32-bit compilation paths. This method works with all versions of gcc,
629 whereas relying on optimization may not be able to exclude them. */
630#define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc)
631#endif
c896fe29 632
5d8a4f8f 633static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
c896fe29 634{
5d8a4f8f
RH
635 tcg_out_opc(s, opc, r, rm, 0);
636 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
c896fe29
FB
637}
638
770c2fc7
RH
639static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
640 int rm, int index)
ecc7e843
RH
641{
642 int tmp;
643
770c2fc7
RH
644 /* Use the two byte form if possible, which cannot encode
645 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */
646 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_REXW)) == P_EXT
647 && ((rm | index) & 8) == 0) {
648 /* Two byte VEX prefix. */
649 tcg_out8(s, 0xc5);
650
651 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
652 } else {
ecc7e843
RH
653 /* Three byte VEX prefix. */
654 tcg_out8(s, 0xc4);
655
656 /* VEX.m-mmmm */
770c2fc7
RH
657 if (opc & P_EXT3A) {
658 tmp = 3;
659 } else if (opc & P_EXT38) {
ecc7e843
RH
660 tmp = 2;
661 } else if (opc & P_EXT) {
662 tmp = 1;
663 } else {
770c2fc7 664 g_assert_not_reached();
ecc7e843 665 }
770c2fc7
RH
666 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
667 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */
668 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
ecc7e843
RH
669 tcg_out8(s, tmp);
670
770c2fc7 671 tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */
ecc7e843 672 }
770c2fc7
RH
673
674 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */
6399ab33
RH
675 /* VEX.pp */
676 if (opc & P_DATA16) {
677 tmp |= 1; /* 0x66 */
678 } else if (opc & P_SIMDF3) {
679 tmp |= 2; /* 0xf3 */
680 } else if (opc & P_SIMDF2) {
681 tmp |= 3; /* 0xf2 */
682 }
ecc7e843
RH
683 tmp |= (~v & 15) << 3; /* VEX.vvvv */
684 tcg_out8(s, tmp);
685 tcg_out8(s, opc);
770c2fc7
RH
686}
687
688static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
689{
690 tcg_out_vex_opc(s, opc, r, v, rm, 0);
ecc7e843
RH
691 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
692}
693
34a6d0b7 694/* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
5d8a4f8f
RH
695 We handle either RM and INDEX missing with a negative value. In 64-bit
696 mode for absolute addresses, ~RM is the size of the immediate operand
697 that will follow the instruction. */
34a6d0b7 698
770c2fc7
RH
699static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
700 int shift, intptr_t offset)
c896fe29 701{
34a6d0b7
RH
702 int mod, len;
703
5d8a4f8f
RH
704 if (index < 0 && rm < 0) {
705 if (TCG_TARGET_REG_BITS == 64) {
706 /* Try for a rip-relative addressing mode. This has replaced
707 the 32-bit-mode absolute addressing encoding. */
357e3d8a
RH
708 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
709 intptr_t disp = offset - pc;
5d8a4f8f 710 if (disp == (int32_t)disp) {
5d8a4f8f
RH
711 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
712 tcg_out32(s, disp);
713 return;
714 }
34a6d0b7 715
5d8a4f8f
RH
716 /* Try for an absolute address encoding. This requires the
717 use of the MODRM+SIB encoding and is therefore larger than
718 rip-relative addressing. */
719 if (offset == (int32_t)offset) {
5d8a4f8f
RH
720 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
721 tcg_out8(s, (4 << 3) | 5);
722 tcg_out32(s, offset);
723 return;
724 }
725
726 /* ??? The memory isn't directly addressable. */
770c2fc7 727 g_assert_not_reached();
5d8a4f8f
RH
728 } else {
729 /* Absolute address. */
5d8a4f8f
RH
730 tcg_out8(s, (r << 3) | 5);
731 tcg_out32(s, offset);
732 return;
733 }
734 }
34a6d0b7
RH
735
736 /* Find the length of the immediate addend. Note that the encoding
737 that would be used for (%ebp) indicates absolute addressing. */
5d8a4f8f 738 if (rm < 0) {
34a6d0b7 739 mod = 0, len = 4, rm = 5;
5d8a4f8f 740 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
34a6d0b7
RH
741 mod = 0, len = 0;
742 } else if (offset == (int8_t)offset) {
743 mod = 0x40, len = 1;
c896fe29 744 } else {
34a6d0b7
RH
745 mod = 0x80, len = 4;
746 }
747
748 /* Use a single byte MODRM format if possible. Note that the encoding
749 that would be used for %esp is the escape to the two byte form. */
5d8a4f8f 750 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
34a6d0b7 751 /* Single byte MODRM format. */
5d8a4f8f 752 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
34a6d0b7
RH
753 } else {
754 /* Two byte MODRM+SIB format. */
755
756 /* Note that the encoding that would place %esp into the index
5d8a4f8f
RH
757 field indicates no index register. In 64-bit mode, the REX.X
758 bit counts, so %r12 can be used as the index. */
759 if (index < 0) {
34a6d0b7 760 index = 4;
c896fe29 761 } else {
eabb7b91 762 tcg_debug_assert(index != TCG_REG_ESP);
c896fe29 763 }
34a6d0b7 764
5d8a4f8f
RH
765 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
766 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
34a6d0b7
RH
767 }
768
769 if (len == 1) {
770 tcg_out8(s, offset);
771 } else if (len == 4) {
c896fe29
FB
772 tcg_out32(s, offset);
773 }
774}
775
770c2fc7
RH
776static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
777 int index, int shift, intptr_t offset)
778{
779 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
780 tcg_out_sib_offset(s, r, rm, index, shift, offset);
781}
782
783static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
784 int rm, int index, int shift,
785 intptr_t offset)
786{
787 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
788 tcg_out_sib_offset(s, r, rm, index, shift, offset);
789}
790
5d8a4f8f
RH
791/* A simplification of the above with no index or shift. */
792static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
357e3d8a 793 int rm, intptr_t offset)
34a6d0b7
RH
794{
795 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
796}
797
770c2fc7
RH
798static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
799 int v, int rm, intptr_t offset)
800{
801 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
802}
803
804/* Output an opcode with an expected reference to the constant pool. */
805static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
806{
807 tcg_out_opc(s, opc, r, 0, 0);
808 /* Absolute for 32-bit, pc-relative for 64-bit. */
809 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
810 tcg_out32(s, 0);
811}
812
813/* Output an opcode with an expected reference to the constant pool. */
814static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
815{
816 tcg_out_vex_opc(s, opc, r, 0, 0, 0);
817 /* Absolute for 32-bit, pc-relative for 64-bit. */
818 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
819 tcg_out32(s, 0);
820}
821
81570a70
RH
822/* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
823static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
824{
5d8a4f8f
RH
825 /* Propagate an opcode prefix, such as P_REXW. */
826 int ext = subop & ~0x7;
827 subop &= 0x7;
828
829 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
81570a70
RH
830}
831
78113e83 832static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
c896fe29 833{
770c2fc7
RH
834 int rexw = 0;
835
836 if (arg == ret) {
78113e83 837 return true;
770c2fc7
RH
838 }
839 switch (type) {
840 case TCG_TYPE_I64:
841 rexw = P_REXW;
842 /* fallthru */
843 case TCG_TYPE_I32:
844 if (ret < 16) {
845 if (arg < 16) {
846 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
847 } else {
848 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
849 }
850 } else {
851 if (arg < 16) {
852 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
853 } else {
854 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
855 }
856 }
857 break;
858
859 case TCG_TYPE_V64:
860 tcg_debug_assert(ret >= 16 && arg >= 16);
861 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
862 break;
863 case TCG_TYPE_V128:
864 tcg_debug_assert(ret >= 16 && arg >= 16);
865 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
866 break;
867 case TCG_TYPE_V256:
868 tcg_debug_assert(ret >= 16 && arg >= 16);
869 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
870 break;
871
872 default:
873 g_assert_not_reached();
874 }
78113e83 875 return true;
770c2fc7
RH
876}
877
1e262b49
RH
878static const int avx2_dup_insn[4] = {
879 OPC_VPBROADCASTB, OPC_VPBROADCASTW,
880 OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
881};
882
e7632cfa 883static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
770c2fc7
RH
884 TCGReg r, TCGReg a)
885{
886 if (have_avx2) {
770c2fc7 887 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
1e262b49 888 tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
770c2fc7
RH
889 } else {
890 switch (vece) {
891 case MO_8:
892 /* ??? With zero in a register, use PSHUFB. */
7eb30ef0 893 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
770c2fc7
RH
894 a = r;
895 /* FALLTHRU */
896 case MO_16:
7eb30ef0 897 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
770c2fc7
RH
898 a = r;
899 /* FALLTHRU */
900 case MO_32:
901 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
902 /* imm8 operand: all output lanes selected from input lane 0. */
903 tcg_out8(s, 0);
904 break;
905 case MO_64:
7eb30ef0 906 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
770c2fc7
RH
907 break;
908 default:
909 g_assert_not_reached();
910 }
911 }
e7632cfa 912 return true;
770c2fc7
RH
913}
914
d6ecb4a9
RH
915static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
916 TCGReg r, TCGReg base, intptr_t offset)
917{
1e262b49
RH
918 if (have_avx2) {
919 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
920 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
921 r, 0, base, offset);
922 } else {
923 switch (vece) {
924 case MO_64:
7b60ef32 925 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
1e262b49
RH
926 break;
927 case MO_32:
928 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
929 break;
930 case MO_16:
931 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
932 tcg_out8(s, 0); /* imm8 */
933 tcg_out_dup_vec(s, type, vece, r, r);
934 break;
935 case MO_8:
936 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
937 tcg_out8(s, 0); /* imm8 */
938 tcg_out_dup_vec(s, type, vece, r, r);
939 break;
940 default:
941 g_assert_not_reached();
942 }
943 }
944 return true;
d6ecb4a9
RH
945}
946
770c2fc7
RH
947static void tcg_out_dupi_vec(TCGContext *s, TCGType type,
948 TCGReg ret, tcg_target_long arg)
949{
950 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
951
952 if (arg == 0) {
953 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
954 return;
955 }
956 if (arg == -1) {
957 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
958 return;
959 }
960
961 if (TCG_TARGET_REG_BITS == 64) {
962 if (type == TCG_TYPE_V64) {
963 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
964 } else if (have_avx2) {
965 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
966 } else {
7b60ef32 967 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
770c2fc7
RH
968 }
969 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
770c2fc7 970 } else {
1e262b49 971 if (have_avx2) {
7b60ef32 972 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTW + vex_l, ret);
1e262b49
RH
973 } else {
974 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
975 }
770c2fc7 976 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
af266089 977 }
c896fe29
FB
978}
979
5d8a4f8f 980static void tcg_out_movi(TCGContext *s, TCGType type,
2a534aff 981 TCGReg ret, tcg_target_long arg)
c896fe29 982{
8023ccda
RH
983 tcg_target_long diff;
984
770c2fc7
RH
985 switch (type) {
986 case TCG_TYPE_I32:
987#if TCG_TARGET_REG_BITS == 64
988 case TCG_TYPE_I64:
989#endif
990 if (ret < 16) {
991 break;
992 }
993 /* fallthru */
994 case TCG_TYPE_V64:
995 case TCG_TYPE_V128:
996 case TCG_TYPE_V256:
997 tcg_debug_assert(ret >= 16);
998 tcg_out_dupi_vec(s, type, ret, arg);
999 return;
1000 default:
1001 g_assert_not_reached();
1002 }
1003
c896fe29 1004 if (arg == 0) {
81570a70 1005 tgen_arithr(s, ARITH_XOR, ret, ret);
5d8a4f8f 1006 return;
8023ccda
RH
1007 }
1008 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
5d8a4f8f
RH
1009 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1010 tcg_out32(s, arg);
8023ccda
RH
1011 return;
1012 }
1013 if (arg == (int32_t)arg) {
5d8a4f8f
RH
1014 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1015 tcg_out32(s, arg);
8023ccda 1016 return;
c896fe29 1017 }
8023ccda
RH
1018
1019 /* Try a 7 byte pc-relative lea before the 10 byte movq. */
357e3d8a 1020 diff = arg - ((uintptr_t)s->code_ptr + 7);
8023ccda
RH
1021 if (diff == (int32_t)diff) {
1022 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1023 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1024 tcg_out32(s, diff);
1025 return;
1026 }
1027
1028 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1029 tcg_out64(s, arg);
c896fe29
FB
1030}
1031
6858614e
RH
1032static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1033{
1034 if (val == (int8_t)val) {
5d8a4f8f 1035 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
6858614e 1036 tcg_out8(s, val);
5d8a4f8f
RH
1037 } else if (val == (int32_t)val) {
1038 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
6858614e 1039 tcg_out32(s, val);
5d8a4f8f
RH
1040 } else {
1041 tcg_abort();
6858614e
RH
1042 }
1043}
1044
a7d00d4e
PK
1045static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1046{
1047 /* Given the strength of x86 memory ordering, we only need care for
1048 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is
1049 faster than "mfence", so don't bother with the sse insn. */
1050 if (a0 & TCG_MO_ST_LD) {
1051 tcg_out8(s, 0xf0);
1052 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1053 tcg_out8(s, 0);
1054 }
1055}
1056
6858614e
RH
1057static inline void tcg_out_push(TCGContext *s, int reg)
1058{
5d8a4f8f 1059 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
6858614e
RH
1060}
1061
1062static inline void tcg_out_pop(TCGContext *s, int reg)
1063{
5d8a4f8f 1064 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
6858614e
RH
1065}
1066
770c2fc7
RH
1067static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1068 TCGReg arg1, intptr_t arg2)
c896fe29 1069{
770c2fc7
RH
1070 switch (type) {
1071 case TCG_TYPE_I32:
1072 if (ret < 16) {
1073 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1074 } else {
1075 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1076 }
1077 break;
1078 case TCG_TYPE_I64:
1079 if (ret < 16) {
1080 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1081 break;
1082 }
1083 /* FALLTHRU */
1084 case TCG_TYPE_V64:
11e2bfef 1085 /* There is no instruction that can validate 8-byte alignment. */
770c2fc7
RH
1086 tcg_debug_assert(ret >= 16);
1087 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1088 break;
1089 case TCG_TYPE_V128:
11e2bfef
RH
1090 /*
1091 * The gvec infrastructure is asserts that v128 vector loads
1092 * and stores use a 16-byte aligned offset. Validate that the
1093 * final pointer is aligned by using an insn that will SIGSEGV.
1094 */
770c2fc7 1095 tcg_debug_assert(ret >= 16);
11e2bfef 1096 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
770c2fc7
RH
1097 break;
1098 case TCG_TYPE_V256:
11e2bfef
RH
1099 /*
1100 * The gvec infrastructure only requires 16-byte alignment,
1101 * so here we must use an unaligned load.
1102 */
770c2fc7
RH
1103 tcg_debug_assert(ret >= 16);
1104 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1105 ret, 0, arg1, arg2);
1106 break;
1107 default:
1108 g_assert_not_reached();
1109 }
c896fe29
FB
1110}
1111
770c2fc7
RH
1112static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1113 TCGReg arg1, intptr_t arg2)
c896fe29 1114{
770c2fc7
RH
1115 switch (type) {
1116 case TCG_TYPE_I32:
1117 if (arg < 16) {
1118 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1119 } else {
1120 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1121 }
1122 break;
1123 case TCG_TYPE_I64:
1124 if (arg < 16) {
1125 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1126 break;
1127 }
1128 /* FALLTHRU */
1129 case TCG_TYPE_V64:
11e2bfef 1130 /* There is no instruction that can validate 8-byte alignment. */
770c2fc7
RH
1131 tcg_debug_assert(arg >= 16);
1132 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1133 break;
1134 case TCG_TYPE_V128:
11e2bfef
RH
1135 /*
1136 * The gvec infrastructure is asserts that v128 vector loads
1137 * and stores use a 16-byte aligned offset. Validate that the
1138 * final pointer is aligned by using an insn that will SIGSEGV.
1139 */
770c2fc7 1140 tcg_debug_assert(arg >= 16);
11e2bfef 1141 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
770c2fc7
RH
1142 break;
1143 case TCG_TYPE_V256:
11e2bfef
RH
1144 /*
1145 * The gvec infrastructure only requires 16-byte alignment,
1146 * so here we must use an unaligned store.
1147 */
770c2fc7
RH
1148 tcg_debug_assert(arg >= 16);
1149 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1150 arg, 0, arg1, arg2);
1151 break;
1152 default:
1153 g_assert_not_reached();
1154 }
c896fe29
FB
1155}
1156
59d7c14e
RH
1157static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1158 TCGReg base, intptr_t ofs)
c6f29ff0 1159{
59d7c14e
RH
1160 int rexw = 0;
1161 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1162 if (val != (int32_t)val) {
1163 return false;
1164 }
1165 rexw = P_REXW;
770c2fc7
RH
1166 } else if (type != TCG_TYPE_I32) {
1167 return false;
59d7c14e
RH
1168 }
1169 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
c6f29ff0 1170 tcg_out32(s, val);
59d7c14e 1171 return true;
c6f29ff0
RH
1172}
1173
f53dba01
RH
1174static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1175{
96b4cf38
RH
1176 /* Propagate an opcode prefix, such as P_DATA16. */
1177 int ext = subopc & ~0x7;
1178 subopc &= 0x7;
1179
f53dba01 1180 if (count == 1) {
5d8a4f8f 1181 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
f53dba01 1182 } else {
5d8a4f8f 1183 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
f53dba01
RH
1184 tcg_out8(s, count);
1185 }
1186}
1187
fcb5dac1
RH
1188static inline void tcg_out_bswap32(TCGContext *s, int reg)
1189{
5d8a4f8f 1190 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
fcb5dac1
RH
1191}
1192
1193static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1194{
5d8a4f8f 1195 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
fcb5dac1
RH
1196}
1197
55e082a7
RH
1198static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1199{
1200 /* movzbl */
eabb7b91 1201 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
5d8a4f8f 1202 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
55e082a7
RH
1203}
1204
5d8a4f8f 1205static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
6817c355
RH
1206{
1207 /* movsbl */
eabb7b91 1208 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
5d8a4f8f 1209 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
6817c355
RH
1210}
1211
55e082a7
RH
1212static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1213{
1214 /* movzwl */
1215 tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1216}
1217
5d8a4f8f 1218static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
6817c355 1219{
5d8a4f8f
RH
1220 /* movsw[lq] */
1221 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
6817c355
RH
1222}
1223
5d8a4f8f 1224static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
c896fe29 1225{
5d8a4f8f
RH
1226 /* 32-bit mov zero extends. */
1227 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1228}
1229
1230static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1231{
1232 tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1233}
1234
1235static inline void tcg_out_bswap64(TCGContext *s, int reg)
1236{
1237 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1238}
1239
1240static void tgen_arithi(TCGContext *s, int c, int r0,
1241 tcg_target_long val, int cf)
1242{
1243 int rexw = 0;
1244
1245 if (TCG_TARGET_REG_BITS == 64) {
1246 rexw = c & -8;
1247 c &= 7;
1248 }
1249
81570a70
RH
1250 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1251 partial flags update stalls on Pentium4 and are not recommended
1252 by current Intel optimization manuals. */
1253 if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
447d681e 1254 int is_inc = (c == ARITH_ADD) ^ (val < 0);
5d8a4f8f
RH
1255 if (TCG_TARGET_REG_BITS == 64) {
1256 /* The single-byte increment encodings are re-tasked as the
1257 REX prefixes. Use the MODRM encoding. */
1258 tcg_out_modrm(s, OPC_GRP5 + rexw,
1259 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1260 } else {
1261 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1262 }
1263 return;
1264 }
1265
1266 if (c == ARITH_AND) {
1267 if (TCG_TARGET_REG_BITS == 64) {
1268 if (val == 0xffffffffu) {
1269 tcg_out_ext32u(s, r0, r0);
1270 return;
1271 }
1272 if (val == (uint32_t)val) {
1273 /* AND with no high bits set can use a 32-bit operation. */
1274 rexw = 0;
1275 }
1276 }
dc397ca3 1277 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
5d8a4f8f
RH
1278 tcg_out_ext8u(s, r0, r0);
1279 return;
1280 }
1281 if (val == 0xffffu) {
1282 tcg_out_ext16u(s, r0, r0);
1283 return;
1284 }
1285 }
1286
1287 if (val == (int8_t)val) {
1288 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
c896fe29 1289 tcg_out8(s, val);
5d8a4f8f
RH
1290 return;
1291 }
1292 if (rexw == 0 || val == (int32_t)val) {
1293 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
c896fe29 1294 tcg_out32(s, val);
5d8a4f8f 1295 return;
c896fe29 1296 }
5d8a4f8f
RH
1297
1298 tcg_abort();
c896fe29
FB
1299}
1300
3e9a474e 1301static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
c896fe29 1302{
5d8a4f8f
RH
1303 if (val != 0) {
1304 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1305 }
c896fe29
FB
1306}
1307
f75b56c1 1308/* Use SMALL != 0 to force a short forward branch. */
bec16311 1309static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
c896fe29
FB
1310{
1311 int32_t val, val1;
78686523 1312
c896fe29 1313 if (l->has_value) {
f6bff89d 1314 val = tcg_pcrel_diff(s, l->u.value_ptr);
c896fe29
FB
1315 val1 = val - 2;
1316 if ((int8_t)val1 == val1) {
f75b56c1 1317 if (opc == -1) {
da441cff 1318 tcg_out8(s, OPC_JMP_short);
f75b56c1 1319 } else {
da441cff 1320 tcg_out8(s, OPC_JCC_short + opc);
f75b56c1 1321 }
c896fe29
FB
1322 tcg_out8(s, val1);
1323 } else {
f75b56c1
RH
1324 if (small) {
1325 tcg_abort();
1326 }
c896fe29 1327 if (opc == -1) {
da441cff 1328 tcg_out8(s, OPC_JMP_long);
c896fe29
FB
1329 tcg_out32(s, val - 5);
1330 } else {
5d8a4f8f 1331 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
c896fe29
FB
1332 tcg_out32(s, val - 6);
1333 }
1334 }
f75b56c1
RH
1335 } else if (small) {
1336 if (opc == -1) {
da441cff 1337 tcg_out8(s, OPC_JMP_short);
f75b56c1 1338 } else {
da441cff 1339 tcg_out8(s, OPC_JCC_short + opc);
f75b56c1 1340 }
bec16311 1341 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
f75b56c1 1342 s->code_ptr += 1;
c896fe29
FB
1343 } else {
1344 if (opc == -1) {
da441cff 1345 tcg_out8(s, OPC_JMP_long);
c896fe29 1346 } else {
5d8a4f8f 1347 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
c896fe29 1348 }
bec16311 1349 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
623e265c 1350 s->code_ptr += 4;
c896fe29
FB
1351 }
1352}
1353
1d2699ae 1354static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
5d8a4f8f 1355 int const_arg2, int rexw)
c896fe29 1356{
c896fe29
FB
1357 if (const_arg2) {
1358 if (arg2 == 0) {
c896fe29 1359 /* test r, r */
5d8a4f8f 1360 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
c896fe29 1361 } else {
5d8a4f8f 1362 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
c896fe29
FB
1363 }
1364 } else {
5d8a4f8f 1365 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
c896fe29 1366 }
1d2699ae
RH
1367}
1368
5d8a4f8f
RH
1369static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1370 TCGArg arg1, TCGArg arg2, int const_arg2,
bec16311 1371 TCGLabel *label, int small)
1d2699ae 1372{
5d8a4f8f 1373 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
bec16311 1374 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
c896fe29
FB
1375}
1376
5d8a4f8f
RH
1377#if TCG_TARGET_REG_BITS == 64
1378static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1379 TCGArg arg1, TCGArg arg2, int const_arg2,
bec16311 1380 TCGLabel *label, int small)
5d8a4f8f
RH
1381{
1382 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
bec16311 1383 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
5d8a4f8f
RH
1384}
1385#else
c896fe29
FB
1386/* XXX: we implement it at the target level to avoid having to
1387 handle cross basic blocks temporaries */
f75b56c1
RH
1388static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1389 const int *const_args, int small)
c896fe29 1390{
bec16311
RH
1391 TCGLabel *label_next = gen_new_label();
1392 TCGLabel *label_this = arg_label(args[5]);
42a268c2 1393
c896fe29
FB
1394 switch(args[4]) {
1395 case TCG_COND_EQ:
5d8a4f8f
RH
1396 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1397 label_next, 1);
1398 tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
bec16311 1399 label_this, small);
c896fe29
FB
1400 break;
1401 case TCG_COND_NE:
5d8a4f8f 1402 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
bec16311 1403 label_this, small);
5d8a4f8f 1404 tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
bec16311 1405 label_this, small);
c896fe29
FB
1406 break;
1407 case TCG_COND_LT:
5d8a4f8f 1408 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
bec16311 1409 label_this, small);
f75b56c1 1410 tcg_out_jxx(s, JCC_JNE, label_next, 1);
5d8a4f8f 1411 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
bec16311 1412 label_this, small);
c896fe29
FB
1413 break;
1414 case TCG_COND_LE:
5d8a4f8f 1415 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
bec16311 1416 label_this, small);
f75b56c1 1417 tcg_out_jxx(s, JCC_JNE, label_next, 1);
5d8a4f8f 1418 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
bec16311 1419 label_this, small);
c896fe29
FB
1420 break;
1421 case TCG_COND_GT:
5d8a4f8f 1422 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
bec16311 1423 label_this, small);
f75b56c1 1424 tcg_out_jxx(s, JCC_JNE, label_next, 1);
5d8a4f8f 1425 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
bec16311 1426 label_this, small);
c896fe29
FB
1427 break;
1428 case TCG_COND_GE:
5d8a4f8f 1429 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
bec16311 1430 label_this, small);
f75b56c1 1431 tcg_out_jxx(s, JCC_JNE, label_next, 1);
5d8a4f8f 1432 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
bec16311 1433 label_this, small);
c896fe29
FB
1434 break;
1435 case TCG_COND_LTU:
5d8a4f8f 1436 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
bec16311 1437 label_this, small);
f75b56c1 1438 tcg_out_jxx(s, JCC_JNE, label_next, 1);
5d8a4f8f 1439 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
bec16311 1440 label_this, small);
c896fe29
FB
1441 break;
1442 case TCG_COND_LEU:
5d8a4f8f 1443 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
bec16311 1444 label_this, small);
f75b56c1 1445 tcg_out_jxx(s, JCC_JNE, label_next, 1);
5d8a4f8f 1446 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
bec16311 1447 label_this, small);
c896fe29
FB
1448 break;
1449 case TCG_COND_GTU:
5d8a4f8f 1450 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
bec16311 1451 label_this, small);
f75b56c1 1452 tcg_out_jxx(s, JCC_JNE, label_next, 1);
5d8a4f8f 1453 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
bec16311 1454 label_this, small);
c896fe29
FB
1455 break;
1456 case TCG_COND_GEU:
5d8a4f8f 1457 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
bec16311 1458 label_this, small);
f75b56c1 1459 tcg_out_jxx(s, JCC_JNE, label_next, 1);
5d8a4f8f 1460 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
bec16311 1461 label_this, small);
c896fe29
FB
1462 break;
1463 default:
1464 tcg_abort();
1465 }
9d6fca70 1466 tcg_out_label(s, label_next, s->code_ptr);
c896fe29 1467}
5d8a4f8f 1468#endif
c896fe29 1469
5d8a4f8f
RH
1470static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1471 TCGArg arg1, TCGArg arg2, int const_arg2)
1d2699ae 1472{
5d8a4f8f 1473 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
32a8ffb9 1474 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
a369a702 1475 tcg_out_ext8u(s, dest, dest);
1d2699ae
RH
1476}
1477
5d8a4f8f
RH
1478#if TCG_TARGET_REG_BITS == 64
1479static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1480 TCGArg arg1, TCGArg arg2, int const_arg2)
1481{
1482 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1483 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1484 tcg_out_ext8u(s, dest, dest);
1485}
1486#else
1d2699ae
RH
1487static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1488 const int *const_args)
1489{
1490 TCGArg new_args[6];
bec16311 1491 TCGLabel *label_true, *label_over;
1d2699ae
RH
1492
1493 memcpy(new_args, args+1, 5*sizeof(TCGArg));
1494
1495 if (args[0] == args[1] || args[0] == args[2]
1496 || (!const_args[3] && args[0] == args[3])
1497 || (!const_args[4] && args[0] == args[4])) {
1498 /* When the destination overlaps with one of the argument
1499 registers, don't do anything tricky. */
bec16311
RH
1500 label_true = gen_new_label();
1501 label_over = gen_new_label();
1d2699ae 1502
bec16311 1503 new_args[5] = label_arg(label_true);
1d2699ae
RH
1504 tcg_out_brcond2(s, new_args, const_args+1, 1);
1505
1506 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1507 tcg_out_jxx(s, JCC_JMP, label_over, 1);
9d6fca70 1508 tcg_out_label(s, label_true, s->code_ptr);
1d2699ae
RH
1509
1510 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
9d6fca70 1511 tcg_out_label(s, label_over, s->code_ptr);
1d2699ae
RH
1512 } else {
1513 /* When the destination does not overlap one of the arguments,
1514 clear the destination first, jump if cond false, and emit an
1515 increment in the true case. This results in smaller code. */
1516
1517 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1518
bec16311 1519 label_over = gen_new_label();
1d2699ae 1520 new_args[4] = tcg_invert_cond(new_args[4]);
bec16311 1521 new_args[5] = label_arg(label_over);
1d2699ae
RH
1522 tcg_out_brcond2(s, new_args, const_args+1, 1);
1523
1524 tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
9d6fca70 1525 tcg_out_label(s, label_over, s->code_ptr);
1d2699ae
RH
1526 }
1527}
5d8a4f8f
RH
1528#endif
1529
bbf25f90
RH
1530static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1531 TCGReg dest, TCGReg v1)
d0a16297 1532{
76a347e1 1533 if (have_cmov) {
bbf25f90 1534 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
76a347e1 1535 } else {
bec16311 1536 TCGLabel *over = gen_new_label();
76a347e1
RH
1537 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1538 tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1539 tcg_out_label(s, over, s->code_ptr);
1540 }
d0a16297
RH
1541}
1542
bbf25f90
RH
1543static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1544 TCGReg c1, TCGArg c2, int const_c2,
1545 TCGReg v1)
1546{
1547 tcg_out_cmp(s, c1, c2, const_c2, 0);
1548 tcg_out_cmov(s, cond, 0, dest, v1);
1549}
1550
d0a16297 1551#if TCG_TARGET_REG_BITS == 64
bbf25f90
RH
1552static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1553 TCGReg c1, TCGArg c2, int const_c2,
1554 TCGReg v1)
d0a16297
RH
1555{
1556 tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
bbf25f90 1557 tcg_out_cmov(s, cond, P_REXW, dest, v1);
d0a16297
RH
1558}
1559#endif
1560
bbf25f90
RH
1561static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1562 TCGArg arg2, bool const_a2)
1563{
39f099ec 1564 if (have_bmi1) {
bbf25f90 1565 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
39f099ec
RH
1566 if (const_a2) {
1567 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1568 } else {
1569 tcg_debug_assert(dest != arg2);
1570 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1571 }
bbf25f90 1572 } else {
9bf38308 1573 tcg_debug_assert(dest != arg2);
bbf25f90 1574 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
9bf38308 1575 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
bbf25f90
RH
1576 }
1577}
1578
1579static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1580 TCGArg arg2, bool const_a2)
1581{
1582 if (have_lzcnt) {
1583 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1584 if (const_a2) {
1585 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1586 } else {
1587 tcg_debug_assert(dest != arg2);
1588 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1589 }
1590 } else {
9bf38308
RH
1591 tcg_debug_assert(!const_a2);
1592 tcg_debug_assert(dest != arg1);
1593 tcg_debug_assert(dest != arg2);
bbf25f90 1594
9bf38308 1595 /* Recall that the output of BSR is the index not the count. */
bbf25f90 1596 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
9bf38308
RH
1597 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1598
1599 /* Since we have destroyed the flags from BSR, we have to re-test. */
1600 tcg_out_cmp(s, arg1, 0, 1, rexw);
1601 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
bbf25f90
RH
1602 }
1603}
1604
f6bff89d 1605static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
5d8a4f8f 1606{
f6bff89d 1607 intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
5d8a4f8f
RH
1608
1609 if (disp == (int32_t)disp) {
1610 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1611 tcg_out32(s, disp);
1612 } else {
4e45f239
RH
1613 /* rip-relative addressing into the constant pool.
1614 This is 6 + 8 = 14 bytes, as compared to using an
1615 an immediate load 10 + 6 = 16 bytes, plus we may
1616 be able to re-use the pool constant for more calls. */
1617 tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1618 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1619 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1620 tcg_out32(s, 0);
5d8a4f8f
RH
1621 }
1622}
1623
6bf3e997 1624static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
5d8a4f8f
RH
1625{
1626 tcg_out_branch(s, 1, dest);
1627}
1d2699ae 1628
f6bff89d 1629static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
aadb21a4 1630{
5d8a4f8f 1631 tcg_out_branch(s, 0, dest);
aadb21a4
RH
1632}
1633
0d07abf0
SF
1634static void tcg_out_nopn(TCGContext *s, int n)
1635{
1636 int i;
1637 /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1638 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1639 * duplicate prefix, and all of the interesting recent cores can
1640 * decode and discard the duplicates in a single cycle.
1641 */
1642 tcg_debug_assert(n >= 1);
1643 for (i = 1; i < n; ++i) {
1644 tcg_out8(s, 0x66);
1645 }
1646 tcg_out8(s, 0x90);
1647}
1648
c896fe29 1649#if defined(CONFIG_SOFTMMU)
2b434dd1 1650#include "../tcg-ldst.inc.c"
659ef5cb 1651
401c227b
RH
1652/* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1653 * int mmu_idx, uintptr_t ra)
1654 */
f6bff89d 1655static void * const qemu_ld_helpers[16] = {
8221a267
RH
1656 [MO_UB] = helper_ret_ldub_mmu,
1657 [MO_LEUW] = helper_le_lduw_mmu,
1658 [MO_LEUL] = helper_le_ldul_mmu,
1659 [MO_LEQ] = helper_le_ldq_mmu,
1660 [MO_BEUW] = helper_be_lduw_mmu,
1661 [MO_BEUL] = helper_be_ldul_mmu,
1662 [MO_BEQ] = helper_be_ldq_mmu,
e141ab52
BS
1663};
1664
401c227b
RH
1665/* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1666 * uintxx_t val, int mmu_idx, uintptr_t ra)
1667 */
f6bff89d 1668static void * const qemu_st_helpers[16] = {
8221a267
RH
1669 [MO_UB] = helper_ret_stb_mmu,
1670 [MO_LEUW] = helper_le_stw_mmu,
1671 [MO_LEUL] = helper_le_stl_mmu,
1672 [MO_LEQ] = helper_le_stq_mmu,
1673 [MO_BEUW] = helper_be_stw_mmu,
1674 [MO_BEUL] = helper_be_stl_mmu,
1675 [MO_BEQ] = helper_be_stq_mmu,
e141ab52 1676};
8516a044
RH
1677
1678/* Perform the TLB load and compare.
1679
1680 Inputs:
7352ee54 1681 ADDRLO and ADDRHI contain the low and high part of the address.
8516a044
RH
1682
1683 MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1684
1685 WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1686 This should be offsetof addr_read or addr_write.
1687
1688 Outputs:
1689 LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1690 positions of the displacements of forward jumps to the TLB miss case.
1691
166792f7 1692 Second argument register is loaded with the low part of the address.
5d8a4f8f
RH
1693 In the TLB hit case, it has been adjusted as indicated by the TLB
1694 and so is a host address. In the TLB miss case, it continues to
1695 hold a guest address.
8516a044 1696
166792f7 1697 First argument register is clobbered. */
8516a044 1698
7352ee54 1699static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
14776ab5 1700 int mem_index, MemOp opc,
f6bff89d 1701 tcg_insn_unit **label_ptr, int which)
8516a044 1702{
7352ee54
RH
1703 const TCGReg r0 = TCG_REG_L0;
1704 const TCGReg r1 = TCG_REG_L1;
d5dad3be 1705 TCGType ttype = TCG_TYPE_I32;
08b0b23b
AJ
1706 TCGType tlbtype = TCG_TYPE_I32;
1707 int trexw = 0, hrexw = 0, tlbrexw = 0;
85aa8081
RH
1708 unsigned a_bits = get_alignment_bits(opc);
1709 unsigned s_bits = opc & MO_SIZE;
1710 unsigned a_mask = (1 << a_bits) - 1;
1711 unsigned s_mask = (1 << s_bits) - 1;
1f00b27f 1712 target_ulong tlb_mask;
5d8a4f8f 1713
d5dad3be
RH
1714 if (TCG_TARGET_REG_BITS == 64) {
1715 if (TARGET_LONG_BITS == 64) {
1716 ttype = TCG_TYPE_I64;
1717 trexw = P_REXW;
1718 }
1719 if (TCG_TYPE_PTR == TCG_TYPE_I64) {
d5dad3be 1720 hrexw = P_REXW;
54eaf40b 1721 if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
08b0b23b
AJ
1722 tlbtype = TCG_TYPE_I64;
1723 tlbrexw = P_REXW;
1724 }
d5dad3be 1725 }
5d8a4f8f 1726 }
8516a044 1727
08b0b23b 1728 tcg_out_mov(s, tlbtype, r0, addrlo);
54eaf40b
EC
1729 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1730 TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1731
1732 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
269bd5d8
RH
1733 TLB_MASK_TABLE_OFS(mem_index) +
1734 offsetof(CPUTLBDescFast, mask));
54eaf40b
EC
1735
1736 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
269bd5d8
RH
1737 TLB_MASK_TABLE_OFS(mem_index) +
1738 offsetof(CPUTLBDescFast, table));
54eaf40b 1739
85aa8081
RH
1740 /* If the required alignment is at least as large as the access, simply
1741 copy the address and mask. For lesser alignments, check that we don't
1742 cross pages for the complete access. */
1743 if (a_bits >= s_bits) {
8cc580f6
AJ
1744 tcg_out_mov(s, ttype, r1, addrlo);
1745 } else {
85aa8081 1746 tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
8cc580f6 1747 }
ebb90a00 1748 tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1f00b27f 1749 tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
8516a044 1750
166792f7 1751 /* cmp 0(r0), r1 */
54eaf40b 1752 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
8516a044 1753
d5dad3be 1754 /* Prepare for both the fast path add of the tlb addend, and the slow
4810d96f 1755 path function argument setup. */
d5dad3be 1756 tcg_out_mov(s, ttype, r1, addrlo);
8516a044 1757
b76f0d8c
YL
1758 /* jne slow_path */
1759 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
8516a044 1760 label_ptr[0] = s->code_ptr;
b76f0d8c 1761 s->code_ptr += 4;
8516a044 1762
5d8a4f8f 1763 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
166792f7 1764 /* cmp 4(r0), addrhi */
54eaf40b 1765 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
8516a044 1766
b76f0d8c
YL
1767 /* jne slow_path */
1768 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
8516a044 1769 label_ptr[1] = s->code_ptr;
b76f0d8c 1770 s->code_ptr += 4;
8516a044
RH
1771 }
1772
1773 /* TLB Hit. */
1774
166792f7 1775 /* add addend(r0), r1 */
d5dad3be 1776 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
54eaf40b 1777 offsetof(CPUTLBEntry, addend));
8516a044 1778}
7352ee54
RH
1779
1780/*
1781 * Record the context of a call to the out of line helper code for the slow path
1782 * for a load or store, so that we can later generate the correct helper code
1783 */
3dbc8c61
RH
1784static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1785 TCGMemOpIdx oi,
7352ee54
RH
1786 TCGReg datalo, TCGReg datahi,
1787 TCGReg addrlo, TCGReg addrhi,
3972ef6f 1788 tcg_insn_unit *raddr,
f6bff89d 1789 tcg_insn_unit **label_ptr)
7352ee54
RH
1790{
1791 TCGLabelQemuLdst *label = new_ldst_label(s);
1792
1793 label->is_ld = is_ld;
3972ef6f 1794 label->oi = oi;
3dbc8c61 1795 label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
7352ee54
RH
1796 label->datalo_reg = datalo;
1797 label->datahi_reg = datahi;
1798 label->addrlo_reg = addrlo;
1799 label->addrhi_reg = addrhi;
7352ee54
RH
1800 label->raddr = raddr;
1801 label->label_ptr[0] = label_ptr[0];
1802 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1803 label->label_ptr[1] = label_ptr[1];
1804 }
1805}
1806
1807/*
1808 * Generate code for the slow path for a load at the end of block
1809 */
aeee05f5 1810static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
7352ee54 1811{
3972ef6f 1812 TCGMemOpIdx oi = l->oi;
14776ab5 1813 MemOp opc = get_memop(oi);
7352ee54 1814 TCGReg data_reg;
f6bff89d 1815 tcg_insn_unit **label_ptr = &l->label_ptr[0];
3dbc8c61 1816 int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
7352ee54
RH
1817
1818 /* resolve label address */
5c53bb81 1819 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
7352ee54 1820 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
5c53bb81 1821 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
7352ee54
RH
1822 }
1823
1824 if (TCG_TARGET_REG_BITS == 32) {
1825 int ofs = 0;
1826
1827 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1828 ofs += 4;
1829
1830 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1831 ofs += 4;
1832
1833 if (TARGET_LONG_BITS == 64) {
1834 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1835 ofs += 4;
1836 }
1837
59d7c14e 1838 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
7352ee54
RH
1839 ofs += 4;
1840
59d7c14e 1841 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
7352ee54
RH
1842 } else {
1843 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1844 /* The second argument is already loaded with addrlo. */
3972ef6f 1845 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
7352ee54
RH
1846 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1847 (uintptr_t)l->raddr);
1848 }
1849
2b7ec66f 1850 tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
7352ee54
RH
1851
1852 data_reg = l->datalo_reg;
1853 switch (opc & MO_SSIZE) {
1854 case MO_SB:
3dbc8c61 1855 tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
7352ee54
RH
1856 break;
1857 case MO_SW:
3dbc8c61 1858 tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
7352ee54
RH
1859 break;
1860#if TCG_TARGET_REG_BITS == 64
1861 case MO_SL:
1862 tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1863 break;
1864#endif
1865 case MO_UB:
1866 case MO_UW:
1867 /* Note that the helpers have zero-extended to tcg_target_long. */
1868 case MO_UL:
1869 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1870 break;
1871 case MO_Q:
1872 if (TCG_TARGET_REG_BITS == 64) {
1873 tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1874 } else if (data_reg == TCG_REG_EDX) {
1875 /* xchg %edx, %eax */
1876 tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1877 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1878 } else {
1879 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1880 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1881 }
1882 break;
1883 default:
1884 tcg_abort();
1885 }
1886
1887 /* Jump to the code corresponding to next IR of qemu_st */
f6bff89d 1888 tcg_out_jmp(s, l->raddr);
aeee05f5 1889 return true;
7352ee54
RH
1890}
1891
1892/*
1893 * Generate code for the slow path for a store at the end of block
1894 */
aeee05f5 1895static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
7352ee54 1896{
3972ef6f 1897 TCGMemOpIdx oi = l->oi;
14776ab5
TN
1898 MemOp opc = get_memop(oi);
1899 MemOp s_bits = opc & MO_SIZE;
f6bff89d 1900 tcg_insn_unit **label_ptr = &l->label_ptr[0];
7352ee54
RH
1901 TCGReg retaddr;
1902
1903 /* resolve label address */
5c53bb81 1904 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
7352ee54 1905 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
5c53bb81 1906 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
7352ee54
RH
1907 }
1908
1909 if (TCG_TARGET_REG_BITS == 32) {
1910 int ofs = 0;
1911
1912 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1913 ofs += 4;
1914
1915 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1916 ofs += 4;
1917
1918 if (TARGET_LONG_BITS == 64) {
1919 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1920 ofs += 4;
1921 }
1922
1923 tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1924 ofs += 4;
1925
1926 if (s_bits == MO_64) {
1927 tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1928 ofs += 4;
1929 }
1930
59d7c14e 1931 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
7352ee54
RH
1932 ofs += 4;
1933
1934 retaddr = TCG_REG_EAX;
3972ef6f
RH
1935 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1936 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
7352ee54
RH
1937 } else {
1938 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1939 /* The second argument is already loaded with addrlo. */
1940 tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1941 tcg_target_call_iarg_regs[2], l->datalo_reg);
3972ef6f 1942 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
7352ee54
RH
1943
1944 if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1945 retaddr = tcg_target_call_iarg_regs[4];
1946 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1947 } else {
1948 retaddr = TCG_REG_RAX;
1949 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
0b919667
RH
1950 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1951 TCG_TARGET_CALL_STACK_OFFSET);
7352ee54
RH
1952 }
1953 }
1954
1955 /* "Tail call" to the helper, with the return address back inline. */
1956 tcg_out_push(s, retaddr);
2b7ec66f 1957 tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
aeee05f5 1958 return true;
7352ee54 1959}
913c2bdd
RH
1960#elif TCG_TARGET_REG_BITS == 32
1961# define x86_guest_base_seg 0
1962# define x86_guest_base_index -1
1963# define x86_guest_base_offset guest_base
1964#else
1965static int x86_guest_base_seg;
1966static int x86_guest_base_index = -1;
1967static int32_t x86_guest_base_offset;
1968# if defined(__x86_64__) && defined(__linux__)
1969# include <asm/prctl.h>
1970# include <sys/prctl.h>
44b37ace 1971int arch_prctl(int code, unsigned long addr);
913c2bdd 1972static inline int setup_guest_base_seg(void)
44b37ace 1973{
b76f21a7 1974 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
913c2bdd 1975 return P_GS;
44b37ace 1976 }
913c2bdd 1977 return 0;
44b37ace 1978}
5785c17f
RH
1979# elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1980# include <machine/sysarch.h>
1981static inline int setup_guest_base_seg(void)
1982{
1983 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1984 return P_GS;
1985 }
1986 return 0;
1987}
913c2bdd
RH
1988# else
1989static inline int setup_guest_base_seg(void)
1990{
1991 return 0;
1992}
1993# endif
44b37ace 1994#endif /* SOFTMMU */
c896fe29 1995
37c5d0d5 1996static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
ee8ba9e4 1997 TCGReg base, int index, intptr_t ofs,
14776ab5 1998 int seg, bool is64, MemOp memop)
be5a4eb7 1999{
14776ab5
TN
2000 const MemOp real_bswap = memop & MO_BSWAP;
2001 MemOp bswap = real_bswap;
1d21d95b 2002 int rexw = is64 * P_REXW;
085bb5bb
AJ
2003 int movop = OPC_MOVL_GvEv;
2004
2005 if (have_movbe && real_bswap) {
2006 bswap = 0;
2007 movop = OPC_MOVBE_GyMy;
2008 }
37c5d0d5
RH
2009
2010 switch (memop & MO_SSIZE) {
2011 case MO_UB:
ee8ba9e4
RH
2012 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
2013 base, index, 0, ofs);
be5a4eb7 2014 break;
37c5d0d5 2015 case MO_SB:
1d21d95b 2016 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
ee8ba9e4 2017 base, index, 0, ofs);
be5a4eb7 2018 break;
37c5d0d5 2019 case MO_UW:
ee8ba9e4
RH
2020 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
2021 base, index, 0, ofs);
085bb5bb 2022 if (real_bswap) {
be5a4eb7
RH
2023 tcg_out_rolw_8(s, datalo);
2024 }
2025 break;
37c5d0d5 2026 case MO_SW:
085bb5bb
AJ
2027 if (real_bswap) {
2028 if (have_movbe) {
ee8ba9e4
RH
2029 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2030 datalo, base, index, 0, ofs);
085bb5bb 2031 } else {
ee8ba9e4
RH
2032 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
2033 base, index, 0, ofs);
085bb5bb
AJ
2034 tcg_out_rolw_8(s, datalo);
2035 }
1d21d95b 2036 tcg_out_modrm(s, OPC_MOVSWL + rexw, datalo, datalo);
5d8a4f8f 2037 } else {
1d21d95b 2038 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
ee8ba9e4 2039 datalo, base, index, 0, ofs);
be5a4eb7
RH
2040 }
2041 break;
37c5d0d5 2042 case MO_UL:
ee8ba9e4 2043 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
be5a4eb7
RH
2044 if (bswap) {
2045 tcg_out_bswap32(s, datalo);
2046 }
2047 break;
5d8a4f8f 2048#if TCG_TARGET_REG_BITS == 64
37c5d0d5 2049 case MO_SL:
085bb5bb 2050 if (real_bswap) {
ee8ba9e4
RH
2051 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2052 base, index, 0, ofs);
085bb5bb
AJ
2053 if (bswap) {
2054 tcg_out_bswap32(s, datalo);
2055 }
5d8a4f8f 2056 tcg_out_ext32s(s, datalo, datalo);
be5a4eb7 2057 } else {
ee8ba9e4
RH
2058 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2059 base, index, 0, ofs);
be5a4eb7 2060 }
5d8a4f8f
RH
2061 break;
2062#endif
37c5d0d5 2063 case MO_Q:
5d8a4f8f 2064 if (TCG_TARGET_REG_BITS == 64) {
ee8ba9e4
RH
2065 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2066 base, index, 0, ofs);
5d8a4f8f
RH
2067 if (bswap) {
2068 tcg_out_bswap64(s, datalo);
2069 }
2070 } else {
085bb5bb 2071 if (real_bswap) {
5d8a4f8f
RH
2072 int t = datalo;
2073 datalo = datahi;
2074 datahi = t;
2075 }
2076 if (base != datalo) {
ee8ba9e4
RH
2077 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2078 base, index, 0, ofs);
2079 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2080 base, index, 0, ofs + 4);
5d8a4f8f 2081 } else {
ee8ba9e4
RH
2082 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2083 base, index, 0, ofs + 4);
2084 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2085 base, index, 0, ofs);
5d8a4f8f
RH
2086 }
2087 if (bswap) {
2088 tcg_out_bswap32(s, datalo);
2089 tcg_out_bswap32(s, datahi);
2090 }
be5a4eb7
RH
2091 }
2092 break;
2093 default:
2094 tcg_abort();
2095 }
2096}
379f6698 2097
c896fe29
FB
2098/* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2099 EAX. It will be useful once fixed registers globals are less
2100 common. */
8221a267 2101static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
c896fe29 2102{
7352ee54 2103 TCGReg datalo, datahi, addrlo;
8221a267 2104 TCGReg addrhi __attribute__((unused));
59227d5d 2105 TCGMemOpIdx oi;
14776ab5 2106 MemOp opc;
c896fe29 2107#if defined(CONFIG_SOFTMMU)
37c5d0d5 2108 int mem_index;
f6bff89d 2109 tcg_insn_unit *label_ptr[2];
c896fe29
FB
2110#endif
2111
7352ee54 2112 datalo = *args++;
8221a267 2113 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
7352ee54 2114 addrlo = *args++;
8221a267 2115 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
59227d5d
RH
2116 oi = *args++;
2117 opc = get_memop(oi);
c896fe29
FB
2118
2119#if defined(CONFIG_SOFTMMU)
59227d5d 2120 mem_index = get_mmuidx(oi);
1a6dc1e4 2121
8cc580f6 2122 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
8516a044 2123 label_ptr, offsetof(CPUTLBEntry, addr_read));
1a6dc1e4
RH
2124
2125 /* TLB Hit. */
1d21d95b 2126 tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
c896fe29 2127
b76f0d8c 2128 /* Record the current context of a load into ldst label */
3dbc8c61 2129 add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
3972ef6f 2130 s->code_ptr, label_ptr);
c896fe29 2131#else
913c2bdd
RH
2132 tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2133 x86_guest_base_offset, x86_guest_base_seg,
2134 is64, opc);
c896fe29 2135#endif
be5a4eb7 2136}
c896fe29 2137
37c5d0d5 2138static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
4810d96f 2139 TCGReg base, int index, intptr_t ofs,
14776ab5 2140 int seg, MemOp memop)
be5a4eb7 2141{
be5a4eb7
RH
2142 /* ??? Ideally we wouldn't need a scratch register. For user-only,
2143 we could perform the bswap twice to restore the original value
2144 instead of moving to the scratch. But as it is, the L constraint
166792f7 2145 means that TCG_REG_L0 is definitely free here. */
37c5d0d5 2146 const TCGReg scratch = TCG_REG_L0;
14776ab5
TN
2147 const MemOp real_bswap = memop & MO_BSWAP;
2148 MemOp bswap = real_bswap;
085bb5bb
AJ
2149 int movop = OPC_MOVL_EvGv;
2150
2151 if (have_movbe && real_bswap) {
2152 bswap = 0;
2153 movop = OPC_MOVBE_MyGy;
2154 }
be5a4eb7 2155
37c5d0d5
RH
2156 switch (memop & MO_SIZE) {
2157 case MO_8:
8589467f 2158 /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
b3e2bc50
RH
2159 Use the scratch register if necessary. */
2160 if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
2161 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
2162 datalo = scratch;
2163 }
4810d96f
RH
2164 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2165 datalo, base, index, 0, ofs);
c896fe29 2166 break;
37c5d0d5 2167 case MO_16:
c896fe29 2168 if (bswap) {
3b6dac34 2169 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
be5a4eb7
RH
2170 tcg_out_rolw_8(s, scratch);
2171 datalo = scratch;
c896fe29 2172 }
4810d96f
RH
2173 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2174 base, index, 0, ofs);
c896fe29 2175 break;
37c5d0d5 2176 case MO_32:
c896fe29 2177 if (bswap) {
3b6dac34 2178 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
be5a4eb7
RH
2179 tcg_out_bswap32(s, scratch);
2180 datalo = scratch;
c896fe29 2181 }
4810d96f 2182 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
c896fe29 2183 break;
37c5d0d5 2184 case MO_64:
5d8a4f8f
RH
2185 if (TCG_TARGET_REG_BITS == 64) {
2186 if (bswap) {
2187 tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
2188 tcg_out_bswap64(s, scratch);
2189 datalo = scratch;
2190 }
4810d96f
RH
2191 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2192 base, index, 0, ofs);
5d8a4f8f 2193 } else if (bswap) {
3b6dac34 2194 tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
be5a4eb7 2195 tcg_out_bswap32(s, scratch);
4810d96f
RH
2196 tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
2197 base, index, 0, ofs);
3b6dac34 2198 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
be5a4eb7 2199 tcg_out_bswap32(s, scratch);
4810d96f
RH
2200 tcg_out_modrm_sib_offset(s, OPC_MOVL_EvGv + seg, scratch,
2201 base, index, 0, ofs + 4);
c896fe29 2202 } else {
085bb5bb
AJ
2203 if (real_bswap) {
2204 int t = datalo;
2205 datalo = datahi;
2206 datahi = t;
2207 }
4810d96f
RH
2208 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2209 base, index, 0, ofs);
2210 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2211 base, index, 0, ofs + 4);
c896fe29
FB
2212 }
2213 break;
2214 default:
2215 tcg_abort();
2216 }
c896fe29
FB
2217}
2218
8221a267 2219static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
c896fe29 2220{
7352ee54 2221 TCGReg datalo, datahi, addrlo;
8221a267 2222 TCGReg addrhi __attribute__((unused));
59227d5d 2223 TCGMemOpIdx oi;
14776ab5 2224 MemOp opc;
c896fe29 2225#if defined(CONFIG_SOFTMMU)
37c5d0d5 2226 int mem_index;
f6bff89d 2227 tcg_insn_unit *label_ptr[2];
c896fe29
FB
2228#endif
2229
7352ee54 2230 datalo = *args++;
8221a267 2231 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
7352ee54 2232 addrlo = *args++;
8221a267 2233 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
59227d5d
RH
2234 oi = *args++;
2235 opc = get_memop(oi);
c896fe29
FB
2236
2237#if defined(CONFIG_SOFTMMU)
59227d5d 2238 mem_index = get_mmuidx(oi);
1a6dc1e4 2239
8cc580f6 2240 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
8516a044 2241 label_ptr, offsetof(CPUTLBEntry, addr_write));
1a6dc1e4
RH
2242
2243 /* TLB Hit. */
4810d96f 2244 tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
c896fe29 2245
b76f0d8c 2246 /* Record the current context of a store into ldst label */
3dbc8c61 2247 add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
3972ef6f 2248 s->code_ptr, label_ptr);
b76f0d8c 2249#else
913c2bdd
RH
2250 tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2251 x86_guest_base_offset, x86_guest_base_seg, opc);
b76f0d8c 2252#endif
b76f0d8c 2253}
c896fe29 2254
a9751609 2255static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
c896fe29
FB
2256 const TCGArg *args, const int *const_args)
2257{
42d5b514
RH
2258 TCGArg a0, a1, a2;
2259 int c, const_a2, vexop, rexw = 0;
5d8a4f8f
RH
2260
2261#if TCG_TARGET_REG_BITS == 64
2262# define OP_32_64(x) \
2263 case glue(glue(INDEX_op_, x), _i64): \
2264 rexw = P_REXW; /* FALLTHRU */ \
2265 case glue(glue(INDEX_op_, x), _i32)
2266#else
2267# define OP_32_64(x) \
2268 case glue(glue(INDEX_op_, x), _i32)
2269#endif
78686523 2270
42d5b514
RH
2271 /* Hoist the loads of the most common arguments. */
2272 a0 = args[0];
2273 a1 = args[1];
2274 a2 = args[2];
2275 const_a2 = const_args[2];
2276
2277 switch (opc) {
c896fe29 2278 case INDEX_op_exit_tb:
5cb4ef80
EC
2279 /* Reuse the zeroing that exists for goto_ptr. */
2280 if (a0 == 0) {
2281 tcg_out_jmp(s, s->code_gen_epilogue);
2282 } else {
2283 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2284 tcg_out_jmp(s, tb_ret_addr);
2285 }
c896fe29
FB
2286 break;
2287 case INDEX_op_goto_tb:
f309101c 2288 if (s->tb_jmp_insn_offset) {
c896fe29 2289 /* direct jump method */
0d07abf0
SF
2290 int gap;
2291 /* jump displacement must be aligned for atomic patching;
2292 * see if we need to add extra nops before jump
2293 */
2294 gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
2295 if (gap != 1) {
2296 tcg_out_nopn(s, gap - 1);
2297 }
da441cff 2298 tcg_out8(s, OPC_JMP_long); /* jmp im */
42d5b514 2299 s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
c896fe29
FB
2300 tcg_out32(s, 0);
2301 } else {
2302 /* indirect jump method */
9363dedb 2303 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
42d5b514 2304 (intptr_t)(s->tb_jmp_target_addr + a0));
c896fe29 2305 }
9f754620 2306 set_jmp_reset_offset(s, a0);
c896fe29 2307 break;
5cb4ef80
EC
2308 case INDEX_op_goto_ptr:
2309 /* jmp to the given host address (could be epilogue) */
2310 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2311 break;
c896fe29 2312 case INDEX_op_br:
42d5b514 2313 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
c896fe29 2314 break;
5d8a4f8f
RH
2315 OP_32_64(ld8u):
2316 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
42d5b514 2317 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
c896fe29 2318 break;
5d8a4f8f 2319 OP_32_64(ld8s):
42d5b514 2320 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
c896fe29 2321 break;
5d8a4f8f
RH
2322 OP_32_64(ld16u):
2323 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
42d5b514 2324 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
c896fe29 2325 break;
5d8a4f8f 2326 OP_32_64(ld16s):
42d5b514 2327 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
c896fe29 2328 break;
5d8a4f8f
RH
2329#if TCG_TARGET_REG_BITS == 64
2330 case INDEX_op_ld32u_i64:
2331#endif
c896fe29 2332 case INDEX_op_ld_i32:
42d5b514 2333 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
c896fe29 2334 break;
5d8a4f8f
RH
2335
2336 OP_32_64(st8):
5c2d2a9e 2337 if (const_args[0]) {
42d5b514
RH
2338 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2339 tcg_out8(s, a0);
5c2d2a9e 2340 } else {
42d5b514 2341 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
5c2d2a9e 2342 }
c896fe29 2343 break;
5d8a4f8f 2344 OP_32_64(st16):
5c2d2a9e 2345 if (const_args[0]) {
42d5b514
RH
2346 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2347 tcg_out16(s, a0);
5c2d2a9e 2348 } else {
42d5b514 2349 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
5c2d2a9e 2350 }
c896fe29 2351 break;
5d8a4f8f
RH
2352#if TCG_TARGET_REG_BITS == 64
2353 case INDEX_op_st32_i64:
2354#endif
c896fe29 2355 case INDEX_op_st_i32:
5c2d2a9e 2356 if (const_args[0]) {
42d5b514
RH
2357 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2358 tcg_out32(s, a0);
5c2d2a9e 2359 } else {
42d5b514 2360 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
5c2d2a9e 2361 }
c896fe29 2362 break;
5d8a4f8f
RH
2363
2364 OP_32_64(add):
5d1e4e85 2365 /* For 3-operand addition, use LEA. */
42d5b514
RH
2366 if (a0 != a1) {
2367 TCGArg c3 = 0;
2368 if (const_a2) {
5d1e4e85
RH
2369 c3 = a2, a2 = -1;
2370 } else if (a0 == a2) {
2371 /* Watch out for dest = src + dest, since we've removed
2372 the matching constraint on the add. */
5d8a4f8f 2373 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
5d1e4e85
RH
2374 break;
2375 }
2376
5d8a4f8f 2377 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
5d1e4e85
RH
2378 break;
2379 }
2380 c = ARITH_ADD;
2381 goto gen_arith;
5d8a4f8f 2382 OP_32_64(sub):
c896fe29
FB
2383 c = ARITH_SUB;
2384 goto gen_arith;
5d8a4f8f 2385 OP_32_64(and):
c896fe29
FB
2386 c = ARITH_AND;
2387 goto gen_arith;
5d8a4f8f 2388 OP_32_64(or):
c896fe29
FB
2389 c = ARITH_OR;
2390 goto gen_arith;
5d8a4f8f 2391 OP_32_64(xor):
c896fe29
FB
2392 c = ARITH_XOR;
2393 goto gen_arith;
c896fe29 2394 gen_arith:
42d5b514
RH
2395 if (const_a2) {
2396 tgen_arithi(s, c + rexw, a0, a2, 0);
c896fe29 2397 } else {
42d5b514 2398 tgen_arithr(s, c + rexw, a0, a2);
c896fe29
FB
2399 }
2400 break;
5d8a4f8f 2401
9d2eec20 2402 OP_32_64(andc):
42d5b514
RH
2403 if (const_a2) {
2404 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2405 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
9d2eec20 2406 } else {
42d5b514 2407 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
9d2eec20
RH
2408 }
2409 break;
2410
5d8a4f8f 2411 OP_32_64(mul):
42d5b514 2412 if (const_a2) {
c896fe29 2413 int32_t val;
42d5b514 2414 val = a2;
c896fe29 2415 if (val == (int8_t)val) {
42d5b514 2416 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
c896fe29
FB
2417 tcg_out8(s, val);
2418 } else {
42d5b514 2419 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
c896fe29
FB
2420 tcg_out32(s, val);
2421 }
2422 } else {
42d5b514 2423 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
c896fe29
FB
2424 }
2425 break;
5d8a4f8f
RH
2426
2427 OP_32_64(div2):
2428 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
c896fe29 2429 break;
5d8a4f8f
RH
2430 OP_32_64(divu2):
2431 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
c896fe29 2432 break;
5d8a4f8f
RH
2433
2434 OP_32_64(shl):
6a5aed4b
RH
2435 /* For small constant 3-operand shift, use LEA. */
2436 if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2437 if (a2 - 1 == 0) {
2438 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2439 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2440 } else {
2441 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2442 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2443 }
2444 break;
2445 }
c896fe29 2446 c = SHIFT_SHL;
6399ab33
RH
2447 vexop = OPC_SHLX;
2448 goto gen_shift_maybe_vex;
5d8a4f8f 2449 OP_32_64(shr):
c896fe29 2450 c = SHIFT_SHR;
6399ab33
RH
2451 vexop = OPC_SHRX;
2452 goto gen_shift_maybe_vex;
5d8a4f8f 2453 OP_32_64(sar):
c896fe29 2454 c = SHIFT_SAR;
6399ab33
RH
2455 vexop = OPC_SARX;
2456 goto gen_shift_maybe_vex;
5d8a4f8f 2457 OP_32_64(rotl):
9619376c 2458 c = SHIFT_ROL;
5d8a4f8f
RH
2459 goto gen_shift;
2460 OP_32_64(rotr):
9619376c 2461 c = SHIFT_ROR;
5d8a4f8f 2462 goto gen_shift;
6399ab33 2463 gen_shift_maybe_vex:
6a5aed4b
RH
2464 if (have_bmi2) {
2465 if (!const_a2) {
2466 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2467 break;
2468 }
2469 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
6399ab33
RH
2470 }
2471 /* FALLTHRU */
5d8a4f8f 2472 gen_shift:
42d5b514
RH
2473 if (const_a2) {
2474 tcg_out_shifti(s, c + rexw, a0, a2);
81570a70 2475 } else {
42d5b514 2476 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
81570a70 2477 }
c896fe29 2478 break;
5d8a4f8f 2479
bbf25f90
RH
2480 OP_32_64(ctz):
2481 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2482 break;
2483 OP_32_64(clz):
2484 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2485 break;
993508e4
RH
2486 OP_32_64(ctpop):
2487 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2488 break;
bbf25f90 2489
c896fe29 2490 case INDEX_op_brcond_i32:
42d5b514 2491 tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
c896fe29 2492 break;
5d8a4f8f 2493 case INDEX_op_setcond_i32:
42d5b514 2494 tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
c896fe29 2495 break;
d0a16297 2496 case INDEX_op_movcond_i32:
42d5b514 2497 tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
d0a16297 2498 break;
c896fe29 2499
5d8a4f8f 2500 OP_32_64(bswap16):
42d5b514 2501 tcg_out_rolw_8(s, a0);
5d40cd63 2502 break;
5d8a4f8f 2503 OP_32_64(bswap32):
42d5b514 2504 tcg_out_bswap32(s, a0);
9619376c
AJ
2505 break;
2506
5d8a4f8f 2507 OP_32_64(neg):
42d5b514 2508 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
9619376c 2509 break;
5d8a4f8f 2510 OP_32_64(not):
42d5b514 2511 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
9619376c
AJ
2512 break;
2513
5d8a4f8f 2514 OP_32_64(ext8s):
42d5b514 2515 tcg_out_ext8s(s, a0, a1, rexw);
9619376c 2516 break;
5d8a4f8f 2517 OP_32_64(ext16s):
42d5b514 2518 tcg_out_ext16s(s, a0, a1, rexw);
9619376c 2519 break;
5d8a4f8f 2520 OP_32_64(ext8u):
42d5b514 2521 tcg_out_ext8u(s, a0, a1);
5f0ce17f 2522 break;
5d8a4f8f 2523 OP_32_64(ext16u):
42d5b514 2524 tcg_out_ext16u(s, a0, a1);
5f0ce17f 2525 break;
9619376c 2526
8221a267
RH
2527 case INDEX_op_qemu_ld_i32:
2528 tcg_out_qemu_ld(s, args, 0);
c896fe29 2529 break;
8221a267
RH
2530 case INDEX_op_qemu_ld_i64:
2531 tcg_out_qemu_ld(s, args, 1);
c896fe29 2532 break;
8221a267
RH
2533 case INDEX_op_qemu_st_i32:
2534 tcg_out_qemu_st(s, args, 0);
c896fe29 2535 break;
8221a267
RH
2536 case INDEX_op_qemu_st_i64:
2537 tcg_out_qemu_st(s, args, 1);
c896fe29
FB
2538 break;
2539
624988a5
RH
2540 OP_32_64(mulu2):
2541 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
5d8a4f8f 2542 break;
624988a5
RH
2543 OP_32_64(muls2):
2544 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2545 break;
2546 OP_32_64(add2):
5d8a4f8f 2547 if (const_args[4]) {
42d5b514 2548 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
5d8a4f8f 2549 } else {
42d5b514 2550 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
5d8a4f8f
RH
2551 }
2552 if (const_args[5]) {
42d5b514 2553 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
5d8a4f8f 2554 } else {
42d5b514 2555 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
5d8a4f8f
RH
2556 }
2557 break;
624988a5 2558 OP_32_64(sub2):
5d8a4f8f 2559 if (const_args[4]) {
42d5b514 2560 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
5d8a4f8f 2561 } else {
42d5b514 2562 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
5d8a4f8f
RH
2563 }
2564 if (const_args[5]) {
42d5b514 2565 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
5d8a4f8f 2566 } else {
42d5b514 2567 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
5d8a4f8f
RH
2568 }
2569 break;
bbc863bf
RH
2570
2571#if TCG_TARGET_REG_BITS == 32
2572 case INDEX_op_brcond2_i32:
2573 tcg_out_brcond2(s, args, const_args, 0);
2574 break;
2575 case INDEX_op_setcond2_i32:
2576 tcg_out_setcond2(s, args, const_args);
2577 break;
5d8a4f8f 2578#else /* TCG_TARGET_REG_BITS == 64 */
5d8a4f8f 2579 case INDEX_op_ld32s_i64:
42d5b514 2580 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
5d8a4f8f
RH
2581 break;
2582 case INDEX_op_ld_i64:
42d5b514 2583 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
5d8a4f8f
RH
2584 break;
2585 case INDEX_op_st_i64:
5c2d2a9e 2586 if (const_args[0]) {
42d5b514
RH
2587 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2588 tcg_out32(s, a0);
5c2d2a9e 2589 } else {
42d5b514 2590 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
5c2d2a9e 2591 }
5d8a4f8f 2592 break;
5d8a4f8f
RH
2593
2594 case INDEX_op_brcond_i64:
42d5b514 2595 tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
5d8a4f8f
RH
2596 break;
2597 case INDEX_op_setcond_i64:
42d5b514 2598 tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
5d8a4f8f 2599 break;
d0a16297 2600 case INDEX_op_movcond_i64:
42d5b514 2601 tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
d0a16297 2602 break;
5d8a4f8f
RH
2603
2604 case INDEX_op_bswap64_i64:
42d5b514 2605 tcg_out_bswap64(s, a0);
5d8a4f8f 2606 break;
4f2331e5 2607 case INDEX_op_extu_i32_i64:
5d8a4f8f 2608 case INDEX_op_ext32u_i64:
75478279 2609 case INDEX_op_extrl_i64_i32:
42d5b514 2610 tcg_out_ext32u(s, a0, a1);
5d8a4f8f 2611 break;
4f2331e5 2612 case INDEX_op_ext_i32_i64:
5d8a4f8f 2613 case INDEX_op_ext32s_i64:
42d5b514 2614 tcg_out_ext32s(s, a0, a1);
5d8a4f8f 2615 break;
75478279
RH
2616 case INDEX_op_extrh_i64_i32:
2617 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2618 break;
5d8a4f8f
RH
2619#endif
2620
a4773324
JK
2621 OP_32_64(deposit):
2622 if (args[3] == 0 && args[4] == 8) {
2623 /* load bits 0..7 */
42d5b514 2624 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
a4773324
JK
2625 } else if (args[3] == 8 && args[4] == 8) {
2626 /* load bits 8..15 */
42d5b514 2627 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
a4773324
JK
2628 } else if (args[3] == 0 && args[4] == 16) {
2629 /* load bits 0..15 */
42d5b514 2630 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
a4773324
JK
2631 } else {
2632 tcg_abort();
2633 }
2634 break;
2635
78fdbfb9 2636 case INDEX_op_extract_i64:
42d5b514 2637 if (a2 + args[3] == 32) {
78fdbfb9 2638 /* This is a 32-bit zero-extending right shift. */
42d5b514
RH
2639 tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2640 tcg_out_shifti(s, SHIFT_SHR, a0, a2);
78fdbfb9
RH
2641 break;
2642 }
2643 /* FALLTHRU */
2644 case INDEX_op_extract_i32:
2645 /* On the off-chance that we can use the high-byte registers.
2646 Otherwise we emit the same ext16 + shift pattern that we
2647 would have gotten from the normal tcg-op.c expansion. */
42d5b514
RH
2648 tcg_debug_assert(a2 == 8 && args[3] == 8);
2649 if (a1 < 4 && a0 < 8) {
2650 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
78fdbfb9 2651 } else {
42d5b514
RH
2652 tcg_out_ext16u(s, a0, a1);
2653 tcg_out_shifti(s, SHIFT_SHR, a0, 8);
78fdbfb9
RH
2654 }
2655 break;
2656
2657 case INDEX_op_sextract_i32:
2658 /* We don't implement sextract_i64, as we cannot sign-extend to
2659 64-bits without using the REX prefix that explicitly excludes
2660 access to the high-byte registers. */
42d5b514
RH
2661 tcg_debug_assert(a2 == 8 && args[3] == 8);
2662 if (a1 < 4 && a0 < 8) {
2663 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
78fdbfb9 2664 } else {
42d5b514
RH
2665 tcg_out_ext16s(s, a0, a1, 0);
2666 tcg_out_shifti(s, SHIFT_SAR, a0, 8);
78fdbfb9
RH
2667 }
2668 break;
2669
c6fb8c0c
RH
2670 OP_32_64(extract2):
2671 /* Note that SHRD outputs to the r/m operand. */
2672 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2673 tcg_out8(s, args[3]);
2674 break;
2675
a7d00d4e 2676 case INDEX_op_mb:
42d5b514 2677 tcg_out_mb(s, a0);
a7d00d4e 2678 break;
96d0ee7f
RH
2679 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
2680 case INDEX_op_mov_i64:
2681 case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
2682 case INDEX_op_movi_i64:
2683 case INDEX_op_call: /* Always emitted via tcg_out_call. */
c896fe29
FB
2684 default:
2685 tcg_abort();
2686 }
5d8a4f8f
RH
2687
2688#undef OP_32_64
c896fe29
FB
2689}
2690
770c2fc7
RH
2691static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2692 unsigned vecl, unsigned vece,
2693 const TCGArg *args, const int *const_args)
2694{
2695 static int const add_insn[4] = {
2696 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2697 };
8ffafbce
RH
2698 static int const ssadd_insn[4] = {
2699 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2700 };
2701 static int const usadd_insn[4] = {
3115584d 2702 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
8ffafbce 2703 };
770c2fc7
RH
2704 static int const sub_insn[4] = {
2705 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2706 };
8ffafbce
RH
2707 static int const sssub_insn[4] = {
2708 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2709 };
2710 static int const ussub_insn[4] = {
3115584d 2711 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
8ffafbce 2712 };
770c2fc7
RH
2713 static int const mul_insn[4] = {
2714 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2715 };
2716 static int const shift_imm_insn[4] = {
2717 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2718 };
2719 static int const cmpeq_insn[4] = {
2720 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2721 };
2722 static int const cmpgt_insn[4] = {
2723 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2724 };
2725 static int const punpckl_insn[4] = {
2726 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2727 };
2728 static int const punpckh_insn[4] = {
2729 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2730 };
2731 static int const packss_insn[4] = {
2732 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2733 };
2734 static int const packus_insn[4] = {
2735 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2736 };
bc37faf4
RH
2737 static int const smin_insn[4] = {
2738 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2739 };
2740 static int const smax_insn[4] = {
2741 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2742 };
2743 static int const umin_insn[4] = {
2744 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2745 };
2746 static int const umax_insn[4] = {
2747 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2748 };
a2ce146a
RH
2749 static int const shlv_insn[4] = {
2750 /* TODO: AVX512 adds support for MO_16. */
2751 OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2752 };
2753 static int const shrv_insn[4] = {
2754 /* TODO: AVX512 adds support for MO_16. */
2755 OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2756 };
2757 static int const sarv_insn[4] = {
2758 /* TODO: AVX512 adds support for MO_16, MO_64. */
2759 OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2760 };
0a8d7a3b
RH
2761 static int const shls_insn[4] = {
2762 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2763 };
2764 static int const shrs_insn[4] = {
2765 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2766 };
2767 static int const sars_insn[4] = {
2768 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2769 };
18f9b65f
RH
2770 static int const abs_insn[4] = {
2771 /* TODO: AVX512 adds support for MO_64. */
2772 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2773 };
770c2fc7
RH
2774
2775 TCGType type = vecl + TCG_TYPE_V64;
2776 int insn, sub;
2777 TCGArg a0, a1, a2;
2778
2779 a0 = args[0];
2780 a1 = args[1];
2781 a2 = args[2];
2782
2783 switch (opc) {
2784 case INDEX_op_add_vec:
2785 insn = add_insn[vece];
2786 goto gen_simd;
8ffafbce
RH
2787 case INDEX_op_ssadd_vec:
2788 insn = ssadd_insn[vece];
2789 goto gen_simd;
2790 case INDEX_op_usadd_vec:
2791 insn = usadd_insn[vece];
2792 goto gen_simd;
770c2fc7
RH
2793 case INDEX_op_sub_vec:
2794 insn = sub_insn[vece];
2795 goto gen_simd;
8ffafbce
RH
2796 case INDEX_op_sssub_vec:
2797 insn = sssub_insn[vece];
2798 goto gen_simd;
2799 case INDEX_op_ussub_vec:
2800 insn = ussub_insn[vece];
2801 goto gen_simd;
770c2fc7
RH
2802 case INDEX_op_mul_vec:
2803 insn = mul_insn[vece];
2804 goto gen_simd;
2805 case INDEX_op_and_vec:
2806 insn = OPC_PAND;
2807 goto gen_simd;
2808 case INDEX_op_or_vec:
2809 insn = OPC_POR;
2810 goto gen_simd;
2811 case INDEX_op_xor_vec:
2812 insn = OPC_PXOR;
2813 goto gen_simd;
bc37faf4
RH
2814 case INDEX_op_smin_vec:
2815 insn = smin_insn[vece];
2816 goto gen_simd;
2817 case INDEX_op_umin_vec:
2818 insn = umin_insn[vece];
2819 goto gen_simd;
2820 case INDEX_op_smax_vec:
2821 insn = smax_insn[vece];
2822 goto gen_simd;
2823 case INDEX_op_umax_vec:
2824 insn = umax_insn[vece];
2825 goto gen_simd;
a2ce146a
RH
2826 case INDEX_op_shlv_vec:
2827 insn = shlv_insn[vece];
2828 goto gen_simd;
2829 case INDEX_op_shrv_vec:
2830 insn = shrv_insn[vece];
2831 goto gen_simd;
2832 case INDEX_op_sarv_vec:
2833 insn = sarv_insn[vece];
2834 goto gen_simd;
0a8d7a3b
RH
2835 case INDEX_op_shls_vec:
2836 insn = shls_insn[vece];
2837 goto gen_simd;
2838 case INDEX_op_shrs_vec:
2839 insn = shrs_insn[vece];
2840 goto gen_simd;
2841 case INDEX_op_sars_vec:
2842 insn = sars_insn[vece];
2843 goto gen_simd;
770c2fc7
RH
2844 case INDEX_op_x86_punpckl_vec:
2845 insn = punpckl_insn[vece];
2846 goto gen_simd;
2847 case INDEX_op_x86_punpckh_vec:
2848 insn = punpckh_insn[vece];
2849 goto gen_simd;
2850 case INDEX_op_x86_packss_vec:
2851 insn = packss_insn[vece];
2852 goto gen_simd;
2853 case INDEX_op_x86_packus_vec:
2854 insn = packus_insn[vece];
2855 goto gen_simd;
7f34ed4b
RH
2856#if TCG_TARGET_REG_BITS == 32
2857 case INDEX_op_dup2_vec:
2858 /* Constraints have already placed both 32-bit inputs in xmm regs. */
2859 insn = OPC_PUNPCKLDQ;
2860 goto gen_simd;
2861#endif
18f9b65f
RH
2862 case INDEX_op_abs_vec:
2863 insn = abs_insn[vece];
2864 a2 = a1;
2865 a1 = 0;
2866 goto gen_simd;
770c2fc7
RH
2867 gen_simd:
2868 tcg_debug_assert(insn != OPC_UD2);
2869 if (type == TCG_TYPE_V256) {
2870 insn |= P_VEXL;
2871 }
2872 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2873 break;
2874
2875 case INDEX_op_cmp_vec:
2876 sub = args[3];
2877 if (sub == TCG_COND_EQ) {
2878 insn = cmpeq_insn[vece];
2879 } else if (sub == TCG_COND_GT) {
2880 insn = cmpgt_insn[vece];
2881 } else {
2882 g_assert_not_reached();
2883 }
2884 goto gen_simd;
2885
2886 case INDEX_op_andc_vec:
2887 insn = OPC_PANDN;
2888 if (type == TCG_TYPE_V256) {
2889 insn |= P_VEXL;
2890 }
2891 tcg_out_vex_modrm(s, insn, a0, a2, a1);
2892 break;
2893
2894 case INDEX_op_shli_vec:
2895 sub = 6;
2896 goto gen_shift;
2897 case INDEX_op_shri_vec:
2898 sub = 2;
2899 goto gen_shift;
2900 case INDEX_op_sari_vec:
2901 tcg_debug_assert(vece != MO_64);
2902 sub = 4;
2903 gen_shift:
2904 tcg_debug_assert(vece != MO_8);
2905 insn = shift_imm_insn[vece];
2906 if (type == TCG_TYPE_V256) {
2907 insn |= P_VEXL;
2908 }
2909 tcg_out_vex_modrm(s, insn, sub, a0, a1);
2910 tcg_out8(s, a2);
2911 break;
2912
2913 case INDEX_op_ld_vec:
2914 tcg_out_ld(s, type, a0, a1, a2);
2915 break;
2916 case INDEX_op_st_vec:
2917 tcg_out_st(s, type, a0, a1, a2);
2918 break;
37ee55a0
RH
2919 case INDEX_op_dupm_vec:
2920 tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2921 break;
770c2fc7
RH
2922
2923 case INDEX_op_x86_shufps_vec:
2924 insn = OPC_SHUFPS;
2925 sub = args[3];
2926 goto gen_simd_imm8;
2927 case INDEX_op_x86_blend_vec:
2928 if (vece == MO_16) {
2929 insn = OPC_PBLENDW;
2930 } else if (vece == MO_32) {
2931 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2932 } else {
2933 g_assert_not_reached();
2934 }
2935 sub = args[3];
2936 goto gen_simd_imm8;
2937 case INDEX_op_x86_vperm2i128_vec:
2938 insn = OPC_VPERM2I128;
2939 sub = args[3];
2940 goto gen_simd_imm8;
2941 gen_simd_imm8:
2942 if (type == TCG_TYPE_V256) {
2943 insn |= P_VEXL;
2944 }
2945 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2946 tcg_out8(s, sub);
2947 break;
2948
2949 case INDEX_op_x86_vpblendvb_vec:
2950 insn = OPC_VPBLENDVB;
2951 if (type == TCG_TYPE_V256) {
2952 insn |= P_VEXL;
2953 }
2954 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2955 tcg_out8(s, args[3] << 4);
2956 break;
2957
2958 case INDEX_op_x86_psrldq_vec:
2959 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2960 tcg_out8(s, a2);
2961 break;
2962
bab1671f
RH
2963 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */
2964 case INDEX_op_dupi_vec: /* Always emitted via tcg_out_movi. */
2965 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */
770c2fc7
RH
2966 default:
2967 g_assert_not_reached();
2968 }
2969}
2970
cd26449a
RH
2971static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2972{
5cb4ef80 2973 static const TCGTargetOpDef r = { .args_ct_str = { "r" } };
cd26449a
RH
2974 static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2975 static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2976 static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2977 static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2978 static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2979 static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2980 static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
6a5aed4b 2981 static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
cd26449a 2982 static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
c6fb8c0c 2983 static const TCGTargetOpDef r_0_r = { .args_ct_str = { "r", "0", "r" } };
cd26449a 2984 static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
cd26449a
RH
2985 static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2986 static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2987 static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2988 static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2989 static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2990 static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2991 static const TCGTargetOpDef r_r_L_L
2992 = { .args_ct_str = { "r", "r", "L", "L" } };
2993 static const TCGTargetOpDef L_L_L_L
2994 = { .args_ct_str = { "L", "L", "L", "L" } };
770c2fc7
RH
2995 static const TCGTargetOpDef x_x = { .args_ct_str = { "x", "x" } };
2996 static const TCGTargetOpDef x_x_x = { .args_ct_str = { "x", "x", "x" } };
2997 static const TCGTargetOpDef x_x_x_x
2998 = { .args_ct_str = { "x", "x", "x", "x" } };
2999 static const TCGTargetOpDef x_r = { .args_ct_str = { "x", "r" } };
cd26449a
RH
3000
3001 switch (op) {
5cb4ef80
EC
3002 case INDEX_op_goto_ptr:
3003 return &r;
3004
cd26449a
RH
3005 case INDEX_op_ld8u_i32:
3006 case INDEX_op_ld8u_i64:
3007 case INDEX_op_ld8s_i32:
3008 case INDEX_op_ld8s_i64:
3009 case INDEX_op_ld16u_i32:
3010 case INDEX_op_ld16u_i64:
3011 case INDEX_op_ld16s_i32:
3012 case INDEX_op_ld16s_i64:
3013 case INDEX_op_ld_i32:
3014 case INDEX_op_ld32u_i64:
3015 case INDEX_op_ld32s_i64:
3016 case INDEX_op_ld_i64:
3017 return &r_r;
a4773324 3018
cd26449a
RH
3019 case INDEX_op_st8_i32:
3020 case INDEX_op_st8_i64:
3021 return &qi_r;
3022 case INDEX_op_st16_i32:
3023 case INDEX_op_st16_i64:
3024 case INDEX_op_st_i32:
3025 case INDEX_op_st32_i64:
3026 return &ri_r;
3027 case INDEX_op_st_i64:
3028 return &re_r;
3029
3030 case INDEX_op_add_i32:
3031 case INDEX_op_add_i64:
3032 return &r_r_re;
3033 case INDEX_op_sub_i32:
3034 case INDEX_op_sub_i64:
3035 case INDEX_op_mul_i32:
3036 case INDEX_op_mul_i64:
3037 case INDEX_op_or_i32:
3038 case INDEX_op_or_i64:
3039 case INDEX_op_xor_i32:
3040 case INDEX_op_xor_i64:
3041 return &r_0_re;
3042
3043 case INDEX_op_and_i32:
3044 case INDEX_op_and_i64:
3045 {
3046 static const TCGTargetOpDef and
3047 = { .args_ct_str = { "r", "0", "reZ" } };
3048 return &and;
3049 }
3050 break;
3051 case INDEX_op_andc_i32:
3052 case INDEX_op_andc_i64:
3053 {
3054 static const TCGTargetOpDef andc
3055 = { .args_ct_str = { "r", "r", "rI" } };
3056 return &andc;
3057 }
3058 break;
bbc863bf 3059
cd26449a
RH
3060 case INDEX_op_shl_i32:
3061 case INDEX_op_shl_i64:
3062 case INDEX_op_shr_i32:
3063 case INDEX_op_shr_i64:
3064 case INDEX_op_sar_i32:
3065 case INDEX_op_sar_i64:
6a5aed4b 3066 return have_bmi2 ? &r_r_ri : &r_0_ci;
cd26449a
RH
3067 case INDEX_op_rotl_i32:
3068 case INDEX_op_rotl_i64:
3069 case INDEX_op_rotr_i32:
3070 case INDEX_op_rotr_i64:
3071 return &r_0_ci;
a7d00d4e 3072
cd26449a
RH
3073 case INDEX_op_brcond_i32:
3074 case INDEX_op_brcond_i64:
3075 return &r_re;
1d2699ae 3076
cd26449a
RH
3077 case INDEX_op_bswap16_i32:
3078 case INDEX_op_bswap16_i64:
3079 case INDEX_op_bswap32_i32:
3080 case INDEX_op_bswap32_i64:
3081 case INDEX_op_bswap64_i64:
3082 case INDEX_op_neg_i32:
3083 case INDEX_op_neg_i64:
3084 case INDEX_op_not_i32:
3085 case INDEX_op_not_i64:
75478279 3086 case INDEX_op_extrh_i64_i32:
cd26449a
RH
3087 return &r_0;
3088
3089 case INDEX_op_ext8s_i32:
3090 case INDEX_op_ext8s_i64:
3091 case INDEX_op_ext8u_i32:
3092 case INDEX_op_ext8u_i64:
3093 return &r_q;
3094 case INDEX_op_ext16s_i32:
3095 case INDEX_op_ext16s_i64:
3096 case INDEX_op_ext16u_i32:
3097 case INDEX_op_ext16u_i64:
3098 case INDEX_op_ext32s_i64:
3099 case INDEX_op_ext32u_i64:
3100 case INDEX_op_ext_i32_i64:
3101 case INDEX_op_extu_i32_i64:
75478279 3102 case INDEX_op_extrl_i64_i32:
cd26449a
RH
3103 case INDEX_op_extract_i32:
3104 case INDEX_op_extract_i64:
3105 case INDEX_op_sextract_i32:
993508e4
RH
3106 case INDEX_op_ctpop_i32:
3107 case INDEX_op_ctpop_i64:
cd26449a 3108 return &r_r;
c6fb8c0c
RH
3109 case INDEX_op_extract2_i32:
3110 case INDEX_op_extract2_i64:
3111 return &r_0_r;
cd26449a
RH
3112
3113 case INDEX_op_deposit_i32:
3114 case INDEX_op_deposit_i64:
3115 {
3116 static const TCGTargetOpDef dep
3117 = { .args_ct_str = { "Q", "0", "Q" } };
3118 return &dep;
3119 }
3120 case INDEX_op_setcond_i32:
3121 case INDEX_op_setcond_i64:
3122 {
3123 static const TCGTargetOpDef setc
3124 = { .args_ct_str = { "q", "r", "re" } };
3125 return &setc;
3126 }
3127 case INDEX_op_movcond_i32:
3128 case INDEX_op_movcond_i64:
3129 {
3130 static const TCGTargetOpDef movc
3131 = { .args_ct_str = { "r", "r", "re", "r", "0" } };
3132 return &movc;
3133 }
3134 case INDEX_op_div2_i32:
3135 case INDEX_op_div2_i64:
3136 case INDEX_op_divu2_i32:
3137 case INDEX_op_divu2_i64:
3138 {
3139 static const TCGTargetOpDef div2
3140 = { .args_ct_str = { "a", "d", "0", "1", "r" } };
3141 return &div2;
3142 }
3143 case INDEX_op_mulu2_i32:
3144 case INDEX_op_mulu2_i64:
3145 case INDEX_op_muls2_i32:
3146 case INDEX_op_muls2_i64:
3147 {
3148 static const TCGTargetOpDef mul2
3149 = { .args_ct_str = { "a", "d", "a", "r" } };
3150 return &mul2;
3151 }
3152 case INDEX_op_add2_i32:
3153 case INDEX_op_add2_i64:
3154 case INDEX_op_sub2_i32:
3155 case INDEX_op_sub2_i64:
3156 {
3157 static const TCGTargetOpDef arith2
3158 = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
3159 return &arith2;
3160 }
bbf25f90
RH
3161 case INDEX_op_ctz_i32:
3162 case INDEX_op_ctz_i64:
3163 {
3164 static const TCGTargetOpDef ctz[2] = {
9bf38308 3165 { .args_ct_str = { "&r", "r", "r" } },
bbf25f90
RH
3166 { .args_ct_str = { "&r", "r", "rW" } },
3167 };
3168 return &ctz[have_bmi1];
3169 }
3170 case INDEX_op_clz_i32:
3171 case INDEX_op_clz_i64:
3172 {
3173 static const TCGTargetOpDef clz[2] = {
9bf38308 3174 { .args_ct_str = { "&r", "r", "r" } },
bbf25f90
RH
3175 { .args_ct_str = { "&r", "r", "rW" } },
3176 };
3177 return &clz[have_lzcnt];
3178 }
c896fe29 3179
cd26449a
RH
3180 case INDEX_op_qemu_ld_i32:
3181 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
3182 case INDEX_op_qemu_st_i32:
3183 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
3184 case INDEX_op_qemu_ld_i64:
3185 return (TCG_TARGET_REG_BITS == 64 ? &r_L
3186 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
3187 : &r_r_L_L);
3188 case INDEX_op_qemu_st_i64:
3189 return (TCG_TARGET_REG_BITS == 64 ? &L_L
3190 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
3191 : &L_L_L_L);
f69d277e 3192
cd26449a
RH
3193 case INDEX_op_brcond2_i32:
3194 {
3195 static const TCGTargetOpDef b2
3196 = { .args_ct_str = { "r", "r", "ri", "ri" } };
3197 return &b2;
3198 }
3199 case INDEX_op_setcond2_i32:
3200 {
3201 static const TCGTargetOpDef s2
3202 = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
3203 return &s2;
f69d277e 3204 }
cd26449a 3205
770c2fc7
RH
3206 case INDEX_op_ld_vec:
3207 case INDEX_op_st_vec:
37ee55a0 3208 case INDEX_op_dupm_vec:
770c2fc7
RH
3209 return &x_r;
3210
3211 case INDEX_op_add_vec:
3212 case INDEX_op_sub_vec:
3213 case INDEX_op_mul_vec:
3214 case INDEX_op_and_vec:
3215 case INDEX_op_or_vec:
3216 case INDEX_op_xor_vec:
3217 case INDEX_op_andc_vec:
8ffafbce
RH
3218 case INDEX_op_ssadd_vec:
3219 case INDEX_op_usadd_vec:
3220 case INDEX_op_sssub_vec:
3221 case INDEX_op_ussub_vec:
bc37faf4
RH
3222 case INDEX_op_smin_vec:
3223 case INDEX_op_umin_vec:
3224 case INDEX_op_smax_vec:
3225 case INDEX_op_umax_vec:
a2ce146a
RH
3226 case INDEX_op_shlv_vec:
3227 case INDEX_op_shrv_vec:
3228 case INDEX_op_sarv_vec:
0a8d7a3b
RH
3229 case INDEX_op_shls_vec:
3230 case INDEX_op_shrs_vec:
3231 case INDEX_op_sars_vec:
770c2fc7
RH
3232 case INDEX_op_cmp_vec:
3233 case INDEX_op_x86_shufps_vec:
3234 case INDEX_op_x86_blend_vec:
3235 case INDEX_op_x86_packss_vec:
3236 case INDEX_op_x86_packus_vec:
3237 case INDEX_op_x86_vperm2i128_vec:
3238 case INDEX_op_x86_punpckl_vec:
3239 case INDEX_op_x86_punpckh_vec:
7f34ed4b
RH
3240#if TCG_TARGET_REG_BITS == 32
3241 case INDEX_op_dup2_vec:
3242#endif
770c2fc7 3243 return &x_x_x;
18f9b65f 3244 case INDEX_op_abs_vec:
770c2fc7
RH
3245 case INDEX_op_dup_vec:
3246 case INDEX_op_shli_vec:
3247 case INDEX_op_shri_vec:
3248 case INDEX_op_sari_vec:
3249 case INDEX_op_x86_psrldq_vec:
3250 return &x_x;
3251 case INDEX_op_x86_vpblendvb_vec:
3252 return &x_x_x_x;
3253
cd26449a
RH
3254 default:
3255 break;
f69d277e
RH
3256 }
3257 return NULL;
3258}
3259
770c2fc7
RH
3260int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3261{
3262 switch (opc) {
3263 case INDEX_op_add_vec:
3264 case INDEX_op_sub_vec:
3265 case INDEX_op_and_vec:
3266 case INDEX_op_or_vec:
3267 case INDEX_op_xor_vec:
3268 case INDEX_op_andc_vec:
3269 return 1;
3270 case INDEX_op_cmp_vec:
904c5e19 3271 case INDEX_op_cmpsel_vec:
770c2fc7
RH
3272 return -1;
3273
3274 case INDEX_op_shli_vec:
3275 case INDEX_op_shri_vec:
3276 /* We must expand the operation for MO_8. */
3277 return vece == MO_8 ? -1 : 1;
3278
3279 case INDEX_op_sari_vec:
3280 /* We must expand the operation for MO_8. */
3281 if (vece == MO_8) {
3282 return -1;
3283 }
3284 /* We can emulate this for MO_64, but it does not pay off
3285 unless we're producing at least 4 values. */
3286 if (vece == MO_64) {
3287 return type >= TCG_TYPE_V256 ? -1 : 0;
3288 }
3289 return 1;
3290
0a8d7a3b
RH
3291 case INDEX_op_shls_vec:
3292 case INDEX_op_shrs_vec:
3293 return vece >= MO_16;
3294 case INDEX_op_sars_vec:
3295 return vece >= MO_16 && vece <= MO_32;
3296
a2ce146a
RH
3297 case INDEX_op_shlv_vec:
3298 case INDEX_op_shrv_vec:
3299 return have_avx2 && vece >= MO_32;
3300 case INDEX_op_sarv_vec:
3301 return have_avx2 && vece == MO_32;
3302
770c2fc7
RH
3303 case INDEX_op_mul_vec:
3304 if (vece == MO_8) {
3305 /* We can expand the operation for MO_8. */
3306 return -1;
3307 }
3308 if (vece == MO_64) {
3309 return 0;
3310 }
3311 return 1;
3312
8ffafbce
RH
3313 case INDEX_op_ssadd_vec:
3314 case INDEX_op_usadd_vec:
3315 case INDEX_op_sssub_vec:
3316 case INDEX_op_ussub_vec:
3317 return vece <= MO_16;
bc37faf4
RH
3318 case INDEX_op_smin_vec:
3319 case INDEX_op_smax_vec:
3320 case INDEX_op_umin_vec:
3321 case INDEX_op_umax_vec:
18f9b65f
RH
3322 case INDEX_op_abs_vec:
3323 return vece <= MO_32;
8ffafbce 3324
770c2fc7
RH
3325 default:
3326 return 0;
3327 }
3328}
3329
44f1441d
RH
3330static void expand_vec_shi(TCGType type, unsigned vece, bool shr,
3331 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3332{
3333 TCGv_vec t1, t2;
3334
3335 tcg_debug_assert(vece == MO_8);
3336
3337 t1 = tcg_temp_new_vec(type);
3338 t2 = tcg_temp_new_vec(type);
3339
3340 /* Unpack to W, shift, and repack. Tricky bits:
3341 (1) Use punpck*bw x,x to produce DDCCBBAA,
3342 i.e. duplicate in other half of the 16-bit lane.
3343 (2) For right-shift, add 8 so that the high half of
3344 the lane becomes zero. For left-shift, we must
3345 shift up and down again.
3346 (3) Step 2 leaves high half zero such that PACKUSWB
3347 (pack with unsigned saturation) does not modify
3348 the quantity. */
3349 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3350 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3351 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3352 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3353
3354 if (shr) {
3355 tcg_gen_shri_vec(MO_16, t1, t1, imm + 8);
3356 tcg_gen_shri_vec(MO_16, t2, t2, imm + 8);
3357 } else {
3358 tcg_gen_shli_vec(MO_16, t1, t1, imm + 8);
3359 tcg_gen_shli_vec(MO_16, t2, t2, imm + 8);
3360 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3361 tcg_gen_shri_vec(MO_16, t2, t2, 8);
3362 }
770c2fc7 3363
44f1441d
RH
3364 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3365 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3366 tcg_temp_free_vec(t1);
3367 tcg_temp_free_vec(t2);
3368}
770c2fc7 3369
44f1441d
RH
3370static void expand_vec_sari(TCGType type, unsigned vece,
3371 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3372{
3373 TCGv_vec t1, t2;
3374
3375 switch (vece) {
3376 case MO_8:
3377 /* Unpack to W, shift, and repack, as in expand_vec_shi. */
770c2fc7
RH
3378 t1 = tcg_temp_new_vec(type);
3379 t2 = tcg_temp_new_vec(type);
3380 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
44f1441d 3381 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
770c2fc7 3382 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
44f1441d
RH
3383 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3384 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3385 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3386 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3387 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
770c2fc7
RH
3388 tcg_temp_free_vec(t1);
3389 tcg_temp_free_vec(t2);
3390 break;
3391
44f1441d
RH
3392 case MO_64:
3393 if (imm <= 32) {
312b426f
RH
3394 /*
3395 * We can emulate a small sign extend by performing an arithmetic
44f1441d 3396 * 32-bit shift and overwriting the high half of a 64-bit logical
312b426f
RH
3397 * shift. Note that the ISA says shift of 32 is valid, but TCG
3398 * does not, so we have to bound the smaller shift -- we get the
3399 * same result in the high half either way.
44f1441d 3400 */
770c2fc7 3401 t1 = tcg_temp_new_vec(type);
312b426f 3402 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
44f1441d 3403 tcg_gen_shri_vec(MO_64, v0, v1, imm);
770c2fc7 3404 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
44f1441d
RH
3405 tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3406 tcgv_vec_arg(t1), 0xaa);
3407 tcg_temp_free_vec(t1);
3408 } else {
3409 /* Otherwise we will need to use a compare vs 0 to produce
3410 * the sign-extend, shift and merge.
3411 */
3412 t1 = tcg_const_zeros_vec(type);
3413 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3414 tcg_gen_shri_vec(MO_64, v0, v1, imm);
3415 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3416 tcg_gen_or_vec(MO_64, v0, v0, t1);
770c2fc7 3417 tcg_temp_free_vec(t1);
770c2fc7 3418 }
770c2fc7
RH
3419 break;
3420
44f1441d
RH
3421 default:
3422 g_assert_not_reached();
3423 }
3424}
770c2fc7 3425
44f1441d
RH
3426static void expand_vec_mul(TCGType type, unsigned vece,
3427 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3428{
3429 TCGv_vec t1, t2, t3, t4;
770c2fc7 3430
44f1441d 3431 tcg_debug_assert(vece == MO_8);
770c2fc7 3432
44f1441d
RH
3433 /*
3434 * Unpack v1 bytes to words, 0 | x.
3435 * Unpack v2 bytes to words, y | 0.
3436 * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3437 * Shift logical right by 8 bits to clear the high 8 bytes before
3438 * using an unsigned saturated pack.
3439 *
3440 * The difference between the V64, V128 and V256 cases is merely how
3441 * we distribute the expansion between temporaries.
3442 */
3443 switch (type) {
3444 case TCG_TYPE_V64:
3445 t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3446 t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3447 tcg_gen_dup16i_vec(t2, 0);
3448 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3449 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t2));
3450 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3451 tcgv_vec_arg(t2), tcgv_vec_arg(t2), tcgv_vec_arg(v2));
3452 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3453 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3454 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3455 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3456 tcg_temp_free_vec(t1);
3457 tcg_temp_free_vec(t2);
770c2fc7
RH
3458 break;
3459
44f1441d
RH
3460 case TCG_TYPE_V128:
3461 case TCG_TYPE_V256:
3462 t1 = tcg_temp_new_vec(type);
3463 t2 = tcg_temp_new_vec(type);
3464 t3 = tcg_temp_new_vec(type);
3465 t4 = tcg_temp_new_vec(type);
3466 tcg_gen_dup16i_vec(t4, 0);
3467 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3468 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3469 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3470 tcgv_vec_arg(t2), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3471 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3472 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(t4));
3473 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3474 tcgv_vec_arg(t4), tcgv_vec_arg(t4), tcgv_vec_arg(v2));
3475 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3476 tcg_gen_mul_vec(MO_16, t3, t3, t4);
3477 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3478 tcg_gen_shri_vec(MO_16, t3, t3, 8);
3479 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3480 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3481 tcg_temp_free_vec(t1);
3482 tcg_temp_free_vec(t2);
3483 tcg_temp_free_vec(t3);
3484 tcg_temp_free_vec(t4);
3485 break;
770c2fc7 3486
44f1441d
RH
3487 default:
3488 g_assert_not_reached();
3489 }
3490}
770c2fc7 3491
904c5e19
RH
3492static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3493 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
44f1441d
RH
3494{
3495 enum {
ebcfb91a
RH
3496 NEED_INV = 1,
3497 NEED_SWAP = 2,
3498 NEED_BIAS = 4,
3499 NEED_UMIN = 8,
3500 NEED_UMAX = 16,
44f1441d
RH
3501 };
3502 TCGv_vec t1, t2;
3503 uint8_t fixup;
770c2fc7 3504
ebcfb91a
RH
3505 switch (cond) {
3506 case TCG_COND_EQ:
3507 case TCG_COND_GT:
3508 fixup = 0;
3509 break;
3510 case TCG_COND_NE:
3511 case TCG_COND_LE:
3512 fixup = NEED_INV;
3513 break;
3514 case TCG_COND_LT:
3515 fixup = NEED_SWAP;
3516 break;
3517 case TCG_COND_GE:
3518 fixup = NEED_SWAP | NEED_INV;
3519 break;
3520 case TCG_COND_LEU:
3521 if (vece <= MO_32) {
3522 fixup = NEED_UMIN;
3523 } else {
3524 fixup = NEED_BIAS | NEED_INV;
3525 }
3526 break;
3527 case TCG_COND_GTU:
3528 if (vece <= MO_32) {
3529 fixup = NEED_UMIN | NEED_INV;
3530 } else {
3531 fixup = NEED_BIAS;
3532 }
3533 break;
3534 case TCG_COND_GEU:
3535 if (vece <= MO_32) {
3536 fixup = NEED_UMAX;
3537 } else {
3538 fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3539 }
3540 break;
3541 case TCG_COND_LTU:
3542 if (vece <= MO_32) {
3543 fixup = NEED_UMAX | NEED_INV;
3544 } else {
3545 fixup = NEED_BIAS | NEED_SWAP;
3546 }
3547 break;
3548 default:
3549 g_assert_not_reached();
3550 }
770c2fc7 3551
44f1441d
RH
3552 if (fixup & NEED_INV) {
3553 cond = tcg_invert_cond(cond);
3554 }
3555 if (fixup & NEED_SWAP) {
3556 t1 = v1, v1 = v2, v2 = t1;
3557 cond = tcg_swap_cond(cond);
3558 }
3559
3560 t1 = t2 = NULL;
ebcfb91a
RH
3561 if (fixup & (NEED_UMIN | NEED_UMAX)) {
3562 t1 = tcg_temp_new_vec(type);
3563 if (fixup & NEED_UMIN) {
3564 tcg_gen_umin_vec(vece, t1, v1, v2);
3565 } else {
3566 tcg_gen_umax_vec(vece, t1, v1, v2);
3567 }
3568 v2 = t1;
3569 cond = TCG_COND_EQ;
3570 } else if (fixup & NEED_BIAS) {
44f1441d
RH
3571 t1 = tcg_temp_new_vec(type);
3572 t2 = tcg_temp_new_vec(type);
3573 tcg_gen_dupi_vec(vece, t2, 1ull << ((8 << vece) - 1));
3574 tcg_gen_sub_vec(vece, t1, v1, t2);
3575 tcg_gen_sub_vec(vece, t2, v2, t2);
3576 v1 = t1;
3577 v2 = t2;
3578 cond = tcg_signed_cond(cond);
3579 }
770c2fc7 3580
44f1441d
RH
3581 tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3582 /* Expand directly; do not recurse. */
3583 vec_gen_4(INDEX_op_cmp_vec, type, vece,
3584 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
770c2fc7 3585
44f1441d
RH
3586 if (t1) {
3587 tcg_temp_free_vec(t1);
3588 if (t2) {
3589 tcg_temp_free_vec(t2);
770c2fc7 3590 }
44f1441d 3591 }
904c5e19
RH
3592 return fixup & NEED_INV;
3593}
3594
3595static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3596 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3597{
3598 if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
44f1441d
RH
3599 tcg_gen_not_vec(vece, v0, v0);
3600 }
3601}
3602
904c5e19
RH
3603static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3604 TCGv_vec c1, TCGv_vec c2,
3605 TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3606{
3607 TCGv_vec t = tcg_temp_new_vec(type);
3608
3609 if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3610 /* Invert the sense of the compare by swapping arguments. */
3611 TCGv_vec x;
3612 x = v3, v3 = v4, v4 = x;
3613 }
3614 vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3615 tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3616 tcgv_vec_arg(v3), tcgv_vec_arg(t));
3617 tcg_temp_free_vec(t);
3618}
3619
44f1441d
RH
3620void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3621 TCGArg a0, ...)
3622{
3623 va_list va;
3624 TCGArg a2;
904c5e19 3625 TCGv_vec v0, v1, v2, v3, v4;
44f1441d
RH
3626
3627 va_start(va, a0);
3628 v0 = temp_tcgv_vec(arg_temp(a0));
3629 v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3630 a2 = va_arg(va, TCGArg);
3631
3632 switch (opc) {
3633 case INDEX_op_shli_vec:
3634 case INDEX_op_shri_vec:
3635 expand_vec_shi(type, vece, opc == INDEX_op_shri_vec, v0, v1, a2);
3636 break;
3637
3638 case INDEX_op_sari_vec:
3639 expand_vec_sari(type, vece, v0, v1, a2);
3640 break;
3641
3642 case INDEX_op_mul_vec:
3643 v2 = temp_tcgv_vec(arg_temp(a2));
3644 expand_vec_mul(type, vece, v0, v1, v2);
3645 break;
3646
3647 case INDEX_op_cmp_vec:
3648 v2 = temp_tcgv_vec(arg_temp(a2));
3649 expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
770c2fc7 3650 break;
bc37faf4 3651
904c5e19
RH
3652 case INDEX_op_cmpsel_vec:
3653 v2 = temp_tcgv_vec(arg_temp(a2));
3654 v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3655 v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3656 expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3657 break;
770c2fc7
RH
3658
3659 default:
3660 break;
3661 }
3662
3663 va_end(va);
3664}
3665
e268f4c0 3666static const int tcg_target_callee_save_regs[] = {
5d8a4f8f
RH
3667#if TCG_TARGET_REG_BITS == 64
3668 TCG_REG_RBP,
3669 TCG_REG_RBX,
8d918718
SW
3670#if defined(_WIN64)
3671 TCG_REG_RDI,
3672 TCG_REG_RSI,
3673#endif
5d8a4f8f
RH
3674 TCG_REG_R12,
3675 TCG_REG_R13,
cea5f9a2 3676 TCG_REG_R14, /* Currently used for the global env. */
5d8a4f8f
RH
3677 TCG_REG_R15,
3678#else
cea5f9a2 3679 TCG_REG_EBP, /* Currently used for the global env. */
b03cce8e
FB
3680 TCG_REG_EBX,
3681 TCG_REG_ESI,
3682 TCG_REG_EDI,
5d8a4f8f 3683#endif
b03cce8e
FB
3684};
3685
813da627
RH
3686/* Compute frame size via macros, to share between tcg_target_qemu_prologue
3687 and tcg_register_jit. */
3688
3689#define PUSH_SIZE \
3690 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3691 * (TCG_TARGET_REG_BITS / 8))
3692
3693#define FRAME_SIZE \
3694 ((PUSH_SIZE \
3695 + TCG_STATIC_CALL_ARGS_SIZE \
3696 + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3697 + TCG_TARGET_STACK_ALIGN - 1) \
3698 & ~(TCG_TARGET_STACK_ALIGN - 1))
3699
b03cce8e 3700/* Generate global QEMU prologue and epilogue code */
e4d58b41 3701static void tcg_target_qemu_prologue(TCGContext *s)
b03cce8e 3702{
813da627 3703 int i, stack_addend;
78686523 3704
b03cce8e 3705 /* TB prologue */
5d8a4f8f 3706
ac0275dc 3707 /* Reserve some stack space, also for TCG temps. */
813da627 3708 stack_addend = FRAME_SIZE - PUSH_SIZE;
ac0275dc
BS
3709 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3710 CPU_TEMP_BUF_NLONGS * sizeof(long));
3711
3712 /* Save all callee saved registers. */
3713 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3714 tcg_out_push(s, tcg_target_callee_save_regs[i]);
3715 }
3716
6a18ae2d
BS
3717#if TCG_TARGET_REG_BITS == 32
3718 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3719 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
b18212c6
SW
3720 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3721 /* jmp *tb. */
3722 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
7d37435b
PB
3723 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3724 + stack_addend);
6a18ae2d 3725#else
913c2bdd
RH
3726# if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3727 if (guest_base) {
3728 int seg = setup_guest_base_seg();
3729 if (seg != 0) {
3730 x86_guest_base_seg = seg;
3731 } else if (guest_base == (int32_t)guest_base) {
3732 x86_guest_base_offset = guest_base;
3733 } else {
3734 /* Choose R12 because, as a base, it requires a SIB byte. */
3735 x86_guest_base_index = TCG_REG_R12;
3736 tcg_out_mov(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3737 tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3738 }
3739 }
3740# endif
cea5f9a2 3741 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
6a18ae2d 3742 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
5d8a4f8f 3743 /* jmp *tb. */
cea5f9a2 3744 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
b18212c6 3745#endif
78686523 3746
5cb4ef80
EC
3747 /*
3748 * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3749 * and fall through to the rest of the epilogue.
3750 */
3751 s->code_gen_epilogue = s->code_ptr;
3752 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3753
b03cce8e
FB
3754 /* TB epilogue */
3755 tb_ret_addr = s->code_ptr;
5d8a4f8f 3756
e83c80f7 3757 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
5d8a4f8f 3758
770c2fc7
RH
3759 if (have_avx2) {
3760 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3761 }
5d8a4f8f 3762 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
b03cce8e
FB
3763 tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3764 }
5d8a4f8f 3765 tcg_out_opc(s, OPC_RET, 0, 0, 0);
b03cce8e
FB
3766}
3767
4e45f239
RH
3768static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3769{
3770 memset(p, 0x90, count);
3771}
3772
e4d58b41 3773static void tcg_target_init(TCGContext *s)
c896fe29 3774{
774d566c 3775#ifdef CONFIG_CPUID_H
770c2fc7 3776 unsigned a, b, c, d, b7 = 0;
9d2eec20 3777 int max = __get_cpuid_max(0, 0);
085bb5bb 3778
770c2fc7
RH
3779 if (max >= 7) {
3780 /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
3781 __cpuid_count(7, 0, a, b7, c, d);
3782 have_bmi1 = (b7 & bit_BMI) != 0;
3783 have_bmi2 = (b7 & bit_BMI2) != 0;
3784 }
3785
9d2eec20
RH
3786 if (max >= 1) {
3787 __cpuid(1, a, b, c, d);
3788#ifndef have_cmov
085bb5bb
AJ
3789 /* For 32-bit, 99% certainty that we're running on hardware that
3790 supports cmov, but we still need to check. In case cmov is not
3791 available, we'll use a small forward branch. */
9d2eec20
RH
3792 have_cmov = (d & bit_CMOV) != 0;
3793#endif
770c2fc7 3794
085bb5bb
AJ
3795 /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3796 need to probe for it. */
9d2eec20 3797 have_movbe = (c & bit_MOVBE) != 0;
993508e4 3798 have_popcnt = (c & bit_POPCNT) != 0;
9d2eec20 3799
770c2fc7
RH
3800 /* There are a number of things we must check before we can be
3801 sure of not hitting invalid opcode. */
3802 if (c & bit_OSXSAVE) {
3803 unsigned xcrl, xcrh;
1019242a
JA
3804 /* The xgetbv instruction is not available to older versions of
3805 * the assembler, so we encode the instruction manually.
3806 */
3807 asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
770c2fc7
RH
3808 if ((xcrl & 6) == 6) {
3809 have_avx1 = (c & bit_AVX) != 0;
3810 have_avx2 = (b7 & bit_AVX2) != 0;
3811 }
3812 }
9d2eec20 3813 }
76a347e1 3814
bbf25f90
RH
3815 max = __get_cpuid_max(0x8000000, 0);
3816 if (max >= 1) {
3817 __cpuid(0x80000001, a, b, c, d);
3818 /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs. */
3819 have_lzcnt = (c & bit_LZCNT) != 0;
3820 }
5dd89908 3821#endif /* CONFIG_CPUID_H */
bbf25f90 3822
770c2fc7 3823 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
5d8a4f8f 3824 if (TCG_TARGET_REG_BITS == 64) {
770c2fc7
RH
3825 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3826 }
3827 if (have_avx1) {
3828 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3829 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3830 }
3831 if (have_avx2) {
3832 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
5d8a4f8f 3833 }
4ab50ccf 3834
672189cd 3835 tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4ab50ccf
RH
3836 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3837 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3838 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
5d8a4f8f 3839 if (TCG_TARGET_REG_BITS == 64) {
8d918718 3840#if !defined(_WIN64)
5d8a4f8f
RH
3841 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3842 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
8d918718 3843#endif
5d8a4f8f
RH
3844 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3845 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3846 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3847 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3848 }
4ab50ccf 3849
ccb1bb66 3850 s->reserved_regs = 0;
e83c80f7 3851 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
c896fe29 3852}
813da627 3853
813da627 3854typedef struct {
e9a9a5b6 3855 DebugFrameHeader h;
497a22eb
RH
3856 uint8_t fde_def_cfa[4];
3857 uint8_t fde_reg_ofs[14];
813da627
RH
3858} DebugFrame;
3859
b5cc476d
RH
3860/* We're expecting a 2 byte uleb128 encoded value. */
3861QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3862
c170cb66
SW
3863#if !defined(__ELF__)
3864 /* Host machine without ELF. */
3865#elif TCG_TARGET_REG_BITS == 64
813da627 3866#define ELF_HOST_MACHINE EM_X86_64
e9a9a5b6
RH
3867static const DebugFrame debug_frame = {
3868 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3869 .h.cie.id = -1,
3870 .h.cie.version = 1,
3871 .h.cie.code_align = 1,
3872 .h.cie.data_align = 0x78, /* sleb128 -8 */
3873 .h.cie.return_column = 16,
813da627 3874
497a22eb 3875 /* Total FDE size does not include the "len" member. */
e9a9a5b6 3876 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
497a22eb
RH
3877
3878 .fde_def_cfa = {
813da627
RH
3879 12, 7, /* DW_CFA_def_cfa %rsp, ... */
3880 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
3881 (FRAME_SIZE >> 7)
3882 },
497a22eb 3883 .fde_reg_ofs = {
813da627
RH
3884 0x90, 1, /* DW_CFA_offset, %rip, -8 */
3885 /* The following ordering must match tcg_target_callee_save_regs. */
3886 0x86, 2, /* DW_CFA_offset, %rbp, -16 */
3887 0x83, 3, /* DW_CFA_offset, %rbx, -24 */
3888 0x8c, 4, /* DW_CFA_offset, %r12, -32 */
3889 0x8d, 5, /* DW_CFA_offset, %r13, -40 */
3890 0x8e, 6, /* DW_CFA_offset, %r14, -48 */
3891 0x8f, 7, /* DW_CFA_offset, %r15, -56 */
3892 }
3893};
3894#else
3895#define ELF_HOST_MACHINE EM_386
e9a9a5b6
RH
3896static const DebugFrame debug_frame = {
3897 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3898 .h.cie.id = -1,
3899 .h.cie.version = 1,
3900 .h.cie.code_align = 1,
3901 .h.cie.data_align = 0x7c, /* sleb128 -4 */
3902 .h.cie.return_column = 8,
813da627 3903
497a22eb 3904 /* Total FDE size does not include the "len" member. */
e9a9a5b6 3905 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
497a22eb
RH
3906
3907 .fde_def_cfa = {
813da627
RH
3908 12, 4, /* DW_CFA_def_cfa %esp, ... */
3909 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
3910 (FRAME_SIZE >> 7)
3911 },
497a22eb 3912 .fde_reg_ofs = {
813da627
RH
3913 0x88, 1, /* DW_CFA_offset, %eip, -4 */
3914 /* The following ordering must match tcg_target_callee_save_regs. */
3915 0x85, 2, /* DW_CFA_offset, %ebp, -8 */
3916 0x83, 3, /* DW_CFA_offset, %ebx, -12 */
3917 0x86, 4, /* DW_CFA_offset, %esi, -16 */
3918 0x87, 5, /* DW_CFA_offset, %edi, -20 */
3919 }
3920};
3921#endif
3922
c170cb66 3923#if defined(ELF_HOST_MACHINE)
813da627
RH
3924void tcg_register_jit(void *buf, size_t buf_size)
3925{
813da627
RH
3926 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3927}
c170cb66 3928#endif