]> git.proxmox.com Git - mirror_qemu.git/blob - tcg/i386/tcg-target.c.inc
tcg/i386: convert add/sub of 128 to sub/add of -128
[mirror_qemu.git] / tcg / i386 / tcg-target.c.inc
1 /*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 #include "../tcg-ldst.c.inc"
26 #include "../tcg-pool.c.inc"
27
28 #ifdef CONFIG_DEBUG_TCG
29 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30 #if TCG_TARGET_REG_BITS == 64
31 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
32 #else
33 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34 #endif
35 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37 #if TCG_TARGET_REG_BITS == 64
38 "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39 "%xmm12", "%xmm13", "%xmm14", "%xmm15",
40 #endif
41 };
42 #endif
43
44 static const int tcg_target_reg_alloc_order[] = {
45 #if TCG_TARGET_REG_BITS == 64
46 TCG_REG_RBP,
47 TCG_REG_RBX,
48 TCG_REG_R12,
49 TCG_REG_R13,
50 TCG_REG_R14,
51 TCG_REG_R15,
52 TCG_REG_R10,
53 TCG_REG_R11,
54 TCG_REG_R9,
55 TCG_REG_R8,
56 TCG_REG_RCX,
57 TCG_REG_RDX,
58 TCG_REG_RSI,
59 TCG_REG_RDI,
60 TCG_REG_RAX,
61 #else
62 TCG_REG_EBX,
63 TCG_REG_ESI,
64 TCG_REG_EDI,
65 TCG_REG_EBP,
66 TCG_REG_ECX,
67 TCG_REG_EDX,
68 TCG_REG_EAX,
69 #endif
70 TCG_REG_XMM0,
71 TCG_REG_XMM1,
72 TCG_REG_XMM2,
73 TCG_REG_XMM3,
74 TCG_REG_XMM4,
75 TCG_REG_XMM5,
76 #ifndef _WIN64
77 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78 any of them. Therefore only allow xmm0-xmm5 to be allocated. */
79 TCG_REG_XMM6,
80 TCG_REG_XMM7,
81 #if TCG_TARGET_REG_BITS == 64
82 TCG_REG_XMM8,
83 TCG_REG_XMM9,
84 TCG_REG_XMM10,
85 TCG_REG_XMM11,
86 TCG_REG_XMM12,
87 TCG_REG_XMM13,
88 TCG_REG_XMM14,
89 TCG_REG_XMM15,
90 #endif
91 #endif
92 };
93
94 #define TCG_TMP_VEC TCG_REG_XMM5
95
96 static const int tcg_target_call_iarg_regs[] = {
97 #if TCG_TARGET_REG_BITS == 64
98 #if defined(_WIN64)
99 TCG_REG_RCX,
100 TCG_REG_RDX,
101 #else
102 TCG_REG_RDI,
103 TCG_REG_RSI,
104 TCG_REG_RDX,
105 TCG_REG_RCX,
106 #endif
107 TCG_REG_R8,
108 TCG_REG_R9,
109 #else
110 /* 32 bit mode uses stack based calling convention (GCC default). */
111 #endif
112 };
113
114 static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
115 {
116 switch (kind) {
117 case TCG_CALL_RET_NORMAL:
118 tcg_debug_assert(slot >= 0 && slot <= 1);
119 return slot ? TCG_REG_EDX : TCG_REG_EAX;
120 #ifdef _WIN64
121 case TCG_CALL_RET_BY_VEC:
122 tcg_debug_assert(slot == 0);
123 return TCG_REG_XMM0;
124 #endif
125 default:
126 g_assert_not_reached();
127 }
128 }
129
130 /* Constants we accept. */
131 #define TCG_CT_CONST_S32 0x100
132 #define TCG_CT_CONST_U32 0x200
133 #define TCG_CT_CONST_I32 0x400
134 #define TCG_CT_CONST_WSZ 0x800
135
136 /* Registers used with L constraint, which are the first argument
137 registers on x86_64, and two random call clobbered registers on
138 i386. */
139 #if TCG_TARGET_REG_BITS == 64
140 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
141 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
142 #else
143 # define TCG_REG_L0 TCG_REG_EAX
144 # define TCG_REG_L1 TCG_REG_EDX
145 #endif
146
147 #if TCG_TARGET_REG_BITS == 64
148 # define ALL_GENERAL_REGS 0x0000ffffu
149 # define ALL_VECTOR_REGS 0xffff0000u
150 # define ALL_BYTEL_REGS ALL_GENERAL_REGS
151 #else
152 # define ALL_GENERAL_REGS 0x000000ffu
153 # define ALL_VECTOR_REGS 0x00ff0000u
154 # define ALL_BYTEL_REGS 0x0000000fu
155 #endif
156 #define SOFTMMU_RESERVE_REGS \
157 (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0)
158
159 /* For 64-bit, we always know that CMOV is available. */
160 #if TCG_TARGET_REG_BITS == 64
161 # define have_cmov true
162 #else
163 # define have_cmov (cpuinfo & CPUINFO_CMOV)
164 #endif
165 #define have_bmi2 (cpuinfo & CPUINFO_BMI2)
166 #define have_lzcnt (cpuinfo & CPUINFO_LZCNT)
167
168 static const tcg_insn_unit *tb_ret_addr;
169
170 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
171 intptr_t value, intptr_t addend)
172 {
173 value += addend;
174 switch(type) {
175 case R_386_PC32:
176 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
177 if (value != (int32_t)value) {
178 return false;
179 }
180 /* FALLTHRU */
181 case R_386_32:
182 tcg_patch32(code_ptr, value);
183 break;
184 case R_386_PC8:
185 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
186 if (value != (int8_t)value) {
187 return false;
188 }
189 tcg_patch8(code_ptr, value);
190 break;
191 default:
192 g_assert_not_reached();
193 }
194 return true;
195 }
196
197 /* test if a constant matches the constraint */
198 static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
199 {
200 if (ct & TCG_CT_CONST) {
201 return 1;
202 }
203 if (type == TCG_TYPE_I32) {
204 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
205 return 1;
206 }
207 } else {
208 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
209 return 1;
210 }
211 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
212 return 1;
213 }
214 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
215 return 1;
216 }
217 }
218 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
219 return 1;
220 }
221 return 0;
222 }
223
224 # define LOWREGMASK(x) ((x) & 7)
225
226 #define P_EXT 0x100 /* 0x0f opcode prefix */
227 #define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */
228 #define P_DATA16 0x400 /* 0x66 opcode prefix */
229 #define P_VEXW 0x1000 /* Set VEX.W = 1 */
230 #if TCG_TARGET_REG_BITS == 64
231 # define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */
232 # define P_REXB_R 0x2000 /* REG field as byte register */
233 # define P_REXB_RM 0x4000 /* R/M field as byte register */
234 # define P_GS 0x8000 /* gs segment override */
235 #else
236 # define P_REXW 0
237 # define P_REXB_R 0
238 # define P_REXB_RM 0
239 # define P_GS 0
240 #endif
241 #define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */
242 #define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */
243 #define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */
244 #define P_VEXL 0x80000 /* Set VEX.L = 1 */
245 #define P_EVEX 0x100000 /* Requires EVEX encoding */
246
247 #define OPC_ARITH_EvIz (0x81)
248 #define OPC_ARITH_EvIb (0x83)
249 #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
250 #define OPC_ANDN (0xf2 | P_EXT38)
251 #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
252 #define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3))
253 #define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16)
254 #define OPC_BSF (0xbc | P_EXT)
255 #define OPC_BSR (0xbd | P_EXT)
256 #define OPC_BSWAP (0xc8 | P_EXT)
257 #define OPC_CALL_Jz (0xe8)
258 #define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */
259 #define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3))
260 #define OPC_DEC_r32 (0x48)
261 #define OPC_IMUL_GvEv (0xaf | P_EXT)
262 #define OPC_IMUL_GvEvIb (0x6b)
263 #define OPC_IMUL_GvEvIz (0x69)
264 #define OPC_INC_r32 (0x40)
265 #define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */
266 #define OPC_JCC_short (0x70) /* ... plus condition code */
267 #define OPC_JMP_long (0xe9)
268 #define OPC_JMP_short (0xeb)
269 #define OPC_LEA (0x8d)
270 #define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3)
271 #define OPC_MOVB_EvGv (0x88) /* stores, more or less */
272 #define OPC_MOVL_EvGv (0x89) /* stores, more or less */
273 #define OPC_MOVL_GvEv (0x8b) /* loads, more or less */
274 #define OPC_MOVB_EvIz (0xc6)
275 #define OPC_MOVL_EvIz (0xc7)
276 #define OPC_MOVB_Ib (0xb0)
277 #define OPC_MOVL_Iv (0xb8)
278 #define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
279 #define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
280 #define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16)
281 #define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16)
282 #define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2)
283 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
284 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
285 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
286 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
287 #define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3)
288 #define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16)
289 #define OPC_MOVSBL (0xbe | P_EXT)
290 #define OPC_MOVSWL (0xbf | P_EXT)
291 #define OPC_MOVSLQ (0x63 | P_REXW)
292 #define OPC_MOVZBL (0xb6 | P_EXT)
293 #define OPC_MOVZWL (0xb7 | P_EXT)
294 #define OPC_PABSB (0x1c | P_EXT38 | P_DATA16)
295 #define OPC_PABSW (0x1d | P_EXT38 | P_DATA16)
296 #define OPC_PABSD (0x1e | P_EXT38 | P_DATA16)
297 #define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
298 #define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16)
299 #define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16)
300 #define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16)
301 #define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16)
302 #define OPC_PADDB (0xfc | P_EXT | P_DATA16)
303 #define OPC_PADDW (0xfd | P_EXT | P_DATA16)
304 #define OPC_PADDD (0xfe | P_EXT | P_DATA16)
305 #define OPC_PADDQ (0xd4 | P_EXT | P_DATA16)
306 #define OPC_PADDSB (0xec | P_EXT | P_DATA16)
307 #define OPC_PADDSW (0xed | P_EXT | P_DATA16)
308 #define OPC_PADDUB (0xdc | P_EXT | P_DATA16)
309 #define OPC_PADDUW (0xdd | P_EXT | P_DATA16)
310 #define OPC_PAND (0xdb | P_EXT | P_DATA16)
311 #define OPC_PANDN (0xdf | P_EXT | P_DATA16)
312 #define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16)
313 #define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16)
314 #define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16)
315 #define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16)
316 #define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16)
317 #define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16)
318 #define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
319 #define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
320 #define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
321 #define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16)
322 #define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16)
323 #define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16)
324 #define OPC_PMAXSW (0xee | P_EXT | P_DATA16)
325 #define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16)
326 #define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
327 #define OPC_PMAXUB (0xde | P_EXT | P_DATA16)
328 #define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16)
329 #define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16)
330 #define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
331 #define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16)
332 #define OPC_PMINSW (0xea | P_EXT | P_DATA16)
333 #define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16)
334 #define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
335 #define OPC_PMINUB (0xda | P_EXT | P_DATA16)
336 #define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16)
337 #define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16)
338 #define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
339 #define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16)
340 #define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16)
341 #define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16)
342 #define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16)
343 #define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16)
344 #define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16)
345 #define OPC_PMULLW (0xd5 | P_EXT | P_DATA16)
346 #define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16)
347 #define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
348 #define OPC_POR (0xeb | P_EXT | P_DATA16)
349 #define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16)
350 #define OPC_PSHUFD (0x70 | P_EXT | P_DATA16)
351 #define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2)
352 #define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3)
353 #define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
354 #define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
355 #define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
356 #define OPC_PSLLW (0xf1 | P_EXT | P_DATA16)
357 #define OPC_PSLLD (0xf2 | P_EXT | P_DATA16)
358 #define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16)
359 #define OPC_PSRAW (0xe1 | P_EXT | P_DATA16)
360 #define OPC_PSRAD (0xe2 | P_EXT | P_DATA16)
361 #define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
362 #define OPC_PSRLW (0xd1 | P_EXT | P_DATA16)
363 #define OPC_PSRLD (0xd2 | P_EXT | P_DATA16)
364 #define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16)
365 #define OPC_PSUBB (0xf8 | P_EXT | P_DATA16)
366 #define OPC_PSUBW (0xf9 | P_EXT | P_DATA16)
367 #define OPC_PSUBD (0xfa | P_EXT | P_DATA16)
368 #define OPC_PSUBQ (0xfb | P_EXT | P_DATA16)
369 #define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16)
370 #define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16)
371 #define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16)
372 #define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16)
373 #define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16)
374 #define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16)
375 #define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16)
376 #define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16)
377 #define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16)
378 #define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16)
379 #define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16)
380 #define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16)
381 #define OPC_PXOR (0xef | P_EXT | P_DATA16)
382 #define OPC_POP_r32 (0x58)
383 #define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
384 #define OPC_PUSH_r32 (0x50)
385 #define OPC_PUSH_Iv (0x68)
386 #define OPC_PUSH_Ib (0x6a)
387 #define OPC_RET (0xc3)
388 #define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
389 #define OPC_SHIFT_1 (0xd1)
390 #define OPC_SHIFT_Ib (0xc1)
391 #define OPC_SHIFT_cl (0xd3)
392 #define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3)
393 #define OPC_SHUFPS (0xc6 | P_EXT)
394 #define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16)
395 #define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
396 #define OPC_SHRD_Ib (0xac | P_EXT)
397 #define OPC_TESTL (0x85)
398 #define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3)
399 #define OPC_UD2 (0x0b | P_EXT)
400 #define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
401 #define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
402 #define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16)
403 #define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16)
404 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
405 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
406 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
407 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
408 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
409 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
410 #define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
411 #define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
412 #define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
413 #define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
414 #define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
415 #define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
416 #define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
417 #define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
418 #define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
419 #define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
420 #define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
421 #define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
422 #define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
423 #define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
424 #define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
425 #define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
426 #define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16)
427 #define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
428 #define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
429 #define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16)
430 #define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
431 #define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
432 #define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16)
433 #define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
434 #define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
435 #define OPC_VZEROUPPER (0x77 | P_EXT)
436 #define OPC_XCHG_ax_r32 (0x90)
437 #define OPC_XCHG_EvGv (0x87)
438
439 #define OPC_GRP3_Eb (0xf6)
440 #define OPC_GRP3_Ev (0xf7)
441 #define OPC_GRP5 (0xff)
442 #define OPC_GRP14 (0x73 | P_EXT | P_DATA16)
443
444 /* Group 1 opcode extensions for 0x80-0x83.
445 These are also used as modifiers for OPC_ARITH. */
446 #define ARITH_ADD 0
447 #define ARITH_OR 1
448 #define ARITH_ADC 2
449 #define ARITH_SBB 3
450 #define ARITH_AND 4
451 #define ARITH_SUB 5
452 #define ARITH_XOR 6
453 #define ARITH_CMP 7
454
455 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */
456 #define SHIFT_ROL 0
457 #define SHIFT_ROR 1
458 #define SHIFT_SHL 4
459 #define SHIFT_SHR 5
460 #define SHIFT_SAR 7
461
462 /* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */
463 #define EXT3_TESTi 0
464 #define EXT3_NOT 2
465 #define EXT3_NEG 3
466 #define EXT3_MUL 4
467 #define EXT3_IMUL 5
468 #define EXT3_DIV 6
469 #define EXT3_IDIV 7
470
471 /* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */
472 #define EXT5_INC_Ev 0
473 #define EXT5_DEC_Ev 1
474 #define EXT5_CALLN_Ev 2
475 #define EXT5_JMPN_Ev 4
476
477 /* Condition codes to be added to OPC_JCC_{long,short}. */
478 #define JCC_JMP (-1)
479 #define JCC_JO 0x0
480 #define JCC_JNO 0x1
481 #define JCC_JB 0x2
482 #define JCC_JAE 0x3
483 #define JCC_JE 0x4
484 #define JCC_JNE 0x5
485 #define JCC_JBE 0x6
486 #define JCC_JA 0x7
487 #define JCC_JS 0x8
488 #define JCC_JNS 0x9
489 #define JCC_JP 0xa
490 #define JCC_JNP 0xb
491 #define JCC_JL 0xc
492 #define JCC_JGE 0xd
493 #define JCC_JLE 0xe
494 #define JCC_JG 0xf
495
496 static const uint8_t tcg_cond_to_jcc[] = {
497 [TCG_COND_EQ] = JCC_JE,
498 [TCG_COND_NE] = JCC_JNE,
499 [TCG_COND_LT] = JCC_JL,
500 [TCG_COND_GE] = JCC_JGE,
501 [TCG_COND_LE] = JCC_JLE,
502 [TCG_COND_GT] = JCC_JG,
503 [TCG_COND_LTU] = JCC_JB,
504 [TCG_COND_GEU] = JCC_JAE,
505 [TCG_COND_LEU] = JCC_JBE,
506 [TCG_COND_GTU] = JCC_JA,
507 };
508
509 #if TCG_TARGET_REG_BITS == 64
510 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
511 {
512 int rex;
513
514 if (opc & P_GS) {
515 tcg_out8(s, 0x65);
516 }
517 if (opc & P_DATA16) {
518 /* We should never be asking for both 16 and 64-bit operation. */
519 tcg_debug_assert((opc & P_REXW) == 0);
520 tcg_out8(s, 0x66);
521 }
522 if (opc & P_SIMDF3) {
523 tcg_out8(s, 0xf3);
524 } else if (opc & P_SIMDF2) {
525 tcg_out8(s, 0xf2);
526 }
527
528 rex = 0;
529 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */
530 rex |= (r & 8) >> 1; /* REX.R */
531 rex |= (x & 8) >> 2; /* REX.X */
532 rex |= (rm & 8) >> 3; /* REX.B */
533
534 /* P_REXB_{R,RM} indicates that the given register is the low byte.
535 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
536 as otherwise the encoding indicates %[abcd]h. Note that the values
537 that are ORed in merely indicate that the REX byte must be present;
538 those bits get discarded in output. */
539 rex |= opc & (r >= 4 ? P_REXB_R : 0);
540 rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
541
542 if (rex) {
543 tcg_out8(s, (uint8_t)(rex | 0x40));
544 }
545
546 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
547 tcg_out8(s, 0x0f);
548 if (opc & P_EXT38) {
549 tcg_out8(s, 0x38);
550 } else if (opc & P_EXT3A) {
551 tcg_out8(s, 0x3a);
552 }
553 }
554
555 tcg_out8(s, opc);
556 }
557 #else
558 static void tcg_out_opc(TCGContext *s, int opc)
559 {
560 if (opc & P_DATA16) {
561 tcg_out8(s, 0x66);
562 }
563 if (opc & P_SIMDF3) {
564 tcg_out8(s, 0xf3);
565 } else if (opc & P_SIMDF2) {
566 tcg_out8(s, 0xf2);
567 }
568 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
569 tcg_out8(s, 0x0f);
570 if (opc & P_EXT38) {
571 tcg_out8(s, 0x38);
572 } else if (opc & P_EXT3A) {
573 tcg_out8(s, 0x3a);
574 }
575 }
576 tcg_out8(s, opc);
577 }
578 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
579 the 32-bit compilation paths. This method works with all versions of gcc,
580 whereas relying on optimization may not be able to exclude them. */
581 #define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc)
582 #endif
583
584 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
585 {
586 tcg_out_opc(s, opc, r, rm, 0);
587 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
588 }
589
590 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
591 int rm, int index)
592 {
593 int tmp;
594
595 if (opc & P_GS) {
596 tcg_out8(s, 0x65);
597 }
598 /* Use the two byte form if possible, which cannot encode
599 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */
600 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
601 && ((rm | index) & 8) == 0) {
602 /* Two byte VEX prefix. */
603 tcg_out8(s, 0xc5);
604
605 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
606 } else {
607 /* Three byte VEX prefix. */
608 tcg_out8(s, 0xc4);
609
610 /* VEX.m-mmmm */
611 if (opc & P_EXT3A) {
612 tmp = 3;
613 } else if (opc & P_EXT38) {
614 tmp = 2;
615 } else if (opc & P_EXT) {
616 tmp = 1;
617 } else {
618 g_assert_not_reached();
619 }
620 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
621 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */
622 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
623 tcg_out8(s, tmp);
624
625 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */
626 }
627
628 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */
629 /* VEX.pp */
630 if (opc & P_DATA16) {
631 tmp |= 1; /* 0x66 */
632 } else if (opc & P_SIMDF3) {
633 tmp |= 2; /* 0xf3 */
634 } else if (opc & P_SIMDF2) {
635 tmp |= 3; /* 0xf2 */
636 }
637 tmp |= (~v & 15) << 3; /* VEX.vvvv */
638 tcg_out8(s, tmp);
639 tcg_out8(s, opc);
640 }
641
642 static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
643 int rm, int index)
644 {
645 /* The entire 4-byte evex prefix; with R' and V' set. */
646 uint32_t p = 0x08041062;
647 int mm, pp;
648
649 tcg_debug_assert(have_avx512vl);
650
651 /* EVEX.mm */
652 if (opc & P_EXT3A) {
653 mm = 3;
654 } else if (opc & P_EXT38) {
655 mm = 2;
656 } else if (opc & P_EXT) {
657 mm = 1;
658 } else {
659 g_assert_not_reached();
660 }
661
662 /* EVEX.pp */
663 if (opc & P_DATA16) {
664 pp = 1; /* 0x66 */
665 } else if (opc & P_SIMDF3) {
666 pp = 2; /* 0xf3 */
667 } else if (opc & P_SIMDF2) {
668 pp = 3; /* 0xf2 */
669 } else {
670 pp = 0;
671 }
672
673 p = deposit32(p, 8, 2, mm);
674 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */
675 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */
676 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */
677 p = deposit32(p, 16, 2, pp);
678 p = deposit32(p, 19, 4, ~v);
679 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
680 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
681
682 tcg_out32(s, p);
683 tcg_out8(s, opc);
684 }
685
686 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
687 {
688 if (opc & P_EVEX) {
689 tcg_out_evex_opc(s, opc, r, v, rm, 0);
690 } else {
691 tcg_out_vex_opc(s, opc, r, v, rm, 0);
692 }
693 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
694 }
695
696 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
697 We handle either RM and INDEX missing with a negative value. In 64-bit
698 mode for absolute addresses, ~RM is the size of the immediate operand
699 that will follow the instruction. */
700
701 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
702 int shift, intptr_t offset)
703 {
704 int mod, len;
705
706 if (index < 0 && rm < 0) {
707 if (TCG_TARGET_REG_BITS == 64) {
708 /* Try for a rip-relative addressing mode. This has replaced
709 the 32-bit-mode absolute addressing encoding. */
710 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
711 intptr_t disp = offset - pc;
712 if (disp == (int32_t)disp) {
713 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
714 tcg_out32(s, disp);
715 return;
716 }
717
718 /* Try for an absolute address encoding. This requires the
719 use of the MODRM+SIB encoding and is therefore larger than
720 rip-relative addressing. */
721 if (offset == (int32_t)offset) {
722 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
723 tcg_out8(s, (4 << 3) | 5);
724 tcg_out32(s, offset);
725 return;
726 }
727
728 /* ??? The memory isn't directly addressable. */
729 g_assert_not_reached();
730 } else {
731 /* Absolute address. */
732 tcg_out8(s, (r << 3) | 5);
733 tcg_out32(s, offset);
734 return;
735 }
736 }
737
738 /* Find the length of the immediate addend. Note that the encoding
739 that would be used for (%ebp) indicates absolute addressing. */
740 if (rm < 0) {
741 mod = 0, len = 4, rm = 5;
742 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
743 mod = 0, len = 0;
744 } else if (offset == (int8_t)offset) {
745 mod = 0x40, len = 1;
746 } else {
747 mod = 0x80, len = 4;
748 }
749
750 /* Use a single byte MODRM format if possible. Note that the encoding
751 that would be used for %esp is the escape to the two byte form. */
752 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
753 /* Single byte MODRM format. */
754 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
755 } else {
756 /* Two byte MODRM+SIB format. */
757
758 /* Note that the encoding that would place %esp into the index
759 field indicates no index register. In 64-bit mode, the REX.X
760 bit counts, so %r12 can be used as the index. */
761 if (index < 0) {
762 index = 4;
763 } else {
764 tcg_debug_assert(index != TCG_REG_ESP);
765 }
766
767 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
768 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
769 }
770
771 if (len == 1) {
772 tcg_out8(s, offset);
773 } else if (len == 4) {
774 tcg_out32(s, offset);
775 }
776 }
777
778 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
779 int index, int shift, intptr_t offset)
780 {
781 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
782 tcg_out_sib_offset(s, r, rm, index, shift, offset);
783 }
784
785 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
786 int rm, int index, int shift,
787 intptr_t offset)
788 {
789 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
790 tcg_out_sib_offset(s, r, rm, index, shift, offset);
791 }
792
793 /* A simplification of the above with no index or shift. */
794 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
795 int rm, intptr_t offset)
796 {
797 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
798 }
799
800 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
801 int v, int rm, intptr_t offset)
802 {
803 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
804 }
805
806 /* Output an opcode with an expected reference to the constant pool. */
807 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
808 {
809 tcg_out_opc(s, opc, r, 0, 0);
810 /* Absolute for 32-bit, pc-relative for 64-bit. */
811 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
812 tcg_out32(s, 0);
813 }
814
815 /* Output an opcode with an expected reference to the constant pool. */
816 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
817 {
818 tcg_out_vex_opc(s, opc, r, 0, 0, 0);
819 /* Absolute for 32-bit, pc-relative for 64-bit. */
820 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
821 tcg_out32(s, 0);
822 }
823
824 /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
825 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
826 {
827 /* Propagate an opcode prefix, such as P_REXW. */
828 int ext = subop & ~0x7;
829 subop &= 0x7;
830
831 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
832 }
833
834 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
835 {
836 int rexw = 0;
837
838 if (arg == ret) {
839 return true;
840 }
841 switch (type) {
842 case TCG_TYPE_I64:
843 rexw = P_REXW;
844 /* fallthru */
845 case TCG_TYPE_I32:
846 if (ret < 16) {
847 if (arg < 16) {
848 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
849 } else {
850 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
851 }
852 } else {
853 if (arg < 16) {
854 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
855 } else {
856 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
857 }
858 }
859 break;
860
861 case TCG_TYPE_V64:
862 tcg_debug_assert(ret >= 16 && arg >= 16);
863 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
864 break;
865 case TCG_TYPE_V128:
866 tcg_debug_assert(ret >= 16 && arg >= 16);
867 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
868 break;
869 case TCG_TYPE_V256:
870 tcg_debug_assert(ret >= 16 && arg >= 16);
871 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
872 break;
873
874 default:
875 g_assert_not_reached();
876 }
877 return true;
878 }
879
880 static const int avx2_dup_insn[4] = {
881 OPC_VPBROADCASTB, OPC_VPBROADCASTW,
882 OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
883 };
884
885 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
886 TCGReg r, TCGReg a)
887 {
888 if (have_avx2) {
889 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
890 tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
891 } else {
892 switch (vece) {
893 case MO_8:
894 /* ??? With zero in a register, use PSHUFB. */
895 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
896 a = r;
897 /* FALLTHRU */
898 case MO_16:
899 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
900 a = r;
901 /* FALLTHRU */
902 case MO_32:
903 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
904 /* imm8 operand: all output lanes selected from input lane 0. */
905 tcg_out8(s, 0);
906 break;
907 case MO_64:
908 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
909 break;
910 default:
911 g_assert_not_reached();
912 }
913 }
914 return true;
915 }
916
917 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
918 TCGReg r, TCGReg base, intptr_t offset)
919 {
920 if (have_avx2) {
921 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
922 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
923 r, 0, base, offset);
924 } else {
925 switch (vece) {
926 case MO_64:
927 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
928 break;
929 case MO_32:
930 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
931 break;
932 case MO_16:
933 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
934 tcg_out8(s, 0); /* imm8 */
935 tcg_out_dup_vec(s, type, vece, r, r);
936 break;
937 case MO_8:
938 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
939 tcg_out8(s, 0); /* imm8 */
940 tcg_out_dup_vec(s, type, vece, r, r);
941 break;
942 default:
943 g_assert_not_reached();
944 }
945 }
946 return true;
947 }
948
949 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
950 TCGReg ret, int64_t arg)
951 {
952 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
953
954 if (arg == 0) {
955 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
956 return;
957 }
958 if (arg == -1) {
959 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
960 return;
961 }
962
963 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
964 if (have_avx2) {
965 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
966 } else {
967 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
968 }
969 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
970 } else {
971 if (type == TCG_TYPE_V64) {
972 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
973 } else if (have_avx2) {
974 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
975 } else {
976 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
977 }
978 if (TCG_TARGET_REG_BITS == 64) {
979 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
980 } else {
981 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
982 }
983 }
984 }
985
986 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
987 TCGReg ret, tcg_target_long arg)
988 {
989 if (arg == 0) {
990 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
991 return;
992 }
993 if (arg == -1) {
994 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
995 return;
996 }
997
998 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
999 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1000 if (TCG_TARGET_REG_BITS == 64) {
1001 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1002 } else {
1003 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1004 }
1005 }
1006
1007 static void tcg_out_movi_int(TCGContext *s, TCGType type,
1008 TCGReg ret, tcg_target_long arg)
1009 {
1010 tcg_target_long diff;
1011
1012 if (arg == 0) {
1013 tgen_arithr(s, ARITH_XOR, ret, ret);
1014 return;
1015 }
1016 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1017 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1018 tcg_out32(s, arg);
1019 return;
1020 }
1021 if (arg == (int32_t)arg) {
1022 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1023 tcg_out32(s, arg);
1024 return;
1025 }
1026
1027 /* Try a 7 byte pc-relative lea before the 10 byte movq. */
1028 diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1029 if (diff == (int32_t)diff) {
1030 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1031 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1032 tcg_out32(s, diff);
1033 return;
1034 }
1035
1036 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1037 tcg_out64(s, arg);
1038 }
1039
1040 static void tcg_out_movi(TCGContext *s, TCGType type,
1041 TCGReg ret, tcg_target_long arg)
1042 {
1043 switch (type) {
1044 case TCG_TYPE_I32:
1045 #if TCG_TARGET_REG_BITS == 64
1046 case TCG_TYPE_I64:
1047 #endif
1048 if (ret < 16) {
1049 tcg_out_movi_int(s, type, ret, arg);
1050 } else {
1051 tcg_out_movi_vec(s, type, ret, arg);
1052 }
1053 break;
1054 default:
1055 g_assert_not_reached();
1056 }
1057 }
1058
1059 static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1060 {
1061 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1062 tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1063 return true;
1064 }
1065
1066 static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1067 tcg_target_long imm)
1068 {
1069 /* This function is only used for passing structs by reference. */
1070 tcg_debug_assert(imm == (int32_t)imm);
1071 tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1072 }
1073
1074 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1075 {
1076 if (val == (int8_t)val) {
1077 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1078 tcg_out8(s, val);
1079 } else if (val == (int32_t)val) {
1080 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1081 tcg_out32(s, val);
1082 } else {
1083 g_assert_not_reached();
1084 }
1085 }
1086
1087 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1088 {
1089 /* Given the strength of x86 memory ordering, we only need care for
1090 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is
1091 faster than "mfence", so don't bother with the sse insn. */
1092 if (a0 & TCG_MO_ST_LD) {
1093 tcg_out8(s, 0xf0);
1094 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1095 tcg_out8(s, 0);
1096 }
1097 }
1098
1099 static inline void tcg_out_push(TCGContext *s, int reg)
1100 {
1101 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1102 }
1103
1104 static inline void tcg_out_pop(TCGContext *s, int reg)
1105 {
1106 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1107 }
1108
1109 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1110 TCGReg arg1, intptr_t arg2)
1111 {
1112 switch (type) {
1113 case TCG_TYPE_I32:
1114 if (ret < 16) {
1115 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1116 } else {
1117 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1118 }
1119 break;
1120 case TCG_TYPE_I64:
1121 if (ret < 16) {
1122 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1123 break;
1124 }
1125 /* FALLTHRU */
1126 case TCG_TYPE_V64:
1127 /* There is no instruction that can validate 8-byte alignment. */
1128 tcg_debug_assert(ret >= 16);
1129 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1130 break;
1131 case TCG_TYPE_V128:
1132 /*
1133 * The gvec infrastructure is asserts that v128 vector loads
1134 * and stores use a 16-byte aligned offset. Validate that the
1135 * final pointer is aligned by using an insn that will SIGSEGV.
1136 */
1137 tcg_debug_assert(ret >= 16);
1138 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1139 break;
1140 case TCG_TYPE_V256:
1141 /*
1142 * The gvec infrastructure only requires 16-byte alignment,
1143 * so here we must use an unaligned load.
1144 */
1145 tcg_debug_assert(ret >= 16);
1146 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1147 ret, 0, arg1, arg2);
1148 break;
1149 default:
1150 g_assert_not_reached();
1151 }
1152 }
1153
1154 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1155 TCGReg arg1, intptr_t arg2)
1156 {
1157 switch (type) {
1158 case TCG_TYPE_I32:
1159 if (arg < 16) {
1160 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1161 } else {
1162 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1163 }
1164 break;
1165 case TCG_TYPE_I64:
1166 if (arg < 16) {
1167 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1168 break;
1169 }
1170 /* FALLTHRU */
1171 case TCG_TYPE_V64:
1172 /* There is no instruction that can validate 8-byte alignment. */
1173 tcg_debug_assert(arg >= 16);
1174 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1175 break;
1176 case TCG_TYPE_V128:
1177 /*
1178 * The gvec infrastructure is asserts that v128 vector loads
1179 * and stores use a 16-byte aligned offset. Validate that the
1180 * final pointer is aligned by using an insn that will SIGSEGV.
1181 *
1182 * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1183 * for _WIN64, which must have SSE2 but may not have AVX.
1184 */
1185 tcg_debug_assert(arg >= 16);
1186 if (have_avx1) {
1187 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1188 } else {
1189 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1190 }
1191 break;
1192 case TCG_TYPE_V256:
1193 /*
1194 * The gvec infrastructure only requires 16-byte alignment,
1195 * so here we must use an unaligned store.
1196 */
1197 tcg_debug_assert(arg >= 16);
1198 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1199 arg, 0, arg1, arg2);
1200 break;
1201 default:
1202 g_assert_not_reached();
1203 }
1204 }
1205
1206 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1207 TCGReg base, intptr_t ofs)
1208 {
1209 int rexw = 0;
1210 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1211 if (val != (int32_t)val) {
1212 return false;
1213 }
1214 rexw = P_REXW;
1215 } else if (type != TCG_TYPE_I32) {
1216 return false;
1217 }
1218 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1219 tcg_out32(s, val);
1220 return true;
1221 }
1222
1223 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1224 {
1225 /* Propagate an opcode prefix, such as P_DATA16. */
1226 int ext = subopc & ~0x7;
1227 subopc &= 0x7;
1228
1229 if (count == 1) {
1230 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1231 } else {
1232 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1233 tcg_out8(s, count);
1234 }
1235 }
1236
1237 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1238 {
1239 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1240 }
1241
1242 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1243 {
1244 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1245 }
1246
1247 static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1248 {
1249 /* movzbl */
1250 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1251 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1252 }
1253
1254 static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1255 {
1256 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1257 /* movsbl */
1258 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1259 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1260 }
1261
1262 static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1263 {
1264 /* movzwl */
1265 tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1266 }
1267
1268 static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1269 {
1270 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1271 /* movsw[lq] */
1272 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1273 }
1274
1275 static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1276 {
1277 /* 32-bit mov zero extends. */
1278 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1279 }
1280
1281 static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1282 {
1283 tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1284 tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1285 }
1286
1287 static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1288 {
1289 tcg_out_ext32s(s, dest, src);
1290 }
1291
1292 static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1293 {
1294 if (dest != src) {
1295 tcg_out_ext32u(s, dest, src);
1296 }
1297 }
1298
1299 static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1300 {
1301 tcg_out_ext32u(s, dest, src);
1302 }
1303
1304 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1305 {
1306 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1307 }
1308
1309 static void tgen_arithi(TCGContext *s, int c, int r0,
1310 tcg_target_long val, int cf)
1311 {
1312 int rexw = 0;
1313
1314 if (TCG_TARGET_REG_BITS == 64) {
1315 rexw = c & -8;
1316 c &= 7;
1317 }
1318
1319 switch (c) {
1320 case ARITH_ADD:
1321 case ARITH_SUB:
1322 if (!cf) {
1323 /*
1324 * ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1325 * partial flags update stalls on Pentium4 and are not recommended
1326 * by current Intel optimization manuals.
1327 */
1328 if (val == 1 || val == -1) {
1329 int is_inc = (c == ARITH_ADD) ^ (val < 0);
1330 if (TCG_TARGET_REG_BITS == 64) {
1331 /*
1332 * The single-byte increment encodings are re-tasked
1333 * as the REX prefixes. Use the MODRM encoding.
1334 */
1335 tcg_out_modrm(s, OPC_GRP5 + rexw,
1336 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1337 } else {
1338 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1339 }
1340 return;
1341 }
1342 if (val == 128) {
1343 /*
1344 * Facilitate using an 8-bit immediate. Carry is inverted
1345 * by this transformation, so do it only if cf == 0.
1346 */
1347 c ^= ARITH_ADD ^ ARITH_SUB;
1348 val = -128;
1349 }
1350 }
1351 break;
1352
1353 case ARITH_AND:
1354 if (TCG_TARGET_REG_BITS == 64) {
1355 if (val == 0xffffffffu) {
1356 tcg_out_ext32u(s, r0, r0);
1357 return;
1358 }
1359 if (val == (uint32_t)val) {
1360 /* AND with no high bits set can use a 32-bit operation. */
1361 rexw = 0;
1362 }
1363 }
1364 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1365 tcg_out_ext8u(s, r0, r0);
1366 return;
1367 }
1368 if (val == 0xffffu) {
1369 tcg_out_ext16u(s, r0, r0);
1370 return;
1371 }
1372 break;
1373 }
1374
1375 if (val == (int8_t)val) {
1376 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1377 tcg_out8(s, val);
1378 return;
1379 }
1380 if (rexw == 0 || val == (int32_t)val) {
1381 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1382 tcg_out32(s, val);
1383 return;
1384 }
1385
1386 g_assert_not_reached();
1387 }
1388
1389 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1390 {
1391 if (val != 0) {
1392 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1393 }
1394 }
1395
1396 /* Set SMALL to force a short forward branch. */
1397 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1398 {
1399 int32_t val, val1;
1400
1401 if (l->has_value) {
1402 val = tcg_pcrel_diff(s, l->u.value_ptr);
1403 val1 = val - 2;
1404 if ((int8_t)val1 == val1) {
1405 if (opc == -1) {
1406 tcg_out8(s, OPC_JMP_short);
1407 } else {
1408 tcg_out8(s, OPC_JCC_short + opc);
1409 }
1410 tcg_out8(s, val1);
1411 } else {
1412 tcg_debug_assert(!small);
1413 if (opc == -1) {
1414 tcg_out8(s, OPC_JMP_long);
1415 tcg_out32(s, val - 5);
1416 } else {
1417 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1418 tcg_out32(s, val - 6);
1419 }
1420 }
1421 } else if (small) {
1422 if (opc == -1) {
1423 tcg_out8(s, OPC_JMP_short);
1424 } else {
1425 tcg_out8(s, OPC_JCC_short + opc);
1426 }
1427 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1428 s->code_ptr += 1;
1429 } else {
1430 if (opc == -1) {
1431 tcg_out8(s, OPC_JMP_long);
1432 } else {
1433 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1434 }
1435 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1436 s->code_ptr += 4;
1437 }
1438 }
1439
1440 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1441 int const_arg2, int rexw)
1442 {
1443 if (const_arg2) {
1444 if (arg2 == 0) {
1445 /* test r, r */
1446 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1447 } else {
1448 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1449 }
1450 } else {
1451 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1452 }
1453 }
1454
1455 static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1456 TCGArg arg1, TCGArg arg2, int const_arg2,
1457 TCGLabel *label, bool small)
1458 {
1459 tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1460 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1461 }
1462
1463 #if TCG_TARGET_REG_BITS == 32
1464 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1465 const int *const_args, bool small)
1466 {
1467 TCGLabel *label_next = gen_new_label();
1468 TCGLabel *label_this = arg_label(args[5]);
1469
1470 switch(args[4]) {
1471 case TCG_COND_EQ:
1472 tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1473 label_next, 1);
1474 tcg_out_brcond(s, 0, TCG_COND_EQ, args[1], args[3], const_args[3],
1475 label_this, small);
1476 break;
1477 case TCG_COND_NE:
1478 tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1479 label_this, small);
1480 tcg_out_brcond(s, 0, TCG_COND_NE, args[1], args[3], const_args[3],
1481 label_this, small);
1482 break;
1483 case TCG_COND_LT:
1484 tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1485 label_this, small);
1486 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1487 tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1488 label_this, small);
1489 break;
1490 case TCG_COND_LE:
1491 tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1492 label_this, small);
1493 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1494 tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1495 label_this, small);
1496 break;
1497 case TCG_COND_GT:
1498 tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1499 label_this, small);
1500 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1501 tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1502 label_this, small);
1503 break;
1504 case TCG_COND_GE:
1505 tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1506 label_this, small);
1507 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1508 tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1509 label_this, small);
1510 break;
1511 case TCG_COND_LTU:
1512 tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1513 label_this, small);
1514 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1515 tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1516 label_this, small);
1517 break;
1518 case TCG_COND_LEU:
1519 tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1520 label_this, small);
1521 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1522 tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1523 label_this, small);
1524 break;
1525 case TCG_COND_GTU:
1526 tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1527 label_this, small);
1528 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1529 tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1530 label_this, small);
1531 break;
1532 case TCG_COND_GEU:
1533 tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1534 label_this, small);
1535 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1536 tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1537 label_this, small);
1538 break;
1539 default:
1540 g_assert_not_reached();
1541 }
1542 tcg_out_label(s, label_next);
1543 }
1544 #endif
1545
1546 static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1547 TCGArg dest, TCGArg arg1, TCGArg arg2,
1548 int const_arg2, bool neg)
1549 {
1550 bool inv = false;
1551 bool cleared;
1552
1553 switch (cond) {
1554 case TCG_COND_NE:
1555 inv = true;
1556 /* fall through */
1557 case TCG_COND_EQ:
1558 /* If arg2 is 0, convert to LTU/GEU vs 1. */
1559 if (const_arg2 && arg2 == 0) {
1560 arg2 = 1;
1561 goto do_ltu;
1562 }
1563 break;
1564
1565 case TCG_COND_LEU:
1566 inv = true;
1567 /* fall through */
1568 case TCG_COND_GTU:
1569 /* If arg2 is a register, swap for LTU/GEU. */
1570 if (!const_arg2) {
1571 TCGReg t = arg1;
1572 arg1 = arg2;
1573 arg2 = t;
1574 goto do_ltu;
1575 }
1576 break;
1577
1578 case TCG_COND_GEU:
1579 inv = true;
1580 /* fall through */
1581 case TCG_COND_LTU:
1582 do_ltu:
1583 /*
1584 * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1585 * We can then use NEG or INC to produce the desired result.
1586 * This is always smaller than the SETCC expansion.
1587 */
1588 tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1589
1590 /* X - X - C = -C = (C ? -1 : 0) */
1591 tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
1592 if (inv && neg) {
1593 /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1594 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1595 } else if (inv) {
1596 /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1597 tgen_arithi(s, ARITH_ADD, dest, 1, 0);
1598 } else if (!neg) {
1599 /* -(C ? -1 : 0) = (C ? 1 : 0) */
1600 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
1601 }
1602 return;
1603
1604 case TCG_COND_GE:
1605 inv = true;
1606 /* fall through */
1607 case TCG_COND_LT:
1608 /* If arg2 is 0, extract the sign bit. */
1609 if (const_arg2 && arg2 == 0) {
1610 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
1611 if (inv) {
1612 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1613 }
1614 tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
1615 dest, rexw ? 63 : 31);
1616 return;
1617 }
1618 break;
1619
1620 default:
1621 break;
1622 }
1623
1624 /*
1625 * If dest does not overlap the inputs, clearing it first is preferred.
1626 * The XOR breaks any false dependency for the low-byte write to dest,
1627 * and is also one byte smaller than MOVZBL.
1628 */
1629 cleared = false;
1630 if (dest != arg1 && (const_arg2 || dest != arg2)) {
1631 tgen_arithr(s, ARITH_XOR, dest, dest);
1632 cleared = true;
1633 }
1634
1635 tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1636 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1637
1638 if (!cleared) {
1639 tcg_out_ext8u(s, dest, dest);
1640 }
1641 if (neg) {
1642 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
1643 }
1644 }
1645
1646 #if TCG_TARGET_REG_BITS == 32
1647 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1648 const int *const_args)
1649 {
1650 TCGArg new_args[6];
1651 TCGLabel *label_true, *label_over;
1652
1653 memcpy(new_args, args+1, 5*sizeof(TCGArg));
1654
1655 if (args[0] == args[1] || args[0] == args[2]
1656 || (!const_args[3] && args[0] == args[3])
1657 || (!const_args[4] && args[0] == args[4])) {
1658 /* When the destination overlaps with one of the argument
1659 registers, don't do anything tricky. */
1660 label_true = gen_new_label();
1661 label_over = gen_new_label();
1662
1663 new_args[5] = label_arg(label_true);
1664 tcg_out_brcond2(s, new_args, const_args+1, 1);
1665
1666 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1667 tcg_out_jxx(s, JCC_JMP, label_over, 1);
1668 tcg_out_label(s, label_true);
1669
1670 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1671 tcg_out_label(s, label_over);
1672 } else {
1673 /* When the destination does not overlap one of the arguments,
1674 clear the destination first, jump if cond false, and emit an
1675 increment in the true case. This results in smaller code. */
1676
1677 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1678
1679 label_over = gen_new_label();
1680 new_args[4] = tcg_invert_cond(new_args[4]);
1681 new_args[5] = label_arg(label_over);
1682 tcg_out_brcond2(s, new_args, const_args+1, 1);
1683
1684 tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1685 tcg_out_label(s, label_over);
1686 }
1687 }
1688 #endif
1689
1690 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1691 TCGReg dest, TCGReg v1)
1692 {
1693 if (have_cmov) {
1694 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1695 } else {
1696 TCGLabel *over = gen_new_label();
1697 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1698 tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1699 tcg_out_label(s, over);
1700 }
1701 }
1702
1703 static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1704 TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1705 TCGReg v1)
1706 {
1707 tcg_out_cmp(s, c1, c2, const_c2, rexw);
1708 tcg_out_cmov(s, cond, rexw, dest, v1);
1709 }
1710
1711 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1712 TCGArg arg2, bool const_a2)
1713 {
1714 if (have_bmi1) {
1715 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1716 if (const_a2) {
1717 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1718 } else {
1719 tcg_debug_assert(dest != arg2);
1720 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1721 }
1722 } else {
1723 tcg_debug_assert(dest != arg2);
1724 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1725 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1726 }
1727 }
1728
1729 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1730 TCGArg arg2, bool const_a2)
1731 {
1732 if (have_lzcnt) {
1733 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1734 if (const_a2) {
1735 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1736 } else {
1737 tcg_debug_assert(dest != arg2);
1738 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1739 }
1740 } else {
1741 tcg_debug_assert(!const_a2);
1742 tcg_debug_assert(dest != arg1);
1743 tcg_debug_assert(dest != arg2);
1744
1745 /* Recall that the output of BSR is the index not the count. */
1746 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1747 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1748
1749 /* Since we have destroyed the flags from BSR, we have to re-test. */
1750 tcg_out_cmp(s, arg1, 0, 1, rexw);
1751 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1752 }
1753 }
1754
1755 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1756 {
1757 intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1758
1759 if (disp == (int32_t)disp) {
1760 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1761 tcg_out32(s, disp);
1762 } else {
1763 /* rip-relative addressing into the constant pool.
1764 This is 6 + 8 = 14 bytes, as compared to using an
1765 immediate load 10 + 6 = 16 bytes, plus we may
1766 be able to re-use the pool constant for more calls. */
1767 tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1768 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1769 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1770 tcg_out32(s, 0);
1771 }
1772 }
1773
1774 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1775 const TCGHelperInfo *info)
1776 {
1777 tcg_out_branch(s, 1, dest);
1778
1779 #ifndef _WIN32
1780 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1781 /*
1782 * The sysv i386 abi for struct return places a reference as the
1783 * first argument of the stack, and pops that argument with the
1784 * return statement. Since we want to retain the aligned stack
1785 * pointer for the callee, we do not want to actually push that
1786 * argument before the call but rely on the normal store to the
1787 * stack slot. But we do need to compensate for the pop in order
1788 * to reset our correct stack pointer value.
1789 * Pushing a garbage value back onto the stack is quickest.
1790 */
1791 tcg_out_push(s, TCG_REG_EAX);
1792 }
1793 #endif
1794 }
1795
1796 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1797 {
1798 tcg_out_branch(s, 0, dest);
1799 }
1800
1801 static void tcg_out_nopn(TCGContext *s, int n)
1802 {
1803 int i;
1804 /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1805 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1806 * duplicate prefix, and all of the interesting recent cores can
1807 * decode and discard the duplicates in a single cycle.
1808 */
1809 tcg_debug_assert(n >= 1);
1810 for (i = 1; i < n; ++i) {
1811 tcg_out8(s, 0x66);
1812 }
1813 tcg_out8(s, 0x90);
1814 }
1815
1816 /* Test register R vs immediate bits I, setting Z flag for EQ/NE. */
1817 static void __attribute__((unused))
1818 tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i)
1819 {
1820 /*
1821 * This is used for testing alignment, so we can usually use testb.
1822 * For i686, we have to use testl for %esi/%edi.
1823 */
1824 if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) {
1825 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r);
1826 tcg_out8(s, i);
1827 } else {
1828 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r);
1829 tcg_out32(s, i);
1830 }
1831 }
1832
1833 typedef struct {
1834 TCGReg base;
1835 int index;
1836 int ofs;
1837 int seg;
1838 TCGAtomAlign aa;
1839 } HostAddress;
1840
1841 bool tcg_target_has_memory_bswap(MemOp memop)
1842 {
1843 TCGAtomAlign aa;
1844
1845 if (!have_movbe) {
1846 return false;
1847 }
1848 if ((memop & MO_SIZE) < MO_128) {
1849 return true;
1850 }
1851
1852 /*
1853 * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1854 * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1855 */
1856 aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1857 return aa.atom < MO_128;
1858 }
1859
1860 /*
1861 * Because i686 has no register parameters and because x86_64 has xchg
1862 * to handle addr/data register overlap, we have placed all input arguments
1863 * before we need might need a scratch reg.
1864 *
1865 * Even then, a scratch is only needed for l->raddr. Rather than expose
1866 * a general-purpose scratch when we don't actually know it's available,
1867 * use the ra_gen hook to load into RAX if needed.
1868 */
1869 #if TCG_TARGET_REG_BITS == 64
1870 static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1871 {
1872 if (arg < 0) {
1873 arg = TCG_REG_RAX;
1874 }
1875 tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1876 return arg;
1877 }
1878 static const TCGLdstHelperParam ldst_helper_param = {
1879 .ra_gen = ldst_ra_gen
1880 };
1881 #else
1882 static const TCGLdstHelperParam ldst_helper_param = { };
1883 #endif
1884
1885 static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1886 TCGReg l, TCGReg h, TCGReg v)
1887 {
1888 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1889
1890 /* vpmov{d,q} %v, %l */
1891 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
1892 /* vpextr{d,q} $1, %v, %h */
1893 tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
1894 tcg_out8(s, 1);
1895 }
1896
1897 static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
1898 TCGReg v, TCGReg l, TCGReg h)
1899 {
1900 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1901
1902 /* vmov{d,q} %l, %v */
1903 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
1904 /* vpinsr{d,q} $1, %h, %v, %v */
1905 tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
1906 tcg_out8(s, 1);
1907 }
1908
1909 /*
1910 * Generate code for the slow path for a load at the end of block
1911 */
1912 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1913 {
1914 MemOp opc = get_memop(l->oi);
1915 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1916
1917 /* resolve label address */
1918 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1919 if (label_ptr[1]) {
1920 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1921 }
1922
1923 tcg_out_ld_helper_args(s, l, &ldst_helper_param);
1924 tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
1925 tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
1926
1927 tcg_out_jmp(s, l->raddr);
1928 return true;
1929 }
1930
1931 /*
1932 * Generate code for the slow path for a store at the end of block
1933 */
1934 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1935 {
1936 MemOp opc = get_memop(l->oi);
1937 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1938
1939 /* resolve label address */
1940 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1941 if (label_ptr[1]) {
1942 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1943 }
1944
1945 tcg_out_st_helper_args(s, l, &ldst_helper_param);
1946 tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
1947
1948 tcg_out_jmp(s, l->raddr);
1949 return true;
1950 }
1951
1952 #ifdef CONFIG_USER_ONLY
1953 static HostAddress x86_guest_base = {
1954 .index = -1
1955 };
1956
1957 #if defined(__x86_64__) && defined(__linux__)
1958 # include <asm/prctl.h>
1959 # include <sys/prctl.h>
1960 int arch_prctl(int code, unsigned long addr);
1961 static inline int setup_guest_base_seg(void)
1962 {
1963 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1964 return P_GS;
1965 }
1966 return 0;
1967 }
1968 #define setup_guest_base_seg setup_guest_base_seg
1969 #elif defined(__x86_64__) && \
1970 (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
1971 # include <machine/sysarch.h>
1972 static inline int setup_guest_base_seg(void)
1973 {
1974 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1975 return P_GS;
1976 }
1977 return 0;
1978 }
1979 #define setup_guest_base_seg setup_guest_base_seg
1980 #endif
1981 #else
1982 # define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; }))
1983 #endif /* CONFIG_USER_ONLY */
1984 #ifndef setup_guest_base_seg
1985 # define setup_guest_base_seg() 0
1986 #endif
1987
1988 #define MIN_TLB_MASK_TABLE_OFS INT_MIN
1989
1990 /*
1991 * For softmmu, perform the TLB load and compare.
1992 * For useronly, perform any required alignment tests.
1993 * In both cases, return a TCGLabelQemuLdst structure if the slow path
1994 * is required and fill in @h with the host address for the fast path.
1995 */
1996 static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
1997 TCGReg addrlo, TCGReg addrhi,
1998 MemOpIdx oi, bool is_ld)
1999 {
2000 TCGLabelQemuLdst *ldst = NULL;
2001 MemOp opc = get_memop(oi);
2002 MemOp s_bits = opc & MO_SIZE;
2003 unsigned a_mask;
2004
2005 if (tcg_use_softmmu) {
2006 h->index = TCG_REG_L0;
2007 h->ofs = 0;
2008 h->seg = 0;
2009 } else {
2010 *h = x86_guest_base;
2011 }
2012 h->base = addrlo;
2013 h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
2014 a_mask = (1 << h->aa.align) - 1;
2015
2016 if (tcg_use_softmmu) {
2017 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
2018 : offsetof(CPUTLBEntry, addr_write);
2019 TCGType ttype = TCG_TYPE_I32;
2020 TCGType tlbtype = TCG_TYPE_I32;
2021 int trexw = 0, hrexw = 0, tlbrexw = 0;
2022 unsigned mem_index = get_mmuidx(oi);
2023 unsigned s_mask = (1 << s_bits) - 1;
2024 int fast_ofs = tlb_mask_table_ofs(s, mem_index);
2025 int tlb_mask;
2026
2027 ldst = new_ldst_label(s);
2028 ldst->is_ld = is_ld;
2029 ldst->oi = oi;
2030 ldst->addrlo_reg = addrlo;
2031 ldst->addrhi_reg = addrhi;
2032
2033 if (TCG_TARGET_REG_BITS == 64) {
2034 ttype = s->addr_type;
2035 trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
2036 if (TCG_TYPE_PTR == TCG_TYPE_I64) {
2037 hrexw = P_REXW;
2038 if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2039 tlbtype = TCG_TYPE_I64;
2040 tlbrexw = P_REXW;
2041 }
2042 }
2043 }
2044
2045 tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
2046 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
2047 s->page_bits - CPU_TLB_ENTRY_BITS);
2048
2049 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2050 fast_ofs + offsetof(CPUTLBDescFast, mask));
2051
2052 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2053 fast_ofs + offsetof(CPUTLBDescFast, table));
2054
2055 /*
2056 * If the required alignment is at least as large as the access,
2057 * simply copy the address and mask. For lesser alignments,
2058 * check that we don't cross pages for the complete access.
2059 */
2060 if (a_mask >= s_mask) {
2061 tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2062 } else {
2063 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2064 addrlo, s_mask - a_mask);
2065 }
2066 tlb_mask = s->page_mask | a_mask;
2067 tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2068
2069 /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2070 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2071 TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2072
2073 /* jne slow_path */
2074 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2075 ldst->label_ptr[0] = s->code_ptr;
2076 s->code_ptr += 4;
2077
2078 if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2079 /* cmp 4(TCG_REG_L0), addrhi */
2080 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi,
2081 TCG_REG_L0, cmp_ofs + 4);
2082
2083 /* jne slow_path */
2084 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2085 ldst->label_ptr[1] = s->code_ptr;
2086 s->code_ptr += 4;
2087 }
2088
2089 /* TLB Hit. */
2090 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2091 offsetof(CPUTLBEntry, addend));
2092 } else if (a_mask) {
2093 ldst = new_ldst_label(s);
2094
2095 ldst->is_ld = is_ld;
2096 ldst->oi = oi;
2097 ldst->addrlo_reg = addrlo;
2098 ldst->addrhi_reg = addrhi;
2099
2100 tcg_out_testi(s, addrlo, a_mask);
2101 /* jne slow_path */
2102 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2103 ldst->label_ptr[0] = s->code_ptr;
2104 s->code_ptr += 4;
2105 }
2106
2107 return ldst;
2108 }
2109
2110 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2111 HostAddress h, TCGType type, MemOp memop)
2112 {
2113 bool use_movbe = false;
2114 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2115 int movop = OPC_MOVL_GvEv;
2116
2117 /* Do big-endian loads with movbe. */
2118 if (memop & MO_BSWAP) {
2119 tcg_debug_assert(have_movbe);
2120 use_movbe = true;
2121 movop = OPC_MOVBE_GyMy;
2122 }
2123
2124 switch (memop & MO_SSIZE) {
2125 case MO_UB:
2126 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2127 h.base, h.index, 0, h.ofs);
2128 break;
2129 case MO_SB:
2130 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2131 h.base, h.index, 0, h.ofs);
2132 break;
2133 case MO_UW:
2134 if (use_movbe) {
2135 /* There is no extending movbe; only low 16-bits are modified. */
2136 if (datalo != h.base && datalo != h.index) {
2137 /* XOR breaks dependency chains. */
2138 tgen_arithr(s, ARITH_XOR, datalo, datalo);
2139 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2140 datalo, h.base, h.index, 0, h.ofs);
2141 } else {
2142 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2143 datalo, h.base, h.index, 0, h.ofs);
2144 tcg_out_ext16u(s, datalo, datalo);
2145 }
2146 } else {
2147 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2148 h.base, h.index, 0, h.ofs);
2149 }
2150 break;
2151 case MO_SW:
2152 if (use_movbe) {
2153 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2154 datalo, h.base, h.index, 0, h.ofs);
2155 tcg_out_ext16s(s, type, datalo, datalo);
2156 } else {
2157 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2158 datalo, h.base, h.index, 0, h.ofs);
2159 }
2160 break;
2161 case MO_UL:
2162 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2163 h.base, h.index, 0, h.ofs);
2164 break;
2165 #if TCG_TARGET_REG_BITS == 64
2166 case MO_SL:
2167 if (use_movbe) {
2168 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2169 h.base, h.index, 0, h.ofs);
2170 tcg_out_ext32s(s, datalo, datalo);
2171 } else {
2172 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2173 h.base, h.index, 0, h.ofs);
2174 }
2175 break;
2176 #endif
2177 case MO_UQ:
2178 if (TCG_TARGET_REG_BITS == 64) {
2179 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2180 h.base, h.index, 0, h.ofs);
2181 break;
2182 }
2183 if (use_movbe) {
2184 TCGReg t = datalo;
2185 datalo = datahi;
2186 datahi = t;
2187 }
2188 if (h.base == datalo || h.index == datalo) {
2189 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2190 h.base, h.index, 0, h.ofs);
2191 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2192 tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2193 } else {
2194 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2195 h.base, h.index, 0, h.ofs);
2196 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2197 h.base, h.index, 0, h.ofs + 4);
2198 }
2199 break;
2200
2201 case MO_128:
2202 tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2203
2204 /*
2205 * Without 16-byte atomicity, use integer regs.
2206 * That is where we want the data, and it allows bswaps.
2207 */
2208 if (h.aa.atom < MO_128) {
2209 if (use_movbe) {
2210 TCGReg t = datalo;
2211 datalo = datahi;
2212 datahi = t;
2213 }
2214 if (h.base == datalo || h.index == datalo) {
2215 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2216 h.base, h.index, 0, h.ofs);
2217 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2218 datalo, datahi, 0);
2219 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2220 datahi, datahi, 8);
2221 } else {
2222 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2223 h.base, h.index, 0, h.ofs);
2224 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2225 h.base, h.index, 0, h.ofs + 8);
2226 }
2227 break;
2228 }
2229
2230 /*
2231 * With 16-byte atomicity, a vector load is required.
2232 * If we already have 16-byte alignment, then VMOVDQA always works.
2233 * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2234 * Else use we require a runtime test for alignment for VMOVDQA;
2235 * use VMOVDQU on the unaligned nonatomic path for simplicity.
2236 */
2237 if (h.aa.align >= MO_128) {
2238 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2239 TCG_TMP_VEC, 0,
2240 h.base, h.index, 0, h.ofs);
2241 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2242 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2243 TCG_TMP_VEC, 0,
2244 h.base, h.index, 0, h.ofs);
2245 } else {
2246 TCGLabel *l1 = gen_new_label();
2247 TCGLabel *l2 = gen_new_label();
2248
2249 tcg_out_testi(s, h.base, 15);
2250 tcg_out_jxx(s, JCC_JNE, l1, true);
2251
2252 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2253 TCG_TMP_VEC, 0,
2254 h.base, h.index, 0, h.ofs);
2255 tcg_out_jxx(s, JCC_JMP, l2, true);
2256
2257 tcg_out_label(s, l1);
2258 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2259 TCG_TMP_VEC, 0,
2260 h.base, h.index, 0, h.ofs);
2261 tcg_out_label(s, l2);
2262 }
2263 tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2264 break;
2265
2266 default:
2267 g_assert_not_reached();
2268 }
2269 }
2270
2271 static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2272 TCGReg addrlo, TCGReg addrhi,
2273 MemOpIdx oi, TCGType data_type)
2274 {
2275 TCGLabelQemuLdst *ldst;
2276 HostAddress h;
2277
2278 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2279 tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2280
2281 if (ldst) {
2282 ldst->type = data_type;
2283 ldst->datalo_reg = datalo;
2284 ldst->datahi_reg = datahi;
2285 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2286 }
2287 }
2288
2289 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2290 HostAddress h, MemOp memop)
2291 {
2292 bool use_movbe = false;
2293 int movop = OPC_MOVL_EvGv;
2294
2295 /*
2296 * Do big-endian stores with movbe or system-mode.
2297 * User-only without movbe will have its swapping done generically.
2298 */
2299 if (memop & MO_BSWAP) {
2300 tcg_debug_assert(have_movbe);
2301 use_movbe = true;
2302 movop = OPC_MOVBE_MyGy;
2303 }
2304
2305 switch (memop & MO_SIZE) {
2306 case MO_8:
2307 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2308 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2309 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2310 datalo, h.base, h.index, 0, h.ofs);
2311 break;
2312 case MO_16:
2313 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2314 h.base, h.index, 0, h.ofs);
2315 break;
2316 case MO_32:
2317 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2318 h.base, h.index, 0, h.ofs);
2319 break;
2320 case MO_64:
2321 if (TCG_TARGET_REG_BITS == 64) {
2322 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2323 h.base, h.index, 0, h.ofs);
2324 } else {
2325 if (use_movbe) {
2326 TCGReg t = datalo;
2327 datalo = datahi;
2328 datahi = t;
2329 }
2330 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2331 h.base, h.index, 0, h.ofs);
2332 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2333 h.base, h.index, 0, h.ofs + 4);
2334 }
2335 break;
2336
2337 case MO_128:
2338 tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2339
2340 /*
2341 * Without 16-byte atomicity, use integer regs.
2342 * That is where we have the data, and it allows bswaps.
2343 */
2344 if (h.aa.atom < MO_128) {
2345 if (use_movbe) {
2346 TCGReg t = datalo;
2347 datalo = datahi;
2348 datahi = t;
2349 }
2350 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2351 h.base, h.index, 0, h.ofs);
2352 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2353 h.base, h.index, 0, h.ofs + 8);
2354 break;
2355 }
2356
2357 /*
2358 * With 16-byte atomicity, a vector store is required.
2359 * If we already have 16-byte alignment, then VMOVDQA always works.
2360 * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2361 * Else use we require a runtime test for alignment for VMOVDQA;
2362 * use VMOVDQU on the unaligned nonatomic path for simplicity.
2363 */
2364 tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2365 if (h.aa.align >= MO_128) {
2366 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2367 TCG_TMP_VEC, 0,
2368 h.base, h.index, 0, h.ofs);
2369 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2370 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2371 TCG_TMP_VEC, 0,
2372 h.base, h.index, 0, h.ofs);
2373 } else {
2374 TCGLabel *l1 = gen_new_label();
2375 TCGLabel *l2 = gen_new_label();
2376
2377 tcg_out_testi(s, h.base, 15);
2378 tcg_out_jxx(s, JCC_JNE, l1, true);
2379
2380 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2381 TCG_TMP_VEC, 0,
2382 h.base, h.index, 0, h.ofs);
2383 tcg_out_jxx(s, JCC_JMP, l2, true);
2384
2385 tcg_out_label(s, l1);
2386 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2387 TCG_TMP_VEC, 0,
2388 h.base, h.index, 0, h.ofs);
2389 tcg_out_label(s, l2);
2390 }
2391 break;
2392
2393 default:
2394 g_assert_not_reached();
2395 }
2396 }
2397
2398 static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2399 TCGReg addrlo, TCGReg addrhi,
2400 MemOpIdx oi, TCGType data_type)
2401 {
2402 TCGLabelQemuLdst *ldst;
2403 HostAddress h;
2404
2405 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2406 tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2407
2408 if (ldst) {
2409 ldst->type = data_type;
2410 ldst->datalo_reg = datalo;
2411 ldst->datahi_reg = datahi;
2412 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2413 }
2414 }
2415
2416 static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2417 {
2418 /* Reuse the zeroing that exists for goto_ptr. */
2419 if (a0 == 0) {
2420 tcg_out_jmp(s, tcg_code_gen_epilogue);
2421 } else {
2422 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2423 tcg_out_jmp(s, tb_ret_addr);
2424 }
2425 }
2426
2427 static void tcg_out_goto_tb(TCGContext *s, int which)
2428 {
2429 /*
2430 * Jump displacement must be aligned for atomic patching;
2431 * see if we need to add extra nops before jump
2432 */
2433 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2434 if (gap != 1) {
2435 tcg_out_nopn(s, gap - 1);
2436 }
2437 tcg_out8(s, OPC_JMP_long); /* jmp im */
2438 set_jmp_insn_offset(s, which);
2439 tcg_out32(s, 0);
2440 set_jmp_reset_offset(s, which);
2441 }
2442
2443 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2444 uintptr_t jmp_rx, uintptr_t jmp_rw)
2445 {
2446 /* patch the branch destination */
2447 uintptr_t addr = tb->jmp_target_addr[n];
2448 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2449 /* no need to flush icache explicitly */
2450 }
2451
2452 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2453 const TCGArg args[TCG_MAX_OP_ARGS],
2454 const int const_args[TCG_MAX_OP_ARGS])
2455 {
2456 TCGArg a0, a1, a2;
2457 int c, const_a2, vexop, rexw = 0;
2458
2459 #if TCG_TARGET_REG_BITS == 64
2460 # define OP_32_64(x) \
2461 case glue(glue(INDEX_op_, x), _i64): \
2462 rexw = P_REXW; /* FALLTHRU */ \
2463 case glue(glue(INDEX_op_, x), _i32)
2464 #else
2465 # define OP_32_64(x) \
2466 case glue(glue(INDEX_op_, x), _i32)
2467 #endif
2468
2469 /* Hoist the loads of the most common arguments. */
2470 a0 = args[0];
2471 a1 = args[1];
2472 a2 = args[2];
2473 const_a2 = const_args[2];
2474
2475 switch (opc) {
2476 case INDEX_op_goto_ptr:
2477 /* jmp to the given host address (could be epilogue) */
2478 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2479 break;
2480 case INDEX_op_br:
2481 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2482 break;
2483 OP_32_64(ld8u):
2484 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2485 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2486 break;
2487 OP_32_64(ld8s):
2488 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2489 break;
2490 OP_32_64(ld16u):
2491 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2492 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2493 break;
2494 OP_32_64(ld16s):
2495 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2496 break;
2497 #if TCG_TARGET_REG_BITS == 64
2498 case INDEX_op_ld32u_i64:
2499 #endif
2500 case INDEX_op_ld_i32:
2501 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2502 break;
2503
2504 OP_32_64(st8):
2505 if (const_args[0]) {
2506 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2507 tcg_out8(s, a0);
2508 } else {
2509 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2510 }
2511 break;
2512 OP_32_64(st16):
2513 if (const_args[0]) {
2514 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2515 tcg_out16(s, a0);
2516 } else {
2517 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2518 }
2519 break;
2520 #if TCG_TARGET_REG_BITS == 64
2521 case INDEX_op_st32_i64:
2522 #endif
2523 case INDEX_op_st_i32:
2524 if (const_args[0]) {
2525 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2526 tcg_out32(s, a0);
2527 } else {
2528 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2529 }
2530 break;
2531
2532 OP_32_64(add):
2533 /* For 3-operand addition, use LEA. */
2534 if (a0 != a1) {
2535 TCGArg c3 = 0;
2536 if (const_a2) {
2537 c3 = a2, a2 = -1;
2538 } else if (a0 == a2) {
2539 /* Watch out for dest = src + dest, since we've removed
2540 the matching constraint on the add. */
2541 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2542 break;
2543 }
2544
2545 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2546 break;
2547 }
2548 c = ARITH_ADD;
2549 goto gen_arith;
2550 OP_32_64(sub):
2551 c = ARITH_SUB;
2552 goto gen_arith;
2553 OP_32_64(and):
2554 c = ARITH_AND;
2555 goto gen_arith;
2556 OP_32_64(or):
2557 c = ARITH_OR;
2558 goto gen_arith;
2559 OP_32_64(xor):
2560 c = ARITH_XOR;
2561 goto gen_arith;
2562 gen_arith:
2563 if (const_a2) {
2564 tgen_arithi(s, c + rexw, a0, a2, 0);
2565 } else {
2566 tgen_arithr(s, c + rexw, a0, a2);
2567 }
2568 break;
2569
2570 OP_32_64(andc):
2571 if (const_a2) {
2572 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2573 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2574 } else {
2575 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2576 }
2577 break;
2578
2579 OP_32_64(mul):
2580 if (const_a2) {
2581 int32_t val;
2582 val = a2;
2583 if (val == (int8_t)val) {
2584 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2585 tcg_out8(s, val);
2586 } else {
2587 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2588 tcg_out32(s, val);
2589 }
2590 } else {
2591 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2592 }
2593 break;
2594
2595 OP_32_64(div2):
2596 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2597 break;
2598 OP_32_64(divu2):
2599 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2600 break;
2601
2602 OP_32_64(shl):
2603 /* For small constant 3-operand shift, use LEA. */
2604 if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2605 if (a2 - 1 == 0) {
2606 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2607 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2608 } else {
2609 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2610 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2611 }
2612 break;
2613 }
2614 c = SHIFT_SHL;
2615 vexop = OPC_SHLX;
2616 goto gen_shift_maybe_vex;
2617 OP_32_64(shr):
2618 c = SHIFT_SHR;
2619 vexop = OPC_SHRX;
2620 goto gen_shift_maybe_vex;
2621 OP_32_64(sar):
2622 c = SHIFT_SAR;
2623 vexop = OPC_SARX;
2624 goto gen_shift_maybe_vex;
2625 OP_32_64(rotl):
2626 c = SHIFT_ROL;
2627 goto gen_shift;
2628 OP_32_64(rotr):
2629 c = SHIFT_ROR;
2630 goto gen_shift;
2631 gen_shift_maybe_vex:
2632 if (have_bmi2) {
2633 if (!const_a2) {
2634 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2635 break;
2636 }
2637 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2638 }
2639 /* FALLTHRU */
2640 gen_shift:
2641 if (const_a2) {
2642 tcg_out_shifti(s, c + rexw, a0, a2);
2643 } else {
2644 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2645 }
2646 break;
2647
2648 OP_32_64(ctz):
2649 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2650 break;
2651 OP_32_64(clz):
2652 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2653 break;
2654 OP_32_64(ctpop):
2655 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2656 break;
2657
2658 OP_32_64(brcond):
2659 tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2660 arg_label(args[3]), 0);
2661 break;
2662 OP_32_64(setcond):
2663 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
2664 break;
2665 OP_32_64(negsetcond):
2666 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
2667 break;
2668 OP_32_64(movcond):
2669 tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2670 break;
2671
2672 OP_32_64(bswap16):
2673 if (a2 & TCG_BSWAP_OS) {
2674 /* Output must be sign-extended. */
2675 if (rexw) {
2676 tcg_out_bswap64(s, a0);
2677 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2678 } else {
2679 tcg_out_bswap32(s, a0);
2680 tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2681 }
2682 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2683 /* Output must be zero-extended, but input isn't. */
2684 tcg_out_bswap32(s, a0);
2685 tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2686 } else {
2687 tcg_out_rolw_8(s, a0);
2688 }
2689 break;
2690 OP_32_64(bswap32):
2691 tcg_out_bswap32(s, a0);
2692 if (rexw && (a2 & TCG_BSWAP_OS)) {
2693 tcg_out_ext32s(s, a0, a0);
2694 }
2695 break;
2696
2697 OP_32_64(neg):
2698 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2699 break;
2700 OP_32_64(not):
2701 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2702 break;
2703
2704 case INDEX_op_qemu_ld_a64_i32:
2705 if (TCG_TARGET_REG_BITS == 32) {
2706 tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2707 break;
2708 }
2709 /* fall through */
2710 case INDEX_op_qemu_ld_a32_i32:
2711 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2712 break;
2713 case INDEX_op_qemu_ld_a32_i64:
2714 if (TCG_TARGET_REG_BITS == 64) {
2715 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2716 } else {
2717 tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2718 }
2719 break;
2720 case INDEX_op_qemu_ld_a64_i64:
2721 if (TCG_TARGET_REG_BITS == 64) {
2722 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2723 } else {
2724 tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2725 }
2726 break;
2727 case INDEX_op_qemu_ld_a32_i128:
2728 case INDEX_op_qemu_ld_a64_i128:
2729 tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2730 tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2731 break;
2732
2733 case INDEX_op_qemu_st_a64_i32:
2734 case INDEX_op_qemu_st8_a64_i32:
2735 if (TCG_TARGET_REG_BITS == 32) {
2736 tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2737 break;
2738 }
2739 /* fall through */
2740 case INDEX_op_qemu_st_a32_i32:
2741 case INDEX_op_qemu_st8_a32_i32:
2742 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2743 break;
2744 case INDEX_op_qemu_st_a32_i64:
2745 if (TCG_TARGET_REG_BITS == 64) {
2746 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2747 } else {
2748 tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2749 }
2750 break;
2751 case INDEX_op_qemu_st_a64_i64:
2752 if (TCG_TARGET_REG_BITS == 64) {
2753 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2754 } else {
2755 tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2756 }
2757 break;
2758 case INDEX_op_qemu_st_a32_i128:
2759 case INDEX_op_qemu_st_a64_i128:
2760 tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2761 tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2762 break;
2763
2764 OP_32_64(mulu2):
2765 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2766 break;
2767 OP_32_64(muls2):
2768 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2769 break;
2770 OP_32_64(add2):
2771 if (const_args[4]) {
2772 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2773 } else {
2774 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2775 }
2776 if (const_args[5]) {
2777 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2778 } else {
2779 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2780 }
2781 break;
2782 OP_32_64(sub2):
2783 if (const_args[4]) {
2784 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2785 } else {
2786 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2787 }
2788 if (const_args[5]) {
2789 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2790 } else {
2791 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2792 }
2793 break;
2794
2795 #if TCG_TARGET_REG_BITS == 32
2796 case INDEX_op_brcond2_i32:
2797 tcg_out_brcond2(s, args, const_args, 0);
2798 break;
2799 case INDEX_op_setcond2_i32:
2800 tcg_out_setcond2(s, args, const_args);
2801 break;
2802 #else /* TCG_TARGET_REG_BITS == 64 */
2803 case INDEX_op_ld32s_i64:
2804 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2805 break;
2806 case INDEX_op_ld_i64:
2807 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2808 break;
2809 case INDEX_op_st_i64:
2810 if (const_args[0]) {
2811 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2812 tcg_out32(s, a0);
2813 } else {
2814 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2815 }
2816 break;
2817
2818 case INDEX_op_bswap64_i64:
2819 tcg_out_bswap64(s, a0);
2820 break;
2821 case INDEX_op_extrh_i64_i32:
2822 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2823 break;
2824 #endif
2825
2826 OP_32_64(deposit):
2827 if (args[3] == 0 && args[4] == 8) {
2828 /* load bits 0..7 */
2829 if (const_a2) {
2830 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2831 0, a0, 0);
2832 tcg_out8(s, a2);
2833 } else {
2834 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2835 }
2836 } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2837 /* load bits 8..15 */
2838 if (const_a2) {
2839 tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2840 tcg_out8(s, a2);
2841 } else {
2842 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2843 }
2844 } else if (args[3] == 0 && args[4] == 16) {
2845 /* load bits 0..15 */
2846 if (const_a2) {
2847 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2848 0, a0, 0);
2849 tcg_out16(s, a2);
2850 } else {
2851 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2852 }
2853 } else {
2854 g_assert_not_reached();
2855 }
2856 break;
2857
2858 case INDEX_op_extract_i64:
2859 if (a2 + args[3] == 32) {
2860 /* This is a 32-bit zero-extending right shift. */
2861 tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2862 tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2863 break;
2864 }
2865 /* FALLTHRU */
2866 case INDEX_op_extract_i32:
2867 /* On the off-chance that we can use the high-byte registers.
2868 Otherwise we emit the same ext16 + shift pattern that we
2869 would have gotten from the normal tcg-op.c expansion. */
2870 tcg_debug_assert(a2 == 8 && args[3] == 8);
2871 if (a1 < 4 && a0 < 8) {
2872 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2873 } else {
2874 tcg_out_ext16u(s, a0, a1);
2875 tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2876 }
2877 break;
2878
2879 case INDEX_op_sextract_i32:
2880 /* We don't implement sextract_i64, as we cannot sign-extend to
2881 64-bits without using the REX prefix that explicitly excludes
2882 access to the high-byte registers. */
2883 tcg_debug_assert(a2 == 8 && args[3] == 8);
2884 if (a1 < 4 && a0 < 8) {
2885 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2886 } else {
2887 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2888 tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2889 }
2890 break;
2891
2892 OP_32_64(extract2):
2893 /* Note that SHRD outputs to the r/m operand. */
2894 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2895 tcg_out8(s, args[3]);
2896 break;
2897
2898 case INDEX_op_mb:
2899 tcg_out_mb(s, a0);
2900 break;
2901 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
2902 case INDEX_op_mov_i64:
2903 case INDEX_op_call: /* Always emitted via tcg_out_call. */
2904 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */
2905 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */
2906 case INDEX_op_ext8s_i32: /* Always emitted via tcg_reg_alloc_op. */
2907 case INDEX_op_ext8s_i64:
2908 case INDEX_op_ext8u_i32:
2909 case INDEX_op_ext8u_i64:
2910 case INDEX_op_ext16s_i32:
2911 case INDEX_op_ext16s_i64:
2912 case INDEX_op_ext16u_i32:
2913 case INDEX_op_ext16u_i64:
2914 case INDEX_op_ext32s_i64:
2915 case INDEX_op_ext32u_i64:
2916 case INDEX_op_ext_i32_i64:
2917 case INDEX_op_extu_i32_i64:
2918 case INDEX_op_extrl_i64_i32:
2919 default:
2920 g_assert_not_reached();
2921 }
2922
2923 #undef OP_32_64
2924 }
2925
2926 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2927 unsigned vecl, unsigned vece,
2928 const TCGArg args[TCG_MAX_OP_ARGS],
2929 const int const_args[TCG_MAX_OP_ARGS])
2930 {
2931 static int const add_insn[4] = {
2932 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2933 };
2934 static int const ssadd_insn[4] = {
2935 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2936 };
2937 static int const usadd_insn[4] = {
2938 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2939 };
2940 static int const sub_insn[4] = {
2941 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2942 };
2943 static int const sssub_insn[4] = {
2944 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2945 };
2946 static int const ussub_insn[4] = {
2947 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2948 };
2949 static int const mul_insn[4] = {
2950 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2951 };
2952 static int const shift_imm_insn[4] = {
2953 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2954 };
2955 static int const cmpeq_insn[4] = {
2956 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2957 };
2958 static int const cmpgt_insn[4] = {
2959 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2960 };
2961 static int const punpckl_insn[4] = {
2962 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2963 };
2964 static int const punpckh_insn[4] = {
2965 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2966 };
2967 static int const packss_insn[4] = {
2968 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2969 };
2970 static int const packus_insn[4] = {
2971 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2972 };
2973 static int const smin_insn[4] = {
2974 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2975 };
2976 static int const smax_insn[4] = {
2977 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2978 };
2979 static int const umin_insn[4] = {
2980 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2981 };
2982 static int const umax_insn[4] = {
2983 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2984 };
2985 static int const rotlv_insn[4] = {
2986 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2987 };
2988 static int const rotrv_insn[4] = {
2989 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
2990 };
2991 static int const shlv_insn[4] = {
2992 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
2993 };
2994 static int const shrv_insn[4] = {
2995 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
2996 };
2997 static int const sarv_insn[4] = {
2998 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
2999 };
3000 static int const shls_insn[4] = {
3001 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
3002 };
3003 static int const shrs_insn[4] = {
3004 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
3005 };
3006 static int const sars_insn[4] = {
3007 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
3008 };
3009 static int const vpshldi_insn[4] = {
3010 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
3011 };
3012 static int const vpshldv_insn[4] = {
3013 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
3014 };
3015 static int const vpshrdv_insn[4] = {
3016 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
3017 };
3018 static int const abs_insn[4] = {
3019 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
3020 };
3021
3022 TCGType type = vecl + TCG_TYPE_V64;
3023 int insn, sub;
3024 TCGArg a0, a1, a2, a3;
3025
3026 a0 = args[0];
3027 a1 = args[1];
3028 a2 = args[2];
3029
3030 switch (opc) {
3031 case INDEX_op_add_vec:
3032 insn = add_insn[vece];
3033 goto gen_simd;
3034 case INDEX_op_ssadd_vec:
3035 insn = ssadd_insn[vece];
3036 goto gen_simd;
3037 case INDEX_op_usadd_vec:
3038 insn = usadd_insn[vece];
3039 goto gen_simd;
3040 case INDEX_op_sub_vec:
3041 insn = sub_insn[vece];
3042 goto gen_simd;
3043 case INDEX_op_sssub_vec:
3044 insn = sssub_insn[vece];
3045 goto gen_simd;
3046 case INDEX_op_ussub_vec:
3047 insn = ussub_insn[vece];
3048 goto gen_simd;
3049 case INDEX_op_mul_vec:
3050 insn = mul_insn[vece];
3051 goto gen_simd;
3052 case INDEX_op_and_vec:
3053 insn = OPC_PAND;
3054 goto gen_simd;
3055 case INDEX_op_or_vec:
3056 insn = OPC_POR;
3057 goto gen_simd;
3058 case INDEX_op_xor_vec:
3059 insn = OPC_PXOR;
3060 goto gen_simd;
3061 case INDEX_op_smin_vec:
3062 insn = smin_insn[vece];
3063 goto gen_simd;
3064 case INDEX_op_umin_vec:
3065 insn = umin_insn[vece];
3066 goto gen_simd;
3067 case INDEX_op_smax_vec:
3068 insn = smax_insn[vece];
3069 goto gen_simd;
3070 case INDEX_op_umax_vec:
3071 insn = umax_insn[vece];
3072 goto gen_simd;
3073 case INDEX_op_shlv_vec:
3074 insn = shlv_insn[vece];
3075 goto gen_simd;
3076 case INDEX_op_shrv_vec:
3077 insn = shrv_insn[vece];
3078 goto gen_simd;
3079 case INDEX_op_sarv_vec:
3080 insn = sarv_insn[vece];
3081 goto gen_simd;
3082 case INDEX_op_rotlv_vec:
3083 insn = rotlv_insn[vece];
3084 goto gen_simd;
3085 case INDEX_op_rotrv_vec:
3086 insn = rotrv_insn[vece];
3087 goto gen_simd;
3088 case INDEX_op_shls_vec:
3089 insn = shls_insn[vece];
3090 goto gen_simd;
3091 case INDEX_op_shrs_vec:
3092 insn = shrs_insn[vece];
3093 goto gen_simd;
3094 case INDEX_op_sars_vec:
3095 insn = sars_insn[vece];
3096 goto gen_simd;
3097 case INDEX_op_x86_punpckl_vec:
3098 insn = punpckl_insn[vece];
3099 goto gen_simd;
3100 case INDEX_op_x86_punpckh_vec:
3101 insn = punpckh_insn[vece];
3102 goto gen_simd;
3103 case INDEX_op_x86_packss_vec:
3104 insn = packss_insn[vece];
3105 goto gen_simd;
3106 case INDEX_op_x86_packus_vec:
3107 insn = packus_insn[vece];
3108 goto gen_simd;
3109 case INDEX_op_x86_vpshldv_vec:
3110 insn = vpshldv_insn[vece];
3111 a1 = a2;
3112 a2 = args[3];
3113 goto gen_simd;
3114 case INDEX_op_x86_vpshrdv_vec:
3115 insn = vpshrdv_insn[vece];
3116 a1 = a2;
3117 a2 = args[3];
3118 goto gen_simd;
3119 #if TCG_TARGET_REG_BITS == 32
3120 case INDEX_op_dup2_vec:
3121 /* First merge the two 32-bit inputs to a single 64-bit element. */
3122 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3123 /* Then replicate the 64-bit elements across the rest of the vector. */
3124 if (type != TCG_TYPE_V64) {
3125 tcg_out_dup_vec(s, type, MO_64, a0, a0);
3126 }
3127 break;
3128 #endif
3129 case INDEX_op_abs_vec:
3130 insn = abs_insn[vece];
3131 a2 = a1;
3132 a1 = 0;
3133 goto gen_simd;
3134 gen_simd:
3135 tcg_debug_assert(insn != OPC_UD2);
3136 if (type == TCG_TYPE_V256) {
3137 insn |= P_VEXL;
3138 }
3139 tcg_out_vex_modrm(s, insn, a0, a1, a2);
3140 break;
3141
3142 case INDEX_op_cmp_vec:
3143 sub = args[3];
3144 if (sub == TCG_COND_EQ) {
3145 insn = cmpeq_insn[vece];
3146 } else if (sub == TCG_COND_GT) {
3147 insn = cmpgt_insn[vece];
3148 } else {
3149 g_assert_not_reached();
3150 }
3151 goto gen_simd;
3152
3153 case INDEX_op_andc_vec:
3154 insn = OPC_PANDN;
3155 if (type == TCG_TYPE_V256) {
3156 insn |= P_VEXL;
3157 }
3158 tcg_out_vex_modrm(s, insn, a0, a2, a1);
3159 break;
3160
3161 case INDEX_op_shli_vec:
3162 insn = shift_imm_insn[vece];
3163 sub = 6;
3164 goto gen_shift;
3165 case INDEX_op_shri_vec:
3166 insn = shift_imm_insn[vece];
3167 sub = 2;
3168 goto gen_shift;
3169 case INDEX_op_sari_vec:
3170 if (vece == MO_64) {
3171 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3172 } else {
3173 insn = shift_imm_insn[vece];
3174 }
3175 sub = 4;
3176 goto gen_shift;
3177 case INDEX_op_rotli_vec:
3178 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */
3179 if (vece == MO_64) {
3180 insn |= P_VEXW;
3181 }
3182 sub = 1;
3183 goto gen_shift;
3184 gen_shift:
3185 tcg_debug_assert(vece != MO_8);
3186 if (type == TCG_TYPE_V256) {
3187 insn |= P_VEXL;
3188 }
3189 tcg_out_vex_modrm(s, insn, sub, a0, a1);
3190 tcg_out8(s, a2);
3191 break;
3192
3193 case INDEX_op_ld_vec:
3194 tcg_out_ld(s, type, a0, a1, a2);
3195 break;
3196 case INDEX_op_st_vec:
3197 tcg_out_st(s, type, a0, a1, a2);
3198 break;
3199 case INDEX_op_dupm_vec:
3200 tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3201 break;
3202
3203 case INDEX_op_x86_shufps_vec:
3204 insn = OPC_SHUFPS;
3205 sub = args[3];
3206 goto gen_simd_imm8;
3207 case INDEX_op_x86_blend_vec:
3208 if (vece == MO_16) {
3209 insn = OPC_PBLENDW;
3210 } else if (vece == MO_32) {
3211 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3212 } else {
3213 g_assert_not_reached();
3214 }
3215 sub = args[3];
3216 goto gen_simd_imm8;
3217 case INDEX_op_x86_vperm2i128_vec:
3218 insn = OPC_VPERM2I128;
3219 sub = args[3];
3220 goto gen_simd_imm8;
3221 case INDEX_op_x86_vpshldi_vec:
3222 insn = vpshldi_insn[vece];
3223 sub = args[3];
3224 goto gen_simd_imm8;
3225
3226 case INDEX_op_not_vec:
3227 insn = OPC_VPTERNLOGQ;
3228 a2 = a1;
3229 sub = 0x33; /* !B */
3230 goto gen_simd_imm8;
3231 case INDEX_op_nor_vec:
3232 insn = OPC_VPTERNLOGQ;
3233 sub = 0x11; /* norCB */
3234 goto gen_simd_imm8;
3235 case INDEX_op_nand_vec:
3236 insn = OPC_VPTERNLOGQ;
3237 sub = 0x77; /* nandCB */
3238 goto gen_simd_imm8;
3239 case INDEX_op_eqv_vec:
3240 insn = OPC_VPTERNLOGQ;
3241 sub = 0x99; /* xnorCB */
3242 goto gen_simd_imm8;
3243 case INDEX_op_orc_vec:
3244 insn = OPC_VPTERNLOGQ;
3245 sub = 0xdd; /* orB!C */
3246 goto gen_simd_imm8;
3247
3248 case INDEX_op_bitsel_vec:
3249 insn = OPC_VPTERNLOGQ;
3250 a3 = args[3];
3251 if (a0 == a1) {
3252 a1 = a2;
3253 a2 = a3;
3254 sub = 0xca; /* A?B:C */
3255 } else if (a0 == a2) {
3256 a2 = a3;
3257 sub = 0xe2; /* B?A:C */
3258 } else {
3259 tcg_out_mov(s, type, a0, a3);
3260 sub = 0xb8; /* B?C:A */
3261 }
3262 goto gen_simd_imm8;
3263
3264 gen_simd_imm8:
3265 tcg_debug_assert(insn != OPC_UD2);
3266 if (type == TCG_TYPE_V256) {
3267 insn |= P_VEXL;
3268 }
3269 tcg_out_vex_modrm(s, insn, a0, a1, a2);
3270 tcg_out8(s, sub);
3271 break;
3272
3273 case INDEX_op_x86_vpblendvb_vec:
3274 insn = OPC_VPBLENDVB;
3275 if (type == TCG_TYPE_V256) {
3276 insn |= P_VEXL;
3277 }
3278 tcg_out_vex_modrm(s, insn, a0, a1, a2);
3279 tcg_out8(s, args[3] << 4);
3280 break;
3281
3282 case INDEX_op_x86_psrldq_vec:
3283 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3284 tcg_out8(s, a2);
3285 break;
3286
3287 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */
3288 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */
3289 default:
3290 g_assert_not_reached();
3291 }
3292 }
3293
3294 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3295 {
3296 switch (op) {
3297 case INDEX_op_goto_ptr:
3298 return C_O0_I1(r);
3299
3300 case INDEX_op_ld8u_i32:
3301 case INDEX_op_ld8u_i64:
3302 case INDEX_op_ld8s_i32:
3303 case INDEX_op_ld8s_i64:
3304 case INDEX_op_ld16u_i32:
3305 case INDEX_op_ld16u_i64:
3306 case INDEX_op_ld16s_i32:
3307 case INDEX_op_ld16s_i64:
3308 case INDEX_op_ld_i32:
3309 case INDEX_op_ld32u_i64:
3310 case INDEX_op_ld32s_i64:
3311 case INDEX_op_ld_i64:
3312 return C_O1_I1(r, r);
3313
3314 case INDEX_op_st8_i32:
3315 case INDEX_op_st8_i64:
3316 return C_O0_I2(qi, r);
3317
3318 case INDEX_op_st16_i32:
3319 case INDEX_op_st16_i64:
3320 case INDEX_op_st_i32:
3321 case INDEX_op_st32_i64:
3322 return C_O0_I2(ri, r);
3323
3324 case INDEX_op_st_i64:
3325 return C_O0_I2(re, r);
3326
3327 case INDEX_op_add_i32:
3328 case INDEX_op_add_i64:
3329 return C_O1_I2(r, r, re);
3330
3331 case INDEX_op_sub_i32:
3332 case INDEX_op_sub_i64:
3333 case INDEX_op_mul_i32:
3334 case INDEX_op_mul_i64:
3335 case INDEX_op_or_i32:
3336 case INDEX_op_or_i64:
3337 case INDEX_op_xor_i32:
3338 case INDEX_op_xor_i64:
3339 return C_O1_I2(r, 0, re);
3340
3341 case INDEX_op_and_i32:
3342 case INDEX_op_and_i64:
3343 return C_O1_I2(r, 0, reZ);
3344
3345 case INDEX_op_andc_i32:
3346 case INDEX_op_andc_i64:
3347 return C_O1_I2(r, r, rI);
3348
3349 case INDEX_op_shl_i32:
3350 case INDEX_op_shl_i64:
3351 case INDEX_op_shr_i32:
3352 case INDEX_op_shr_i64:
3353 case INDEX_op_sar_i32:
3354 case INDEX_op_sar_i64:
3355 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3356
3357 case INDEX_op_rotl_i32:
3358 case INDEX_op_rotl_i64:
3359 case INDEX_op_rotr_i32:
3360 case INDEX_op_rotr_i64:
3361 return C_O1_I2(r, 0, ci);
3362
3363 case INDEX_op_brcond_i32:
3364 case INDEX_op_brcond_i64:
3365 return C_O0_I2(r, re);
3366
3367 case INDEX_op_bswap16_i32:
3368 case INDEX_op_bswap16_i64:
3369 case INDEX_op_bswap32_i32:
3370 case INDEX_op_bswap32_i64:
3371 case INDEX_op_bswap64_i64:
3372 case INDEX_op_neg_i32:
3373 case INDEX_op_neg_i64:
3374 case INDEX_op_not_i32:
3375 case INDEX_op_not_i64:
3376 case INDEX_op_extrh_i64_i32:
3377 return C_O1_I1(r, 0);
3378
3379 case INDEX_op_ext8s_i32:
3380 case INDEX_op_ext8s_i64:
3381 case INDEX_op_ext8u_i32:
3382 case INDEX_op_ext8u_i64:
3383 return C_O1_I1(r, q);
3384
3385 case INDEX_op_ext16s_i32:
3386 case INDEX_op_ext16s_i64:
3387 case INDEX_op_ext16u_i32:
3388 case INDEX_op_ext16u_i64:
3389 case INDEX_op_ext32s_i64:
3390 case INDEX_op_ext32u_i64:
3391 case INDEX_op_ext_i32_i64:
3392 case INDEX_op_extu_i32_i64:
3393 case INDEX_op_extrl_i64_i32:
3394 case INDEX_op_extract_i32:
3395 case INDEX_op_extract_i64:
3396 case INDEX_op_sextract_i32:
3397 case INDEX_op_ctpop_i32:
3398 case INDEX_op_ctpop_i64:
3399 return C_O1_I1(r, r);
3400
3401 case INDEX_op_extract2_i32:
3402 case INDEX_op_extract2_i64:
3403 return C_O1_I2(r, 0, r);
3404
3405 case INDEX_op_deposit_i32:
3406 case INDEX_op_deposit_i64:
3407 return C_O1_I2(q, 0, qi);
3408
3409 case INDEX_op_setcond_i32:
3410 case INDEX_op_setcond_i64:
3411 case INDEX_op_negsetcond_i32:
3412 case INDEX_op_negsetcond_i64:
3413 return C_O1_I2(q, r, re);
3414
3415 case INDEX_op_movcond_i32:
3416 case INDEX_op_movcond_i64:
3417 return C_O1_I4(r, r, re, r, 0);
3418
3419 case INDEX_op_div2_i32:
3420 case INDEX_op_div2_i64:
3421 case INDEX_op_divu2_i32:
3422 case INDEX_op_divu2_i64:
3423 return C_O2_I3(a, d, 0, 1, r);
3424
3425 case INDEX_op_mulu2_i32:
3426 case INDEX_op_mulu2_i64:
3427 case INDEX_op_muls2_i32:
3428 case INDEX_op_muls2_i64:
3429 return C_O2_I2(a, d, a, r);
3430
3431 case INDEX_op_add2_i32:
3432 case INDEX_op_add2_i64:
3433 case INDEX_op_sub2_i32:
3434 case INDEX_op_sub2_i64:
3435 return C_N1_O1_I4(r, r, 0, 1, re, re);
3436
3437 case INDEX_op_ctz_i32:
3438 case INDEX_op_ctz_i64:
3439 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3440
3441 case INDEX_op_clz_i32:
3442 case INDEX_op_clz_i64:
3443 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3444
3445 case INDEX_op_qemu_ld_a32_i32:
3446 return C_O1_I1(r, L);
3447 case INDEX_op_qemu_ld_a64_i32:
3448 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3449
3450 case INDEX_op_qemu_st_a32_i32:
3451 return C_O0_I2(L, L);
3452 case INDEX_op_qemu_st_a64_i32:
3453 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3454 case INDEX_op_qemu_st8_a32_i32:
3455 return C_O0_I2(s, L);
3456 case INDEX_op_qemu_st8_a64_i32:
3457 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3458
3459 case INDEX_op_qemu_ld_a32_i64:
3460 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3461 case INDEX_op_qemu_ld_a64_i64:
3462 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3463
3464 case INDEX_op_qemu_st_a32_i64:
3465 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3466 case INDEX_op_qemu_st_a64_i64:
3467 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3468
3469 case INDEX_op_qemu_ld_a32_i128:
3470 case INDEX_op_qemu_ld_a64_i128:
3471 tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3472 return C_O2_I1(r, r, L);
3473 case INDEX_op_qemu_st_a32_i128:
3474 case INDEX_op_qemu_st_a64_i128:
3475 tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3476 return C_O0_I3(L, L, L);
3477
3478 case INDEX_op_brcond2_i32:
3479 return C_O0_I4(r, r, ri, ri);
3480
3481 case INDEX_op_setcond2_i32:
3482 return C_O1_I4(r, r, r, ri, ri);
3483
3484 case INDEX_op_ld_vec:
3485 case INDEX_op_dupm_vec:
3486 return C_O1_I1(x, r);
3487
3488 case INDEX_op_st_vec:
3489 return C_O0_I2(x, r);
3490
3491 case INDEX_op_add_vec:
3492 case INDEX_op_sub_vec:
3493 case INDEX_op_mul_vec:
3494 case INDEX_op_and_vec:
3495 case INDEX_op_or_vec:
3496 case INDEX_op_xor_vec:
3497 case INDEX_op_andc_vec:
3498 case INDEX_op_orc_vec:
3499 case INDEX_op_nand_vec:
3500 case INDEX_op_nor_vec:
3501 case INDEX_op_eqv_vec:
3502 case INDEX_op_ssadd_vec:
3503 case INDEX_op_usadd_vec:
3504 case INDEX_op_sssub_vec:
3505 case INDEX_op_ussub_vec:
3506 case INDEX_op_smin_vec:
3507 case INDEX_op_umin_vec:
3508 case INDEX_op_smax_vec:
3509 case INDEX_op_umax_vec:
3510 case INDEX_op_shlv_vec:
3511 case INDEX_op_shrv_vec:
3512 case INDEX_op_sarv_vec:
3513 case INDEX_op_rotlv_vec:
3514 case INDEX_op_rotrv_vec:
3515 case INDEX_op_shls_vec:
3516 case INDEX_op_shrs_vec:
3517 case INDEX_op_sars_vec:
3518 case INDEX_op_cmp_vec:
3519 case INDEX_op_x86_shufps_vec:
3520 case INDEX_op_x86_blend_vec:
3521 case INDEX_op_x86_packss_vec:
3522 case INDEX_op_x86_packus_vec:
3523 case INDEX_op_x86_vperm2i128_vec:
3524 case INDEX_op_x86_punpckl_vec:
3525 case INDEX_op_x86_punpckh_vec:
3526 case INDEX_op_x86_vpshldi_vec:
3527 #if TCG_TARGET_REG_BITS == 32
3528 case INDEX_op_dup2_vec:
3529 #endif
3530 return C_O1_I2(x, x, x);
3531
3532 case INDEX_op_abs_vec:
3533 case INDEX_op_dup_vec:
3534 case INDEX_op_not_vec:
3535 case INDEX_op_shli_vec:
3536 case INDEX_op_shri_vec:
3537 case INDEX_op_sari_vec:
3538 case INDEX_op_rotli_vec:
3539 case INDEX_op_x86_psrldq_vec:
3540 return C_O1_I1(x, x);
3541
3542 case INDEX_op_x86_vpshldv_vec:
3543 case INDEX_op_x86_vpshrdv_vec:
3544 return C_O1_I3(x, 0, x, x);
3545
3546 case INDEX_op_bitsel_vec:
3547 case INDEX_op_x86_vpblendvb_vec:
3548 return C_O1_I3(x, x, x, x);
3549
3550 default:
3551 g_assert_not_reached();
3552 }
3553 }
3554
3555 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3556 {
3557 switch (opc) {
3558 case INDEX_op_add_vec:
3559 case INDEX_op_sub_vec:
3560 case INDEX_op_and_vec:
3561 case INDEX_op_or_vec:
3562 case INDEX_op_xor_vec:
3563 case INDEX_op_andc_vec:
3564 case INDEX_op_orc_vec:
3565 case INDEX_op_nand_vec:
3566 case INDEX_op_nor_vec:
3567 case INDEX_op_eqv_vec:
3568 case INDEX_op_not_vec:
3569 case INDEX_op_bitsel_vec:
3570 return 1;
3571 case INDEX_op_cmp_vec:
3572 case INDEX_op_cmpsel_vec:
3573 return -1;
3574
3575 case INDEX_op_rotli_vec:
3576 return have_avx512vl && vece >= MO_32 ? 1 : -1;
3577
3578 case INDEX_op_shli_vec:
3579 case INDEX_op_shri_vec:
3580 /* We must expand the operation for MO_8. */
3581 return vece == MO_8 ? -1 : 1;
3582
3583 case INDEX_op_sari_vec:
3584 switch (vece) {
3585 case MO_8:
3586 return -1;
3587 case MO_16:
3588 case MO_32:
3589 return 1;
3590 case MO_64:
3591 if (have_avx512vl) {
3592 return 1;
3593 }
3594 /*
3595 * We can emulate this for MO_64, but it does not pay off
3596 * unless we're producing at least 4 values.
3597 */
3598 return type >= TCG_TYPE_V256 ? -1 : 0;
3599 }
3600 return 0;
3601
3602 case INDEX_op_shls_vec:
3603 case INDEX_op_shrs_vec:
3604 return vece >= MO_16;
3605 case INDEX_op_sars_vec:
3606 switch (vece) {
3607 case MO_16:
3608 case MO_32:
3609 return 1;
3610 case MO_64:
3611 return have_avx512vl;
3612 }
3613 return 0;
3614 case INDEX_op_rotls_vec:
3615 return vece >= MO_16 ? -1 : 0;
3616
3617 case INDEX_op_shlv_vec:
3618 case INDEX_op_shrv_vec:
3619 switch (vece) {
3620 case MO_16:
3621 return have_avx512bw;
3622 case MO_32:
3623 case MO_64:
3624 return have_avx2;
3625 }
3626 return 0;
3627 case INDEX_op_sarv_vec:
3628 switch (vece) {
3629 case MO_16:
3630 return have_avx512bw;
3631 case MO_32:
3632 return have_avx2;
3633 case MO_64:
3634 return have_avx512vl;
3635 }
3636 return 0;
3637 case INDEX_op_rotlv_vec:
3638 case INDEX_op_rotrv_vec:
3639 switch (vece) {
3640 case MO_16:
3641 return have_avx512vbmi2 ? -1 : 0;
3642 case MO_32:
3643 case MO_64:
3644 return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3645 }
3646 return 0;
3647
3648 case INDEX_op_mul_vec:
3649 switch (vece) {
3650 case MO_8:
3651 return -1;
3652 case MO_64:
3653 return have_avx512dq;
3654 }
3655 return 1;
3656
3657 case INDEX_op_ssadd_vec:
3658 case INDEX_op_usadd_vec:
3659 case INDEX_op_sssub_vec:
3660 case INDEX_op_ussub_vec:
3661 return vece <= MO_16;
3662 case INDEX_op_smin_vec:
3663 case INDEX_op_smax_vec:
3664 case INDEX_op_umin_vec:
3665 case INDEX_op_umax_vec:
3666 case INDEX_op_abs_vec:
3667 return vece <= MO_32 || have_avx512vl;
3668
3669 default:
3670 return 0;
3671 }
3672 }
3673
3674 static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3675 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3676 {
3677 TCGv_vec t1, t2;
3678
3679 tcg_debug_assert(vece == MO_8);
3680
3681 t1 = tcg_temp_new_vec(type);
3682 t2 = tcg_temp_new_vec(type);
3683
3684 /*
3685 * Unpack to W, shift, and repack. Tricky bits:
3686 * (1) Use punpck*bw x,x to produce DDCCBBAA,
3687 * i.e. duplicate in other half of the 16-bit lane.
3688 * (2) For right-shift, add 8 so that the high half of the lane
3689 * becomes zero. For left-shift, and left-rotate, we must
3690 * shift up and down again.
3691 * (3) Step 2 leaves high half zero such that PACKUSWB
3692 * (pack with unsigned saturation) does not modify
3693 * the quantity.
3694 */
3695 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3696 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3697 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3698 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3699
3700 if (opc != INDEX_op_rotli_vec) {
3701 imm += 8;
3702 }
3703 if (opc == INDEX_op_shri_vec) {
3704 tcg_gen_shri_vec(MO_16, t1, t1, imm);
3705 tcg_gen_shri_vec(MO_16, t2, t2, imm);
3706 } else {
3707 tcg_gen_shli_vec(MO_16, t1, t1, imm);
3708 tcg_gen_shli_vec(MO_16, t2, t2, imm);
3709 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3710 tcg_gen_shri_vec(MO_16, t2, t2, 8);
3711 }
3712
3713 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3714 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3715 tcg_temp_free_vec(t1);
3716 tcg_temp_free_vec(t2);
3717 }
3718
3719 static void expand_vec_sari(TCGType type, unsigned vece,
3720 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3721 {
3722 TCGv_vec t1, t2;
3723
3724 switch (vece) {
3725 case MO_8:
3726 /* Unpack to W, shift, and repack, as in expand_vec_shi. */
3727 t1 = tcg_temp_new_vec(type);
3728 t2 = tcg_temp_new_vec(type);
3729 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3730 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3731 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3732 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3733 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3734 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3735 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3736 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3737 tcg_temp_free_vec(t1);
3738 tcg_temp_free_vec(t2);
3739 break;
3740
3741 case MO_64:
3742 t1 = tcg_temp_new_vec(type);
3743 if (imm <= 32) {
3744 /*
3745 * We can emulate a small sign extend by performing an arithmetic
3746 * 32-bit shift and overwriting the high half of a 64-bit logical
3747 * shift. Note that the ISA says shift of 32 is valid, but TCG
3748 * does not, so we have to bound the smaller shift -- we get the
3749 * same result in the high half either way.
3750 */
3751 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3752 tcg_gen_shri_vec(MO_64, v0, v1, imm);
3753 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3754 tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3755 tcgv_vec_arg(t1), 0xaa);
3756 } else {
3757 /* Otherwise we will need to use a compare vs 0 to produce
3758 * the sign-extend, shift and merge.
3759 */
3760 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3761 tcg_constant_vec(type, MO_64, 0), v1);
3762 tcg_gen_shri_vec(MO_64, v0, v1, imm);
3763 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3764 tcg_gen_or_vec(MO_64, v0, v0, t1);
3765 }
3766 tcg_temp_free_vec(t1);
3767 break;
3768
3769 default:
3770 g_assert_not_reached();
3771 }
3772 }
3773
3774 static void expand_vec_rotli(TCGType type, unsigned vece,
3775 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3776 {
3777 TCGv_vec t;
3778
3779 if (vece == MO_8) {
3780 expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3781 return;
3782 }
3783
3784 if (have_avx512vbmi2) {
3785 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3786 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3787 return;
3788 }
3789
3790 t = tcg_temp_new_vec(type);
3791 tcg_gen_shli_vec(vece, t, v1, imm);
3792 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3793 tcg_gen_or_vec(vece, v0, v0, t);
3794 tcg_temp_free_vec(t);
3795 }
3796
3797 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3798 TCGv_vec v1, TCGv_vec sh, bool right)
3799 {
3800 TCGv_vec t;
3801
3802 if (have_avx512vbmi2) {
3803 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3804 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3805 tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3806 return;
3807 }
3808
3809 t = tcg_temp_new_vec(type);
3810 tcg_gen_dupi_vec(vece, t, 8 << vece);
3811 tcg_gen_sub_vec(vece, t, t, sh);
3812 if (right) {
3813 tcg_gen_shlv_vec(vece, t, v1, t);
3814 tcg_gen_shrv_vec(vece, v0, v1, sh);
3815 } else {
3816 tcg_gen_shrv_vec(vece, t, v1, t);
3817 tcg_gen_shlv_vec(vece, v0, v1, sh);
3818 }
3819 tcg_gen_or_vec(vece, v0, v0, t);
3820 tcg_temp_free_vec(t);
3821 }
3822
3823 static void expand_vec_rotls(TCGType type, unsigned vece,
3824 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3825 {
3826 TCGv_vec t = tcg_temp_new_vec(type);
3827
3828 tcg_debug_assert(vece != MO_8);
3829
3830 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3831 tcg_gen_dup_i32_vec(vece, t, lsh);
3832 if (vece >= MO_32) {
3833 tcg_gen_rotlv_vec(vece, v0, v1, t);
3834 } else {
3835 expand_vec_rotv(type, vece, v0, v1, t, false);
3836 }
3837 } else {
3838 TCGv_i32 rsh = tcg_temp_new_i32();
3839
3840 tcg_gen_neg_i32(rsh, lsh);
3841 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3842 tcg_gen_shls_vec(vece, t, v1, lsh);
3843 tcg_gen_shrs_vec(vece, v0, v1, rsh);
3844 tcg_gen_or_vec(vece, v0, v0, t);
3845
3846 tcg_temp_free_i32(rsh);
3847 }
3848
3849 tcg_temp_free_vec(t);
3850 }
3851
3852 static void expand_vec_mul(TCGType type, unsigned vece,
3853 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3854 {
3855 TCGv_vec t1, t2, t3, t4, zero;
3856
3857 tcg_debug_assert(vece == MO_8);
3858
3859 /*
3860 * Unpack v1 bytes to words, 0 | x.
3861 * Unpack v2 bytes to words, y | 0.
3862 * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3863 * Shift logical right by 8 bits to clear the high 8 bytes before
3864 * using an unsigned saturated pack.
3865 *
3866 * The difference between the V64, V128 and V256 cases is merely how
3867 * we distribute the expansion between temporaries.
3868 */
3869 switch (type) {
3870 case TCG_TYPE_V64:
3871 t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3872 t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3873 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3874 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3875 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3876 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3877 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3878 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3879 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3880 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3881 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3882 tcg_temp_free_vec(t1);
3883 tcg_temp_free_vec(t2);
3884 break;
3885
3886 case TCG_TYPE_V128:
3887 case TCG_TYPE_V256:
3888 t1 = tcg_temp_new_vec(type);
3889 t2 = tcg_temp_new_vec(type);
3890 t3 = tcg_temp_new_vec(type);
3891 t4 = tcg_temp_new_vec(type);
3892 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3893 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3894 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3895 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3896 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3897 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3898 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3899 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3900 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3901 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3902 tcg_gen_mul_vec(MO_16, t3, t3, t4);
3903 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3904 tcg_gen_shri_vec(MO_16, t3, t3, 8);
3905 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3906 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3907 tcg_temp_free_vec(t1);
3908 tcg_temp_free_vec(t2);
3909 tcg_temp_free_vec(t3);
3910 tcg_temp_free_vec(t4);
3911 break;
3912
3913 default:
3914 g_assert_not_reached();
3915 }
3916 }
3917
3918 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3919 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3920 {
3921 enum {
3922 NEED_INV = 1,
3923 NEED_SWAP = 2,
3924 NEED_BIAS = 4,
3925 NEED_UMIN = 8,
3926 NEED_UMAX = 16,
3927 };
3928 TCGv_vec t1, t2, t3;
3929 uint8_t fixup;
3930
3931 switch (cond) {
3932 case TCG_COND_EQ:
3933 case TCG_COND_GT:
3934 fixup = 0;
3935 break;
3936 case TCG_COND_NE:
3937 case TCG_COND_LE:
3938 fixup = NEED_INV;
3939 break;
3940 case TCG_COND_LT:
3941 fixup = NEED_SWAP;
3942 break;
3943 case TCG_COND_GE:
3944 fixup = NEED_SWAP | NEED_INV;
3945 break;
3946 case TCG_COND_LEU:
3947 if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3948 fixup = NEED_UMIN;
3949 } else {
3950 fixup = NEED_BIAS | NEED_INV;
3951 }
3952 break;
3953 case TCG_COND_GTU:
3954 if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3955 fixup = NEED_UMIN | NEED_INV;
3956 } else {
3957 fixup = NEED_BIAS;
3958 }
3959 break;
3960 case TCG_COND_GEU:
3961 if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3962 fixup = NEED_UMAX;
3963 } else {
3964 fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3965 }
3966 break;
3967 case TCG_COND_LTU:
3968 if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3969 fixup = NEED_UMAX | NEED_INV;
3970 } else {
3971 fixup = NEED_BIAS | NEED_SWAP;
3972 }
3973 break;
3974 default:
3975 g_assert_not_reached();
3976 }
3977
3978 if (fixup & NEED_INV) {
3979 cond = tcg_invert_cond(cond);
3980 }
3981 if (fixup & NEED_SWAP) {
3982 t1 = v1, v1 = v2, v2 = t1;
3983 cond = tcg_swap_cond(cond);
3984 }
3985
3986 t1 = t2 = NULL;
3987 if (fixup & (NEED_UMIN | NEED_UMAX)) {
3988 t1 = tcg_temp_new_vec(type);
3989 if (fixup & NEED_UMIN) {
3990 tcg_gen_umin_vec(vece, t1, v1, v2);
3991 } else {
3992 tcg_gen_umax_vec(vece, t1, v1, v2);
3993 }
3994 v2 = t1;
3995 cond = TCG_COND_EQ;
3996 } else if (fixup & NEED_BIAS) {
3997 t1 = tcg_temp_new_vec(type);
3998 t2 = tcg_temp_new_vec(type);
3999 t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
4000 tcg_gen_sub_vec(vece, t1, v1, t3);
4001 tcg_gen_sub_vec(vece, t2, v2, t3);
4002 v1 = t1;
4003 v2 = t2;
4004 cond = tcg_signed_cond(cond);
4005 }
4006
4007 tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
4008 /* Expand directly; do not recurse. */
4009 vec_gen_4(INDEX_op_cmp_vec, type, vece,
4010 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
4011
4012 if (t1) {
4013 tcg_temp_free_vec(t1);
4014 if (t2) {
4015 tcg_temp_free_vec(t2);
4016 }
4017 }
4018 return fixup & NEED_INV;
4019 }
4020
4021 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
4022 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
4023 {
4024 if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
4025 tcg_gen_not_vec(vece, v0, v0);
4026 }
4027 }
4028
4029 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
4030 TCGv_vec c1, TCGv_vec c2,
4031 TCGv_vec v3, TCGv_vec v4, TCGCond cond)
4032 {
4033 TCGv_vec t = tcg_temp_new_vec(type);
4034
4035 if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
4036 /* Invert the sense of the compare by swapping arguments. */
4037 TCGv_vec x;
4038 x = v3, v3 = v4, v4 = x;
4039 }
4040 vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
4041 tcgv_vec_arg(v0), tcgv_vec_arg(v4),
4042 tcgv_vec_arg(v3), tcgv_vec_arg(t));
4043 tcg_temp_free_vec(t);
4044 }
4045
4046 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
4047 TCGArg a0, ...)
4048 {
4049 va_list va;
4050 TCGArg a2;
4051 TCGv_vec v0, v1, v2, v3, v4;
4052
4053 va_start(va, a0);
4054 v0 = temp_tcgv_vec(arg_temp(a0));
4055 v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4056 a2 = va_arg(va, TCGArg);
4057
4058 switch (opc) {
4059 case INDEX_op_shli_vec:
4060 case INDEX_op_shri_vec:
4061 expand_vec_shi(type, vece, opc, v0, v1, a2);
4062 break;
4063
4064 case INDEX_op_sari_vec:
4065 expand_vec_sari(type, vece, v0, v1, a2);
4066 break;
4067
4068 case INDEX_op_rotli_vec:
4069 expand_vec_rotli(type, vece, v0, v1, a2);
4070 break;
4071
4072 case INDEX_op_rotls_vec:
4073 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4074 break;
4075
4076 case INDEX_op_rotlv_vec:
4077 v2 = temp_tcgv_vec(arg_temp(a2));
4078 expand_vec_rotv(type, vece, v0, v1, v2, false);
4079 break;
4080 case INDEX_op_rotrv_vec:
4081 v2 = temp_tcgv_vec(arg_temp(a2));
4082 expand_vec_rotv(type, vece, v0, v1, v2, true);
4083 break;
4084
4085 case INDEX_op_mul_vec:
4086 v2 = temp_tcgv_vec(arg_temp(a2));
4087 expand_vec_mul(type, vece, v0, v1, v2);
4088 break;
4089
4090 case INDEX_op_cmp_vec:
4091 v2 = temp_tcgv_vec(arg_temp(a2));
4092 expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
4093 break;
4094
4095 case INDEX_op_cmpsel_vec:
4096 v2 = temp_tcgv_vec(arg_temp(a2));
4097 v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4098 v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4099 expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4100 break;
4101
4102 default:
4103 break;
4104 }
4105
4106 va_end(va);
4107 }
4108
4109 static const int tcg_target_callee_save_regs[] = {
4110 #if TCG_TARGET_REG_BITS == 64
4111 TCG_REG_RBP,
4112 TCG_REG_RBX,
4113 #if defined(_WIN64)
4114 TCG_REG_RDI,
4115 TCG_REG_RSI,
4116 #endif
4117 TCG_REG_R12,
4118 TCG_REG_R13,
4119 TCG_REG_R14, /* Currently used for the global env. */
4120 TCG_REG_R15,
4121 #else
4122 TCG_REG_EBP, /* Currently used for the global env. */
4123 TCG_REG_EBX,
4124 TCG_REG_ESI,
4125 TCG_REG_EDI,
4126 #endif
4127 };
4128
4129 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
4130 and tcg_register_jit. */
4131
4132 #define PUSH_SIZE \
4133 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4134 * (TCG_TARGET_REG_BITS / 8))
4135
4136 #define FRAME_SIZE \
4137 ((PUSH_SIZE \
4138 + TCG_STATIC_CALL_ARGS_SIZE \
4139 + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4140 + TCG_TARGET_STACK_ALIGN - 1) \
4141 & ~(TCG_TARGET_STACK_ALIGN - 1))
4142
4143 /* Generate global QEMU prologue and epilogue code */
4144 static void tcg_target_qemu_prologue(TCGContext *s)
4145 {
4146 int i, stack_addend;
4147
4148 /* TB prologue */
4149
4150 /* Reserve some stack space, also for TCG temps. */
4151 stack_addend = FRAME_SIZE - PUSH_SIZE;
4152 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4153 CPU_TEMP_BUF_NLONGS * sizeof(long));
4154
4155 /* Save all callee saved registers. */
4156 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4157 tcg_out_push(s, tcg_target_callee_save_regs[i]);
4158 }
4159
4160 if (!tcg_use_softmmu && guest_base) {
4161 int seg = setup_guest_base_seg();
4162 if (seg != 0) {
4163 x86_guest_base.seg = seg;
4164 } else if (guest_base == (int32_t)guest_base) {
4165 x86_guest_base.ofs = guest_base;
4166 } else {
4167 assert(TCG_TARGET_REG_BITS == 64);
4168 /* Choose R12 because, as a base, it requires a SIB byte. */
4169 x86_guest_base.index = TCG_REG_R12;
4170 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4171 tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4172 }
4173 }
4174
4175 if (TCG_TARGET_REG_BITS == 32) {
4176 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4177 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4178 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4179 /* jmp *tb. */
4180 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4181 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4182 + stack_addend);
4183 } else {
4184 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4185 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4186 /* jmp *tb. */
4187 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4188 }
4189
4190 /*
4191 * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4192 * and fall through to the rest of the epilogue.
4193 */
4194 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4195 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4196
4197 /* TB epilogue */
4198 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4199
4200 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4201
4202 if (have_avx2) {
4203 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4204 }
4205 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4206 tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4207 }
4208 tcg_out_opc(s, OPC_RET, 0, 0, 0);
4209 }
4210
4211 static void tcg_out_tb_start(TCGContext *s)
4212 {
4213 /* nothing to do */
4214 }
4215
4216 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4217 {
4218 memset(p, 0x90, count);
4219 }
4220
4221 static void tcg_target_init(TCGContext *s)
4222 {
4223 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4224 if (TCG_TARGET_REG_BITS == 64) {
4225 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4226 }
4227 if (have_avx1) {
4228 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4229 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4230 }
4231 if (have_avx2) {
4232 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4233 }
4234
4235 tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4236 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4237 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4238 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4239 if (TCG_TARGET_REG_BITS == 64) {
4240 #if !defined(_WIN64)
4241 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4242 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4243 #endif
4244 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4245 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4246 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4247 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4248 }
4249
4250 s->reserved_regs = 0;
4251 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4252 tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4253 #ifdef _WIN64
4254 /* These are call saved, and we don't save them, so don't use them. */
4255 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4256 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4257 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4258 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4259 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4260 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4261 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4262 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4263 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4264 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4265 #endif
4266 }
4267
4268 typedef struct {
4269 DebugFrameHeader h;
4270 uint8_t fde_def_cfa[4];
4271 uint8_t fde_reg_ofs[14];
4272 } DebugFrame;
4273
4274 /* We're expecting a 2 byte uleb128 encoded value. */
4275 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4276
4277 #if !defined(__ELF__)
4278 /* Host machine without ELF. */
4279 #elif TCG_TARGET_REG_BITS == 64
4280 #define ELF_HOST_MACHINE EM_X86_64
4281 static const DebugFrame debug_frame = {
4282 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4283 .h.cie.id = -1,
4284 .h.cie.version = 1,
4285 .h.cie.code_align = 1,
4286 .h.cie.data_align = 0x78, /* sleb128 -8 */
4287 .h.cie.return_column = 16,
4288
4289 /* Total FDE size does not include the "len" member. */
4290 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4291
4292 .fde_def_cfa = {
4293 12, 7, /* DW_CFA_def_cfa %rsp, ... */
4294 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
4295 (FRAME_SIZE >> 7)
4296 },
4297 .fde_reg_ofs = {
4298 0x90, 1, /* DW_CFA_offset, %rip, -8 */
4299 /* The following ordering must match tcg_target_callee_save_regs. */
4300 0x86, 2, /* DW_CFA_offset, %rbp, -16 */
4301 0x83, 3, /* DW_CFA_offset, %rbx, -24 */
4302 0x8c, 4, /* DW_CFA_offset, %r12, -32 */
4303 0x8d, 5, /* DW_CFA_offset, %r13, -40 */
4304 0x8e, 6, /* DW_CFA_offset, %r14, -48 */
4305 0x8f, 7, /* DW_CFA_offset, %r15, -56 */
4306 }
4307 };
4308 #else
4309 #define ELF_HOST_MACHINE EM_386
4310 static const DebugFrame debug_frame = {
4311 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4312 .h.cie.id = -1,
4313 .h.cie.version = 1,
4314 .h.cie.code_align = 1,
4315 .h.cie.data_align = 0x7c, /* sleb128 -4 */
4316 .h.cie.return_column = 8,
4317
4318 /* Total FDE size does not include the "len" member. */
4319 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4320
4321 .fde_def_cfa = {
4322 12, 4, /* DW_CFA_def_cfa %esp, ... */
4323 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
4324 (FRAME_SIZE >> 7)
4325 },
4326 .fde_reg_ofs = {
4327 0x88, 1, /* DW_CFA_offset, %eip, -4 */
4328 /* The following ordering must match tcg_target_callee_save_regs. */
4329 0x85, 2, /* DW_CFA_offset, %ebp, -8 */
4330 0x83, 3, /* DW_CFA_offset, %ebx, -12 */
4331 0x86, 4, /* DW_CFA_offset, %esi, -16 */
4332 0x87, 5, /* DW_CFA_offset, %edi, -20 */
4333 }
4334 };
4335 #endif
4336
4337 #if defined(ELF_HOST_MACHINE)
4338 void tcg_register_jit(const void *buf, size_t buf_size)
4339 {
4340 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
4341 }
4342 #endif