2 * Tiny Code Generator for QEMU
4 * Copyright (c) 2008 Fabrice Bellard
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 #include "../tcg-ldst.c.inc"
26 #include "../tcg-pool.c.inc"
28 #ifdef CONFIG_DEBUG_TCG
29 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
30 #if TCG_TARGET_REG_BITS == 64
31 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
33 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
35 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
36 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
37 #if TCG_TARGET_REG_BITS == 64
38 "%xmm8", "%xmm9", "%xmm10", "%xmm11",
39 "%xmm12", "%xmm13", "%xmm14", "%xmm15",
44 static const int tcg_target_reg_alloc_order[] = {
45 #if TCG_TARGET_REG_BITS == 64
77 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
78 any of them. Therefore only allow xmm0-xmm5 to be allocated. */
81 #if TCG_TARGET_REG_BITS == 64
94 #define TCG_TMP_VEC TCG_REG_XMM5
96 static const int tcg_target_call_iarg_regs[] = {
97 #if TCG_TARGET_REG_BITS == 64
110 /* 32 bit mode uses stack based calling convention (GCC default). */
114 static TCGReg tcg_target_call_oarg_reg(TCGCallReturnKind kind, int slot)
117 case TCG_CALL_RET_NORMAL:
118 tcg_debug_assert(slot >= 0 && slot <= 1);
119 return slot ? TCG_REG_EDX : TCG_REG_EAX;
121 case TCG_CALL_RET_BY_VEC:
122 tcg_debug_assert(slot == 0);
126 g_assert_not_reached();
130 /* Constants we accept. */
131 #define TCG_CT_CONST_S32 0x100
132 #define TCG_CT_CONST_U32 0x200
133 #define TCG_CT_CONST_I32 0x400
134 #define TCG_CT_CONST_WSZ 0x800
136 /* Registers used with L constraint, which are the first argument
137 registers on x86_64, and two random call clobbered registers on
139 #if TCG_TARGET_REG_BITS == 64
140 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
141 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
143 # define TCG_REG_L0 TCG_REG_EAX
144 # define TCG_REG_L1 TCG_REG_EDX
147 #if TCG_TARGET_REG_BITS == 64
148 # define ALL_GENERAL_REGS 0x0000ffffu
149 # define ALL_VECTOR_REGS 0xffff0000u
150 # define ALL_BYTEL_REGS ALL_GENERAL_REGS
152 # define ALL_GENERAL_REGS 0x000000ffu
153 # define ALL_VECTOR_REGS 0x00ff0000u
154 # define ALL_BYTEL_REGS 0x0000000fu
156 #define SOFTMMU_RESERVE_REGS \
157 (tcg_use_softmmu ? (1 << TCG_REG_L0) | (1 << TCG_REG_L1) : 0)
159 /* For 64-bit, we always know that CMOV is available. */
160 #if TCG_TARGET_REG_BITS == 64
161 # define have_cmov true
163 # define have_cmov (cpuinfo & CPUINFO_CMOV)
165 #define have_bmi2 (cpuinfo & CPUINFO_BMI2)
166 #define have_lzcnt (cpuinfo & CPUINFO_LZCNT)
168 static const tcg_insn_unit *tb_ret_addr;
170 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
171 intptr_t value, intptr_t addend)
176 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
177 if (value != (int32_t)value) {
182 tcg_patch32(code_ptr, value);
185 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
186 if (value != (int8_t)value) {
189 tcg_patch8(code_ptr, value);
192 g_assert_not_reached();
197 /* test if a constant matches the constraint */
198 static bool tcg_target_const_match(int64_t val, TCGType type, int ct, int vece)
200 if (ct & TCG_CT_CONST) {
203 if (type == TCG_TYPE_I32) {
204 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
208 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
211 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
214 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
218 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
224 # define LOWREGMASK(x) ((x) & 7)
226 #define P_EXT 0x100 /* 0x0f opcode prefix */
227 #define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */
228 #define P_DATA16 0x400 /* 0x66 opcode prefix */
229 #define P_VEXW 0x1000 /* Set VEX.W = 1 */
230 #if TCG_TARGET_REG_BITS == 64
231 # define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */
232 # define P_REXB_R 0x2000 /* REG field as byte register */
233 # define P_REXB_RM 0x4000 /* R/M field as byte register */
234 # define P_GS 0x8000 /* gs segment override */
241 #define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */
242 #define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */
243 #define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */
244 #define P_VEXL 0x80000 /* Set VEX.L = 1 */
245 #define P_EVEX 0x100000 /* Requires EVEX encoding */
247 #define OPC_ARITH_EvIz (0x81)
248 #define OPC_ARITH_EvIb (0x83)
249 #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
250 #define OPC_ANDN (0xf2 | P_EXT38)
251 #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
252 #define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3))
253 #define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16)
254 #define OPC_BSF (0xbc | P_EXT)
255 #define OPC_BSR (0xbd | P_EXT)
256 #define OPC_BSWAP (0xc8 | P_EXT)
257 #define OPC_CALL_Jz (0xe8)
258 #define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */
259 #define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3))
260 #define OPC_DEC_r32 (0x48)
261 #define OPC_IMUL_GvEv (0xaf | P_EXT)
262 #define OPC_IMUL_GvEvIb (0x6b)
263 #define OPC_IMUL_GvEvIz (0x69)
264 #define OPC_INC_r32 (0x40)
265 #define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */
266 #define OPC_JCC_short (0x70) /* ... plus condition code */
267 #define OPC_JMP_long (0xe9)
268 #define OPC_JMP_short (0xeb)
269 #define OPC_LEA (0x8d)
270 #define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3)
271 #define OPC_MOVB_EvGv (0x88) /* stores, more or less */
272 #define OPC_MOVL_EvGv (0x89) /* stores, more or less */
273 #define OPC_MOVL_GvEv (0x8b) /* loads, more or less */
274 #define OPC_MOVB_EvIz (0xc6)
275 #define OPC_MOVL_EvIz (0xc7)
276 #define OPC_MOVB_Ib (0xb0)
277 #define OPC_MOVL_Iv (0xb8)
278 #define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
279 #define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
280 #define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16)
281 #define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16)
282 #define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2)
283 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
284 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
285 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
286 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
287 #define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3)
288 #define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16)
289 #define OPC_MOVSBL (0xbe | P_EXT)
290 #define OPC_MOVSWL (0xbf | P_EXT)
291 #define OPC_MOVSLQ (0x63 | P_REXW)
292 #define OPC_MOVZBL (0xb6 | P_EXT)
293 #define OPC_MOVZWL (0xb7 | P_EXT)
294 #define OPC_PABSB (0x1c | P_EXT38 | P_DATA16)
295 #define OPC_PABSW (0x1d | P_EXT38 | P_DATA16)
296 #define OPC_PABSD (0x1e | P_EXT38 | P_DATA16)
297 #define OPC_VPABSQ (0x1f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
298 #define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16)
299 #define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16)
300 #define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16)
301 #define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16)
302 #define OPC_PADDB (0xfc | P_EXT | P_DATA16)
303 #define OPC_PADDW (0xfd | P_EXT | P_DATA16)
304 #define OPC_PADDD (0xfe | P_EXT | P_DATA16)
305 #define OPC_PADDQ (0xd4 | P_EXT | P_DATA16)
306 #define OPC_PADDSB (0xec | P_EXT | P_DATA16)
307 #define OPC_PADDSW (0xed | P_EXT | P_DATA16)
308 #define OPC_PADDUB (0xdc | P_EXT | P_DATA16)
309 #define OPC_PADDUW (0xdd | P_EXT | P_DATA16)
310 #define OPC_PAND (0xdb | P_EXT | P_DATA16)
311 #define OPC_PANDN (0xdf | P_EXT | P_DATA16)
312 #define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16)
313 #define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16)
314 #define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16)
315 #define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16)
316 #define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16)
317 #define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16)
318 #define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
319 #define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
320 #define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
321 #define OPC_PEXTRD (0x16 | P_EXT3A | P_DATA16)
322 #define OPC_PINSRD (0x22 | P_EXT3A | P_DATA16)
323 #define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16)
324 #define OPC_PMAXSW (0xee | P_EXT | P_DATA16)
325 #define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16)
326 #define OPC_VPMAXSQ (0x3d | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
327 #define OPC_PMAXUB (0xde | P_EXT | P_DATA16)
328 #define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16)
329 #define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16)
330 #define OPC_VPMAXUQ (0x3f | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
331 #define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16)
332 #define OPC_PMINSW (0xea | P_EXT | P_DATA16)
333 #define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16)
334 #define OPC_VPMINSQ (0x39 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
335 #define OPC_PMINUB (0xda | P_EXT | P_DATA16)
336 #define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16)
337 #define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16)
338 #define OPC_VPMINUQ (0x3b | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
339 #define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16)
340 #define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16)
341 #define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16)
342 #define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16)
343 #define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16)
344 #define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16)
345 #define OPC_PMULLW (0xd5 | P_EXT | P_DATA16)
346 #define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16)
347 #define OPC_VPMULLQ (0x40 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
348 #define OPC_POR (0xeb | P_EXT | P_DATA16)
349 #define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16)
350 #define OPC_PSHUFD (0x70 | P_EXT | P_DATA16)
351 #define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2)
352 #define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3)
353 #define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
354 #define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /1 /2 /6 /4 */
355 #define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
356 #define OPC_PSLLW (0xf1 | P_EXT | P_DATA16)
357 #define OPC_PSLLD (0xf2 | P_EXT | P_DATA16)
358 #define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16)
359 #define OPC_PSRAW (0xe1 | P_EXT | P_DATA16)
360 #define OPC_PSRAD (0xe2 | P_EXT | P_DATA16)
361 #define OPC_VPSRAQ (0xe2 | P_EXT | P_DATA16 | P_VEXW | P_EVEX)
362 #define OPC_PSRLW (0xd1 | P_EXT | P_DATA16)
363 #define OPC_PSRLD (0xd2 | P_EXT | P_DATA16)
364 #define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16)
365 #define OPC_PSUBB (0xf8 | P_EXT | P_DATA16)
366 #define OPC_PSUBW (0xf9 | P_EXT | P_DATA16)
367 #define OPC_PSUBD (0xfa | P_EXT | P_DATA16)
368 #define OPC_PSUBQ (0xfb | P_EXT | P_DATA16)
369 #define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16)
370 #define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16)
371 #define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16)
372 #define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16)
373 #define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16)
374 #define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16)
375 #define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16)
376 #define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16)
377 #define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16)
378 #define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16)
379 #define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16)
380 #define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16)
381 #define OPC_PXOR (0xef | P_EXT | P_DATA16)
382 #define OPC_POP_r32 (0x58)
383 #define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
384 #define OPC_PUSH_r32 (0x50)
385 #define OPC_PUSH_Iv (0x68)
386 #define OPC_PUSH_Ib (0x6a)
387 #define OPC_RET (0xc3)
388 #define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
389 #define OPC_SHIFT_1 (0xd1)
390 #define OPC_SHIFT_Ib (0xc1)
391 #define OPC_SHIFT_cl (0xd3)
392 #define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3)
393 #define OPC_SHUFPS (0xc6 | P_EXT)
394 #define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16)
395 #define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
396 #define OPC_SHRD_Ib (0xac | P_EXT)
397 #define OPC_TESTL (0x85)
398 #define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3)
399 #define OPC_UD2 (0x0b | P_EXT)
400 #define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
401 #define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
402 #define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16)
403 #define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16)
404 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
405 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
406 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
407 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
408 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
409 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
410 #define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
411 #define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
412 #define OPC_VPROLVD (0x15 | P_EXT38 | P_DATA16 | P_EVEX)
413 #define OPC_VPROLVQ (0x15 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
414 #define OPC_VPRORVD (0x14 | P_EXT38 | P_DATA16 | P_EVEX)
415 #define OPC_VPRORVQ (0x14 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
416 #define OPC_VPSHLDW (0x70 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
417 #define OPC_VPSHLDD (0x71 | P_EXT3A | P_DATA16 | P_EVEX)
418 #define OPC_VPSHLDQ (0x71 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
419 #define OPC_VPSHLDVW (0x70 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
420 #define OPC_VPSHLDVD (0x71 | P_EXT38 | P_DATA16 | P_EVEX)
421 #define OPC_VPSHLDVQ (0x71 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
422 #define OPC_VPSHRDVW (0x72 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
423 #define OPC_VPSHRDVD (0x73 | P_EXT38 | P_DATA16 | P_EVEX)
424 #define OPC_VPSHRDVQ (0x73 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
425 #define OPC_VPSLLVW (0x12 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
426 #define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16)
427 #define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
428 #define OPC_VPSRAVW (0x11 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
429 #define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16)
430 #define OPC_VPSRAVQ (0x46 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
431 #define OPC_VPSRLVW (0x10 | P_EXT38 | P_DATA16 | P_VEXW | P_EVEX)
432 #define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16)
433 #define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
434 #define OPC_VPTERNLOGQ (0x25 | P_EXT3A | P_DATA16 | P_VEXW | P_EVEX)
435 #define OPC_VZEROUPPER (0x77 | P_EXT)
436 #define OPC_XCHG_ax_r32 (0x90)
437 #define OPC_XCHG_EvGv (0x87)
439 #define OPC_GRP3_Eb (0xf6)
440 #define OPC_GRP3_Ev (0xf7)
441 #define OPC_GRP5 (0xff)
442 #define OPC_GRP14 (0x73 | P_EXT | P_DATA16)
444 /* Group 1 opcode extensions for 0x80-0x83.
445 These are also used as modifiers for OPC_ARITH. */
455 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */
462 /* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */
471 /* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */
472 #define EXT5_INC_Ev 0
473 #define EXT5_DEC_Ev 1
474 #define EXT5_CALLN_Ev 2
475 #define EXT5_JMPN_Ev 4
477 /* Condition codes to be added to OPC_JCC_{long,short}. */
496 static const uint8_t tcg_cond_to_jcc[] = {
497 [TCG_COND_EQ] = JCC_JE,
498 [TCG_COND_NE] = JCC_JNE,
499 [TCG_COND_LT] = JCC_JL,
500 [TCG_COND_GE] = JCC_JGE,
501 [TCG_COND_LE] = JCC_JLE,
502 [TCG_COND_GT] = JCC_JG,
503 [TCG_COND_LTU] = JCC_JB,
504 [TCG_COND_GEU] = JCC_JAE,
505 [TCG_COND_LEU] = JCC_JBE,
506 [TCG_COND_GTU] = JCC_JA,
509 #if TCG_TARGET_REG_BITS == 64
510 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
517 if (opc & P_DATA16) {
518 /* We should never be asking for both 16 and 64-bit operation. */
519 tcg_debug_assert((opc & P_REXW) == 0);
522 if (opc & P_SIMDF3) {
524 } else if (opc & P_SIMDF2) {
529 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */
530 rex |= (r & 8) >> 1; /* REX.R */
531 rex |= (x & 8) >> 2; /* REX.X */
532 rex |= (rm & 8) >> 3; /* REX.B */
534 /* P_REXB_{R,RM} indicates that the given register is the low byte.
535 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
536 as otherwise the encoding indicates %[abcd]h. Note that the values
537 that are ORed in merely indicate that the REX byte must be present;
538 those bits get discarded in output. */
539 rex |= opc & (r >= 4 ? P_REXB_R : 0);
540 rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
543 tcg_out8(s, (uint8_t)(rex | 0x40));
546 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
550 } else if (opc & P_EXT3A) {
558 static void tcg_out_opc(TCGContext *s, int opc)
560 if (opc & P_DATA16) {
563 if (opc & P_SIMDF3) {
565 } else if (opc & P_SIMDF2) {
568 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
572 } else if (opc & P_EXT3A) {
578 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
579 the 32-bit compilation paths. This method works with all versions of gcc,
580 whereas relying on optimization may not be able to exclude them. */
581 #define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc)
584 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
586 tcg_out_opc(s, opc, r, rm, 0);
587 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
590 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
598 /* Use the two byte form if possible, which cannot encode
599 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */
600 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
601 && ((rm | index) & 8) == 0) {
602 /* Two byte VEX prefix. */
605 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
607 /* Three byte VEX prefix. */
613 } else if (opc & P_EXT38) {
615 } else if (opc & P_EXT) {
618 g_assert_not_reached();
620 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
621 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */
622 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
625 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */
628 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */
630 if (opc & P_DATA16) {
632 } else if (opc & P_SIMDF3) {
634 } else if (opc & P_SIMDF2) {
637 tmp |= (~v & 15) << 3; /* VEX.vvvv */
642 static void tcg_out_evex_opc(TCGContext *s, int opc, int r, int v,
645 /* The entire 4-byte evex prefix; with R' and V' set. */
646 uint32_t p = 0x08041062;
649 tcg_debug_assert(have_avx512vl);
654 } else if (opc & P_EXT38) {
656 } else if (opc & P_EXT) {
659 g_assert_not_reached();
663 if (opc & P_DATA16) {
665 } else if (opc & P_SIMDF3) {
667 } else if (opc & P_SIMDF2) {
673 p = deposit32(p, 8, 2, mm);
674 p = deposit32(p, 13, 1, (rm & 8) == 0); /* EVEX.RXB.B */
675 p = deposit32(p, 14, 1, (index & 8) == 0); /* EVEX.RXB.X */
676 p = deposit32(p, 15, 1, (r & 8) == 0); /* EVEX.RXB.R */
677 p = deposit32(p, 16, 2, pp);
678 p = deposit32(p, 19, 4, ~v);
679 p = deposit32(p, 23, 1, (opc & P_VEXW) != 0);
680 p = deposit32(p, 29, 2, (opc & P_VEXL) != 0);
686 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
689 tcg_out_evex_opc(s, opc, r, v, rm, 0);
691 tcg_out_vex_opc(s, opc, r, v, rm, 0);
693 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
696 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
697 We handle either RM and INDEX missing with a negative value. In 64-bit
698 mode for absolute addresses, ~RM is the size of the immediate operand
699 that will follow the instruction. */
701 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
702 int shift, intptr_t offset)
706 if (index < 0 && rm < 0) {
707 if (TCG_TARGET_REG_BITS == 64) {
708 /* Try for a rip-relative addressing mode. This has replaced
709 the 32-bit-mode absolute addressing encoding. */
710 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
711 intptr_t disp = offset - pc;
712 if (disp == (int32_t)disp) {
713 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
718 /* Try for an absolute address encoding. This requires the
719 use of the MODRM+SIB encoding and is therefore larger than
720 rip-relative addressing. */
721 if (offset == (int32_t)offset) {
722 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
723 tcg_out8(s, (4 << 3) | 5);
724 tcg_out32(s, offset);
728 /* ??? The memory isn't directly addressable. */
729 g_assert_not_reached();
731 /* Absolute address. */
732 tcg_out8(s, (r << 3) | 5);
733 tcg_out32(s, offset);
738 /* Find the length of the immediate addend. Note that the encoding
739 that would be used for (%ebp) indicates absolute addressing. */
741 mod = 0, len = 4, rm = 5;
742 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
744 } else if (offset == (int8_t)offset) {
750 /* Use a single byte MODRM format if possible. Note that the encoding
751 that would be used for %esp is the escape to the two byte form. */
752 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
753 /* Single byte MODRM format. */
754 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
756 /* Two byte MODRM+SIB format. */
758 /* Note that the encoding that would place %esp into the index
759 field indicates no index register. In 64-bit mode, the REX.X
760 bit counts, so %r12 can be used as the index. */
764 tcg_debug_assert(index != TCG_REG_ESP);
767 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
768 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
773 } else if (len == 4) {
774 tcg_out32(s, offset);
778 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
779 int index, int shift, intptr_t offset)
781 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
782 tcg_out_sib_offset(s, r, rm, index, shift, offset);
785 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
786 int rm, int index, int shift,
789 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
790 tcg_out_sib_offset(s, r, rm, index, shift, offset);
793 /* A simplification of the above with no index or shift. */
794 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
795 int rm, intptr_t offset)
797 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
800 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
801 int v, int rm, intptr_t offset)
803 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
806 /* Output an opcode with an expected reference to the constant pool. */
807 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
809 tcg_out_opc(s, opc, r, 0, 0);
810 /* Absolute for 32-bit, pc-relative for 64-bit. */
811 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
815 /* Output an opcode with an expected reference to the constant pool. */
816 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
818 tcg_out_vex_opc(s, opc, r, 0, 0, 0);
819 /* Absolute for 32-bit, pc-relative for 64-bit. */
820 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
824 /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
825 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
827 /* Propagate an opcode prefix, such as P_REXW. */
828 int ext = subop & ~0x7;
831 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
834 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
848 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
850 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
854 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
856 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
862 tcg_debug_assert(ret >= 16 && arg >= 16);
863 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
866 tcg_debug_assert(ret >= 16 && arg >= 16);
867 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
870 tcg_debug_assert(ret >= 16 && arg >= 16);
871 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
875 g_assert_not_reached();
880 static const int avx2_dup_insn[4] = {
881 OPC_VPBROADCASTB, OPC_VPBROADCASTW,
882 OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
885 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
889 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
890 tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
894 /* ??? With zero in a register, use PSHUFB. */
895 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
899 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
903 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
904 /* imm8 operand: all output lanes selected from input lane 0. */
908 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
911 g_assert_not_reached();
917 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
918 TCGReg r, TCGReg base, intptr_t offset)
921 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
922 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
927 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
930 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
933 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
934 tcg_out8(s, 0); /* imm8 */
935 tcg_out_dup_vec(s, type, vece, r, r);
938 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
939 tcg_out8(s, 0); /* imm8 */
940 tcg_out_dup_vec(s, type, vece, r, r);
943 g_assert_not_reached();
949 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
950 TCGReg ret, int64_t arg)
952 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
955 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
959 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
963 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
965 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
967 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
969 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
971 if (type == TCG_TYPE_V64) {
972 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
973 } else if (have_avx2) {
974 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
976 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
978 if (TCG_TARGET_REG_BITS == 64) {
979 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
981 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
986 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
987 TCGReg ret, tcg_target_long arg)
990 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
994 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
998 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
999 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
1000 if (TCG_TARGET_REG_BITS == 64) {
1001 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
1003 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
1007 static void tcg_out_movi_int(TCGContext *s, TCGType type,
1008 TCGReg ret, tcg_target_long arg)
1010 tcg_target_long diff;
1013 tgen_arithr(s, ARITH_XOR, ret, ret);
1016 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
1017 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
1021 if (arg == (int32_t)arg) {
1022 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
1027 /* Try a 7 byte pc-relative lea before the 10 byte movq. */
1028 diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
1029 if (diff == (int32_t)diff) {
1030 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
1031 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
1036 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
1040 static void tcg_out_movi(TCGContext *s, TCGType type,
1041 TCGReg ret, tcg_target_long arg)
1045 #if TCG_TARGET_REG_BITS == 64
1049 tcg_out_movi_int(s, type, ret, arg);
1051 tcg_out_movi_vec(s, type, ret, arg);
1055 g_assert_not_reached();
1059 static bool tcg_out_xchg(TCGContext *s, TCGType type, TCGReg r1, TCGReg r2)
1061 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1062 tcg_out_modrm(s, OPC_XCHG_EvGv + rexw, r1, r2);
1066 static void tcg_out_addi_ptr(TCGContext *s, TCGReg rd, TCGReg rs,
1067 tcg_target_long imm)
1069 /* This function is only used for passing structs by reference. */
1070 tcg_debug_assert(imm == (int32_t)imm);
1071 tcg_out_modrm_offset(s, OPC_LEA | P_REXW, rd, rs, imm);
1074 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
1076 if (val == (int8_t)val) {
1077 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
1079 } else if (val == (int32_t)val) {
1080 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
1083 g_assert_not_reached();
1087 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1089 /* Given the strength of x86 memory ordering, we only need care for
1090 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is
1091 faster than "mfence", so don't bother with the sse insn. */
1092 if (a0 & TCG_MO_ST_LD) {
1094 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1099 static inline void tcg_out_push(TCGContext *s, int reg)
1101 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1104 static inline void tcg_out_pop(TCGContext *s, int reg)
1106 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1109 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1110 TCGReg arg1, intptr_t arg2)
1115 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1117 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1122 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1127 /* There is no instruction that can validate 8-byte alignment. */
1128 tcg_debug_assert(ret >= 16);
1129 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1133 * The gvec infrastructure is asserts that v128 vector loads
1134 * and stores use a 16-byte aligned offset. Validate that the
1135 * final pointer is aligned by using an insn that will SIGSEGV.
1137 tcg_debug_assert(ret >= 16);
1138 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1142 * The gvec infrastructure only requires 16-byte alignment,
1143 * so here we must use an unaligned load.
1145 tcg_debug_assert(ret >= 16);
1146 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1147 ret, 0, arg1, arg2);
1150 g_assert_not_reached();
1154 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1155 TCGReg arg1, intptr_t arg2)
1160 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1162 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1167 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1172 /* There is no instruction that can validate 8-byte alignment. */
1173 tcg_debug_assert(arg >= 16);
1174 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1178 * The gvec infrastructure is asserts that v128 vector loads
1179 * and stores use a 16-byte aligned offset. Validate that the
1180 * final pointer is aligned by using an insn that will SIGSEGV.
1182 * This specific instance is also used by TCG_CALL_RET_BY_VEC,
1183 * for _WIN64, which must have SSE2 but may not have AVX.
1185 tcg_debug_assert(arg >= 16);
1187 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1189 tcg_out_modrm_offset(s, OPC_MOVDQA_WxVx, arg, arg1, arg2);
1194 * The gvec infrastructure only requires 16-byte alignment,
1195 * so here we must use an unaligned store.
1197 tcg_debug_assert(arg >= 16);
1198 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1199 arg, 0, arg1, arg2);
1202 g_assert_not_reached();
1206 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1207 TCGReg base, intptr_t ofs)
1210 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1211 if (val != (int32_t)val) {
1215 } else if (type != TCG_TYPE_I32) {
1218 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1223 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1225 /* Propagate an opcode prefix, such as P_DATA16. */
1226 int ext = subopc & ~0x7;
1230 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1232 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1237 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1239 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1242 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1244 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1247 static void tcg_out_ext8u(TCGContext *s, TCGReg dest, TCGReg src)
1250 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1251 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1254 static void tcg_out_ext8s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1256 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1258 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1259 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1262 static void tcg_out_ext16u(TCGContext *s, TCGReg dest, TCGReg src)
1265 tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1268 static void tcg_out_ext16s(TCGContext *s, TCGType type, TCGReg dest, TCGReg src)
1270 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1272 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1275 static void tcg_out_ext32u(TCGContext *s, TCGReg dest, TCGReg src)
1277 /* 32-bit mov zero extends. */
1278 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1281 static void tcg_out_ext32s(TCGContext *s, TCGReg dest, TCGReg src)
1283 tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
1284 tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1287 static void tcg_out_exts_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1289 tcg_out_ext32s(s, dest, src);
1292 static void tcg_out_extu_i32_i64(TCGContext *s, TCGReg dest, TCGReg src)
1295 tcg_out_ext32u(s, dest, src);
1299 static void tcg_out_extrl_i64_i32(TCGContext *s, TCGReg dest, TCGReg src)
1301 tcg_out_ext32u(s, dest, src);
1304 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1306 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1309 static void tgen_arithi(TCGContext *s, int c, int r0,
1310 tcg_target_long val, int cf)
1314 if (TCG_TARGET_REG_BITS == 64) {
1324 * ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1325 * partial flags update stalls on Pentium4 and are not recommended
1326 * by current Intel optimization manuals.
1328 if (val == 1 || val == -1) {
1329 int is_inc = (c == ARITH_ADD) ^ (val < 0);
1330 if (TCG_TARGET_REG_BITS == 64) {
1332 * The single-byte increment encodings are re-tasked
1333 * as the REX prefixes. Use the MODRM encoding.
1335 tcg_out_modrm(s, OPC_GRP5 + rexw,
1336 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1338 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1344 * Facilitate using an 8-bit immediate. Carry is inverted
1345 * by this transformation, so do it only if cf == 0.
1347 c ^= ARITH_ADD ^ ARITH_SUB;
1354 if (TCG_TARGET_REG_BITS == 64) {
1355 if (val == 0xffffffffu) {
1356 tcg_out_ext32u(s, r0, r0);
1359 if (val == (uint32_t)val) {
1360 /* AND with no high bits set can use a 32-bit operation. */
1364 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1365 tcg_out_ext8u(s, r0, r0);
1368 if (val == 0xffffu) {
1369 tcg_out_ext16u(s, r0, r0);
1375 if (val == (int8_t)val) {
1376 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1380 if (rexw == 0 || val == (int32_t)val) {
1381 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1386 g_assert_not_reached();
1389 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1392 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1396 /* Set SMALL to force a short forward branch. */
1397 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, bool small)
1402 val = tcg_pcrel_diff(s, l->u.value_ptr);
1404 if ((int8_t)val1 == val1) {
1406 tcg_out8(s, OPC_JMP_short);
1408 tcg_out8(s, OPC_JCC_short + opc);
1412 tcg_debug_assert(!small);
1414 tcg_out8(s, OPC_JMP_long);
1415 tcg_out32(s, val - 5);
1417 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1418 tcg_out32(s, val - 6);
1423 tcg_out8(s, OPC_JMP_short);
1425 tcg_out8(s, OPC_JCC_short + opc);
1427 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1431 tcg_out8(s, OPC_JMP_long);
1433 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1435 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1440 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1441 int const_arg2, int rexw)
1446 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1448 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1451 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1455 static void tcg_out_brcond(TCGContext *s, int rexw, TCGCond cond,
1456 TCGArg arg1, TCGArg arg2, int const_arg2,
1457 TCGLabel *label, bool small)
1459 tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1460 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1463 #if TCG_TARGET_REG_BITS == 32
1464 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1465 const int *const_args, bool small)
1467 TCGLabel *label_next = gen_new_label();
1468 TCGLabel *label_this = arg_label(args[5]);
1472 tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1474 tcg_out_brcond(s, 0, TCG_COND_EQ, args[1], args[3], const_args[3],
1478 tcg_out_brcond(s, 0, TCG_COND_NE, args[0], args[2], const_args[2],
1480 tcg_out_brcond(s, 0, TCG_COND_NE, args[1], args[3], const_args[3],
1484 tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1486 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1487 tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1491 tcg_out_brcond(s, 0, TCG_COND_LT, args[1], args[3], const_args[3],
1493 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1494 tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1498 tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1500 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1501 tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1505 tcg_out_brcond(s, 0, TCG_COND_GT, args[1], args[3], const_args[3],
1507 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1508 tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1512 tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1514 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1515 tcg_out_brcond(s, 0, TCG_COND_LTU, args[0], args[2], const_args[2],
1519 tcg_out_brcond(s, 0, TCG_COND_LTU, args[1], args[3], const_args[3],
1521 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1522 tcg_out_brcond(s, 0, TCG_COND_LEU, args[0], args[2], const_args[2],
1526 tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1528 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1529 tcg_out_brcond(s, 0, TCG_COND_GTU, args[0], args[2], const_args[2],
1533 tcg_out_brcond(s, 0, TCG_COND_GTU, args[1], args[3], const_args[3],
1535 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1536 tcg_out_brcond(s, 0, TCG_COND_GEU, args[0], args[2], const_args[2],
1540 g_assert_not_reached();
1542 tcg_out_label(s, label_next);
1546 static void tcg_out_setcond(TCGContext *s, int rexw, TCGCond cond,
1547 TCGArg dest, TCGArg arg1, TCGArg arg2,
1548 int const_arg2, bool neg)
1558 /* If arg2 is 0, convert to LTU/GEU vs 1. */
1559 if (const_arg2 && arg2 == 0) {
1569 /* If arg2 is a register, swap for LTU/GEU. */
1584 * Relying on the carry bit, use SBB to produce -1 if LTU, 0 if GEU.
1585 * We can then use NEG or INC to produce the desired result.
1586 * This is always smaller than the SETCC expansion.
1588 tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1590 /* X - X - C = -C = (C ? -1 : 0) */
1591 tgen_arithr(s, ARITH_SBB + (neg ? rexw : 0), dest, dest);
1593 /* ~(C ? -1 : 0) = (C ? 0 : -1) */
1594 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1596 /* (C ? -1 : 0) + 1 = (C ? 0 : 1) */
1597 tgen_arithi(s, ARITH_ADD, dest, 1, 0);
1599 /* -(C ? -1 : 0) = (C ? 1 : 0) */
1600 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_NEG, dest);
1608 /* If arg2 is 0, extract the sign bit. */
1609 if (const_arg2 && arg2 == 0) {
1610 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, dest, arg1);
1612 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, dest);
1614 tcg_out_shifti(s, (neg ? SHIFT_SAR : SHIFT_SHR) + rexw,
1615 dest, rexw ? 63 : 31);
1625 * If dest does not overlap the inputs, clearing it first is preferred.
1626 * The XOR breaks any false dependency for the low-byte write to dest,
1627 * and is also one byte smaller than MOVZBL.
1630 if (dest != arg1 && (const_arg2 || dest != arg2)) {
1631 tgen_arithr(s, ARITH_XOR, dest, dest);
1635 tcg_out_cmp(s, arg1, arg2, const_arg2, rexw);
1636 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1639 tcg_out_ext8u(s, dest, dest);
1642 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, dest);
1646 #if TCG_TARGET_REG_BITS == 32
1647 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1648 const int *const_args)
1651 TCGLabel *label_true, *label_over;
1653 memcpy(new_args, args+1, 5*sizeof(TCGArg));
1655 if (args[0] == args[1] || args[0] == args[2]
1656 || (!const_args[3] && args[0] == args[3])
1657 || (!const_args[4] && args[0] == args[4])) {
1658 /* When the destination overlaps with one of the argument
1659 registers, don't do anything tricky. */
1660 label_true = gen_new_label();
1661 label_over = gen_new_label();
1663 new_args[5] = label_arg(label_true);
1664 tcg_out_brcond2(s, new_args, const_args+1, 1);
1666 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1667 tcg_out_jxx(s, JCC_JMP, label_over, 1);
1668 tcg_out_label(s, label_true);
1670 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1671 tcg_out_label(s, label_over);
1673 /* When the destination does not overlap one of the arguments,
1674 clear the destination first, jump if cond false, and emit an
1675 increment in the true case. This results in smaller code. */
1677 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1679 label_over = gen_new_label();
1680 new_args[4] = tcg_invert_cond(new_args[4]);
1681 new_args[5] = label_arg(label_over);
1682 tcg_out_brcond2(s, new_args, const_args+1, 1);
1684 tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1685 tcg_out_label(s, label_over);
1690 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1691 TCGReg dest, TCGReg v1)
1694 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1696 TCGLabel *over = gen_new_label();
1697 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1698 tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1699 tcg_out_label(s, over);
1703 static void tcg_out_movcond(TCGContext *s, int rexw, TCGCond cond,
1704 TCGReg dest, TCGReg c1, TCGArg c2, int const_c2,
1707 tcg_out_cmp(s, c1, c2, const_c2, rexw);
1708 tcg_out_cmov(s, cond, rexw, dest, v1);
1711 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1712 TCGArg arg2, bool const_a2)
1715 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1717 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1719 tcg_debug_assert(dest != arg2);
1720 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1723 tcg_debug_assert(dest != arg2);
1724 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1725 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1729 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1730 TCGArg arg2, bool const_a2)
1733 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1735 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1737 tcg_debug_assert(dest != arg2);
1738 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1741 tcg_debug_assert(!const_a2);
1742 tcg_debug_assert(dest != arg1);
1743 tcg_debug_assert(dest != arg2);
1745 /* Recall that the output of BSR is the index not the count. */
1746 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1747 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1749 /* Since we have destroyed the flags from BSR, we have to re-test. */
1750 tcg_out_cmp(s, arg1, 0, 1, rexw);
1751 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1755 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1757 intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1759 if (disp == (int32_t)disp) {
1760 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1763 /* rip-relative addressing into the constant pool.
1764 This is 6 + 8 = 14 bytes, as compared to using an
1765 immediate load 10 + 6 = 16 bytes, plus we may
1766 be able to re-use the pool constant for more calls. */
1767 tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1768 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1769 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1774 static void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest,
1775 const TCGHelperInfo *info)
1777 tcg_out_branch(s, 1, dest);
1780 if (TCG_TARGET_REG_BITS == 32 && info->out_kind == TCG_CALL_RET_BY_REF) {
1782 * The sysv i386 abi for struct return places a reference as the
1783 * first argument of the stack, and pops that argument with the
1784 * return statement. Since we want to retain the aligned stack
1785 * pointer for the callee, we do not want to actually push that
1786 * argument before the call but rely on the normal store to the
1787 * stack slot. But we do need to compensate for the pop in order
1788 * to reset our correct stack pointer value.
1789 * Pushing a garbage value back onto the stack is quickest.
1791 tcg_out_push(s, TCG_REG_EAX);
1796 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1798 tcg_out_branch(s, 0, dest);
1801 static void tcg_out_nopn(TCGContext *s, int n)
1804 /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1805 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1806 * duplicate prefix, and all of the interesting recent cores can
1807 * decode and discard the duplicates in a single cycle.
1809 tcg_debug_assert(n >= 1);
1810 for (i = 1; i < n; ++i) {
1816 /* Test register R vs immediate bits I, setting Z flag for EQ/NE. */
1817 static void __attribute__((unused))
1818 tcg_out_testi(TCGContext *s, TCGReg r, uint32_t i)
1821 * This is used for testing alignment, so we can usually use testb.
1822 * For i686, we have to use testl for %esi/%edi.
1824 if (i <= 0xff && (TCG_TARGET_REG_BITS == 64 || r < 4)) {
1825 tcg_out_modrm(s, OPC_GRP3_Eb | P_REXB_RM, EXT3_TESTi, r);
1828 tcg_out_modrm(s, OPC_GRP3_Ev, EXT3_TESTi, r);
1841 bool tcg_target_has_memory_bswap(MemOp memop)
1848 if ((memop & MO_SIZE) < MO_128) {
1853 * Reject 16-byte memop with 16-byte atomicity, i.e. VMOVDQA,
1854 * but do allow a pair of 64-bit operations, i.e. MOVBEQ.
1856 aa = atom_and_align_for_opc(tcg_ctx, memop, MO_ATOM_IFALIGN, true);
1857 return aa.atom < MO_128;
1861 * Because i686 has no register parameters and because x86_64 has xchg
1862 * to handle addr/data register overlap, we have placed all input arguments
1863 * before we need might need a scratch reg.
1865 * Even then, a scratch is only needed for l->raddr. Rather than expose
1866 * a general-purpose scratch when we don't actually know it's available,
1867 * use the ra_gen hook to load into RAX if needed.
1869 #if TCG_TARGET_REG_BITS == 64
1870 static TCGReg ldst_ra_gen(TCGContext *s, const TCGLabelQemuLdst *l, int arg)
1875 tcg_out_movi(s, TCG_TYPE_PTR, arg, (uintptr_t)l->raddr);
1878 static const TCGLdstHelperParam ldst_helper_param = {
1879 .ra_gen = ldst_ra_gen
1882 static const TCGLdstHelperParam ldst_helper_param = { };
1885 static void tcg_out_vec_to_pair(TCGContext *s, TCGType type,
1886 TCGReg l, TCGReg h, TCGReg v)
1888 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1890 /* vpmov{d,q} %v, %l */
1891 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, v, 0, l);
1892 /* vpextr{d,q} $1, %v, %h */
1893 tcg_out_vex_modrm(s, OPC_PEXTRD + rexw, v, 0, h);
1897 static void tcg_out_pair_to_vec(TCGContext *s, TCGType type,
1898 TCGReg v, TCGReg l, TCGReg h)
1900 int rexw = type == TCG_TYPE_I32 ? 0 : P_REXW;
1902 /* vmov{d,q} %l, %v */
1903 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, v, 0, l);
1904 /* vpinsr{d,q} $1, %h, %v, %v */
1905 tcg_out_vex_modrm(s, OPC_PINSRD + rexw, v, v, h);
1910 * Generate code for the slow path for a load at the end of block
1912 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1914 MemOp opc = get_memop(l->oi);
1915 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1917 /* resolve label address */
1918 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1920 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1923 tcg_out_ld_helper_args(s, l, &ldst_helper_param);
1924 tcg_out_branch(s, 1, qemu_ld_helpers[opc & MO_SIZE]);
1925 tcg_out_ld_helper_ret(s, l, false, &ldst_helper_param);
1927 tcg_out_jmp(s, l->raddr);
1932 * Generate code for the slow path for a store at the end of block
1934 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1936 MemOp opc = get_memop(l->oi);
1937 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1939 /* resolve label address */
1940 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1942 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1945 tcg_out_st_helper_args(s, l, &ldst_helper_param);
1946 tcg_out_branch(s, 1, qemu_st_helpers[opc & MO_SIZE]);
1948 tcg_out_jmp(s, l->raddr);
1952 #ifdef CONFIG_USER_ONLY
1953 static HostAddress x86_guest_base = {
1957 #if defined(__x86_64__) && defined(__linux__)
1958 # include <asm/prctl.h>
1959 # include <sys/prctl.h>
1960 int arch_prctl(int code, unsigned long addr);
1961 static inline int setup_guest_base_seg(void)
1963 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1968 #define setup_guest_base_seg setup_guest_base_seg
1969 #elif defined(__x86_64__) && \
1970 (defined (__FreeBSD__) || defined (__FreeBSD_kernel__))
1971 # include <machine/sysarch.h>
1972 static inline int setup_guest_base_seg(void)
1974 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1979 #define setup_guest_base_seg setup_guest_base_seg
1982 # define x86_guest_base (*(HostAddress *)({ qemu_build_not_reached(); NULL; }))
1983 #endif /* CONFIG_USER_ONLY */
1984 #ifndef setup_guest_base_seg
1985 # define setup_guest_base_seg() 0
1988 #define MIN_TLB_MASK_TABLE_OFS INT_MIN
1991 * For softmmu, perform the TLB load and compare.
1992 * For useronly, perform any required alignment tests.
1993 * In both cases, return a TCGLabelQemuLdst structure if the slow path
1994 * is required and fill in @h with the host address for the fast path.
1996 static TCGLabelQemuLdst *prepare_host_addr(TCGContext *s, HostAddress *h,
1997 TCGReg addrlo, TCGReg addrhi,
1998 MemOpIdx oi, bool is_ld)
2000 TCGLabelQemuLdst *ldst = NULL;
2001 MemOp opc = get_memop(oi);
2002 MemOp s_bits = opc & MO_SIZE;
2005 if (tcg_use_softmmu) {
2006 h->index = TCG_REG_L0;
2010 *h = x86_guest_base;
2013 h->aa = atom_and_align_for_opc(s, opc, MO_ATOM_IFALIGN, s_bits == MO_128);
2014 a_mask = (1 << h->aa.align) - 1;
2016 if (tcg_use_softmmu) {
2017 int cmp_ofs = is_ld ? offsetof(CPUTLBEntry, addr_read)
2018 : offsetof(CPUTLBEntry, addr_write);
2019 TCGType ttype = TCG_TYPE_I32;
2020 TCGType tlbtype = TCG_TYPE_I32;
2021 int trexw = 0, hrexw = 0, tlbrexw = 0;
2022 unsigned mem_index = get_mmuidx(oi);
2023 unsigned s_mask = (1 << s_bits) - 1;
2024 int fast_ofs = tlb_mask_table_ofs(s, mem_index);
2027 ldst = new_ldst_label(s);
2028 ldst->is_ld = is_ld;
2030 ldst->addrlo_reg = addrlo;
2031 ldst->addrhi_reg = addrhi;
2033 if (TCG_TARGET_REG_BITS == 64) {
2034 ttype = s->addr_type;
2035 trexw = (ttype == TCG_TYPE_I32 ? 0 : P_REXW);
2036 if (TCG_TYPE_PTR == TCG_TYPE_I64) {
2038 if (s->page_bits + s->tlb_dyn_max_bits > 32) {
2039 tlbtype = TCG_TYPE_I64;
2045 tcg_out_mov(s, tlbtype, TCG_REG_L0, addrlo);
2046 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, TCG_REG_L0,
2047 s->page_bits - CPU_TLB_ENTRY_BITS);
2049 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, TCG_REG_L0, TCG_AREG0,
2050 fast_ofs + offsetof(CPUTLBDescFast, mask));
2052 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, TCG_REG_L0, TCG_AREG0,
2053 fast_ofs + offsetof(CPUTLBDescFast, table));
2056 * If the required alignment is at least as large as the access,
2057 * simply copy the address and mask. For lesser alignments,
2058 * check that we don't cross pages for the complete access.
2060 if (a_mask >= s_mask) {
2061 tcg_out_mov(s, ttype, TCG_REG_L1, addrlo);
2063 tcg_out_modrm_offset(s, OPC_LEA + trexw, TCG_REG_L1,
2064 addrlo, s_mask - a_mask);
2066 tlb_mask = s->page_mask | a_mask;
2067 tgen_arithi(s, ARITH_AND + trexw, TCG_REG_L1, tlb_mask, 0);
2069 /* cmp 0(TCG_REG_L0), TCG_REG_L1 */
2070 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw,
2071 TCG_REG_L1, TCG_REG_L0, cmp_ofs);
2074 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2075 ldst->label_ptr[0] = s->code_ptr;
2078 if (TCG_TARGET_REG_BITS == 32 && s->addr_type == TCG_TYPE_I64) {
2079 /* cmp 4(TCG_REG_L0), addrhi */
2080 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi,
2081 TCG_REG_L0, cmp_ofs + 4);
2084 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2085 ldst->label_ptr[1] = s->code_ptr;
2090 tcg_out_ld(s, TCG_TYPE_PTR, TCG_REG_L0, TCG_REG_L0,
2091 offsetof(CPUTLBEntry, addend));
2092 } else if (a_mask) {
2093 ldst = new_ldst_label(s);
2095 ldst->is_ld = is_ld;
2097 ldst->addrlo_reg = addrlo;
2098 ldst->addrhi_reg = addrhi;
2100 tcg_out_testi(s, addrlo, a_mask);
2102 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
2103 ldst->label_ptr[0] = s->code_ptr;
2110 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2111 HostAddress h, TCGType type, MemOp memop)
2113 bool use_movbe = false;
2114 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
2115 int movop = OPC_MOVL_GvEv;
2117 /* Do big-endian loads with movbe. */
2118 if (memop & MO_BSWAP) {
2119 tcg_debug_assert(have_movbe);
2121 movop = OPC_MOVBE_GyMy;
2124 switch (memop & MO_SSIZE) {
2126 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + h.seg, datalo,
2127 h.base, h.index, 0, h.ofs);
2130 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + h.seg, datalo,
2131 h.base, h.index, 0, h.ofs);
2135 /* There is no extending movbe; only low 16-bits are modified. */
2136 if (datalo != h.base && datalo != h.index) {
2137 /* XOR breaks dependency chains. */
2138 tgen_arithr(s, ARITH_XOR, datalo, datalo);
2139 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2140 datalo, h.base, h.index, 0, h.ofs);
2142 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2143 datalo, h.base, h.index, 0, h.ofs);
2144 tcg_out_ext16u(s, datalo, datalo);
2147 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + h.seg, datalo,
2148 h.base, h.index, 0, h.ofs);
2153 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + h.seg,
2154 datalo, h.base, h.index, 0, h.ofs);
2155 tcg_out_ext16s(s, type, datalo, datalo);
2157 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + h.seg,
2158 datalo, h.base, h.index, 0, h.ofs);
2162 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2163 h.base, h.index, 0, h.ofs);
2165 #if TCG_TARGET_REG_BITS == 64
2168 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + h.seg, datalo,
2169 h.base, h.index, 0, h.ofs);
2170 tcg_out_ext32s(s, datalo, datalo);
2172 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + h.seg, datalo,
2173 h.base, h.index, 0, h.ofs);
2178 if (TCG_TARGET_REG_BITS == 64) {
2179 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2180 h.base, h.index, 0, h.ofs);
2188 if (h.base == datalo || h.index == datalo) {
2189 tcg_out_modrm_sib_offset(s, OPC_LEA, datahi,
2190 h.base, h.index, 0, h.ofs);
2191 tcg_out_modrm_offset(s, movop + h.seg, datalo, datahi, 0);
2192 tcg_out_modrm_offset(s, movop + h.seg, datahi, datahi, 4);
2194 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2195 h.base, h.index, 0, h.ofs);
2196 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2197 h.base, h.index, 0, h.ofs + 4);
2202 tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2205 * Without 16-byte atomicity, use integer regs.
2206 * That is where we want the data, and it allows bswaps.
2208 if (h.aa.atom < MO_128) {
2214 if (h.base == datalo || h.index == datalo) {
2215 tcg_out_modrm_sib_offset(s, OPC_LEA + P_REXW, datahi,
2216 h.base, h.index, 0, h.ofs);
2217 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2219 tcg_out_modrm_offset(s, movop + P_REXW + h.seg,
2222 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2223 h.base, h.index, 0, h.ofs);
2224 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2225 h.base, h.index, 0, h.ofs + 8);
2231 * With 16-byte atomicity, a vector load is required.
2232 * If we already have 16-byte alignment, then VMOVDQA always works.
2233 * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2234 * Else use we require a runtime test for alignment for VMOVDQA;
2235 * use VMOVDQU on the unaligned nonatomic path for simplicity.
2237 if (h.aa.align >= MO_128) {
2238 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2240 h.base, h.index, 0, h.ofs);
2241 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2242 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2244 h.base, h.index, 0, h.ofs);
2246 TCGLabel *l1 = gen_new_label();
2247 TCGLabel *l2 = gen_new_label();
2249 tcg_out_testi(s, h.base, 15);
2250 tcg_out_jxx(s, JCC_JNE, l1, true);
2252 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_VxWx + h.seg,
2254 h.base, h.index, 0, h.ofs);
2255 tcg_out_jxx(s, JCC_JMP, l2, true);
2257 tcg_out_label(s, l1);
2258 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_VxWx + h.seg,
2260 h.base, h.index, 0, h.ofs);
2261 tcg_out_label(s, l2);
2263 tcg_out_vec_to_pair(s, TCG_TYPE_I64, datalo, datahi, TCG_TMP_VEC);
2267 g_assert_not_reached();
2271 static void tcg_out_qemu_ld(TCGContext *s, TCGReg datalo, TCGReg datahi,
2272 TCGReg addrlo, TCGReg addrhi,
2273 MemOpIdx oi, TCGType data_type)
2275 TCGLabelQemuLdst *ldst;
2278 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, true);
2279 tcg_out_qemu_ld_direct(s, datalo, datahi, h, data_type, get_memop(oi));
2282 ldst->type = data_type;
2283 ldst->datalo_reg = datalo;
2284 ldst->datahi_reg = datahi;
2285 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2289 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2290 HostAddress h, MemOp memop)
2292 bool use_movbe = false;
2293 int movop = OPC_MOVL_EvGv;
2296 * Do big-endian stores with movbe or system-mode.
2297 * User-only without movbe will have its swapping done generically.
2299 if (memop & MO_BSWAP) {
2300 tcg_debug_assert(have_movbe);
2302 movop = OPC_MOVBE_MyGy;
2305 switch (memop & MO_SIZE) {
2307 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2308 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2309 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + h.seg,
2310 datalo, h.base, h.index, 0, h.ofs);
2313 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + h.seg, datalo,
2314 h.base, h.index, 0, h.ofs);
2317 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2318 h.base, h.index, 0, h.ofs);
2321 if (TCG_TARGET_REG_BITS == 64) {
2322 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2323 h.base, h.index, 0, h.ofs);
2330 tcg_out_modrm_sib_offset(s, movop + h.seg, datalo,
2331 h.base, h.index, 0, h.ofs);
2332 tcg_out_modrm_sib_offset(s, movop + h.seg, datahi,
2333 h.base, h.index, 0, h.ofs + 4);
2338 tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2341 * Without 16-byte atomicity, use integer regs.
2342 * That is where we have the data, and it allows bswaps.
2344 if (h.aa.atom < MO_128) {
2350 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datalo,
2351 h.base, h.index, 0, h.ofs);
2352 tcg_out_modrm_sib_offset(s, movop + P_REXW + h.seg, datahi,
2353 h.base, h.index, 0, h.ofs + 8);
2358 * With 16-byte atomicity, a vector store is required.
2359 * If we already have 16-byte alignment, then VMOVDQA always works.
2360 * Else if VMOVDQU has atomicity with dynamic alignment, use that.
2361 * Else use we require a runtime test for alignment for VMOVDQA;
2362 * use VMOVDQU on the unaligned nonatomic path for simplicity.
2364 tcg_out_pair_to_vec(s, TCG_TYPE_I64, TCG_TMP_VEC, datalo, datahi);
2365 if (h.aa.align >= MO_128) {
2366 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2368 h.base, h.index, 0, h.ofs);
2369 } else if (cpuinfo & CPUINFO_ATOMIC_VMOVDQU) {
2370 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2372 h.base, h.index, 0, h.ofs);
2374 TCGLabel *l1 = gen_new_label();
2375 TCGLabel *l2 = gen_new_label();
2377 tcg_out_testi(s, h.base, 15);
2378 tcg_out_jxx(s, JCC_JNE, l1, true);
2380 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQA_WxVx + h.seg,
2382 h.base, h.index, 0, h.ofs);
2383 tcg_out_jxx(s, JCC_JMP, l2, true);
2385 tcg_out_label(s, l1);
2386 tcg_out_vex_modrm_sib_offset(s, OPC_MOVDQU_WxVx + h.seg,
2388 h.base, h.index, 0, h.ofs);
2389 tcg_out_label(s, l2);
2394 g_assert_not_reached();
2398 static void tcg_out_qemu_st(TCGContext *s, TCGReg datalo, TCGReg datahi,
2399 TCGReg addrlo, TCGReg addrhi,
2400 MemOpIdx oi, TCGType data_type)
2402 TCGLabelQemuLdst *ldst;
2405 ldst = prepare_host_addr(s, &h, addrlo, addrhi, oi, false);
2406 tcg_out_qemu_st_direct(s, datalo, datahi, h, get_memop(oi));
2409 ldst->type = data_type;
2410 ldst->datalo_reg = datalo;
2411 ldst->datahi_reg = datahi;
2412 ldst->raddr = tcg_splitwx_to_rx(s->code_ptr);
2416 static void tcg_out_exit_tb(TCGContext *s, uintptr_t a0)
2418 /* Reuse the zeroing that exists for goto_ptr. */
2420 tcg_out_jmp(s, tcg_code_gen_epilogue);
2422 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2423 tcg_out_jmp(s, tb_ret_addr);
2427 static void tcg_out_goto_tb(TCGContext *s, int which)
2430 * Jump displacement must be aligned for atomic patching;
2431 * see if we need to add extra nops before jump
2433 int gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2435 tcg_out_nopn(s, gap - 1);
2437 tcg_out8(s, OPC_JMP_long); /* jmp im */
2438 set_jmp_insn_offset(s, which);
2440 set_jmp_reset_offset(s, which);
2443 void tb_target_set_jmp_target(const TranslationBlock *tb, int n,
2444 uintptr_t jmp_rx, uintptr_t jmp_rw)
2446 /* patch the branch destination */
2447 uintptr_t addr = tb->jmp_target_addr[n];
2448 qatomic_set((int32_t *)jmp_rw, addr - (jmp_rx + 4));
2449 /* no need to flush icache explicitly */
2452 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2453 const TCGArg args[TCG_MAX_OP_ARGS],
2454 const int const_args[TCG_MAX_OP_ARGS])
2457 int c, const_a2, vexop, rexw = 0;
2459 #if TCG_TARGET_REG_BITS == 64
2460 # define OP_32_64(x) \
2461 case glue(glue(INDEX_op_, x), _i64): \
2462 rexw = P_REXW; /* FALLTHRU */ \
2463 case glue(glue(INDEX_op_, x), _i32)
2465 # define OP_32_64(x) \
2466 case glue(glue(INDEX_op_, x), _i32)
2469 /* Hoist the loads of the most common arguments. */
2473 const_a2 = const_args[2];
2476 case INDEX_op_goto_ptr:
2477 /* jmp to the given host address (could be epilogue) */
2478 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2481 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2484 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2485 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2488 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2491 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2492 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2495 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2497 #if TCG_TARGET_REG_BITS == 64
2498 case INDEX_op_ld32u_i64:
2500 case INDEX_op_ld_i32:
2501 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2505 if (const_args[0]) {
2506 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2509 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2513 if (const_args[0]) {
2514 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2517 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2520 #if TCG_TARGET_REG_BITS == 64
2521 case INDEX_op_st32_i64:
2523 case INDEX_op_st_i32:
2524 if (const_args[0]) {
2525 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2528 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2533 /* For 3-operand addition, use LEA. */
2538 } else if (a0 == a2) {
2539 /* Watch out for dest = src + dest, since we've removed
2540 the matching constraint on the add. */
2541 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2545 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2564 tgen_arithi(s, c + rexw, a0, a2, 0);
2566 tgen_arithr(s, c + rexw, a0, a2);
2572 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2573 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2575 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2583 if (val == (int8_t)val) {
2584 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2587 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2591 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2596 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2599 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2603 /* For small constant 3-operand shift, use LEA. */
2604 if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2606 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2607 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2609 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2610 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2616 goto gen_shift_maybe_vex;
2620 goto gen_shift_maybe_vex;
2624 goto gen_shift_maybe_vex;
2631 gen_shift_maybe_vex:
2634 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2637 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2642 tcg_out_shifti(s, c + rexw, a0, a2);
2644 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2649 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2652 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2655 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2659 tcg_out_brcond(s, rexw, a2, a0, a1, const_args[1],
2660 arg_label(args[3]), 0);
2663 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, false);
2665 OP_32_64(negsetcond):
2666 tcg_out_setcond(s, rexw, args[3], a0, a1, a2, const_a2, true);
2669 tcg_out_movcond(s, rexw, args[5], a0, a1, a2, const_a2, args[3]);
2673 if (a2 & TCG_BSWAP_OS) {
2674 /* Output must be sign-extended. */
2676 tcg_out_bswap64(s, a0);
2677 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2679 tcg_out_bswap32(s, a0);
2680 tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2682 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2683 /* Output must be zero-extended, but input isn't. */
2684 tcg_out_bswap32(s, a0);
2685 tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2687 tcg_out_rolw_8(s, a0);
2691 tcg_out_bswap32(s, a0);
2692 if (rexw && (a2 & TCG_BSWAP_OS)) {
2693 tcg_out_ext32s(s, a0, a0);
2698 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2701 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2704 case INDEX_op_qemu_ld_a64_i32:
2705 if (TCG_TARGET_REG_BITS == 32) {
2706 tcg_out_qemu_ld(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2710 case INDEX_op_qemu_ld_a32_i32:
2711 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2713 case INDEX_op_qemu_ld_a32_i64:
2714 if (TCG_TARGET_REG_BITS == 64) {
2715 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2717 tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2720 case INDEX_op_qemu_ld_a64_i64:
2721 if (TCG_TARGET_REG_BITS == 64) {
2722 tcg_out_qemu_ld(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2724 tcg_out_qemu_ld(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2727 case INDEX_op_qemu_ld_a32_i128:
2728 case INDEX_op_qemu_ld_a64_i128:
2729 tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2730 tcg_out_qemu_ld(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2733 case INDEX_op_qemu_st_a64_i32:
2734 case INDEX_op_qemu_st8_a64_i32:
2735 if (TCG_TARGET_REG_BITS == 32) {
2736 tcg_out_qemu_st(s, a0, -1, a1, a2, args[3], TCG_TYPE_I32);
2740 case INDEX_op_qemu_st_a32_i32:
2741 case INDEX_op_qemu_st8_a32_i32:
2742 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I32);
2744 case INDEX_op_qemu_st_a32_i64:
2745 if (TCG_TARGET_REG_BITS == 64) {
2746 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2748 tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I64);
2751 case INDEX_op_qemu_st_a64_i64:
2752 if (TCG_TARGET_REG_BITS == 64) {
2753 tcg_out_qemu_st(s, a0, -1, a1, -1, a2, TCG_TYPE_I64);
2755 tcg_out_qemu_st(s, a0, a1, a2, args[3], args[4], TCG_TYPE_I64);
2758 case INDEX_op_qemu_st_a32_i128:
2759 case INDEX_op_qemu_st_a64_i128:
2760 tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
2761 tcg_out_qemu_st(s, a0, a1, a2, -1, args[3], TCG_TYPE_I128);
2765 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2768 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2771 if (const_args[4]) {
2772 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2774 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2776 if (const_args[5]) {
2777 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2779 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2783 if (const_args[4]) {
2784 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2786 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2788 if (const_args[5]) {
2789 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2791 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2795 #if TCG_TARGET_REG_BITS == 32
2796 case INDEX_op_brcond2_i32:
2797 tcg_out_brcond2(s, args, const_args, 0);
2799 case INDEX_op_setcond2_i32:
2800 tcg_out_setcond2(s, args, const_args);
2802 #else /* TCG_TARGET_REG_BITS == 64 */
2803 case INDEX_op_ld32s_i64:
2804 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2806 case INDEX_op_ld_i64:
2807 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2809 case INDEX_op_st_i64:
2810 if (const_args[0]) {
2811 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2814 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2818 case INDEX_op_bswap64_i64:
2819 tcg_out_bswap64(s, a0);
2821 case INDEX_op_extrh_i64_i32:
2822 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2827 if (args[3] == 0 && args[4] == 8) {
2828 /* load bits 0..7 */
2830 tcg_out_opc(s, OPC_MOVB_Ib | P_REXB_RM | LOWREGMASK(a0),
2834 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2836 } else if (TCG_TARGET_REG_BITS == 32 && args[3] == 8 && args[4] == 8) {
2837 /* load bits 8..15 */
2839 tcg_out8(s, OPC_MOVB_Ib + a0 + 4);
2842 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2844 } else if (args[3] == 0 && args[4] == 16) {
2845 /* load bits 0..15 */
2847 tcg_out_opc(s, OPC_MOVL_Iv | P_DATA16 | LOWREGMASK(a0),
2851 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2854 g_assert_not_reached();
2858 case INDEX_op_extract_i64:
2859 if (a2 + args[3] == 32) {
2860 /* This is a 32-bit zero-extending right shift. */
2861 tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2862 tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2866 case INDEX_op_extract_i32:
2867 /* On the off-chance that we can use the high-byte registers.
2868 Otherwise we emit the same ext16 + shift pattern that we
2869 would have gotten from the normal tcg-op.c expansion. */
2870 tcg_debug_assert(a2 == 8 && args[3] == 8);
2871 if (a1 < 4 && a0 < 8) {
2872 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2874 tcg_out_ext16u(s, a0, a1);
2875 tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2879 case INDEX_op_sextract_i32:
2880 /* We don't implement sextract_i64, as we cannot sign-extend to
2881 64-bits without using the REX prefix that explicitly excludes
2882 access to the high-byte registers. */
2883 tcg_debug_assert(a2 == 8 && args[3] == 8);
2884 if (a1 < 4 && a0 < 8) {
2885 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2887 tcg_out_ext16s(s, TCG_TYPE_I32, a0, a1);
2888 tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2893 /* Note that SHRD outputs to the r/m operand. */
2894 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2895 tcg_out8(s, args[3]);
2901 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
2902 case INDEX_op_mov_i64:
2903 case INDEX_op_call: /* Always emitted via tcg_out_call. */
2904 case INDEX_op_exit_tb: /* Always emitted via tcg_out_exit_tb. */
2905 case INDEX_op_goto_tb: /* Always emitted via tcg_out_goto_tb. */
2906 case INDEX_op_ext8s_i32: /* Always emitted via tcg_reg_alloc_op. */
2907 case INDEX_op_ext8s_i64:
2908 case INDEX_op_ext8u_i32:
2909 case INDEX_op_ext8u_i64:
2910 case INDEX_op_ext16s_i32:
2911 case INDEX_op_ext16s_i64:
2912 case INDEX_op_ext16u_i32:
2913 case INDEX_op_ext16u_i64:
2914 case INDEX_op_ext32s_i64:
2915 case INDEX_op_ext32u_i64:
2916 case INDEX_op_ext_i32_i64:
2917 case INDEX_op_extu_i32_i64:
2918 case INDEX_op_extrl_i64_i32:
2920 g_assert_not_reached();
2926 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2927 unsigned vecl, unsigned vece,
2928 const TCGArg args[TCG_MAX_OP_ARGS],
2929 const int const_args[TCG_MAX_OP_ARGS])
2931 static int const add_insn[4] = {
2932 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2934 static int const ssadd_insn[4] = {
2935 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2937 static int const usadd_insn[4] = {
2938 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2940 static int const sub_insn[4] = {
2941 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2943 static int const sssub_insn[4] = {
2944 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2946 static int const ussub_insn[4] = {
2947 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2949 static int const mul_insn[4] = {
2950 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_VPMULLQ
2952 static int const shift_imm_insn[4] = {
2953 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2955 static int const cmpeq_insn[4] = {
2956 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2958 static int const cmpgt_insn[4] = {
2959 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2961 static int const punpckl_insn[4] = {
2962 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2964 static int const punpckh_insn[4] = {
2965 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2967 static int const packss_insn[4] = {
2968 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2970 static int const packus_insn[4] = {
2971 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2973 static int const smin_insn[4] = {
2974 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_VPMINSQ
2976 static int const smax_insn[4] = {
2977 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_VPMAXSQ
2979 static int const umin_insn[4] = {
2980 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_VPMINUQ
2982 static int const umax_insn[4] = {
2983 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_VPMAXUQ
2985 static int const rotlv_insn[4] = {
2986 OPC_UD2, OPC_UD2, OPC_VPROLVD, OPC_VPROLVQ
2988 static int const rotrv_insn[4] = {
2989 OPC_UD2, OPC_UD2, OPC_VPRORVD, OPC_VPRORVQ
2991 static int const shlv_insn[4] = {
2992 OPC_UD2, OPC_VPSLLVW, OPC_VPSLLVD, OPC_VPSLLVQ
2994 static int const shrv_insn[4] = {
2995 OPC_UD2, OPC_VPSRLVW, OPC_VPSRLVD, OPC_VPSRLVQ
2997 static int const sarv_insn[4] = {
2998 OPC_UD2, OPC_VPSRAVW, OPC_VPSRAVD, OPC_VPSRAVQ
3000 static int const shls_insn[4] = {
3001 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
3003 static int const shrs_insn[4] = {
3004 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
3006 static int const sars_insn[4] = {
3007 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_VPSRAQ
3009 static int const vpshldi_insn[4] = {
3010 OPC_UD2, OPC_VPSHLDW, OPC_VPSHLDD, OPC_VPSHLDQ
3012 static int const vpshldv_insn[4] = {
3013 OPC_UD2, OPC_VPSHLDVW, OPC_VPSHLDVD, OPC_VPSHLDVQ
3015 static int const vpshrdv_insn[4] = {
3016 OPC_UD2, OPC_VPSHRDVW, OPC_VPSHRDVD, OPC_VPSHRDVQ
3018 static int const abs_insn[4] = {
3019 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_VPABSQ
3022 TCGType type = vecl + TCG_TYPE_V64;
3024 TCGArg a0, a1, a2, a3;
3031 case INDEX_op_add_vec:
3032 insn = add_insn[vece];
3034 case INDEX_op_ssadd_vec:
3035 insn = ssadd_insn[vece];
3037 case INDEX_op_usadd_vec:
3038 insn = usadd_insn[vece];
3040 case INDEX_op_sub_vec:
3041 insn = sub_insn[vece];
3043 case INDEX_op_sssub_vec:
3044 insn = sssub_insn[vece];
3046 case INDEX_op_ussub_vec:
3047 insn = ussub_insn[vece];
3049 case INDEX_op_mul_vec:
3050 insn = mul_insn[vece];
3052 case INDEX_op_and_vec:
3055 case INDEX_op_or_vec:
3058 case INDEX_op_xor_vec:
3061 case INDEX_op_smin_vec:
3062 insn = smin_insn[vece];
3064 case INDEX_op_umin_vec:
3065 insn = umin_insn[vece];
3067 case INDEX_op_smax_vec:
3068 insn = smax_insn[vece];
3070 case INDEX_op_umax_vec:
3071 insn = umax_insn[vece];
3073 case INDEX_op_shlv_vec:
3074 insn = shlv_insn[vece];
3076 case INDEX_op_shrv_vec:
3077 insn = shrv_insn[vece];
3079 case INDEX_op_sarv_vec:
3080 insn = sarv_insn[vece];
3082 case INDEX_op_rotlv_vec:
3083 insn = rotlv_insn[vece];
3085 case INDEX_op_rotrv_vec:
3086 insn = rotrv_insn[vece];
3088 case INDEX_op_shls_vec:
3089 insn = shls_insn[vece];
3091 case INDEX_op_shrs_vec:
3092 insn = shrs_insn[vece];
3094 case INDEX_op_sars_vec:
3095 insn = sars_insn[vece];
3097 case INDEX_op_x86_punpckl_vec:
3098 insn = punpckl_insn[vece];
3100 case INDEX_op_x86_punpckh_vec:
3101 insn = punpckh_insn[vece];
3103 case INDEX_op_x86_packss_vec:
3104 insn = packss_insn[vece];
3106 case INDEX_op_x86_packus_vec:
3107 insn = packus_insn[vece];
3109 case INDEX_op_x86_vpshldv_vec:
3110 insn = vpshldv_insn[vece];
3114 case INDEX_op_x86_vpshrdv_vec:
3115 insn = vpshrdv_insn[vece];
3119 #if TCG_TARGET_REG_BITS == 32
3120 case INDEX_op_dup2_vec:
3121 /* First merge the two 32-bit inputs to a single 64-bit element. */
3122 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
3123 /* Then replicate the 64-bit elements across the rest of the vector. */
3124 if (type != TCG_TYPE_V64) {
3125 tcg_out_dup_vec(s, type, MO_64, a0, a0);
3129 case INDEX_op_abs_vec:
3130 insn = abs_insn[vece];
3135 tcg_debug_assert(insn != OPC_UD2);
3136 if (type == TCG_TYPE_V256) {
3139 tcg_out_vex_modrm(s, insn, a0, a1, a2);
3142 case INDEX_op_cmp_vec:
3144 if (sub == TCG_COND_EQ) {
3145 insn = cmpeq_insn[vece];
3146 } else if (sub == TCG_COND_GT) {
3147 insn = cmpgt_insn[vece];
3149 g_assert_not_reached();
3153 case INDEX_op_andc_vec:
3155 if (type == TCG_TYPE_V256) {
3158 tcg_out_vex_modrm(s, insn, a0, a2, a1);
3161 case INDEX_op_shli_vec:
3162 insn = shift_imm_insn[vece];
3165 case INDEX_op_shri_vec:
3166 insn = shift_imm_insn[vece];
3169 case INDEX_op_sari_vec:
3170 if (vece == MO_64) {
3171 insn = OPC_PSHIFTD_Ib | P_VEXW | P_EVEX;
3173 insn = shift_imm_insn[vece];
3177 case INDEX_op_rotli_vec:
3178 insn = OPC_PSHIFTD_Ib | P_EVEX; /* VPROL[DQ] */
3179 if (vece == MO_64) {
3185 tcg_debug_assert(vece != MO_8);
3186 if (type == TCG_TYPE_V256) {
3189 tcg_out_vex_modrm(s, insn, sub, a0, a1);
3193 case INDEX_op_ld_vec:
3194 tcg_out_ld(s, type, a0, a1, a2);
3196 case INDEX_op_st_vec:
3197 tcg_out_st(s, type, a0, a1, a2);
3199 case INDEX_op_dupm_vec:
3200 tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
3203 case INDEX_op_x86_shufps_vec:
3207 case INDEX_op_x86_blend_vec:
3208 if (vece == MO_16) {
3210 } else if (vece == MO_32) {
3211 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
3213 g_assert_not_reached();
3217 case INDEX_op_x86_vperm2i128_vec:
3218 insn = OPC_VPERM2I128;
3221 case INDEX_op_x86_vpshldi_vec:
3222 insn = vpshldi_insn[vece];
3226 case INDEX_op_not_vec:
3227 insn = OPC_VPTERNLOGQ;
3229 sub = 0x33; /* !B */
3231 case INDEX_op_nor_vec:
3232 insn = OPC_VPTERNLOGQ;
3233 sub = 0x11; /* norCB */
3235 case INDEX_op_nand_vec:
3236 insn = OPC_VPTERNLOGQ;
3237 sub = 0x77; /* nandCB */
3239 case INDEX_op_eqv_vec:
3240 insn = OPC_VPTERNLOGQ;
3241 sub = 0x99; /* xnorCB */
3243 case INDEX_op_orc_vec:
3244 insn = OPC_VPTERNLOGQ;
3245 sub = 0xdd; /* orB!C */
3248 case INDEX_op_bitsel_vec:
3249 insn = OPC_VPTERNLOGQ;
3254 sub = 0xca; /* A?B:C */
3255 } else if (a0 == a2) {
3257 sub = 0xe2; /* B?A:C */
3259 tcg_out_mov(s, type, a0, a3);
3260 sub = 0xb8; /* B?C:A */
3265 tcg_debug_assert(insn != OPC_UD2);
3266 if (type == TCG_TYPE_V256) {
3269 tcg_out_vex_modrm(s, insn, a0, a1, a2);
3273 case INDEX_op_x86_vpblendvb_vec:
3274 insn = OPC_VPBLENDVB;
3275 if (type == TCG_TYPE_V256) {
3278 tcg_out_vex_modrm(s, insn, a0, a1, a2);
3279 tcg_out8(s, args[3] << 4);
3282 case INDEX_op_x86_psrldq_vec:
3283 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
3287 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */
3288 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */
3290 g_assert_not_reached();
3294 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
3297 case INDEX_op_goto_ptr:
3300 case INDEX_op_ld8u_i32:
3301 case INDEX_op_ld8u_i64:
3302 case INDEX_op_ld8s_i32:
3303 case INDEX_op_ld8s_i64:
3304 case INDEX_op_ld16u_i32:
3305 case INDEX_op_ld16u_i64:
3306 case INDEX_op_ld16s_i32:
3307 case INDEX_op_ld16s_i64:
3308 case INDEX_op_ld_i32:
3309 case INDEX_op_ld32u_i64:
3310 case INDEX_op_ld32s_i64:
3311 case INDEX_op_ld_i64:
3312 return C_O1_I1(r, r);
3314 case INDEX_op_st8_i32:
3315 case INDEX_op_st8_i64:
3316 return C_O0_I2(qi, r);
3318 case INDEX_op_st16_i32:
3319 case INDEX_op_st16_i64:
3320 case INDEX_op_st_i32:
3321 case INDEX_op_st32_i64:
3322 return C_O0_I2(ri, r);
3324 case INDEX_op_st_i64:
3325 return C_O0_I2(re, r);
3327 case INDEX_op_add_i32:
3328 case INDEX_op_add_i64:
3329 return C_O1_I2(r, r, re);
3331 case INDEX_op_sub_i32:
3332 case INDEX_op_sub_i64:
3333 case INDEX_op_mul_i32:
3334 case INDEX_op_mul_i64:
3335 case INDEX_op_or_i32:
3336 case INDEX_op_or_i64:
3337 case INDEX_op_xor_i32:
3338 case INDEX_op_xor_i64:
3339 return C_O1_I2(r, 0, re);
3341 case INDEX_op_and_i32:
3342 case INDEX_op_and_i64:
3343 return C_O1_I2(r, 0, reZ);
3345 case INDEX_op_andc_i32:
3346 case INDEX_op_andc_i64:
3347 return C_O1_I2(r, r, rI);
3349 case INDEX_op_shl_i32:
3350 case INDEX_op_shl_i64:
3351 case INDEX_op_shr_i32:
3352 case INDEX_op_shr_i64:
3353 case INDEX_op_sar_i32:
3354 case INDEX_op_sar_i64:
3355 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
3357 case INDEX_op_rotl_i32:
3358 case INDEX_op_rotl_i64:
3359 case INDEX_op_rotr_i32:
3360 case INDEX_op_rotr_i64:
3361 return C_O1_I2(r, 0, ci);
3363 case INDEX_op_brcond_i32:
3364 case INDEX_op_brcond_i64:
3365 return C_O0_I2(r, re);
3367 case INDEX_op_bswap16_i32:
3368 case INDEX_op_bswap16_i64:
3369 case INDEX_op_bswap32_i32:
3370 case INDEX_op_bswap32_i64:
3371 case INDEX_op_bswap64_i64:
3372 case INDEX_op_neg_i32:
3373 case INDEX_op_neg_i64:
3374 case INDEX_op_not_i32:
3375 case INDEX_op_not_i64:
3376 case INDEX_op_extrh_i64_i32:
3377 return C_O1_I1(r, 0);
3379 case INDEX_op_ext8s_i32:
3380 case INDEX_op_ext8s_i64:
3381 case INDEX_op_ext8u_i32:
3382 case INDEX_op_ext8u_i64:
3383 return C_O1_I1(r, q);
3385 case INDEX_op_ext16s_i32:
3386 case INDEX_op_ext16s_i64:
3387 case INDEX_op_ext16u_i32:
3388 case INDEX_op_ext16u_i64:
3389 case INDEX_op_ext32s_i64:
3390 case INDEX_op_ext32u_i64:
3391 case INDEX_op_ext_i32_i64:
3392 case INDEX_op_extu_i32_i64:
3393 case INDEX_op_extrl_i64_i32:
3394 case INDEX_op_extract_i32:
3395 case INDEX_op_extract_i64:
3396 case INDEX_op_sextract_i32:
3397 case INDEX_op_ctpop_i32:
3398 case INDEX_op_ctpop_i64:
3399 return C_O1_I1(r, r);
3401 case INDEX_op_extract2_i32:
3402 case INDEX_op_extract2_i64:
3403 return C_O1_I2(r, 0, r);
3405 case INDEX_op_deposit_i32:
3406 case INDEX_op_deposit_i64:
3407 return C_O1_I2(q, 0, qi);
3409 case INDEX_op_setcond_i32:
3410 case INDEX_op_setcond_i64:
3411 case INDEX_op_negsetcond_i32:
3412 case INDEX_op_negsetcond_i64:
3413 return C_O1_I2(q, r, re);
3415 case INDEX_op_movcond_i32:
3416 case INDEX_op_movcond_i64:
3417 return C_O1_I4(r, r, re, r, 0);
3419 case INDEX_op_div2_i32:
3420 case INDEX_op_div2_i64:
3421 case INDEX_op_divu2_i32:
3422 case INDEX_op_divu2_i64:
3423 return C_O2_I3(a, d, 0, 1, r);
3425 case INDEX_op_mulu2_i32:
3426 case INDEX_op_mulu2_i64:
3427 case INDEX_op_muls2_i32:
3428 case INDEX_op_muls2_i64:
3429 return C_O2_I2(a, d, a, r);
3431 case INDEX_op_add2_i32:
3432 case INDEX_op_add2_i64:
3433 case INDEX_op_sub2_i32:
3434 case INDEX_op_sub2_i64:
3435 return C_N1_O1_I4(r, r, 0, 1, re, re);
3437 case INDEX_op_ctz_i32:
3438 case INDEX_op_ctz_i64:
3439 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3441 case INDEX_op_clz_i32:
3442 case INDEX_op_clz_i64:
3443 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3445 case INDEX_op_qemu_ld_a32_i32:
3446 return C_O1_I1(r, L);
3447 case INDEX_op_qemu_ld_a64_i32:
3448 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O1_I2(r, L, L);
3450 case INDEX_op_qemu_st_a32_i32:
3451 return C_O0_I2(L, L);
3452 case INDEX_op_qemu_st_a64_i32:
3453 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3454 case INDEX_op_qemu_st8_a32_i32:
3455 return C_O0_I2(s, L);
3456 case INDEX_op_qemu_st8_a64_i32:
3457 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(s, L) : C_O0_I3(s, L, L);
3459 case INDEX_op_qemu_ld_a32_i64:
3460 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I1(r, r, L);
3461 case INDEX_op_qemu_ld_a64_i64:
3462 return TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L) : C_O2_I2(r, r, L, L);
3464 case INDEX_op_qemu_st_a32_i64:
3465 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I3(L, L, L);
3466 case INDEX_op_qemu_st_a64_i64:
3467 return TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L) : C_O0_I4(L, L, L, L);
3469 case INDEX_op_qemu_ld_a32_i128:
3470 case INDEX_op_qemu_ld_a64_i128:
3471 tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3472 return C_O2_I1(r, r, L);
3473 case INDEX_op_qemu_st_a32_i128:
3474 case INDEX_op_qemu_st_a64_i128:
3475 tcg_debug_assert(TCG_TARGET_REG_BITS == 64);
3476 return C_O0_I3(L, L, L);
3478 case INDEX_op_brcond2_i32:
3479 return C_O0_I4(r, r, ri, ri);
3481 case INDEX_op_setcond2_i32:
3482 return C_O1_I4(r, r, r, ri, ri);
3484 case INDEX_op_ld_vec:
3485 case INDEX_op_dupm_vec:
3486 return C_O1_I1(x, r);
3488 case INDEX_op_st_vec:
3489 return C_O0_I2(x, r);
3491 case INDEX_op_add_vec:
3492 case INDEX_op_sub_vec:
3493 case INDEX_op_mul_vec:
3494 case INDEX_op_and_vec:
3495 case INDEX_op_or_vec:
3496 case INDEX_op_xor_vec:
3497 case INDEX_op_andc_vec:
3498 case INDEX_op_orc_vec:
3499 case INDEX_op_nand_vec:
3500 case INDEX_op_nor_vec:
3501 case INDEX_op_eqv_vec:
3502 case INDEX_op_ssadd_vec:
3503 case INDEX_op_usadd_vec:
3504 case INDEX_op_sssub_vec:
3505 case INDEX_op_ussub_vec:
3506 case INDEX_op_smin_vec:
3507 case INDEX_op_umin_vec:
3508 case INDEX_op_smax_vec:
3509 case INDEX_op_umax_vec:
3510 case INDEX_op_shlv_vec:
3511 case INDEX_op_shrv_vec:
3512 case INDEX_op_sarv_vec:
3513 case INDEX_op_rotlv_vec:
3514 case INDEX_op_rotrv_vec:
3515 case INDEX_op_shls_vec:
3516 case INDEX_op_shrs_vec:
3517 case INDEX_op_sars_vec:
3518 case INDEX_op_cmp_vec:
3519 case INDEX_op_x86_shufps_vec:
3520 case INDEX_op_x86_blend_vec:
3521 case INDEX_op_x86_packss_vec:
3522 case INDEX_op_x86_packus_vec:
3523 case INDEX_op_x86_vperm2i128_vec:
3524 case INDEX_op_x86_punpckl_vec:
3525 case INDEX_op_x86_punpckh_vec:
3526 case INDEX_op_x86_vpshldi_vec:
3527 #if TCG_TARGET_REG_BITS == 32
3528 case INDEX_op_dup2_vec:
3530 return C_O1_I2(x, x, x);
3532 case INDEX_op_abs_vec:
3533 case INDEX_op_dup_vec:
3534 case INDEX_op_not_vec:
3535 case INDEX_op_shli_vec:
3536 case INDEX_op_shri_vec:
3537 case INDEX_op_sari_vec:
3538 case INDEX_op_rotli_vec:
3539 case INDEX_op_x86_psrldq_vec:
3540 return C_O1_I1(x, x);
3542 case INDEX_op_x86_vpshldv_vec:
3543 case INDEX_op_x86_vpshrdv_vec:
3544 return C_O1_I3(x, 0, x, x);
3546 case INDEX_op_bitsel_vec:
3547 case INDEX_op_x86_vpblendvb_vec:
3548 return C_O1_I3(x, x, x, x);
3551 g_assert_not_reached();
3555 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3558 case INDEX_op_add_vec:
3559 case INDEX_op_sub_vec:
3560 case INDEX_op_and_vec:
3561 case INDEX_op_or_vec:
3562 case INDEX_op_xor_vec:
3563 case INDEX_op_andc_vec:
3564 case INDEX_op_orc_vec:
3565 case INDEX_op_nand_vec:
3566 case INDEX_op_nor_vec:
3567 case INDEX_op_eqv_vec:
3568 case INDEX_op_not_vec:
3569 case INDEX_op_bitsel_vec:
3571 case INDEX_op_cmp_vec:
3572 case INDEX_op_cmpsel_vec:
3575 case INDEX_op_rotli_vec:
3576 return have_avx512vl && vece >= MO_32 ? 1 : -1;
3578 case INDEX_op_shli_vec:
3579 case INDEX_op_shri_vec:
3580 /* We must expand the operation for MO_8. */
3581 return vece == MO_8 ? -1 : 1;
3583 case INDEX_op_sari_vec:
3591 if (have_avx512vl) {
3595 * We can emulate this for MO_64, but it does not pay off
3596 * unless we're producing at least 4 values.
3598 return type >= TCG_TYPE_V256 ? -1 : 0;
3602 case INDEX_op_shls_vec:
3603 case INDEX_op_shrs_vec:
3604 return vece >= MO_16;
3605 case INDEX_op_sars_vec:
3611 return have_avx512vl;
3614 case INDEX_op_rotls_vec:
3615 return vece >= MO_16 ? -1 : 0;
3617 case INDEX_op_shlv_vec:
3618 case INDEX_op_shrv_vec:
3621 return have_avx512bw;
3627 case INDEX_op_sarv_vec:
3630 return have_avx512bw;
3634 return have_avx512vl;
3637 case INDEX_op_rotlv_vec:
3638 case INDEX_op_rotrv_vec:
3641 return have_avx512vbmi2 ? -1 : 0;
3644 return have_avx512vl ? 1 : have_avx2 ? -1 : 0;
3648 case INDEX_op_mul_vec:
3653 return have_avx512dq;
3657 case INDEX_op_ssadd_vec:
3658 case INDEX_op_usadd_vec:
3659 case INDEX_op_sssub_vec:
3660 case INDEX_op_ussub_vec:
3661 return vece <= MO_16;
3662 case INDEX_op_smin_vec:
3663 case INDEX_op_smax_vec:
3664 case INDEX_op_umin_vec:
3665 case INDEX_op_umax_vec:
3666 case INDEX_op_abs_vec:
3667 return vece <= MO_32 || have_avx512vl;
3674 static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3675 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3679 tcg_debug_assert(vece == MO_8);
3681 t1 = tcg_temp_new_vec(type);
3682 t2 = tcg_temp_new_vec(type);
3685 * Unpack to W, shift, and repack. Tricky bits:
3686 * (1) Use punpck*bw x,x to produce DDCCBBAA,
3687 * i.e. duplicate in other half of the 16-bit lane.
3688 * (2) For right-shift, add 8 so that the high half of the lane
3689 * becomes zero. For left-shift, and left-rotate, we must
3690 * shift up and down again.
3691 * (3) Step 2 leaves high half zero such that PACKUSWB
3692 * (pack with unsigned saturation) does not modify
3695 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3696 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3697 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3698 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3700 if (opc != INDEX_op_rotli_vec) {
3703 if (opc == INDEX_op_shri_vec) {
3704 tcg_gen_shri_vec(MO_16, t1, t1, imm);
3705 tcg_gen_shri_vec(MO_16, t2, t2, imm);
3707 tcg_gen_shli_vec(MO_16, t1, t1, imm);
3708 tcg_gen_shli_vec(MO_16, t2, t2, imm);
3709 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3710 tcg_gen_shri_vec(MO_16, t2, t2, 8);
3713 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3714 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3715 tcg_temp_free_vec(t1);
3716 tcg_temp_free_vec(t2);
3719 static void expand_vec_sari(TCGType type, unsigned vece,
3720 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3726 /* Unpack to W, shift, and repack, as in expand_vec_shi. */
3727 t1 = tcg_temp_new_vec(type);
3728 t2 = tcg_temp_new_vec(type);
3729 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3730 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3731 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3732 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3733 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3734 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3735 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3736 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3737 tcg_temp_free_vec(t1);
3738 tcg_temp_free_vec(t2);
3742 t1 = tcg_temp_new_vec(type);
3745 * We can emulate a small sign extend by performing an arithmetic
3746 * 32-bit shift and overwriting the high half of a 64-bit logical
3747 * shift. Note that the ISA says shift of 32 is valid, but TCG
3748 * does not, so we have to bound the smaller shift -- we get the
3749 * same result in the high half either way.
3751 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3752 tcg_gen_shri_vec(MO_64, v0, v1, imm);
3753 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3754 tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3755 tcgv_vec_arg(t1), 0xaa);
3757 /* Otherwise we will need to use a compare vs 0 to produce
3758 * the sign-extend, shift and merge.
3760 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1,
3761 tcg_constant_vec(type, MO_64, 0), v1);
3762 tcg_gen_shri_vec(MO_64, v0, v1, imm);
3763 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3764 tcg_gen_or_vec(MO_64, v0, v0, t1);
3766 tcg_temp_free_vec(t1);
3770 g_assert_not_reached();
3774 static void expand_vec_rotli(TCGType type, unsigned vece,
3775 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3780 expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3784 if (have_avx512vbmi2) {
3785 vec_gen_4(INDEX_op_x86_vpshldi_vec, type, vece,
3786 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v1), imm);
3790 t = tcg_temp_new_vec(type);
3791 tcg_gen_shli_vec(vece, t, v1, imm);
3792 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3793 tcg_gen_or_vec(vece, v0, v0, t);
3794 tcg_temp_free_vec(t);
3797 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3798 TCGv_vec v1, TCGv_vec sh, bool right)
3802 if (have_avx512vbmi2) {
3803 vec_gen_4(right ? INDEX_op_x86_vpshrdv_vec : INDEX_op_x86_vpshldv_vec,
3804 type, vece, tcgv_vec_arg(v0), tcgv_vec_arg(v1),
3805 tcgv_vec_arg(v1), tcgv_vec_arg(sh));
3809 t = tcg_temp_new_vec(type);
3810 tcg_gen_dupi_vec(vece, t, 8 << vece);
3811 tcg_gen_sub_vec(vece, t, t, sh);
3813 tcg_gen_shlv_vec(vece, t, v1, t);
3814 tcg_gen_shrv_vec(vece, v0, v1, sh);
3816 tcg_gen_shrv_vec(vece, t, v1, t);
3817 tcg_gen_shlv_vec(vece, v0, v1, sh);
3819 tcg_gen_or_vec(vece, v0, v0, t);
3820 tcg_temp_free_vec(t);
3823 static void expand_vec_rotls(TCGType type, unsigned vece,
3824 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3826 TCGv_vec t = tcg_temp_new_vec(type);
3828 tcg_debug_assert(vece != MO_8);
3830 if (vece >= MO_32 ? have_avx512vl : have_avx512vbmi2) {
3831 tcg_gen_dup_i32_vec(vece, t, lsh);
3832 if (vece >= MO_32) {
3833 tcg_gen_rotlv_vec(vece, v0, v1, t);
3835 expand_vec_rotv(type, vece, v0, v1, t, false);
3838 TCGv_i32 rsh = tcg_temp_new_i32();
3840 tcg_gen_neg_i32(rsh, lsh);
3841 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3842 tcg_gen_shls_vec(vece, t, v1, lsh);
3843 tcg_gen_shrs_vec(vece, v0, v1, rsh);
3844 tcg_gen_or_vec(vece, v0, v0, t);
3846 tcg_temp_free_i32(rsh);
3849 tcg_temp_free_vec(t);
3852 static void expand_vec_mul(TCGType type, unsigned vece,
3853 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3855 TCGv_vec t1, t2, t3, t4, zero;
3857 tcg_debug_assert(vece == MO_8);
3860 * Unpack v1 bytes to words, 0 | x.
3861 * Unpack v2 bytes to words, y | 0.
3862 * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3863 * Shift logical right by 8 bits to clear the high 8 bytes before
3864 * using an unsigned saturated pack.
3866 * The difference between the V64, V128 and V256 cases is merely how
3867 * we distribute the expansion between temporaries.
3871 t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3872 t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3873 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3874 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3875 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3876 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3877 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3878 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3879 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3880 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3881 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3882 tcg_temp_free_vec(t1);
3883 tcg_temp_free_vec(t2);
3888 t1 = tcg_temp_new_vec(type);
3889 t2 = tcg_temp_new_vec(type);
3890 t3 = tcg_temp_new_vec(type);
3891 t4 = tcg_temp_new_vec(type);
3892 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3893 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3894 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3895 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3896 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3897 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3898 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3899 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3900 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3901 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3902 tcg_gen_mul_vec(MO_16, t3, t3, t4);
3903 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3904 tcg_gen_shri_vec(MO_16, t3, t3, 8);
3905 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3906 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3907 tcg_temp_free_vec(t1);
3908 tcg_temp_free_vec(t2);
3909 tcg_temp_free_vec(t3);
3910 tcg_temp_free_vec(t4);
3914 g_assert_not_reached();
3918 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3919 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3928 TCGv_vec t1, t2, t3;
3944 fixup = NEED_SWAP | NEED_INV;
3947 if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3950 fixup = NEED_BIAS | NEED_INV;
3954 if (tcg_can_emit_vec_op(INDEX_op_umin_vec, type, vece)) {
3955 fixup = NEED_UMIN | NEED_INV;
3961 if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3964 fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3968 if (tcg_can_emit_vec_op(INDEX_op_umax_vec, type, vece)) {
3969 fixup = NEED_UMAX | NEED_INV;
3971 fixup = NEED_BIAS | NEED_SWAP;
3975 g_assert_not_reached();
3978 if (fixup & NEED_INV) {
3979 cond = tcg_invert_cond(cond);
3981 if (fixup & NEED_SWAP) {
3982 t1 = v1, v1 = v2, v2 = t1;
3983 cond = tcg_swap_cond(cond);
3987 if (fixup & (NEED_UMIN | NEED_UMAX)) {
3988 t1 = tcg_temp_new_vec(type);
3989 if (fixup & NEED_UMIN) {
3990 tcg_gen_umin_vec(vece, t1, v1, v2);
3992 tcg_gen_umax_vec(vece, t1, v1, v2);
3996 } else if (fixup & NEED_BIAS) {
3997 t1 = tcg_temp_new_vec(type);
3998 t2 = tcg_temp_new_vec(type);
3999 t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
4000 tcg_gen_sub_vec(vece, t1, v1, t3);
4001 tcg_gen_sub_vec(vece, t2, v2, t3);
4004 cond = tcg_signed_cond(cond);
4007 tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
4008 /* Expand directly; do not recurse. */
4009 vec_gen_4(INDEX_op_cmp_vec, type, vece,
4010 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
4013 tcg_temp_free_vec(t1);
4015 tcg_temp_free_vec(t2);
4018 return fixup & NEED_INV;
4021 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
4022 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
4024 if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
4025 tcg_gen_not_vec(vece, v0, v0);
4029 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
4030 TCGv_vec c1, TCGv_vec c2,
4031 TCGv_vec v3, TCGv_vec v4, TCGCond cond)
4033 TCGv_vec t = tcg_temp_new_vec(type);
4035 if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
4036 /* Invert the sense of the compare by swapping arguments. */
4038 x = v3, v3 = v4, v4 = x;
4040 vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
4041 tcgv_vec_arg(v0), tcgv_vec_arg(v4),
4042 tcgv_vec_arg(v3), tcgv_vec_arg(t));
4043 tcg_temp_free_vec(t);
4046 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
4051 TCGv_vec v0, v1, v2, v3, v4;
4054 v0 = temp_tcgv_vec(arg_temp(a0));
4055 v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4056 a2 = va_arg(va, TCGArg);
4059 case INDEX_op_shli_vec:
4060 case INDEX_op_shri_vec:
4061 expand_vec_shi(type, vece, opc, v0, v1, a2);
4064 case INDEX_op_sari_vec:
4065 expand_vec_sari(type, vece, v0, v1, a2);
4068 case INDEX_op_rotli_vec:
4069 expand_vec_rotli(type, vece, v0, v1, a2);
4072 case INDEX_op_rotls_vec:
4073 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
4076 case INDEX_op_rotlv_vec:
4077 v2 = temp_tcgv_vec(arg_temp(a2));
4078 expand_vec_rotv(type, vece, v0, v1, v2, false);
4080 case INDEX_op_rotrv_vec:
4081 v2 = temp_tcgv_vec(arg_temp(a2));
4082 expand_vec_rotv(type, vece, v0, v1, v2, true);
4085 case INDEX_op_mul_vec:
4086 v2 = temp_tcgv_vec(arg_temp(a2));
4087 expand_vec_mul(type, vece, v0, v1, v2);
4090 case INDEX_op_cmp_vec:
4091 v2 = temp_tcgv_vec(arg_temp(a2));
4092 expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
4095 case INDEX_op_cmpsel_vec:
4096 v2 = temp_tcgv_vec(arg_temp(a2));
4097 v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4098 v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
4099 expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
4109 static const int tcg_target_callee_save_regs[] = {
4110 #if TCG_TARGET_REG_BITS == 64
4119 TCG_REG_R14, /* Currently used for the global env. */
4122 TCG_REG_EBP, /* Currently used for the global env. */
4129 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
4130 and tcg_register_jit. */
4133 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
4134 * (TCG_TARGET_REG_BITS / 8))
4136 #define FRAME_SIZE \
4138 + TCG_STATIC_CALL_ARGS_SIZE \
4139 + CPU_TEMP_BUF_NLONGS * sizeof(long) \
4140 + TCG_TARGET_STACK_ALIGN - 1) \
4141 & ~(TCG_TARGET_STACK_ALIGN - 1))
4143 /* Generate global QEMU prologue and epilogue code */
4144 static void tcg_target_qemu_prologue(TCGContext *s)
4146 int i, stack_addend;
4150 /* Reserve some stack space, also for TCG temps. */
4151 stack_addend = FRAME_SIZE - PUSH_SIZE;
4152 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
4153 CPU_TEMP_BUF_NLONGS * sizeof(long));
4155 /* Save all callee saved registers. */
4156 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
4157 tcg_out_push(s, tcg_target_callee_save_regs[i]);
4160 if (!tcg_use_softmmu && guest_base) {
4161 int seg = setup_guest_base_seg();
4163 x86_guest_base.seg = seg;
4164 } else if (guest_base == (int32_t)guest_base) {
4165 x86_guest_base.ofs = guest_base;
4167 assert(TCG_TARGET_REG_BITS == 64);
4168 /* Choose R12 because, as a base, it requires a SIB byte. */
4169 x86_guest_base.index = TCG_REG_R12;
4170 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base.index, guest_base);
4171 tcg_regset_set_reg(s->reserved_regs, x86_guest_base.index);
4175 if (TCG_TARGET_REG_BITS == 32) {
4176 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
4177 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
4178 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4180 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
4181 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
4184 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
4185 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
4187 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
4191 * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
4192 * and fall through to the rest of the epilogue.
4194 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
4195 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
4198 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
4200 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
4203 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
4205 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
4206 tcg_out_pop(s, tcg_target_callee_save_regs[i]);
4208 tcg_out_opc(s, OPC_RET, 0, 0, 0);
4211 static void tcg_out_tb_start(TCGContext *s)
4216 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
4218 memset(p, 0x90, count);
4221 static void tcg_target_init(TCGContext *s)
4223 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
4224 if (TCG_TARGET_REG_BITS == 64) {
4225 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
4228 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
4229 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
4232 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
4235 tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
4236 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
4237 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
4238 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
4239 if (TCG_TARGET_REG_BITS == 64) {
4240 #if !defined(_WIN64)
4241 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
4242 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
4244 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
4245 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
4246 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
4247 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
4250 s->reserved_regs = 0;
4251 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
4252 tcg_regset_set_reg(s->reserved_regs, TCG_TMP_VEC);
4254 /* These are call saved, and we don't save them, so don't use them. */
4255 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM6);
4256 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM7);
4257 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM8);
4258 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM9);
4259 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM10);
4260 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM11);
4261 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM12);
4262 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM13);
4263 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM14);
4264 tcg_regset_set_reg(s->reserved_regs, TCG_REG_XMM15);
4270 uint8_t fde_def_cfa[4];
4271 uint8_t fde_reg_ofs[14];
4274 /* We're expecting a 2 byte uleb128 encoded value. */
4275 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
4277 #if !defined(__ELF__)
4278 /* Host machine without ELF. */
4279 #elif TCG_TARGET_REG_BITS == 64
4280 #define ELF_HOST_MACHINE EM_X86_64
4281 static const DebugFrame debug_frame = {
4282 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4285 .h.cie.code_align = 1,
4286 .h.cie.data_align = 0x78, /* sleb128 -8 */
4287 .h.cie.return_column = 16,
4289 /* Total FDE size does not include the "len" member. */
4290 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4293 12, 7, /* DW_CFA_def_cfa %rsp, ... */
4294 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
4298 0x90, 1, /* DW_CFA_offset, %rip, -8 */
4299 /* The following ordering must match tcg_target_callee_save_regs. */
4300 0x86, 2, /* DW_CFA_offset, %rbp, -16 */
4301 0x83, 3, /* DW_CFA_offset, %rbx, -24 */
4302 0x8c, 4, /* DW_CFA_offset, %r12, -32 */
4303 0x8d, 5, /* DW_CFA_offset, %r13, -40 */
4304 0x8e, 6, /* DW_CFA_offset, %r14, -48 */
4305 0x8f, 7, /* DW_CFA_offset, %r15, -56 */
4309 #define ELF_HOST_MACHINE EM_386
4310 static const DebugFrame debug_frame = {
4311 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
4314 .h.cie.code_align = 1,
4315 .h.cie.data_align = 0x7c, /* sleb128 -4 */
4316 .h.cie.return_column = 8,
4318 /* Total FDE size does not include the "len" member. */
4319 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
4322 12, 4, /* DW_CFA_def_cfa %esp, ... */
4323 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
4327 0x88, 1, /* DW_CFA_offset, %eip, -4 */
4328 /* The following ordering must match tcg_target_callee_save_regs. */
4329 0x85, 2, /* DW_CFA_offset, %ebp, -8 */
4330 0x83, 3, /* DW_CFA_offset, %ebx, -12 */
4331 0x86, 4, /* DW_CFA_offset, %esi, -16 */
4332 0x87, 5, /* DW_CFA_offset, %edi, -20 */
4337 #if defined(ELF_HOST_MACHINE)
4338 void tcg_register_jit(const void *buf, size_t buf_size)
4340 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));