]> git.proxmox.com Git - mirror_qemu.git/blob - tcg/i386/tcg-target.inc.c
tcg/i386: Rely on undefined/undocumented behaviour of BSF/BSR
[mirror_qemu.git] / tcg / i386 / tcg-target.inc.c
1 /*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 #include "tcg-be-ldst.h"
26
27 #ifdef CONFIG_DEBUG_TCG
28 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29 #if TCG_TARGET_REG_BITS == 64
30 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
31 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
32 #else
33 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34 #endif
35 };
36 #endif
37
38 static const int tcg_target_reg_alloc_order[] = {
39 #if TCG_TARGET_REG_BITS == 64
40 TCG_REG_RBP,
41 TCG_REG_RBX,
42 TCG_REG_R12,
43 TCG_REG_R13,
44 TCG_REG_R14,
45 TCG_REG_R15,
46 TCG_REG_R10,
47 TCG_REG_R11,
48 TCG_REG_R9,
49 TCG_REG_R8,
50 TCG_REG_RCX,
51 TCG_REG_RDX,
52 TCG_REG_RSI,
53 TCG_REG_RDI,
54 TCG_REG_RAX,
55 #else
56 TCG_REG_EBX,
57 TCG_REG_ESI,
58 TCG_REG_EDI,
59 TCG_REG_EBP,
60 TCG_REG_ECX,
61 TCG_REG_EDX,
62 TCG_REG_EAX,
63 #endif
64 };
65
66 static const int tcg_target_call_iarg_regs[] = {
67 #if TCG_TARGET_REG_BITS == 64
68 #if defined(_WIN64)
69 TCG_REG_RCX,
70 TCG_REG_RDX,
71 #else
72 TCG_REG_RDI,
73 TCG_REG_RSI,
74 TCG_REG_RDX,
75 TCG_REG_RCX,
76 #endif
77 TCG_REG_R8,
78 TCG_REG_R9,
79 #else
80 /* 32 bit mode uses stack based calling convention (GCC default). */
81 #endif
82 };
83
84 static const int tcg_target_call_oarg_regs[] = {
85 TCG_REG_EAX,
86 #if TCG_TARGET_REG_BITS == 32
87 TCG_REG_EDX
88 #endif
89 };
90
91 /* Constants we accept. */
92 #define TCG_CT_CONST_S32 0x100
93 #define TCG_CT_CONST_U32 0x200
94 #define TCG_CT_CONST_I32 0x400
95 #define TCG_CT_CONST_WSZ 0x800
96
97 /* Registers used with L constraint, which are the first argument
98 registers on x86_64, and two random call clobbered registers on
99 i386. */
100 #if TCG_TARGET_REG_BITS == 64
101 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
102 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
103 #else
104 # define TCG_REG_L0 TCG_REG_EAX
105 # define TCG_REG_L1 TCG_REG_EDX
106 #endif
107
108 /* The host compiler should supply <cpuid.h> to enable runtime features
109 detection, as we're not going to go so far as our own inline assembly.
110 If not available, default values will be assumed. */
111 #if defined(CONFIG_CPUID_H)
112 #include <cpuid.h>
113 #endif
114
115 /* For 32-bit, we are going to attempt to determine at runtime whether cmov
116 is available. */
117 #if TCG_TARGET_REG_BITS == 64
118 # define have_cmov 1
119 #elif defined(CONFIG_CPUID_H) && defined(bit_CMOV)
120 static bool have_cmov;
121 #else
122 # define have_cmov 0
123 #endif
124
125 /* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are
126 going to attempt to determine at runtime whether movbe is available. */
127 #if defined(CONFIG_CPUID_H) && defined(bit_MOVBE)
128 static bool have_movbe;
129 #else
130 # define have_movbe 0
131 #endif
132
133 /* We need this symbol in tcg-target.h, and we can't properly conditionalize
134 it there. Therefore we always define the variable. */
135 bool have_bmi1;
136
137 #if defined(CONFIG_CPUID_H) && defined(bit_BMI2)
138 static bool have_bmi2;
139 #else
140 # define have_bmi2 0
141 #endif
142 #if defined(CONFIG_CPUID_H) && defined(bit_LZCNT)
143 static bool have_lzcnt;
144 #else
145 # define have_lzcnt 0
146 #endif
147
148 static tcg_insn_unit *tb_ret_addr;
149
150 static void patch_reloc(tcg_insn_unit *code_ptr, int type,
151 intptr_t value, intptr_t addend)
152 {
153 value += addend;
154 switch(type) {
155 case R_386_PC32:
156 value -= (uintptr_t)code_ptr;
157 if (value != (int32_t)value) {
158 tcg_abort();
159 }
160 tcg_patch32(code_ptr, value);
161 break;
162 case R_386_PC8:
163 value -= (uintptr_t)code_ptr;
164 if (value != (int8_t)value) {
165 tcg_abort();
166 }
167 tcg_patch8(code_ptr, value);
168 break;
169 default:
170 tcg_abort();
171 }
172 }
173
174 /* parse target specific constraints */
175 static const char *target_parse_constraint(TCGArgConstraint *ct,
176 const char *ct_str, TCGType type)
177 {
178 switch(*ct_str++) {
179 case 'a':
180 ct->ct |= TCG_CT_REG;
181 tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
182 break;
183 case 'b':
184 ct->ct |= TCG_CT_REG;
185 tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
186 break;
187 case 'c':
188 ct->ct |= TCG_CT_REG;
189 tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
190 break;
191 case 'd':
192 ct->ct |= TCG_CT_REG;
193 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
194 break;
195 case 'S':
196 ct->ct |= TCG_CT_REG;
197 tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
198 break;
199 case 'D':
200 ct->ct |= TCG_CT_REG;
201 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
202 break;
203 case 'q':
204 ct->ct |= TCG_CT_REG;
205 if (TCG_TARGET_REG_BITS == 64) {
206 tcg_regset_set32(ct->u.regs, 0, 0xffff);
207 } else {
208 tcg_regset_set32(ct->u.regs, 0, 0xf);
209 }
210 break;
211 case 'Q':
212 ct->ct |= TCG_CT_REG;
213 tcg_regset_set32(ct->u.regs, 0, 0xf);
214 break;
215 case 'r':
216 ct->ct |= TCG_CT_REG;
217 if (TCG_TARGET_REG_BITS == 64) {
218 tcg_regset_set32(ct->u.regs, 0, 0xffff);
219 } else {
220 tcg_regset_set32(ct->u.regs, 0, 0xff);
221 }
222 break;
223 case 'W':
224 /* With TZCNT/LZCNT, we can have operand-size as an input. */
225 ct->ct |= TCG_CT_CONST_WSZ;
226 break;
227
228 /* qemu_ld/st address constraint */
229 case 'L':
230 ct->ct |= TCG_CT_REG;
231 if (TCG_TARGET_REG_BITS == 64) {
232 tcg_regset_set32(ct->u.regs, 0, 0xffff);
233 } else {
234 tcg_regset_set32(ct->u.regs, 0, 0xff);
235 }
236 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
237 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
238 break;
239
240 case 'e':
241 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
242 break;
243 case 'Z':
244 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
245 break;
246 case 'I':
247 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
248 break;
249
250 default:
251 return NULL;
252 }
253 return ct_str;
254 }
255
256 /* test if a constant matches the constraint */
257 static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
258 const TCGArgConstraint *arg_ct)
259 {
260 int ct = arg_ct->ct;
261 if (ct & TCG_CT_CONST) {
262 return 1;
263 }
264 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
265 return 1;
266 }
267 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
268 return 1;
269 }
270 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
271 return 1;
272 }
273 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
274 return 1;
275 }
276 return 0;
277 }
278
279 #if TCG_TARGET_REG_BITS == 64
280 # define LOWREGMASK(x) ((x) & 7)
281 #else
282 # define LOWREGMASK(x) (x)
283 #endif
284
285 #define P_EXT 0x100 /* 0x0f opcode prefix */
286 #define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */
287 #define P_DATA16 0x400 /* 0x66 opcode prefix */
288 #if TCG_TARGET_REG_BITS == 64
289 # define P_ADDR32 0x800 /* 0x67 opcode prefix */
290 # define P_REXW 0x1000 /* Set REX.W = 1 */
291 # define P_REXB_R 0x2000 /* REG field as byte register */
292 # define P_REXB_RM 0x4000 /* R/M field as byte register */
293 # define P_GS 0x8000 /* gs segment override */
294 #else
295 # define P_ADDR32 0
296 # define P_REXW 0
297 # define P_REXB_R 0
298 # define P_REXB_RM 0
299 # define P_GS 0
300 #endif
301 #define P_SIMDF3 0x10000 /* 0xf3 opcode prefix */
302 #define P_SIMDF2 0x20000 /* 0xf2 opcode prefix */
303
304 #define OPC_ARITH_EvIz (0x81)
305 #define OPC_ARITH_EvIb (0x83)
306 #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
307 #define OPC_ANDN (0xf2 | P_EXT38)
308 #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
309 #define OPC_BSF (0xbc | P_EXT)
310 #define OPC_BSR (0xbd | P_EXT)
311 #define OPC_BSWAP (0xc8 | P_EXT)
312 #define OPC_CALL_Jz (0xe8)
313 #define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */
314 #define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3))
315 #define OPC_DEC_r32 (0x48)
316 #define OPC_IMUL_GvEv (0xaf | P_EXT)
317 #define OPC_IMUL_GvEvIb (0x6b)
318 #define OPC_IMUL_GvEvIz (0x69)
319 #define OPC_INC_r32 (0x40)
320 #define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */
321 #define OPC_JCC_short (0x70) /* ... plus condition code */
322 #define OPC_JMP_long (0xe9)
323 #define OPC_JMP_short (0xeb)
324 #define OPC_LEA (0x8d)
325 #define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3)
326 #define OPC_MOVB_EvGv (0x88) /* stores, more or less */
327 #define OPC_MOVL_EvGv (0x89) /* stores, more or less */
328 #define OPC_MOVL_GvEv (0x8b) /* loads, more or less */
329 #define OPC_MOVB_EvIz (0xc6)
330 #define OPC_MOVL_EvIz (0xc7)
331 #define OPC_MOVL_Iv (0xb8)
332 #define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
333 #define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
334 #define OPC_MOVSBL (0xbe | P_EXT)
335 #define OPC_MOVSWL (0xbf | P_EXT)
336 #define OPC_MOVSLQ (0x63 | P_REXW)
337 #define OPC_MOVZBL (0xb6 | P_EXT)
338 #define OPC_MOVZWL (0xb7 | P_EXT)
339 #define OPC_POP_r32 (0x58)
340 #define OPC_PUSH_r32 (0x50)
341 #define OPC_PUSH_Iv (0x68)
342 #define OPC_PUSH_Ib (0x6a)
343 #define OPC_RET (0xc3)
344 #define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
345 #define OPC_SHIFT_1 (0xd1)
346 #define OPC_SHIFT_Ib (0xc1)
347 #define OPC_SHIFT_cl (0xd3)
348 #define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3)
349 #define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16)
350 #define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
351 #define OPC_TESTL (0x85)
352 #define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3)
353 #define OPC_XCHG_ax_r32 (0x90)
354
355 #define OPC_GRP3_Ev (0xf7)
356 #define OPC_GRP5 (0xff)
357
358 /* Group 1 opcode extensions for 0x80-0x83.
359 These are also used as modifiers for OPC_ARITH. */
360 #define ARITH_ADD 0
361 #define ARITH_OR 1
362 #define ARITH_ADC 2
363 #define ARITH_SBB 3
364 #define ARITH_AND 4
365 #define ARITH_SUB 5
366 #define ARITH_XOR 6
367 #define ARITH_CMP 7
368
369 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */
370 #define SHIFT_ROL 0
371 #define SHIFT_ROR 1
372 #define SHIFT_SHL 4
373 #define SHIFT_SHR 5
374 #define SHIFT_SAR 7
375
376 /* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */
377 #define EXT3_NOT 2
378 #define EXT3_NEG 3
379 #define EXT3_MUL 4
380 #define EXT3_IMUL 5
381 #define EXT3_DIV 6
382 #define EXT3_IDIV 7
383
384 /* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */
385 #define EXT5_INC_Ev 0
386 #define EXT5_DEC_Ev 1
387 #define EXT5_CALLN_Ev 2
388 #define EXT5_JMPN_Ev 4
389
390 /* Condition codes to be added to OPC_JCC_{long,short}. */
391 #define JCC_JMP (-1)
392 #define JCC_JO 0x0
393 #define JCC_JNO 0x1
394 #define JCC_JB 0x2
395 #define JCC_JAE 0x3
396 #define JCC_JE 0x4
397 #define JCC_JNE 0x5
398 #define JCC_JBE 0x6
399 #define JCC_JA 0x7
400 #define JCC_JS 0x8
401 #define JCC_JNS 0x9
402 #define JCC_JP 0xa
403 #define JCC_JNP 0xb
404 #define JCC_JL 0xc
405 #define JCC_JGE 0xd
406 #define JCC_JLE 0xe
407 #define JCC_JG 0xf
408
409 static const uint8_t tcg_cond_to_jcc[] = {
410 [TCG_COND_EQ] = JCC_JE,
411 [TCG_COND_NE] = JCC_JNE,
412 [TCG_COND_LT] = JCC_JL,
413 [TCG_COND_GE] = JCC_JGE,
414 [TCG_COND_LE] = JCC_JLE,
415 [TCG_COND_GT] = JCC_JG,
416 [TCG_COND_LTU] = JCC_JB,
417 [TCG_COND_GEU] = JCC_JAE,
418 [TCG_COND_LEU] = JCC_JBE,
419 [TCG_COND_GTU] = JCC_JA,
420 };
421
422 #if TCG_TARGET_REG_BITS == 64
423 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
424 {
425 int rex;
426
427 if (opc & P_GS) {
428 tcg_out8(s, 0x65);
429 }
430 if (opc & P_DATA16) {
431 /* We should never be asking for both 16 and 64-bit operation. */
432 tcg_debug_assert((opc & P_REXW) == 0);
433 tcg_out8(s, 0x66);
434 }
435 if (opc & P_ADDR32) {
436 tcg_out8(s, 0x67);
437 }
438 if (opc & P_SIMDF3) {
439 tcg_out8(s, 0xf3);
440 } else if (opc & P_SIMDF2) {
441 tcg_out8(s, 0xf2);
442 }
443
444 rex = 0;
445 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */
446 rex |= (r & 8) >> 1; /* REX.R */
447 rex |= (x & 8) >> 2; /* REX.X */
448 rex |= (rm & 8) >> 3; /* REX.B */
449
450 /* P_REXB_{R,RM} indicates that the given register is the low byte.
451 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
452 as otherwise the encoding indicates %[abcd]h. Note that the values
453 that are ORed in merely indicate that the REX byte must be present;
454 those bits get discarded in output. */
455 rex |= opc & (r >= 4 ? P_REXB_R : 0);
456 rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
457
458 if (rex) {
459 tcg_out8(s, (uint8_t)(rex | 0x40));
460 }
461
462 if (opc & (P_EXT | P_EXT38)) {
463 tcg_out8(s, 0x0f);
464 if (opc & P_EXT38) {
465 tcg_out8(s, 0x38);
466 }
467 }
468
469 tcg_out8(s, opc);
470 }
471 #else
472 static void tcg_out_opc(TCGContext *s, int opc)
473 {
474 if (opc & P_DATA16) {
475 tcg_out8(s, 0x66);
476 }
477 if (opc & P_SIMDF3) {
478 tcg_out8(s, 0xf3);
479 } else if (opc & P_SIMDF2) {
480 tcg_out8(s, 0xf2);
481 }
482 if (opc & (P_EXT | P_EXT38)) {
483 tcg_out8(s, 0x0f);
484 if (opc & P_EXT38) {
485 tcg_out8(s, 0x38);
486 }
487 }
488 tcg_out8(s, opc);
489 }
490 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
491 the 32-bit compilation paths. This method works with all versions of gcc,
492 whereas relying on optimization may not be able to exclude them. */
493 #define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc)
494 #endif
495
496 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
497 {
498 tcg_out_opc(s, opc, r, rm, 0);
499 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
500 }
501
502 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
503 {
504 int tmp;
505
506 if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) {
507 /* Three byte VEX prefix. */
508 tcg_out8(s, 0xc4);
509
510 /* VEX.m-mmmm */
511 if (opc & P_EXT38) {
512 tmp = 2;
513 } else if (opc & P_EXT) {
514 tmp = 1;
515 } else {
516 tcg_abort();
517 }
518 tmp |= 0x40; /* VEX.X */
519 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
520 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
521 tcg_out8(s, tmp);
522
523 tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */
524 } else {
525 /* Two byte VEX prefix. */
526 tcg_out8(s, 0xc5);
527
528 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
529 }
530 /* VEX.pp */
531 if (opc & P_DATA16) {
532 tmp |= 1; /* 0x66 */
533 } else if (opc & P_SIMDF3) {
534 tmp |= 2; /* 0xf3 */
535 } else if (opc & P_SIMDF2) {
536 tmp |= 3; /* 0xf2 */
537 }
538 tmp |= (~v & 15) << 3; /* VEX.vvvv */
539 tcg_out8(s, tmp);
540 tcg_out8(s, opc);
541 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
542 }
543
544 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
545 We handle either RM and INDEX missing with a negative value. In 64-bit
546 mode for absolute addresses, ~RM is the size of the immediate operand
547 that will follow the instruction. */
548
549 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
550 int index, int shift, intptr_t offset)
551 {
552 int mod, len;
553
554 if (index < 0 && rm < 0) {
555 if (TCG_TARGET_REG_BITS == 64) {
556 /* Try for a rip-relative addressing mode. This has replaced
557 the 32-bit-mode absolute addressing encoding. */
558 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
559 intptr_t disp = offset - pc;
560 if (disp == (int32_t)disp) {
561 tcg_out_opc(s, opc, r, 0, 0);
562 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
563 tcg_out32(s, disp);
564 return;
565 }
566
567 /* Try for an absolute address encoding. This requires the
568 use of the MODRM+SIB encoding and is therefore larger than
569 rip-relative addressing. */
570 if (offset == (int32_t)offset) {
571 tcg_out_opc(s, opc, r, 0, 0);
572 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
573 tcg_out8(s, (4 << 3) | 5);
574 tcg_out32(s, offset);
575 return;
576 }
577
578 /* ??? The memory isn't directly addressable. */
579 tcg_abort();
580 } else {
581 /* Absolute address. */
582 tcg_out_opc(s, opc, r, 0, 0);
583 tcg_out8(s, (r << 3) | 5);
584 tcg_out32(s, offset);
585 return;
586 }
587 }
588
589 /* Find the length of the immediate addend. Note that the encoding
590 that would be used for (%ebp) indicates absolute addressing. */
591 if (rm < 0) {
592 mod = 0, len = 4, rm = 5;
593 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
594 mod = 0, len = 0;
595 } else if (offset == (int8_t)offset) {
596 mod = 0x40, len = 1;
597 } else {
598 mod = 0x80, len = 4;
599 }
600
601 /* Use a single byte MODRM format if possible. Note that the encoding
602 that would be used for %esp is the escape to the two byte form. */
603 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
604 /* Single byte MODRM format. */
605 tcg_out_opc(s, opc, r, rm, 0);
606 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
607 } else {
608 /* Two byte MODRM+SIB format. */
609
610 /* Note that the encoding that would place %esp into the index
611 field indicates no index register. In 64-bit mode, the REX.X
612 bit counts, so %r12 can be used as the index. */
613 if (index < 0) {
614 index = 4;
615 } else {
616 tcg_debug_assert(index != TCG_REG_ESP);
617 }
618
619 tcg_out_opc(s, opc, r, rm, index);
620 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
621 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
622 }
623
624 if (len == 1) {
625 tcg_out8(s, offset);
626 } else if (len == 4) {
627 tcg_out32(s, offset);
628 }
629 }
630
631 /* A simplification of the above with no index or shift. */
632 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
633 int rm, intptr_t offset)
634 {
635 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
636 }
637
638 /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
639 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
640 {
641 /* Propagate an opcode prefix, such as P_REXW. */
642 int ext = subop & ~0x7;
643 subop &= 0x7;
644
645 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
646 }
647
648 static inline void tcg_out_mov(TCGContext *s, TCGType type,
649 TCGReg ret, TCGReg arg)
650 {
651 if (arg != ret) {
652 int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
653 tcg_out_modrm(s, opc, ret, arg);
654 }
655 }
656
657 static void tcg_out_movi(TCGContext *s, TCGType type,
658 TCGReg ret, tcg_target_long arg)
659 {
660 tcg_target_long diff;
661
662 if (arg == 0) {
663 tgen_arithr(s, ARITH_XOR, ret, ret);
664 return;
665 }
666 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
667 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
668 tcg_out32(s, arg);
669 return;
670 }
671 if (arg == (int32_t)arg) {
672 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
673 tcg_out32(s, arg);
674 return;
675 }
676
677 /* Try a 7 byte pc-relative lea before the 10 byte movq. */
678 diff = arg - ((uintptr_t)s->code_ptr + 7);
679 if (diff == (int32_t)diff) {
680 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
681 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
682 tcg_out32(s, diff);
683 return;
684 }
685
686 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
687 tcg_out64(s, arg);
688 }
689
690 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
691 {
692 if (val == (int8_t)val) {
693 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
694 tcg_out8(s, val);
695 } else if (val == (int32_t)val) {
696 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
697 tcg_out32(s, val);
698 } else {
699 tcg_abort();
700 }
701 }
702
703 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
704 {
705 /* Given the strength of x86 memory ordering, we only need care for
706 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is
707 faster than "mfence", so don't bother with the sse insn. */
708 if (a0 & TCG_MO_ST_LD) {
709 tcg_out8(s, 0xf0);
710 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
711 tcg_out8(s, 0);
712 }
713 }
714
715 static inline void tcg_out_push(TCGContext *s, int reg)
716 {
717 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
718 }
719
720 static inline void tcg_out_pop(TCGContext *s, int reg)
721 {
722 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
723 }
724
725 static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
726 TCGReg arg1, intptr_t arg2)
727 {
728 int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
729 tcg_out_modrm_offset(s, opc, ret, arg1, arg2);
730 }
731
732 static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
733 TCGReg arg1, intptr_t arg2)
734 {
735 int opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0);
736 tcg_out_modrm_offset(s, opc, arg, arg1, arg2);
737 }
738
739 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
740 TCGReg base, intptr_t ofs)
741 {
742 int rexw = 0;
743 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
744 if (val != (int32_t)val) {
745 return false;
746 }
747 rexw = P_REXW;
748 }
749 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
750 tcg_out32(s, val);
751 return true;
752 }
753
754 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
755 {
756 /* Propagate an opcode prefix, such as P_DATA16. */
757 int ext = subopc & ~0x7;
758 subopc &= 0x7;
759
760 if (count == 1) {
761 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
762 } else {
763 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
764 tcg_out8(s, count);
765 }
766 }
767
768 static inline void tcg_out_bswap32(TCGContext *s, int reg)
769 {
770 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
771 }
772
773 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
774 {
775 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
776 }
777
778 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
779 {
780 /* movzbl */
781 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
782 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
783 }
784
785 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
786 {
787 /* movsbl */
788 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
789 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
790 }
791
792 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
793 {
794 /* movzwl */
795 tcg_out_modrm(s, OPC_MOVZWL, dest, src);
796 }
797
798 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
799 {
800 /* movsw[lq] */
801 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
802 }
803
804 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
805 {
806 /* 32-bit mov zero extends. */
807 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
808 }
809
810 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
811 {
812 tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
813 }
814
815 static inline void tcg_out_bswap64(TCGContext *s, int reg)
816 {
817 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
818 }
819
820 static void tgen_arithi(TCGContext *s, int c, int r0,
821 tcg_target_long val, int cf)
822 {
823 int rexw = 0;
824
825 if (TCG_TARGET_REG_BITS == 64) {
826 rexw = c & -8;
827 c &= 7;
828 }
829
830 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
831 partial flags update stalls on Pentium4 and are not recommended
832 by current Intel optimization manuals. */
833 if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
834 int is_inc = (c == ARITH_ADD) ^ (val < 0);
835 if (TCG_TARGET_REG_BITS == 64) {
836 /* The single-byte increment encodings are re-tasked as the
837 REX prefixes. Use the MODRM encoding. */
838 tcg_out_modrm(s, OPC_GRP5 + rexw,
839 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
840 } else {
841 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
842 }
843 return;
844 }
845
846 if (c == ARITH_AND) {
847 if (TCG_TARGET_REG_BITS == 64) {
848 if (val == 0xffffffffu) {
849 tcg_out_ext32u(s, r0, r0);
850 return;
851 }
852 if (val == (uint32_t)val) {
853 /* AND with no high bits set can use a 32-bit operation. */
854 rexw = 0;
855 }
856 }
857 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
858 tcg_out_ext8u(s, r0, r0);
859 return;
860 }
861 if (val == 0xffffu) {
862 tcg_out_ext16u(s, r0, r0);
863 return;
864 }
865 }
866
867 if (val == (int8_t)val) {
868 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
869 tcg_out8(s, val);
870 return;
871 }
872 if (rexw == 0 || val == (int32_t)val) {
873 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
874 tcg_out32(s, val);
875 return;
876 }
877
878 tcg_abort();
879 }
880
881 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
882 {
883 if (val != 0) {
884 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
885 }
886 }
887
888 /* Use SMALL != 0 to force a short forward branch. */
889 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
890 {
891 int32_t val, val1;
892
893 if (l->has_value) {
894 val = tcg_pcrel_diff(s, l->u.value_ptr);
895 val1 = val - 2;
896 if ((int8_t)val1 == val1) {
897 if (opc == -1) {
898 tcg_out8(s, OPC_JMP_short);
899 } else {
900 tcg_out8(s, OPC_JCC_short + opc);
901 }
902 tcg_out8(s, val1);
903 } else {
904 if (small) {
905 tcg_abort();
906 }
907 if (opc == -1) {
908 tcg_out8(s, OPC_JMP_long);
909 tcg_out32(s, val - 5);
910 } else {
911 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
912 tcg_out32(s, val - 6);
913 }
914 }
915 } else if (small) {
916 if (opc == -1) {
917 tcg_out8(s, OPC_JMP_short);
918 } else {
919 tcg_out8(s, OPC_JCC_short + opc);
920 }
921 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
922 s->code_ptr += 1;
923 } else {
924 if (opc == -1) {
925 tcg_out8(s, OPC_JMP_long);
926 } else {
927 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
928 }
929 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
930 s->code_ptr += 4;
931 }
932 }
933
934 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
935 int const_arg2, int rexw)
936 {
937 if (const_arg2) {
938 if (arg2 == 0) {
939 /* test r, r */
940 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
941 } else {
942 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
943 }
944 } else {
945 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
946 }
947 }
948
949 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
950 TCGArg arg1, TCGArg arg2, int const_arg2,
951 TCGLabel *label, int small)
952 {
953 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
954 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
955 }
956
957 #if TCG_TARGET_REG_BITS == 64
958 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
959 TCGArg arg1, TCGArg arg2, int const_arg2,
960 TCGLabel *label, int small)
961 {
962 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
963 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
964 }
965 #else
966 /* XXX: we implement it at the target level to avoid having to
967 handle cross basic blocks temporaries */
968 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
969 const int *const_args, int small)
970 {
971 TCGLabel *label_next = gen_new_label();
972 TCGLabel *label_this = arg_label(args[5]);
973
974 switch(args[4]) {
975 case TCG_COND_EQ:
976 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
977 label_next, 1);
978 tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
979 label_this, small);
980 break;
981 case TCG_COND_NE:
982 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
983 label_this, small);
984 tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
985 label_this, small);
986 break;
987 case TCG_COND_LT:
988 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
989 label_this, small);
990 tcg_out_jxx(s, JCC_JNE, label_next, 1);
991 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
992 label_this, small);
993 break;
994 case TCG_COND_LE:
995 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
996 label_this, small);
997 tcg_out_jxx(s, JCC_JNE, label_next, 1);
998 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
999 label_this, small);
1000 break;
1001 case TCG_COND_GT:
1002 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1003 label_this, small);
1004 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1005 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1006 label_this, small);
1007 break;
1008 case TCG_COND_GE:
1009 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1010 label_this, small);
1011 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1012 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1013 label_this, small);
1014 break;
1015 case TCG_COND_LTU:
1016 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1017 label_this, small);
1018 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1019 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1020 label_this, small);
1021 break;
1022 case TCG_COND_LEU:
1023 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1024 label_this, small);
1025 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1026 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1027 label_this, small);
1028 break;
1029 case TCG_COND_GTU:
1030 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1031 label_this, small);
1032 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1033 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1034 label_this, small);
1035 break;
1036 case TCG_COND_GEU:
1037 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1038 label_this, small);
1039 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1040 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1041 label_this, small);
1042 break;
1043 default:
1044 tcg_abort();
1045 }
1046 tcg_out_label(s, label_next, s->code_ptr);
1047 }
1048 #endif
1049
1050 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1051 TCGArg arg1, TCGArg arg2, int const_arg2)
1052 {
1053 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1054 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1055 tcg_out_ext8u(s, dest, dest);
1056 }
1057
1058 #if TCG_TARGET_REG_BITS == 64
1059 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1060 TCGArg arg1, TCGArg arg2, int const_arg2)
1061 {
1062 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1063 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1064 tcg_out_ext8u(s, dest, dest);
1065 }
1066 #else
1067 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1068 const int *const_args)
1069 {
1070 TCGArg new_args[6];
1071 TCGLabel *label_true, *label_over;
1072
1073 memcpy(new_args, args+1, 5*sizeof(TCGArg));
1074
1075 if (args[0] == args[1] || args[0] == args[2]
1076 || (!const_args[3] && args[0] == args[3])
1077 || (!const_args[4] && args[0] == args[4])) {
1078 /* When the destination overlaps with one of the argument
1079 registers, don't do anything tricky. */
1080 label_true = gen_new_label();
1081 label_over = gen_new_label();
1082
1083 new_args[5] = label_arg(label_true);
1084 tcg_out_brcond2(s, new_args, const_args+1, 1);
1085
1086 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1087 tcg_out_jxx(s, JCC_JMP, label_over, 1);
1088 tcg_out_label(s, label_true, s->code_ptr);
1089
1090 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1091 tcg_out_label(s, label_over, s->code_ptr);
1092 } else {
1093 /* When the destination does not overlap one of the arguments,
1094 clear the destination first, jump if cond false, and emit an
1095 increment in the true case. This results in smaller code. */
1096
1097 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1098
1099 label_over = gen_new_label();
1100 new_args[4] = tcg_invert_cond(new_args[4]);
1101 new_args[5] = label_arg(label_over);
1102 tcg_out_brcond2(s, new_args, const_args+1, 1);
1103
1104 tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1105 tcg_out_label(s, label_over, s->code_ptr);
1106 }
1107 }
1108 #endif
1109
1110 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1111 TCGReg dest, TCGReg v1)
1112 {
1113 if (have_cmov) {
1114 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1115 } else {
1116 TCGLabel *over = gen_new_label();
1117 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1118 tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1119 tcg_out_label(s, over, s->code_ptr);
1120 }
1121 }
1122
1123 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1124 TCGReg c1, TCGArg c2, int const_c2,
1125 TCGReg v1)
1126 {
1127 tcg_out_cmp(s, c1, c2, const_c2, 0);
1128 tcg_out_cmov(s, cond, 0, dest, v1);
1129 }
1130
1131 #if TCG_TARGET_REG_BITS == 64
1132 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1133 TCGReg c1, TCGArg c2, int const_c2,
1134 TCGReg v1)
1135 {
1136 tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1137 tcg_out_cmov(s, cond, P_REXW, dest, v1);
1138 }
1139 #endif
1140
1141 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1142 TCGArg arg2, bool const_a2)
1143 {
1144 if (const_a2) {
1145 tcg_debug_assert(have_bmi1);
1146 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1147 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1148 } else {
1149 /* ??? The manual says that the output is undefined when the
1150 input is zero, but real hardware leaves it unchanged. As
1151 noted in target-i386/translate.c, real programs depend on
1152 this -- now we are one more of those. */
1153 tcg_debug_assert(dest == arg2);
1154 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1155 }
1156 }
1157
1158 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1159 TCGArg arg2, bool const_a2)
1160 {
1161 if (have_lzcnt) {
1162 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1163 if (const_a2) {
1164 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1165 } else {
1166 tcg_debug_assert(dest != arg2);
1167 /* LZCNT sets C if the input was zero. */
1168 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1169 }
1170 } else {
1171 TCGType type = rexw ? TCG_TYPE_I64: TCG_TYPE_I32;
1172 TCGArg rev = rexw ? 63 : 31;
1173
1174 /* Recall that the output of BSR is the index not the count.
1175 Therefore we must adjust the result by ^ (SIZE-1). In some
1176 cases below, we prefer an extra XOR to a JMP. */
1177 /* ??? See the comment in tcg_out_ctz re BSF. */
1178 if (const_a2) {
1179 tcg_debug_assert(dest != arg1);
1180 tcg_out_movi(s, type, dest, arg2 ^ rev);
1181 } else {
1182 tcg_debug_assert(dest == arg2);
1183 tgen_arithi(s, ARITH_XOR + rexw, dest, rev, 0);
1184 }
1185 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1186 tgen_arithi(s, ARITH_XOR + rexw, dest, rev, 0);
1187 }
1188 }
1189
1190 static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
1191 {
1192 intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1193
1194 if (disp == (int32_t)disp) {
1195 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1196 tcg_out32(s, disp);
1197 } else {
1198 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R10, (uintptr_t)dest);
1199 tcg_out_modrm(s, OPC_GRP5,
1200 call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev, TCG_REG_R10);
1201 }
1202 }
1203
1204 static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
1205 {
1206 tcg_out_branch(s, 1, dest);
1207 }
1208
1209 static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
1210 {
1211 tcg_out_branch(s, 0, dest);
1212 }
1213
1214 static void tcg_out_nopn(TCGContext *s, int n)
1215 {
1216 int i;
1217 /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1218 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1219 * duplicate prefix, and all of the interesting recent cores can
1220 * decode and discard the duplicates in a single cycle.
1221 */
1222 tcg_debug_assert(n >= 1);
1223 for (i = 1; i < n; ++i) {
1224 tcg_out8(s, 0x66);
1225 }
1226 tcg_out8(s, 0x90);
1227 }
1228
1229 #if defined(CONFIG_SOFTMMU)
1230 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1231 * int mmu_idx, uintptr_t ra)
1232 */
1233 static void * const qemu_ld_helpers[16] = {
1234 [MO_UB] = helper_ret_ldub_mmu,
1235 [MO_LEUW] = helper_le_lduw_mmu,
1236 [MO_LEUL] = helper_le_ldul_mmu,
1237 [MO_LEQ] = helper_le_ldq_mmu,
1238 [MO_BEUW] = helper_be_lduw_mmu,
1239 [MO_BEUL] = helper_be_ldul_mmu,
1240 [MO_BEQ] = helper_be_ldq_mmu,
1241 };
1242
1243 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1244 * uintxx_t val, int mmu_idx, uintptr_t ra)
1245 */
1246 static void * const qemu_st_helpers[16] = {
1247 [MO_UB] = helper_ret_stb_mmu,
1248 [MO_LEUW] = helper_le_stw_mmu,
1249 [MO_LEUL] = helper_le_stl_mmu,
1250 [MO_LEQ] = helper_le_stq_mmu,
1251 [MO_BEUW] = helper_be_stw_mmu,
1252 [MO_BEUL] = helper_be_stl_mmu,
1253 [MO_BEQ] = helper_be_stq_mmu,
1254 };
1255
1256 /* Perform the TLB load and compare.
1257
1258 Inputs:
1259 ADDRLO and ADDRHI contain the low and high part of the address.
1260
1261 MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1262
1263 WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1264 This should be offsetof addr_read or addr_write.
1265
1266 Outputs:
1267 LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1268 positions of the displacements of forward jumps to the TLB miss case.
1269
1270 Second argument register is loaded with the low part of the address.
1271 In the TLB hit case, it has been adjusted as indicated by the TLB
1272 and so is a host address. In the TLB miss case, it continues to
1273 hold a guest address.
1274
1275 First argument register is clobbered. */
1276
1277 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1278 int mem_index, TCGMemOp opc,
1279 tcg_insn_unit **label_ptr, int which)
1280 {
1281 const TCGReg r0 = TCG_REG_L0;
1282 const TCGReg r1 = TCG_REG_L1;
1283 TCGType ttype = TCG_TYPE_I32;
1284 TCGType tlbtype = TCG_TYPE_I32;
1285 int trexw = 0, hrexw = 0, tlbrexw = 0;
1286 unsigned a_bits = get_alignment_bits(opc);
1287 unsigned s_bits = opc & MO_SIZE;
1288 unsigned a_mask = (1 << a_bits) - 1;
1289 unsigned s_mask = (1 << s_bits) - 1;
1290 target_ulong tlb_mask;
1291
1292 if (TCG_TARGET_REG_BITS == 64) {
1293 if (TARGET_LONG_BITS == 64) {
1294 ttype = TCG_TYPE_I64;
1295 trexw = P_REXW;
1296 }
1297 if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1298 hrexw = P_REXW;
1299 if (TARGET_PAGE_BITS + CPU_TLB_BITS > 32) {
1300 tlbtype = TCG_TYPE_I64;
1301 tlbrexw = P_REXW;
1302 }
1303 }
1304 }
1305
1306 tcg_out_mov(s, tlbtype, r0, addrlo);
1307 /* If the required alignment is at least as large as the access, simply
1308 copy the address and mask. For lesser alignments, check that we don't
1309 cross pages for the complete access. */
1310 if (a_bits >= s_bits) {
1311 tcg_out_mov(s, ttype, r1, addrlo);
1312 } else {
1313 tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1314 }
1315 tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1316
1317 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1318 TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1319
1320 tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1321 tgen_arithi(s, ARITH_AND + tlbrexw, r0,
1322 (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
1323
1324 tcg_out_modrm_sib_offset(s, OPC_LEA + hrexw, r0, TCG_AREG0, r0, 0,
1325 offsetof(CPUArchState, tlb_table[mem_index][0])
1326 + which);
1327
1328 /* cmp 0(r0), r1 */
1329 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, 0);
1330
1331 /* Prepare for both the fast path add of the tlb addend, and the slow
1332 path function argument setup. There are two cases worth note:
1333 For 32-bit guest and x86_64 host, MOVL zero-extends the guest address
1334 before the fastpath ADDQ below. For 64-bit guest and x32 host, MOVQ
1335 copies the entire guest address for the slow path, while truncation
1336 for the 32-bit host happens with the fastpath ADDL below. */
1337 tcg_out_mov(s, ttype, r1, addrlo);
1338
1339 /* jne slow_path */
1340 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1341 label_ptr[0] = s->code_ptr;
1342 s->code_ptr += 4;
1343
1344 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1345 /* cmp 4(r0), addrhi */
1346 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4);
1347
1348 /* jne slow_path */
1349 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1350 label_ptr[1] = s->code_ptr;
1351 s->code_ptr += 4;
1352 }
1353
1354 /* TLB Hit. */
1355
1356 /* add addend(r0), r1 */
1357 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1358 offsetof(CPUTLBEntry, addend) - which);
1359 }
1360
1361 /*
1362 * Record the context of a call to the out of line helper code for the slow path
1363 * for a load or store, so that we can later generate the correct helper code
1364 */
1365 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
1366 TCGReg datalo, TCGReg datahi,
1367 TCGReg addrlo, TCGReg addrhi,
1368 tcg_insn_unit *raddr,
1369 tcg_insn_unit **label_ptr)
1370 {
1371 TCGLabelQemuLdst *label = new_ldst_label(s);
1372
1373 label->is_ld = is_ld;
1374 label->oi = oi;
1375 label->datalo_reg = datalo;
1376 label->datahi_reg = datahi;
1377 label->addrlo_reg = addrlo;
1378 label->addrhi_reg = addrhi;
1379 label->raddr = raddr;
1380 label->label_ptr[0] = label_ptr[0];
1381 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1382 label->label_ptr[1] = label_ptr[1];
1383 }
1384 }
1385
1386 /*
1387 * Generate code for the slow path for a load at the end of block
1388 */
1389 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1390 {
1391 TCGMemOpIdx oi = l->oi;
1392 TCGMemOp opc = get_memop(oi);
1393 TCGReg data_reg;
1394 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1395
1396 /* resolve label address */
1397 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1398 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1399 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1400 }
1401
1402 if (TCG_TARGET_REG_BITS == 32) {
1403 int ofs = 0;
1404
1405 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1406 ofs += 4;
1407
1408 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1409 ofs += 4;
1410
1411 if (TARGET_LONG_BITS == 64) {
1412 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1413 ofs += 4;
1414 }
1415
1416 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1417 ofs += 4;
1418
1419 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1420 } else {
1421 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1422 /* The second argument is already loaded with addrlo. */
1423 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1424 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1425 (uintptr_t)l->raddr);
1426 }
1427
1428 tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1429
1430 data_reg = l->datalo_reg;
1431 switch (opc & MO_SSIZE) {
1432 case MO_SB:
1433 tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
1434 break;
1435 case MO_SW:
1436 tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
1437 break;
1438 #if TCG_TARGET_REG_BITS == 64
1439 case MO_SL:
1440 tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1441 break;
1442 #endif
1443 case MO_UB:
1444 case MO_UW:
1445 /* Note that the helpers have zero-extended to tcg_target_long. */
1446 case MO_UL:
1447 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1448 break;
1449 case MO_Q:
1450 if (TCG_TARGET_REG_BITS == 64) {
1451 tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1452 } else if (data_reg == TCG_REG_EDX) {
1453 /* xchg %edx, %eax */
1454 tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1455 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1456 } else {
1457 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1458 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1459 }
1460 break;
1461 default:
1462 tcg_abort();
1463 }
1464
1465 /* Jump to the code corresponding to next IR of qemu_st */
1466 tcg_out_jmp(s, l->raddr);
1467 }
1468
1469 /*
1470 * Generate code for the slow path for a store at the end of block
1471 */
1472 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1473 {
1474 TCGMemOpIdx oi = l->oi;
1475 TCGMemOp opc = get_memop(oi);
1476 TCGMemOp s_bits = opc & MO_SIZE;
1477 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1478 TCGReg retaddr;
1479
1480 /* resolve label address */
1481 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1482 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1483 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1484 }
1485
1486 if (TCG_TARGET_REG_BITS == 32) {
1487 int ofs = 0;
1488
1489 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1490 ofs += 4;
1491
1492 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1493 ofs += 4;
1494
1495 if (TARGET_LONG_BITS == 64) {
1496 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1497 ofs += 4;
1498 }
1499
1500 tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1501 ofs += 4;
1502
1503 if (s_bits == MO_64) {
1504 tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1505 ofs += 4;
1506 }
1507
1508 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1509 ofs += 4;
1510
1511 retaddr = TCG_REG_EAX;
1512 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1513 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1514 } else {
1515 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1516 /* The second argument is already loaded with addrlo. */
1517 tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1518 tcg_target_call_iarg_regs[2], l->datalo_reg);
1519 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1520
1521 if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1522 retaddr = tcg_target_call_iarg_regs[4];
1523 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1524 } else {
1525 retaddr = TCG_REG_RAX;
1526 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1527 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1528 TCG_TARGET_CALL_STACK_OFFSET);
1529 }
1530 }
1531
1532 /* "Tail call" to the helper, with the return address back inline. */
1533 tcg_out_push(s, retaddr);
1534 tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1535 }
1536 #elif defined(__x86_64__) && defined(__linux__)
1537 # include <asm/prctl.h>
1538 # include <sys/prctl.h>
1539
1540 int arch_prctl(int code, unsigned long addr);
1541
1542 static int guest_base_flags;
1543 static inline void setup_guest_base_seg(void)
1544 {
1545 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1546 guest_base_flags = P_GS;
1547 }
1548 }
1549 #else
1550 # define guest_base_flags 0
1551 static inline void setup_guest_base_seg(void) { }
1552 #endif /* SOFTMMU */
1553
1554 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1555 TCGReg base, int index, intptr_t ofs,
1556 int seg, TCGMemOp memop)
1557 {
1558 const TCGMemOp real_bswap = memop & MO_BSWAP;
1559 TCGMemOp bswap = real_bswap;
1560 int movop = OPC_MOVL_GvEv;
1561
1562 if (have_movbe && real_bswap) {
1563 bswap = 0;
1564 movop = OPC_MOVBE_GyMy;
1565 }
1566
1567 switch (memop & MO_SSIZE) {
1568 case MO_UB:
1569 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1570 base, index, 0, ofs);
1571 break;
1572 case MO_SB:
1573 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + P_REXW + seg, datalo,
1574 base, index, 0, ofs);
1575 break;
1576 case MO_UW:
1577 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1578 base, index, 0, ofs);
1579 if (real_bswap) {
1580 tcg_out_rolw_8(s, datalo);
1581 }
1582 break;
1583 case MO_SW:
1584 if (real_bswap) {
1585 if (have_movbe) {
1586 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1587 datalo, base, index, 0, ofs);
1588 } else {
1589 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1590 base, index, 0, ofs);
1591 tcg_out_rolw_8(s, datalo);
1592 }
1593 tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
1594 } else {
1595 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + P_REXW + seg,
1596 datalo, base, index, 0, ofs);
1597 }
1598 break;
1599 case MO_UL:
1600 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
1601 if (bswap) {
1602 tcg_out_bswap32(s, datalo);
1603 }
1604 break;
1605 #if TCG_TARGET_REG_BITS == 64
1606 case MO_SL:
1607 if (real_bswap) {
1608 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1609 base, index, 0, ofs);
1610 if (bswap) {
1611 tcg_out_bswap32(s, datalo);
1612 }
1613 tcg_out_ext32s(s, datalo, datalo);
1614 } else {
1615 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
1616 base, index, 0, ofs);
1617 }
1618 break;
1619 #endif
1620 case MO_Q:
1621 if (TCG_TARGET_REG_BITS == 64) {
1622 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
1623 base, index, 0, ofs);
1624 if (bswap) {
1625 tcg_out_bswap64(s, datalo);
1626 }
1627 } else {
1628 if (real_bswap) {
1629 int t = datalo;
1630 datalo = datahi;
1631 datahi = t;
1632 }
1633 if (base != datalo) {
1634 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1635 base, index, 0, ofs);
1636 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1637 base, index, 0, ofs + 4);
1638 } else {
1639 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1640 base, index, 0, ofs + 4);
1641 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1642 base, index, 0, ofs);
1643 }
1644 if (bswap) {
1645 tcg_out_bswap32(s, datalo);
1646 tcg_out_bswap32(s, datahi);
1647 }
1648 }
1649 break;
1650 default:
1651 tcg_abort();
1652 }
1653 }
1654
1655 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
1656 EAX. It will be useful once fixed registers globals are less
1657 common. */
1658 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
1659 {
1660 TCGReg datalo, datahi, addrlo;
1661 TCGReg addrhi __attribute__((unused));
1662 TCGMemOpIdx oi;
1663 TCGMemOp opc;
1664 #if defined(CONFIG_SOFTMMU)
1665 int mem_index;
1666 tcg_insn_unit *label_ptr[2];
1667 #endif
1668
1669 datalo = *args++;
1670 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
1671 addrlo = *args++;
1672 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
1673 oi = *args++;
1674 opc = get_memop(oi);
1675
1676 #if defined(CONFIG_SOFTMMU)
1677 mem_index = get_mmuidx(oi);
1678
1679 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
1680 label_ptr, offsetof(CPUTLBEntry, addr_read));
1681
1682 /* TLB Hit. */
1683 tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
1684
1685 /* Record the current context of a load into ldst label */
1686 add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
1687 s->code_ptr, label_ptr);
1688 #else
1689 {
1690 int32_t offset = guest_base;
1691 TCGReg base = addrlo;
1692 int index = -1;
1693 int seg = 0;
1694
1695 /* For a 32-bit guest, the high 32 bits may contain garbage.
1696 We can do this with the ADDR32 prefix if we're not using
1697 a guest base, or when using segmentation. Otherwise we
1698 need to zero-extend manually. */
1699 if (guest_base == 0 || guest_base_flags) {
1700 seg = guest_base_flags;
1701 offset = 0;
1702 if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
1703 seg |= P_ADDR32;
1704 }
1705 } else if (TCG_TARGET_REG_BITS == 64) {
1706 if (TARGET_LONG_BITS == 32) {
1707 tcg_out_ext32u(s, TCG_REG_L0, base);
1708 base = TCG_REG_L0;
1709 }
1710 if (offset != guest_base) {
1711 tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
1712 index = TCG_REG_L1;
1713 offset = 0;
1714 }
1715 }
1716
1717 tcg_out_qemu_ld_direct(s, datalo, datahi,
1718 base, index, offset, seg, opc);
1719 }
1720 #endif
1721 }
1722
1723 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1724 TCGReg base, intptr_t ofs, int seg,
1725 TCGMemOp memop)
1726 {
1727 /* ??? Ideally we wouldn't need a scratch register. For user-only,
1728 we could perform the bswap twice to restore the original value
1729 instead of moving to the scratch. But as it is, the L constraint
1730 means that TCG_REG_L0 is definitely free here. */
1731 const TCGReg scratch = TCG_REG_L0;
1732 const TCGMemOp real_bswap = memop & MO_BSWAP;
1733 TCGMemOp bswap = real_bswap;
1734 int movop = OPC_MOVL_EvGv;
1735
1736 if (have_movbe && real_bswap) {
1737 bswap = 0;
1738 movop = OPC_MOVBE_MyGy;
1739 }
1740
1741 switch (memop & MO_SIZE) {
1742 case MO_8:
1743 /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
1744 Use the scratch register if necessary. */
1745 if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
1746 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1747 datalo = scratch;
1748 }
1749 tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
1750 datalo, base, ofs);
1751 break;
1752 case MO_16:
1753 if (bswap) {
1754 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1755 tcg_out_rolw_8(s, scratch);
1756 datalo = scratch;
1757 }
1758 tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
1759 break;
1760 case MO_32:
1761 if (bswap) {
1762 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1763 tcg_out_bswap32(s, scratch);
1764 datalo = scratch;
1765 }
1766 tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
1767 break;
1768 case MO_64:
1769 if (TCG_TARGET_REG_BITS == 64) {
1770 if (bswap) {
1771 tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
1772 tcg_out_bswap64(s, scratch);
1773 datalo = scratch;
1774 }
1775 tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
1776 } else if (bswap) {
1777 tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
1778 tcg_out_bswap32(s, scratch);
1779 tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs);
1780 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1781 tcg_out_bswap32(s, scratch);
1782 tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
1783 } else {
1784 if (real_bswap) {
1785 int t = datalo;
1786 datalo = datahi;
1787 datahi = t;
1788 }
1789 tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
1790 tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
1791 }
1792 break;
1793 default:
1794 tcg_abort();
1795 }
1796 }
1797
1798 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
1799 {
1800 TCGReg datalo, datahi, addrlo;
1801 TCGReg addrhi __attribute__((unused));
1802 TCGMemOpIdx oi;
1803 TCGMemOp opc;
1804 #if defined(CONFIG_SOFTMMU)
1805 int mem_index;
1806 tcg_insn_unit *label_ptr[2];
1807 #endif
1808
1809 datalo = *args++;
1810 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
1811 addrlo = *args++;
1812 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
1813 oi = *args++;
1814 opc = get_memop(oi);
1815
1816 #if defined(CONFIG_SOFTMMU)
1817 mem_index = get_mmuidx(oi);
1818
1819 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
1820 label_ptr, offsetof(CPUTLBEntry, addr_write));
1821
1822 /* TLB Hit. */
1823 tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
1824
1825 /* Record the current context of a store into ldst label */
1826 add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
1827 s->code_ptr, label_ptr);
1828 #else
1829 {
1830 int32_t offset = guest_base;
1831 TCGReg base = addrlo;
1832 int seg = 0;
1833
1834 /* See comment in tcg_out_qemu_ld re zero-extension of addrlo. */
1835 if (guest_base == 0 || guest_base_flags) {
1836 seg = guest_base_flags;
1837 offset = 0;
1838 if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
1839 seg |= P_ADDR32;
1840 }
1841 } else if (TCG_TARGET_REG_BITS == 64) {
1842 /* ??? Note that we can't use the same SIB addressing scheme
1843 as for loads, since we require L0 free for bswap. */
1844 if (offset != guest_base) {
1845 if (TARGET_LONG_BITS == 32) {
1846 tcg_out_ext32u(s, TCG_REG_L0, base);
1847 base = TCG_REG_L0;
1848 }
1849 tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
1850 tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
1851 base = TCG_REG_L1;
1852 offset = 0;
1853 } else if (TARGET_LONG_BITS == 32) {
1854 tcg_out_ext32u(s, TCG_REG_L1, base);
1855 base = TCG_REG_L1;
1856 }
1857 }
1858
1859 tcg_out_qemu_st_direct(s, datalo, datahi, base, offset, seg, opc);
1860 }
1861 #endif
1862 }
1863
1864 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
1865 const TCGArg *args, const int *const_args)
1866 {
1867 TCGArg a0, a1, a2;
1868 int c, const_a2, vexop, rexw = 0;
1869
1870 #if TCG_TARGET_REG_BITS == 64
1871 # define OP_32_64(x) \
1872 case glue(glue(INDEX_op_, x), _i64): \
1873 rexw = P_REXW; /* FALLTHRU */ \
1874 case glue(glue(INDEX_op_, x), _i32)
1875 #else
1876 # define OP_32_64(x) \
1877 case glue(glue(INDEX_op_, x), _i32)
1878 #endif
1879
1880 /* Hoist the loads of the most common arguments. */
1881 a0 = args[0];
1882 a1 = args[1];
1883 a2 = args[2];
1884 const_a2 = const_args[2];
1885
1886 switch (opc) {
1887 case INDEX_op_exit_tb:
1888 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
1889 tcg_out_jmp(s, tb_ret_addr);
1890 break;
1891 case INDEX_op_goto_tb:
1892 if (s->tb_jmp_insn_offset) {
1893 /* direct jump method */
1894 int gap;
1895 /* jump displacement must be aligned for atomic patching;
1896 * see if we need to add extra nops before jump
1897 */
1898 gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
1899 if (gap != 1) {
1900 tcg_out_nopn(s, gap - 1);
1901 }
1902 tcg_out8(s, OPC_JMP_long); /* jmp im */
1903 s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
1904 tcg_out32(s, 0);
1905 } else {
1906 /* indirect jump method */
1907 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
1908 (intptr_t)(s->tb_jmp_target_addr + a0));
1909 }
1910 s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
1911 break;
1912 case INDEX_op_br:
1913 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
1914 break;
1915 OP_32_64(ld8u):
1916 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
1917 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
1918 break;
1919 OP_32_64(ld8s):
1920 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
1921 break;
1922 OP_32_64(ld16u):
1923 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
1924 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
1925 break;
1926 OP_32_64(ld16s):
1927 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
1928 break;
1929 #if TCG_TARGET_REG_BITS == 64
1930 case INDEX_op_ld32u_i64:
1931 #endif
1932 case INDEX_op_ld_i32:
1933 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
1934 break;
1935
1936 OP_32_64(st8):
1937 if (const_args[0]) {
1938 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
1939 tcg_out8(s, a0);
1940 } else {
1941 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
1942 }
1943 break;
1944 OP_32_64(st16):
1945 if (const_args[0]) {
1946 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
1947 tcg_out16(s, a0);
1948 } else {
1949 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
1950 }
1951 break;
1952 #if TCG_TARGET_REG_BITS == 64
1953 case INDEX_op_st32_i64:
1954 #endif
1955 case INDEX_op_st_i32:
1956 if (const_args[0]) {
1957 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
1958 tcg_out32(s, a0);
1959 } else {
1960 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
1961 }
1962 break;
1963
1964 OP_32_64(add):
1965 /* For 3-operand addition, use LEA. */
1966 if (a0 != a1) {
1967 TCGArg c3 = 0;
1968 if (const_a2) {
1969 c3 = a2, a2 = -1;
1970 } else if (a0 == a2) {
1971 /* Watch out for dest = src + dest, since we've removed
1972 the matching constraint on the add. */
1973 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
1974 break;
1975 }
1976
1977 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
1978 break;
1979 }
1980 c = ARITH_ADD;
1981 goto gen_arith;
1982 OP_32_64(sub):
1983 c = ARITH_SUB;
1984 goto gen_arith;
1985 OP_32_64(and):
1986 c = ARITH_AND;
1987 goto gen_arith;
1988 OP_32_64(or):
1989 c = ARITH_OR;
1990 goto gen_arith;
1991 OP_32_64(xor):
1992 c = ARITH_XOR;
1993 goto gen_arith;
1994 gen_arith:
1995 if (const_a2) {
1996 tgen_arithi(s, c + rexw, a0, a2, 0);
1997 } else {
1998 tgen_arithr(s, c + rexw, a0, a2);
1999 }
2000 break;
2001
2002 OP_32_64(andc):
2003 if (const_a2) {
2004 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2005 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2006 } else {
2007 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2008 }
2009 break;
2010
2011 OP_32_64(mul):
2012 if (const_a2) {
2013 int32_t val;
2014 val = a2;
2015 if (val == (int8_t)val) {
2016 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2017 tcg_out8(s, val);
2018 } else {
2019 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2020 tcg_out32(s, val);
2021 }
2022 } else {
2023 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2024 }
2025 break;
2026
2027 OP_32_64(div2):
2028 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2029 break;
2030 OP_32_64(divu2):
2031 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2032 break;
2033
2034 OP_32_64(shl):
2035 /* For small constant 3-operand shift, use LEA. */
2036 if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2037 if (a2 - 1 == 0) {
2038 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2039 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2040 } else {
2041 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2042 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2043 }
2044 break;
2045 }
2046 c = SHIFT_SHL;
2047 vexop = OPC_SHLX;
2048 goto gen_shift_maybe_vex;
2049 OP_32_64(shr):
2050 c = SHIFT_SHR;
2051 vexop = OPC_SHRX;
2052 goto gen_shift_maybe_vex;
2053 OP_32_64(sar):
2054 c = SHIFT_SAR;
2055 vexop = OPC_SARX;
2056 goto gen_shift_maybe_vex;
2057 OP_32_64(rotl):
2058 c = SHIFT_ROL;
2059 goto gen_shift;
2060 OP_32_64(rotr):
2061 c = SHIFT_ROR;
2062 goto gen_shift;
2063 gen_shift_maybe_vex:
2064 if (have_bmi2) {
2065 if (!const_a2) {
2066 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2067 break;
2068 }
2069 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2070 }
2071 /* FALLTHRU */
2072 gen_shift:
2073 if (const_a2) {
2074 tcg_out_shifti(s, c + rexw, a0, a2);
2075 } else {
2076 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2077 }
2078 break;
2079
2080 OP_32_64(ctz):
2081 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2082 break;
2083 OP_32_64(clz):
2084 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2085 break;
2086
2087 case INDEX_op_brcond_i32:
2088 tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2089 break;
2090 case INDEX_op_setcond_i32:
2091 tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2092 break;
2093 case INDEX_op_movcond_i32:
2094 tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2095 break;
2096
2097 OP_32_64(bswap16):
2098 tcg_out_rolw_8(s, a0);
2099 break;
2100 OP_32_64(bswap32):
2101 tcg_out_bswap32(s, a0);
2102 break;
2103
2104 OP_32_64(neg):
2105 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2106 break;
2107 OP_32_64(not):
2108 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2109 break;
2110
2111 OP_32_64(ext8s):
2112 tcg_out_ext8s(s, a0, a1, rexw);
2113 break;
2114 OP_32_64(ext16s):
2115 tcg_out_ext16s(s, a0, a1, rexw);
2116 break;
2117 OP_32_64(ext8u):
2118 tcg_out_ext8u(s, a0, a1);
2119 break;
2120 OP_32_64(ext16u):
2121 tcg_out_ext16u(s, a0, a1);
2122 break;
2123
2124 case INDEX_op_qemu_ld_i32:
2125 tcg_out_qemu_ld(s, args, 0);
2126 break;
2127 case INDEX_op_qemu_ld_i64:
2128 tcg_out_qemu_ld(s, args, 1);
2129 break;
2130 case INDEX_op_qemu_st_i32:
2131 tcg_out_qemu_st(s, args, 0);
2132 break;
2133 case INDEX_op_qemu_st_i64:
2134 tcg_out_qemu_st(s, args, 1);
2135 break;
2136
2137 OP_32_64(mulu2):
2138 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2139 break;
2140 OP_32_64(muls2):
2141 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2142 break;
2143 OP_32_64(add2):
2144 if (const_args[4]) {
2145 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2146 } else {
2147 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2148 }
2149 if (const_args[5]) {
2150 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2151 } else {
2152 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2153 }
2154 break;
2155 OP_32_64(sub2):
2156 if (const_args[4]) {
2157 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2158 } else {
2159 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2160 }
2161 if (const_args[5]) {
2162 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2163 } else {
2164 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2165 }
2166 break;
2167
2168 #if TCG_TARGET_REG_BITS == 32
2169 case INDEX_op_brcond2_i32:
2170 tcg_out_brcond2(s, args, const_args, 0);
2171 break;
2172 case INDEX_op_setcond2_i32:
2173 tcg_out_setcond2(s, args, const_args);
2174 break;
2175 #else /* TCG_TARGET_REG_BITS == 64 */
2176 case INDEX_op_ld32s_i64:
2177 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2178 break;
2179 case INDEX_op_ld_i64:
2180 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2181 break;
2182 case INDEX_op_st_i64:
2183 if (const_args[0]) {
2184 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2185 tcg_out32(s, a0);
2186 } else {
2187 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2188 }
2189 break;
2190
2191 case INDEX_op_brcond_i64:
2192 tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2193 break;
2194 case INDEX_op_setcond_i64:
2195 tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2196 break;
2197 case INDEX_op_movcond_i64:
2198 tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2199 break;
2200
2201 case INDEX_op_bswap64_i64:
2202 tcg_out_bswap64(s, a0);
2203 break;
2204 case INDEX_op_extu_i32_i64:
2205 case INDEX_op_ext32u_i64:
2206 tcg_out_ext32u(s, a0, a1);
2207 break;
2208 case INDEX_op_ext_i32_i64:
2209 case INDEX_op_ext32s_i64:
2210 tcg_out_ext32s(s, a0, a1);
2211 break;
2212 #endif
2213
2214 OP_32_64(deposit):
2215 if (args[3] == 0 && args[4] == 8) {
2216 /* load bits 0..7 */
2217 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2218 } else if (args[3] == 8 && args[4] == 8) {
2219 /* load bits 8..15 */
2220 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2221 } else if (args[3] == 0 && args[4] == 16) {
2222 /* load bits 0..15 */
2223 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2224 } else {
2225 tcg_abort();
2226 }
2227 break;
2228
2229 case INDEX_op_extract_i64:
2230 if (a2 + args[3] == 32) {
2231 /* This is a 32-bit zero-extending right shift. */
2232 tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2233 tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2234 break;
2235 }
2236 /* FALLTHRU */
2237 case INDEX_op_extract_i32:
2238 /* On the off-chance that we can use the high-byte registers.
2239 Otherwise we emit the same ext16 + shift pattern that we
2240 would have gotten from the normal tcg-op.c expansion. */
2241 tcg_debug_assert(a2 == 8 && args[3] == 8);
2242 if (a1 < 4 && a0 < 8) {
2243 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2244 } else {
2245 tcg_out_ext16u(s, a0, a1);
2246 tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2247 }
2248 break;
2249
2250 case INDEX_op_sextract_i32:
2251 /* We don't implement sextract_i64, as we cannot sign-extend to
2252 64-bits without using the REX prefix that explicitly excludes
2253 access to the high-byte registers. */
2254 tcg_debug_assert(a2 == 8 && args[3] == 8);
2255 if (a1 < 4 && a0 < 8) {
2256 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2257 } else {
2258 tcg_out_ext16s(s, a0, a1, 0);
2259 tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2260 }
2261 break;
2262
2263 case INDEX_op_mb:
2264 tcg_out_mb(s, a0);
2265 break;
2266 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
2267 case INDEX_op_mov_i64:
2268 case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
2269 case INDEX_op_movi_i64:
2270 case INDEX_op_call: /* Always emitted via tcg_out_call. */
2271 default:
2272 tcg_abort();
2273 }
2274
2275 #undef OP_32_64
2276 }
2277
2278 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2279 {
2280 static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2281 static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2282 static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2283 static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2284 static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2285 static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2286 static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2287 static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2288 static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2289 static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2290 static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2291 static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2292 static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2293 static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2294 static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2295 static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2296 static const TCGTargetOpDef r_r_L_L
2297 = { .args_ct_str = { "r", "r", "L", "L" } };
2298 static const TCGTargetOpDef L_L_L_L
2299 = { .args_ct_str = { "L", "L", "L", "L" } };
2300
2301 switch (op) {
2302 case INDEX_op_ld8u_i32:
2303 case INDEX_op_ld8u_i64:
2304 case INDEX_op_ld8s_i32:
2305 case INDEX_op_ld8s_i64:
2306 case INDEX_op_ld16u_i32:
2307 case INDEX_op_ld16u_i64:
2308 case INDEX_op_ld16s_i32:
2309 case INDEX_op_ld16s_i64:
2310 case INDEX_op_ld_i32:
2311 case INDEX_op_ld32u_i64:
2312 case INDEX_op_ld32s_i64:
2313 case INDEX_op_ld_i64:
2314 return &r_r;
2315
2316 case INDEX_op_st8_i32:
2317 case INDEX_op_st8_i64:
2318 return &qi_r;
2319 case INDEX_op_st16_i32:
2320 case INDEX_op_st16_i64:
2321 case INDEX_op_st_i32:
2322 case INDEX_op_st32_i64:
2323 return &ri_r;
2324 case INDEX_op_st_i64:
2325 return &re_r;
2326
2327 case INDEX_op_add_i32:
2328 case INDEX_op_add_i64:
2329 return &r_r_re;
2330 case INDEX_op_sub_i32:
2331 case INDEX_op_sub_i64:
2332 case INDEX_op_mul_i32:
2333 case INDEX_op_mul_i64:
2334 case INDEX_op_or_i32:
2335 case INDEX_op_or_i64:
2336 case INDEX_op_xor_i32:
2337 case INDEX_op_xor_i64:
2338 return &r_0_re;
2339
2340 case INDEX_op_and_i32:
2341 case INDEX_op_and_i64:
2342 {
2343 static const TCGTargetOpDef and
2344 = { .args_ct_str = { "r", "0", "reZ" } };
2345 return &and;
2346 }
2347 break;
2348 case INDEX_op_andc_i32:
2349 case INDEX_op_andc_i64:
2350 {
2351 static const TCGTargetOpDef andc
2352 = { .args_ct_str = { "r", "r", "rI" } };
2353 return &andc;
2354 }
2355 break;
2356
2357 case INDEX_op_shl_i32:
2358 case INDEX_op_shl_i64:
2359 case INDEX_op_shr_i32:
2360 case INDEX_op_shr_i64:
2361 case INDEX_op_sar_i32:
2362 case INDEX_op_sar_i64:
2363 return have_bmi2 ? &r_r_ri : &r_0_ci;
2364 case INDEX_op_rotl_i32:
2365 case INDEX_op_rotl_i64:
2366 case INDEX_op_rotr_i32:
2367 case INDEX_op_rotr_i64:
2368 return &r_0_ci;
2369
2370 case INDEX_op_brcond_i32:
2371 case INDEX_op_brcond_i64:
2372 return &r_re;
2373
2374 case INDEX_op_bswap16_i32:
2375 case INDEX_op_bswap16_i64:
2376 case INDEX_op_bswap32_i32:
2377 case INDEX_op_bswap32_i64:
2378 case INDEX_op_bswap64_i64:
2379 case INDEX_op_neg_i32:
2380 case INDEX_op_neg_i64:
2381 case INDEX_op_not_i32:
2382 case INDEX_op_not_i64:
2383 return &r_0;
2384
2385 case INDEX_op_ext8s_i32:
2386 case INDEX_op_ext8s_i64:
2387 case INDEX_op_ext8u_i32:
2388 case INDEX_op_ext8u_i64:
2389 return &r_q;
2390 case INDEX_op_ext16s_i32:
2391 case INDEX_op_ext16s_i64:
2392 case INDEX_op_ext16u_i32:
2393 case INDEX_op_ext16u_i64:
2394 case INDEX_op_ext32s_i64:
2395 case INDEX_op_ext32u_i64:
2396 case INDEX_op_ext_i32_i64:
2397 case INDEX_op_extu_i32_i64:
2398 case INDEX_op_extract_i32:
2399 case INDEX_op_extract_i64:
2400 case INDEX_op_sextract_i32:
2401 return &r_r;
2402
2403 case INDEX_op_deposit_i32:
2404 case INDEX_op_deposit_i64:
2405 {
2406 static const TCGTargetOpDef dep
2407 = { .args_ct_str = { "Q", "0", "Q" } };
2408 return &dep;
2409 }
2410 case INDEX_op_setcond_i32:
2411 case INDEX_op_setcond_i64:
2412 {
2413 static const TCGTargetOpDef setc
2414 = { .args_ct_str = { "q", "r", "re" } };
2415 return &setc;
2416 }
2417 case INDEX_op_movcond_i32:
2418 case INDEX_op_movcond_i64:
2419 {
2420 static const TCGTargetOpDef movc
2421 = { .args_ct_str = { "r", "r", "re", "r", "0" } };
2422 return &movc;
2423 }
2424 case INDEX_op_div2_i32:
2425 case INDEX_op_div2_i64:
2426 case INDEX_op_divu2_i32:
2427 case INDEX_op_divu2_i64:
2428 {
2429 static const TCGTargetOpDef div2
2430 = { .args_ct_str = { "a", "d", "0", "1", "r" } };
2431 return &div2;
2432 }
2433 case INDEX_op_mulu2_i32:
2434 case INDEX_op_mulu2_i64:
2435 case INDEX_op_muls2_i32:
2436 case INDEX_op_muls2_i64:
2437 {
2438 static const TCGTargetOpDef mul2
2439 = { .args_ct_str = { "a", "d", "a", "r" } };
2440 return &mul2;
2441 }
2442 case INDEX_op_add2_i32:
2443 case INDEX_op_add2_i64:
2444 case INDEX_op_sub2_i32:
2445 case INDEX_op_sub2_i64:
2446 {
2447 static const TCGTargetOpDef arith2
2448 = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
2449 return &arith2;
2450 }
2451 case INDEX_op_ctz_i32:
2452 case INDEX_op_ctz_i64:
2453 {
2454 static const TCGTargetOpDef ctz[2] = {
2455 { .args_ct_str = { "r", "r", "0" } },
2456 { .args_ct_str = { "&r", "r", "rW" } },
2457 };
2458 return &ctz[have_bmi1];
2459 }
2460 case INDEX_op_clz_i32:
2461 case INDEX_op_clz_i64:
2462 {
2463 static const TCGTargetOpDef clz[2] = {
2464 { .args_ct_str = { "&r", "r", "0i" } },
2465 { .args_ct_str = { "&r", "r", "rW" } },
2466 };
2467 return &clz[have_lzcnt];
2468 }
2469
2470 case INDEX_op_qemu_ld_i32:
2471 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
2472 case INDEX_op_qemu_st_i32:
2473 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
2474 case INDEX_op_qemu_ld_i64:
2475 return (TCG_TARGET_REG_BITS == 64 ? &r_L
2476 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
2477 : &r_r_L_L);
2478 case INDEX_op_qemu_st_i64:
2479 return (TCG_TARGET_REG_BITS == 64 ? &L_L
2480 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
2481 : &L_L_L_L);
2482
2483 case INDEX_op_brcond2_i32:
2484 {
2485 static const TCGTargetOpDef b2
2486 = { .args_ct_str = { "r", "r", "ri", "ri" } };
2487 return &b2;
2488 }
2489 case INDEX_op_setcond2_i32:
2490 {
2491 static const TCGTargetOpDef s2
2492 = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
2493 return &s2;
2494 }
2495
2496 default:
2497 break;
2498 }
2499 return NULL;
2500 }
2501
2502 static int tcg_target_callee_save_regs[] = {
2503 #if TCG_TARGET_REG_BITS == 64
2504 TCG_REG_RBP,
2505 TCG_REG_RBX,
2506 #if defined(_WIN64)
2507 TCG_REG_RDI,
2508 TCG_REG_RSI,
2509 #endif
2510 TCG_REG_R12,
2511 TCG_REG_R13,
2512 TCG_REG_R14, /* Currently used for the global env. */
2513 TCG_REG_R15,
2514 #else
2515 TCG_REG_EBP, /* Currently used for the global env. */
2516 TCG_REG_EBX,
2517 TCG_REG_ESI,
2518 TCG_REG_EDI,
2519 #endif
2520 };
2521
2522 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
2523 and tcg_register_jit. */
2524
2525 #define PUSH_SIZE \
2526 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
2527 * (TCG_TARGET_REG_BITS / 8))
2528
2529 #define FRAME_SIZE \
2530 ((PUSH_SIZE \
2531 + TCG_STATIC_CALL_ARGS_SIZE \
2532 + CPU_TEMP_BUF_NLONGS * sizeof(long) \
2533 + TCG_TARGET_STACK_ALIGN - 1) \
2534 & ~(TCG_TARGET_STACK_ALIGN - 1))
2535
2536 /* Generate global QEMU prologue and epilogue code */
2537 static void tcg_target_qemu_prologue(TCGContext *s)
2538 {
2539 int i, stack_addend;
2540
2541 /* TB prologue */
2542
2543 /* Reserve some stack space, also for TCG temps. */
2544 stack_addend = FRAME_SIZE - PUSH_SIZE;
2545 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
2546 CPU_TEMP_BUF_NLONGS * sizeof(long));
2547
2548 /* Save all callee saved registers. */
2549 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
2550 tcg_out_push(s, tcg_target_callee_save_regs[i]);
2551 }
2552
2553 #if TCG_TARGET_REG_BITS == 32
2554 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
2555 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
2556 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
2557 /* jmp *tb. */
2558 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
2559 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
2560 + stack_addend);
2561 #else
2562 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
2563 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
2564 /* jmp *tb. */
2565 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
2566 #endif
2567
2568 /* TB epilogue */
2569 tb_ret_addr = s->code_ptr;
2570
2571 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
2572
2573 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
2574 tcg_out_pop(s, tcg_target_callee_save_regs[i]);
2575 }
2576 tcg_out_opc(s, OPC_RET, 0, 0, 0);
2577
2578 #if !defined(CONFIG_SOFTMMU)
2579 /* Try to set up a segment register to point to guest_base. */
2580 if (guest_base) {
2581 setup_guest_base_seg();
2582 }
2583 #endif
2584 }
2585
2586 static void tcg_target_init(TCGContext *s)
2587 {
2588 #ifdef CONFIG_CPUID_H
2589 unsigned a, b, c, d;
2590 int max = __get_cpuid_max(0, 0);
2591
2592 if (max >= 1) {
2593 __cpuid(1, a, b, c, d);
2594 #ifndef have_cmov
2595 /* For 32-bit, 99% certainty that we're running on hardware that
2596 supports cmov, but we still need to check. In case cmov is not
2597 available, we'll use a small forward branch. */
2598 have_cmov = (d & bit_CMOV) != 0;
2599 #endif
2600 #ifndef have_movbe
2601 /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
2602 need to probe for it. */
2603 have_movbe = (c & bit_MOVBE) != 0;
2604 #endif
2605 }
2606
2607 if (max >= 7) {
2608 /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
2609 __cpuid_count(7, 0, a, b, c, d);
2610 #ifdef bit_BMI
2611 have_bmi1 = (b & bit_BMI) != 0;
2612 #endif
2613 #ifndef have_bmi2
2614 have_bmi2 = (b & bit_BMI2) != 0;
2615 #endif
2616 }
2617 #endif
2618
2619 #ifndef have_lzcnt
2620 max = __get_cpuid_max(0x8000000, 0);
2621 if (max >= 1) {
2622 __cpuid(0x80000001, a, b, c, d);
2623 /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs. */
2624 have_lzcnt = (c & bit_LZCNT) != 0;
2625 }
2626 #endif
2627
2628 if (TCG_TARGET_REG_BITS == 64) {
2629 tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff);
2630 tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I64], 0, 0xffff);
2631 } else {
2632 tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xff);
2633 }
2634
2635 tcg_regset_clear(tcg_target_call_clobber_regs);
2636 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
2637 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
2638 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
2639 if (TCG_TARGET_REG_BITS == 64) {
2640 #if !defined(_WIN64)
2641 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
2642 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
2643 #endif
2644 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
2645 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
2646 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
2647 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
2648 }
2649
2650 tcg_regset_clear(s->reserved_regs);
2651 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
2652 }
2653
2654 typedef struct {
2655 DebugFrameHeader h;
2656 uint8_t fde_def_cfa[4];
2657 uint8_t fde_reg_ofs[14];
2658 } DebugFrame;
2659
2660 /* We're expecting a 2 byte uleb128 encoded value. */
2661 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
2662
2663 #if !defined(__ELF__)
2664 /* Host machine without ELF. */
2665 #elif TCG_TARGET_REG_BITS == 64
2666 #define ELF_HOST_MACHINE EM_X86_64
2667 static const DebugFrame debug_frame = {
2668 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
2669 .h.cie.id = -1,
2670 .h.cie.version = 1,
2671 .h.cie.code_align = 1,
2672 .h.cie.data_align = 0x78, /* sleb128 -8 */
2673 .h.cie.return_column = 16,
2674
2675 /* Total FDE size does not include the "len" member. */
2676 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
2677
2678 .fde_def_cfa = {
2679 12, 7, /* DW_CFA_def_cfa %rsp, ... */
2680 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
2681 (FRAME_SIZE >> 7)
2682 },
2683 .fde_reg_ofs = {
2684 0x90, 1, /* DW_CFA_offset, %rip, -8 */
2685 /* The following ordering must match tcg_target_callee_save_regs. */
2686 0x86, 2, /* DW_CFA_offset, %rbp, -16 */
2687 0x83, 3, /* DW_CFA_offset, %rbx, -24 */
2688 0x8c, 4, /* DW_CFA_offset, %r12, -32 */
2689 0x8d, 5, /* DW_CFA_offset, %r13, -40 */
2690 0x8e, 6, /* DW_CFA_offset, %r14, -48 */
2691 0x8f, 7, /* DW_CFA_offset, %r15, -56 */
2692 }
2693 };
2694 #else
2695 #define ELF_HOST_MACHINE EM_386
2696 static const DebugFrame debug_frame = {
2697 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
2698 .h.cie.id = -1,
2699 .h.cie.version = 1,
2700 .h.cie.code_align = 1,
2701 .h.cie.data_align = 0x7c, /* sleb128 -4 */
2702 .h.cie.return_column = 8,
2703
2704 /* Total FDE size does not include the "len" member. */
2705 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
2706
2707 .fde_def_cfa = {
2708 12, 4, /* DW_CFA_def_cfa %esp, ... */
2709 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
2710 (FRAME_SIZE >> 7)
2711 },
2712 .fde_reg_ofs = {
2713 0x88, 1, /* DW_CFA_offset, %eip, -4 */
2714 /* The following ordering must match tcg_target_callee_save_regs. */
2715 0x85, 2, /* DW_CFA_offset, %ebp, -8 */
2716 0x83, 3, /* DW_CFA_offset, %ebx, -12 */
2717 0x86, 4, /* DW_CFA_offset, %esi, -16 */
2718 0x87, 5, /* DW_CFA_offset, %edi, -20 */
2719 }
2720 };
2721 #endif
2722
2723 #if defined(ELF_HOST_MACHINE)
2724 void tcg_register_jit(void *buf, size_t buf_size)
2725 {
2726 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
2727 }
2728 #endif