]> git.proxmox.com Git - mirror_qemu.git/blob - tcg/i386/tcg-target.inc.c
tcg/i386: Handle ctpop opcode
[mirror_qemu.git] / tcg / i386 / tcg-target.inc.c
1 /*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 #include "tcg-be-ldst.h"
26
27 #ifdef CONFIG_DEBUG_TCG
28 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29 #if TCG_TARGET_REG_BITS == 64
30 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
31 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
32 #else
33 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
34 #endif
35 };
36 #endif
37
38 static const int tcg_target_reg_alloc_order[] = {
39 #if TCG_TARGET_REG_BITS == 64
40 TCG_REG_RBP,
41 TCG_REG_RBX,
42 TCG_REG_R12,
43 TCG_REG_R13,
44 TCG_REG_R14,
45 TCG_REG_R15,
46 TCG_REG_R10,
47 TCG_REG_R11,
48 TCG_REG_R9,
49 TCG_REG_R8,
50 TCG_REG_RCX,
51 TCG_REG_RDX,
52 TCG_REG_RSI,
53 TCG_REG_RDI,
54 TCG_REG_RAX,
55 #else
56 TCG_REG_EBX,
57 TCG_REG_ESI,
58 TCG_REG_EDI,
59 TCG_REG_EBP,
60 TCG_REG_ECX,
61 TCG_REG_EDX,
62 TCG_REG_EAX,
63 #endif
64 };
65
66 static const int tcg_target_call_iarg_regs[] = {
67 #if TCG_TARGET_REG_BITS == 64
68 #if defined(_WIN64)
69 TCG_REG_RCX,
70 TCG_REG_RDX,
71 #else
72 TCG_REG_RDI,
73 TCG_REG_RSI,
74 TCG_REG_RDX,
75 TCG_REG_RCX,
76 #endif
77 TCG_REG_R8,
78 TCG_REG_R9,
79 #else
80 /* 32 bit mode uses stack based calling convention (GCC default). */
81 #endif
82 };
83
84 static const int tcg_target_call_oarg_regs[] = {
85 TCG_REG_EAX,
86 #if TCG_TARGET_REG_BITS == 32
87 TCG_REG_EDX
88 #endif
89 };
90
91 /* Constants we accept. */
92 #define TCG_CT_CONST_S32 0x100
93 #define TCG_CT_CONST_U32 0x200
94 #define TCG_CT_CONST_I32 0x400
95 #define TCG_CT_CONST_WSZ 0x800
96
97 /* Registers used with L constraint, which are the first argument
98 registers on x86_64, and two random call clobbered registers on
99 i386. */
100 #if TCG_TARGET_REG_BITS == 64
101 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
102 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
103 #else
104 # define TCG_REG_L0 TCG_REG_EAX
105 # define TCG_REG_L1 TCG_REG_EDX
106 #endif
107
108 /* The host compiler should supply <cpuid.h> to enable runtime features
109 detection, as we're not going to go so far as our own inline assembly.
110 If not available, default values will be assumed. */
111 #if defined(CONFIG_CPUID_H)
112 #include <cpuid.h>
113 #endif
114
115 /* For 32-bit, we are going to attempt to determine at runtime whether cmov
116 is available. */
117 #if TCG_TARGET_REG_BITS == 64
118 # define have_cmov 1
119 #elif defined(CONFIG_CPUID_H) && defined(bit_CMOV)
120 static bool have_cmov;
121 #else
122 # define have_cmov 0
123 #endif
124
125 /* If bit_MOVBE is defined in cpuid.h (added in GCC version 4.6), we are
126 going to attempt to determine at runtime whether movbe is available. */
127 #if defined(CONFIG_CPUID_H) && defined(bit_MOVBE)
128 static bool have_movbe;
129 #else
130 # define have_movbe 0
131 #endif
132
133 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
134 it there. Therefore we always define the variable. */
135 bool have_bmi1;
136 bool have_popcnt;
137
138 #if defined(CONFIG_CPUID_H) && defined(bit_BMI2)
139 static bool have_bmi2;
140 #else
141 # define have_bmi2 0
142 #endif
143 #if defined(CONFIG_CPUID_H) && defined(bit_LZCNT)
144 static bool have_lzcnt;
145 #else
146 # define have_lzcnt 0
147 #endif
148
149 static tcg_insn_unit *tb_ret_addr;
150
151 static void patch_reloc(tcg_insn_unit *code_ptr, int type,
152 intptr_t value, intptr_t addend)
153 {
154 value += addend;
155 switch(type) {
156 case R_386_PC32:
157 value -= (uintptr_t)code_ptr;
158 if (value != (int32_t)value) {
159 tcg_abort();
160 }
161 tcg_patch32(code_ptr, value);
162 break;
163 case R_386_PC8:
164 value -= (uintptr_t)code_ptr;
165 if (value != (int8_t)value) {
166 tcg_abort();
167 }
168 tcg_patch8(code_ptr, value);
169 break;
170 default:
171 tcg_abort();
172 }
173 }
174
175 /* parse target specific constraints */
176 static const char *target_parse_constraint(TCGArgConstraint *ct,
177 const char *ct_str, TCGType type)
178 {
179 switch(*ct_str++) {
180 case 'a':
181 ct->ct |= TCG_CT_REG;
182 tcg_regset_set_reg(ct->u.regs, TCG_REG_EAX);
183 break;
184 case 'b':
185 ct->ct |= TCG_CT_REG;
186 tcg_regset_set_reg(ct->u.regs, TCG_REG_EBX);
187 break;
188 case 'c':
189 ct->ct |= TCG_CT_REG;
190 tcg_regset_set_reg(ct->u.regs, TCG_REG_ECX);
191 break;
192 case 'd':
193 ct->ct |= TCG_CT_REG;
194 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDX);
195 break;
196 case 'S':
197 ct->ct |= TCG_CT_REG;
198 tcg_regset_set_reg(ct->u.regs, TCG_REG_ESI);
199 break;
200 case 'D':
201 ct->ct |= TCG_CT_REG;
202 tcg_regset_set_reg(ct->u.regs, TCG_REG_EDI);
203 break;
204 case 'q':
205 ct->ct |= TCG_CT_REG;
206 if (TCG_TARGET_REG_BITS == 64) {
207 tcg_regset_set32(ct->u.regs, 0, 0xffff);
208 } else {
209 tcg_regset_set32(ct->u.regs, 0, 0xf);
210 }
211 break;
212 case 'Q':
213 ct->ct |= TCG_CT_REG;
214 tcg_regset_set32(ct->u.regs, 0, 0xf);
215 break;
216 case 'r':
217 ct->ct |= TCG_CT_REG;
218 if (TCG_TARGET_REG_BITS == 64) {
219 tcg_regset_set32(ct->u.regs, 0, 0xffff);
220 } else {
221 tcg_regset_set32(ct->u.regs, 0, 0xff);
222 }
223 break;
224 case 'W':
225 /* With TZCNT/LZCNT, we can have operand-size as an input. */
226 ct->ct |= TCG_CT_CONST_WSZ;
227 break;
228
229 /* qemu_ld/st address constraint */
230 case 'L':
231 ct->ct |= TCG_CT_REG;
232 if (TCG_TARGET_REG_BITS == 64) {
233 tcg_regset_set32(ct->u.regs, 0, 0xffff);
234 } else {
235 tcg_regset_set32(ct->u.regs, 0, 0xff);
236 }
237 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L0);
238 tcg_regset_reset_reg(ct->u.regs, TCG_REG_L1);
239 break;
240
241 case 'e':
242 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_S32);
243 break;
244 case 'Z':
245 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_U32);
246 break;
247 case 'I':
248 ct->ct |= (type == TCG_TYPE_I32 ? TCG_CT_CONST : TCG_CT_CONST_I32);
249 break;
250
251 default:
252 return NULL;
253 }
254 return ct_str;
255 }
256
257 /* test if a constant matches the constraint */
258 static inline int tcg_target_const_match(tcg_target_long val, TCGType type,
259 const TCGArgConstraint *arg_ct)
260 {
261 int ct = arg_ct->ct;
262 if (ct & TCG_CT_CONST) {
263 return 1;
264 }
265 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
266 return 1;
267 }
268 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
269 return 1;
270 }
271 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
272 return 1;
273 }
274 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
275 return 1;
276 }
277 return 0;
278 }
279
280 #if TCG_TARGET_REG_BITS == 64
281 # define LOWREGMASK(x) ((x) & 7)
282 #else
283 # define LOWREGMASK(x) (x)
284 #endif
285
286 #define P_EXT 0x100 /* 0x0f opcode prefix */
287 #define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */
288 #define P_DATA16 0x400 /* 0x66 opcode prefix */
289 #if TCG_TARGET_REG_BITS == 64
290 # define P_ADDR32 0x800 /* 0x67 opcode prefix */
291 # define P_REXW 0x1000 /* Set REX.W = 1 */
292 # define P_REXB_R 0x2000 /* REG field as byte register */
293 # define P_REXB_RM 0x4000 /* R/M field as byte register */
294 # define P_GS 0x8000 /* gs segment override */
295 #else
296 # define P_ADDR32 0
297 # define P_REXW 0
298 # define P_REXB_R 0
299 # define P_REXB_RM 0
300 # define P_GS 0
301 #endif
302 #define P_SIMDF3 0x10000 /* 0xf3 opcode prefix */
303 #define P_SIMDF2 0x20000 /* 0xf2 opcode prefix */
304
305 #define OPC_ARITH_EvIz (0x81)
306 #define OPC_ARITH_EvIb (0x83)
307 #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
308 #define OPC_ANDN (0xf2 | P_EXT38)
309 #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
310 #define OPC_BSF (0xbc | P_EXT)
311 #define OPC_BSR (0xbd | P_EXT)
312 #define OPC_BSWAP (0xc8 | P_EXT)
313 #define OPC_CALL_Jz (0xe8)
314 #define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */
315 #define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3))
316 #define OPC_DEC_r32 (0x48)
317 #define OPC_IMUL_GvEv (0xaf | P_EXT)
318 #define OPC_IMUL_GvEvIb (0x6b)
319 #define OPC_IMUL_GvEvIz (0x69)
320 #define OPC_INC_r32 (0x40)
321 #define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */
322 #define OPC_JCC_short (0x70) /* ... plus condition code */
323 #define OPC_JMP_long (0xe9)
324 #define OPC_JMP_short (0xeb)
325 #define OPC_LEA (0x8d)
326 #define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3)
327 #define OPC_MOVB_EvGv (0x88) /* stores, more or less */
328 #define OPC_MOVL_EvGv (0x89) /* stores, more or less */
329 #define OPC_MOVL_GvEv (0x8b) /* loads, more or less */
330 #define OPC_MOVB_EvIz (0xc6)
331 #define OPC_MOVL_EvIz (0xc7)
332 #define OPC_MOVL_Iv (0xb8)
333 #define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
334 #define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
335 #define OPC_MOVSBL (0xbe | P_EXT)
336 #define OPC_MOVSWL (0xbf | P_EXT)
337 #define OPC_MOVSLQ (0x63 | P_REXW)
338 #define OPC_MOVZBL (0xb6 | P_EXT)
339 #define OPC_MOVZWL (0xb7 | P_EXT)
340 #define OPC_POP_r32 (0x58)
341 #define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
342 #define OPC_PUSH_r32 (0x50)
343 #define OPC_PUSH_Iv (0x68)
344 #define OPC_PUSH_Ib (0x6a)
345 #define OPC_RET (0xc3)
346 #define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
347 #define OPC_SHIFT_1 (0xd1)
348 #define OPC_SHIFT_Ib (0xc1)
349 #define OPC_SHIFT_cl (0xd3)
350 #define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3)
351 #define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16)
352 #define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
353 #define OPC_TESTL (0x85)
354 #define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3)
355 #define OPC_XCHG_ax_r32 (0x90)
356
357 #define OPC_GRP3_Ev (0xf7)
358 #define OPC_GRP5 (0xff)
359
360 /* Group 1 opcode extensions for 0x80-0x83.
361 These are also used as modifiers for OPC_ARITH. */
362 #define ARITH_ADD 0
363 #define ARITH_OR 1
364 #define ARITH_ADC 2
365 #define ARITH_SBB 3
366 #define ARITH_AND 4
367 #define ARITH_SUB 5
368 #define ARITH_XOR 6
369 #define ARITH_CMP 7
370
371 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */
372 #define SHIFT_ROL 0
373 #define SHIFT_ROR 1
374 #define SHIFT_SHL 4
375 #define SHIFT_SHR 5
376 #define SHIFT_SAR 7
377
378 /* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */
379 #define EXT3_NOT 2
380 #define EXT3_NEG 3
381 #define EXT3_MUL 4
382 #define EXT3_IMUL 5
383 #define EXT3_DIV 6
384 #define EXT3_IDIV 7
385
386 /* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */
387 #define EXT5_INC_Ev 0
388 #define EXT5_DEC_Ev 1
389 #define EXT5_CALLN_Ev 2
390 #define EXT5_JMPN_Ev 4
391
392 /* Condition codes to be added to OPC_JCC_{long,short}. */
393 #define JCC_JMP (-1)
394 #define JCC_JO 0x0
395 #define JCC_JNO 0x1
396 #define JCC_JB 0x2
397 #define JCC_JAE 0x3
398 #define JCC_JE 0x4
399 #define JCC_JNE 0x5
400 #define JCC_JBE 0x6
401 #define JCC_JA 0x7
402 #define JCC_JS 0x8
403 #define JCC_JNS 0x9
404 #define JCC_JP 0xa
405 #define JCC_JNP 0xb
406 #define JCC_JL 0xc
407 #define JCC_JGE 0xd
408 #define JCC_JLE 0xe
409 #define JCC_JG 0xf
410
411 static const uint8_t tcg_cond_to_jcc[] = {
412 [TCG_COND_EQ] = JCC_JE,
413 [TCG_COND_NE] = JCC_JNE,
414 [TCG_COND_LT] = JCC_JL,
415 [TCG_COND_GE] = JCC_JGE,
416 [TCG_COND_LE] = JCC_JLE,
417 [TCG_COND_GT] = JCC_JG,
418 [TCG_COND_LTU] = JCC_JB,
419 [TCG_COND_GEU] = JCC_JAE,
420 [TCG_COND_LEU] = JCC_JBE,
421 [TCG_COND_GTU] = JCC_JA,
422 };
423
424 #if TCG_TARGET_REG_BITS == 64
425 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
426 {
427 int rex;
428
429 if (opc & P_GS) {
430 tcg_out8(s, 0x65);
431 }
432 if (opc & P_DATA16) {
433 /* We should never be asking for both 16 and 64-bit operation. */
434 tcg_debug_assert((opc & P_REXW) == 0);
435 tcg_out8(s, 0x66);
436 }
437 if (opc & P_ADDR32) {
438 tcg_out8(s, 0x67);
439 }
440 if (opc & P_SIMDF3) {
441 tcg_out8(s, 0xf3);
442 } else if (opc & P_SIMDF2) {
443 tcg_out8(s, 0xf2);
444 }
445
446 rex = 0;
447 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */
448 rex |= (r & 8) >> 1; /* REX.R */
449 rex |= (x & 8) >> 2; /* REX.X */
450 rex |= (rm & 8) >> 3; /* REX.B */
451
452 /* P_REXB_{R,RM} indicates that the given register is the low byte.
453 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
454 as otherwise the encoding indicates %[abcd]h. Note that the values
455 that are ORed in merely indicate that the REX byte must be present;
456 those bits get discarded in output. */
457 rex |= opc & (r >= 4 ? P_REXB_R : 0);
458 rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
459
460 if (rex) {
461 tcg_out8(s, (uint8_t)(rex | 0x40));
462 }
463
464 if (opc & (P_EXT | P_EXT38)) {
465 tcg_out8(s, 0x0f);
466 if (opc & P_EXT38) {
467 tcg_out8(s, 0x38);
468 }
469 }
470
471 tcg_out8(s, opc);
472 }
473 #else
474 static void tcg_out_opc(TCGContext *s, int opc)
475 {
476 if (opc & P_DATA16) {
477 tcg_out8(s, 0x66);
478 }
479 if (opc & P_SIMDF3) {
480 tcg_out8(s, 0xf3);
481 } else if (opc & P_SIMDF2) {
482 tcg_out8(s, 0xf2);
483 }
484 if (opc & (P_EXT | P_EXT38)) {
485 tcg_out8(s, 0x0f);
486 if (opc & P_EXT38) {
487 tcg_out8(s, 0x38);
488 }
489 }
490 tcg_out8(s, opc);
491 }
492 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
493 the 32-bit compilation paths. This method works with all versions of gcc,
494 whereas relying on optimization may not be able to exclude them. */
495 #define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc)
496 #endif
497
498 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
499 {
500 tcg_out_opc(s, opc, r, rm, 0);
501 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
502 }
503
504 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
505 {
506 int tmp;
507
508 if ((opc & (P_REXW | P_EXT | P_EXT38)) || (rm & 8)) {
509 /* Three byte VEX prefix. */
510 tcg_out8(s, 0xc4);
511
512 /* VEX.m-mmmm */
513 if (opc & P_EXT38) {
514 tmp = 2;
515 } else if (opc & P_EXT) {
516 tmp = 1;
517 } else {
518 tcg_abort();
519 }
520 tmp |= 0x40; /* VEX.X */
521 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
522 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
523 tcg_out8(s, tmp);
524
525 tmp = (opc & P_REXW ? 0x80 : 0); /* VEX.W */
526 } else {
527 /* Two byte VEX prefix. */
528 tcg_out8(s, 0xc5);
529
530 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
531 }
532 /* VEX.pp */
533 if (opc & P_DATA16) {
534 tmp |= 1; /* 0x66 */
535 } else if (opc & P_SIMDF3) {
536 tmp |= 2; /* 0xf3 */
537 } else if (opc & P_SIMDF2) {
538 tmp |= 3; /* 0xf2 */
539 }
540 tmp |= (~v & 15) << 3; /* VEX.vvvv */
541 tcg_out8(s, tmp);
542 tcg_out8(s, opc);
543 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
544 }
545
546 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
547 We handle either RM and INDEX missing with a negative value. In 64-bit
548 mode for absolute addresses, ~RM is the size of the immediate operand
549 that will follow the instruction. */
550
551 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
552 int index, int shift, intptr_t offset)
553 {
554 int mod, len;
555
556 if (index < 0 && rm < 0) {
557 if (TCG_TARGET_REG_BITS == 64) {
558 /* Try for a rip-relative addressing mode. This has replaced
559 the 32-bit-mode absolute addressing encoding. */
560 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
561 intptr_t disp = offset - pc;
562 if (disp == (int32_t)disp) {
563 tcg_out_opc(s, opc, r, 0, 0);
564 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
565 tcg_out32(s, disp);
566 return;
567 }
568
569 /* Try for an absolute address encoding. This requires the
570 use of the MODRM+SIB encoding and is therefore larger than
571 rip-relative addressing. */
572 if (offset == (int32_t)offset) {
573 tcg_out_opc(s, opc, r, 0, 0);
574 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
575 tcg_out8(s, (4 << 3) | 5);
576 tcg_out32(s, offset);
577 return;
578 }
579
580 /* ??? The memory isn't directly addressable. */
581 tcg_abort();
582 } else {
583 /* Absolute address. */
584 tcg_out_opc(s, opc, r, 0, 0);
585 tcg_out8(s, (r << 3) | 5);
586 tcg_out32(s, offset);
587 return;
588 }
589 }
590
591 /* Find the length of the immediate addend. Note that the encoding
592 that would be used for (%ebp) indicates absolute addressing. */
593 if (rm < 0) {
594 mod = 0, len = 4, rm = 5;
595 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
596 mod = 0, len = 0;
597 } else if (offset == (int8_t)offset) {
598 mod = 0x40, len = 1;
599 } else {
600 mod = 0x80, len = 4;
601 }
602
603 /* Use a single byte MODRM format if possible. Note that the encoding
604 that would be used for %esp is the escape to the two byte form. */
605 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
606 /* Single byte MODRM format. */
607 tcg_out_opc(s, opc, r, rm, 0);
608 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
609 } else {
610 /* Two byte MODRM+SIB format. */
611
612 /* Note that the encoding that would place %esp into the index
613 field indicates no index register. In 64-bit mode, the REX.X
614 bit counts, so %r12 can be used as the index. */
615 if (index < 0) {
616 index = 4;
617 } else {
618 tcg_debug_assert(index != TCG_REG_ESP);
619 }
620
621 tcg_out_opc(s, opc, r, rm, index);
622 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
623 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
624 }
625
626 if (len == 1) {
627 tcg_out8(s, offset);
628 } else if (len == 4) {
629 tcg_out32(s, offset);
630 }
631 }
632
633 /* A simplification of the above with no index or shift. */
634 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
635 int rm, intptr_t offset)
636 {
637 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
638 }
639
640 /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
641 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
642 {
643 /* Propagate an opcode prefix, such as P_REXW. */
644 int ext = subop & ~0x7;
645 subop &= 0x7;
646
647 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
648 }
649
650 static inline void tcg_out_mov(TCGContext *s, TCGType type,
651 TCGReg ret, TCGReg arg)
652 {
653 if (arg != ret) {
654 int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
655 tcg_out_modrm(s, opc, ret, arg);
656 }
657 }
658
659 static void tcg_out_movi(TCGContext *s, TCGType type,
660 TCGReg ret, tcg_target_long arg)
661 {
662 tcg_target_long diff;
663
664 if (arg == 0) {
665 tgen_arithr(s, ARITH_XOR, ret, ret);
666 return;
667 }
668 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
669 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
670 tcg_out32(s, arg);
671 return;
672 }
673 if (arg == (int32_t)arg) {
674 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
675 tcg_out32(s, arg);
676 return;
677 }
678
679 /* Try a 7 byte pc-relative lea before the 10 byte movq. */
680 diff = arg - ((uintptr_t)s->code_ptr + 7);
681 if (diff == (int32_t)diff) {
682 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
683 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
684 tcg_out32(s, diff);
685 return;
686 }
687
688 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
689 tcg_out64(s, arg);
690 }
691
692 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
693 {
694 if (val == (int8_t)val) {
695 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
696 tcg_out8(s, val);
697 } else if (val == (int32_t)val) {
698 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
699 tcg_out32(s, val);
700 } else {
701 tcg_abort();
702 }
703 }
704
705 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
706 {
707 /* Given the strength of x86 memory ordering, we only need care for
708 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is
709 faster than "mfence", so don't bother with the sse insn. */
710 if (a0 & TCG_MO_ST_LD) {
711 tcg_out8(s, 0xf0);
712 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
713 tcg_out8(s, 0);
714 }
715 }
716
717 static inline void tcg_out_push(TCGContext *s, int reg)
718 {
719 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
720 }
721
722 static inline void tcg_out_pop(TCGContext *s, int reg)
723 {
724 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
725 }
726
727 static inline void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
728 TCGReg arg1, intptr_t arg2)
729 {
730 int opc = OPC_MOVL_GvEv + (type == TCG_TYPE_I64 ? P_REXW : 0);
731 tcg_out_modrm_offset(s, opc, ret, arg1, arg2);
732 }
733
734 static inline void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
735 TCGReg arg1, intptr_t arg2)
736 {
737 int opc = OPC_MOVL_EvGv + (type == TCG_TYPE_I64 ? P_REXW : 0);
738 tcg_out_modrm_offset(s, opc, arg, arg1, arg2);
739 }
740
741 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
742 TCGReg base, intptr_t ofs)
743 {
744 int rexw = 0;
745 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
746 if (val != (int32_t)val) {
747 return false;
748 }
749 rexw = P_REXW;
750 }
751 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
752 tcg_out32(s, val);
753 return true;
754 }
755
756 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
757 {
758 /* Propagate an opcode prefix, such as P_DATA16. */
759 int ext = subopc & ~0x7;
760 subopc &= 0x7;
761
762 if (count == 1) {
763 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
764 } else {
765 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
766 tcg_out8(s, count);
767 }
768 }
769
770 static inline void tcg_out_bswap32(TCGContext *s, int reg)
771 {
772 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
773 }
774
775 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
776 {
777 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
778 }
779
780 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
781 {
782 /* movzbl */
783 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
784 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
785 }
786
787 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
788 {
789 /* movsbl */
790 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
791 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
792 }
793
794 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
795 {
796 /* movzwl */
797 tcg_out_modrm(s, OPC_MOVZWL, dest, src);
798 }
799
800 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
801 {
802 /* movsw[lq] */
803 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
804 }
805
806 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
807 {
808 /* 32-bit mov zero extends. */
809 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
810 }
811
812 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
813 {
814 tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
815 }
816
817 static inline void tcg_out_bswap64(TCGContext *s, int reg)
818 {
819 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
820 }
821
822 static void tgen_arithi(TCGContext *s, int c, int r0,
823 tcg_target_long val, int cf)
824 {
825 int rexw = 0;
826
827 if (TCG_TARGET_REG_BITS == 64) {
828 rexw = c & -8;
829 c &= 7;
830 }
831
832 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
833 partial flags update stalls on Pentium4 and are not recommended
834 by current Intel optimization manuals. */
835 if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
836 int is_inc = (c == ARITH_ADD) ^ (val < 0);
837 if (TCG_TARGET_REG_BITS == 64) {
838 /* The single-byte increment encodings are re-tasked as the
839 REX prefixes. Use the MODRM encoding. */
840 tcg_out_modrm(s, OPC_GRP5 + rexw,
841 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
842 } else {
843 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
844 }
845 return;
846 }
847
848 if (c == ARITH_AND) {
849 if (TCG_TARGET_REG_BITS == 64) {
850 if (val == 0xffffffffu) {
851 tcg_out_ext32u(s, r0, r0);
852 return;
853 }
854 if (val == (uint32_t)val) {
855 /* AND with no high bits set can use a 32-bit operation. */
856 rexw = 0;
857 }
858 }
859 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
860 tcg_out_ext8u(s, r0, r0);
861 return;
862 }
863 if (val == 0xffffu) {
864 tcg_out_ext16u(s, r0, r0);
865 return;
866 }
867 }
868
869 if (val == (int8_t)val) {
870 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
871 tcg_out8(s, val);
872 return;
873 }
874 if (rexw == 0 || val == (int32_t)val) {
875 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
876 tcg_out32(s, val);
877 return;
878 }
879
880 tcg_abort();
881 }
882
883 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
884 {
885 if (val != 0) {
886 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
887 }
888 }
889
890 /* Use SMALL != 0 to force a short forward branch. */
891 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
892 {
893 int32_t val, val1;
894
895 if (l->has_value) {
896 val = tcg_pcrel_diff(s, l->u.value_ptr);
897 val1 = val - 2;
898 if ((int8_t)val1 == val1) {
899 if (opc == -1) {
900 tcg_out8(s, OPC_JMP_short);
901 } else {
902 tcg_out8(s, OPC_JCC_short + opc);
903 }
904 tcg_out8(s, val1);
905 } else {
906 if (small) {
907 tcg_abort();
908 }
909 if (opc == -1) {
910 tcg_out8(s, OPC_JMP_long);
911 tcg_out32(s, val - 5);
912 } else {
913 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
914 tcg_out32(s, val - 6);
915 }
916 }
917 } else if (small) {
918 if (opc == -1) {
919 tcg_out8(s, OPC_JMP_short);
920 } else {
921 tcg_out8(s, OPC_JCC_short + opc);
922 }
923 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
924 s->code_ptr += 1;
925 } else {
926 if (opc == -1) {
927 tcg_out8(s, OPC_JMP_long);
928 } else {
929 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
930 }
931 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
932 s->code_ptr += 4;
933 }
934 }
935
936 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
937 int const_arg2, int rexw)
938 {
939 if (const_arg2) {
940 if (arg2 == 0) {
941 /* test r, r */
942 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
943 } else {
944 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
945 }
946 } else {
947 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
948 }
949 }
950
951 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
952 TCGArg arg1, TCGArg arg2, int const_arg2,
953 TCGLabel *label, int small)
954 {
955 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
956 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
957 }
958
959 #if TCG_TARGET_REG_BITS == 64
960 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
961 TCGArg arg1, TCGArg arg2, int const_arg2,
962 TCGLabel *label, int small)
963 {
964 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
965 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
966 }
967 #else
968 /* XXX: we implement it at the target level to avoid having to
969 handle cross basic blocks temporaries */
970 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
971 const int *const_args, int small)
972 {
973 TCGLabel *label_next = gen_new_label();
974 TCGLabel *label_this = arg_label(args[5]);
975
976 switch(args[4]) {
977 case TCG_COND_EQ:
978 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
979 label_next, 1);
980 tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
981 label_this, small);
982 break;
983 case TCG_COND_NE:
984 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
985 label_this, small);
986 tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
987 label_this, small);
988 break;
989 case TCG_COND_LT:
990 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
991 label_this, small);
992 tcg_out_jxx(s, JCC_JNE, label_next, 1);
993 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
994 label_this, small);
995 break;
996 case TCG_COND_LE:
997 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
998 label_this, small);
999 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1000 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1001 label_this, small);
1002 break;
1003 case TCG_COND_GT:
1004 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1005 label_this, small);
1006 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1007 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1008 label_this, small);
1009 break;
1010 case TCG_COND_GE:
1011 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1012 label_this, small);
1013 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1014 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1015 label_this, small);
1016 break;
1017 case TCG_COND_LTU:
1018 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1019 label_this, small);
1020 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1021 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1022 label_this, small);
1023 break;
1024 case TCG_COND_LEU:
1025 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1026 label_this, small);
1027 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1028 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1029 label_this, small);
1030 break;
1031 case TCG_COND_GTU:
1032 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1033 label_this, small);
1034 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1035 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1036 label_this, small);
1037 break;
1038 case TCG_COND_GEU:
1039 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1040 label_this, small);
1041 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1042 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1043 label_this, small);
1044 break;
1045 default:
1046 tcg_abort();
1047 }
1048 tcg_out_label(s, label_next, s->code_ptr);
1049 }
1050 #endif
1051
1052 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1053 TCGArg arg1, TCGArg arg2, int const_arg2)
1054 {
1055 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1056 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1057 tcg_out_ext8u(s, dest, dest);
1058 }
1059
1060 #if TCG_TARGET_REG_BITS == 64
1061 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1062 TCGArg arg1, TCGArg arg2, int const_arg2)
1063 {
1064 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1065 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1066 tcg_out_ext8u(s, dest, dest);
1067 }
1068 #else
1069 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1070 const int *const_args)
1071 {
1072 TCGArg new_args[6];
1073 TCGLabel *label_true, *label_over;
1074
1075 memcpy(new_args, args+1, 5*sizeof(TCGArg));
1076
1077 if (args[0] == args[1] || args[0] == args[2]
1078 || (!const_args[3] && args[0] == args[3])
1079 || (!const_args[4] && args[0] == args[4])) {
1080 /* When the destination overlaps with one of the argument
1081 registers, don't do anything tricky. */
1082 label_true = gen_new_label();
1083 label_over = gen_new_label();
1084
1085 new_args[5] = label_arg(label_true);
1086 tcg_out_brcond2(s, new_args, const_args+1, 1);
1087
1088 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1089 tcg_out_jxx(s, JCC_JMP, label_over, 1);
1090 tcg_out_label(s, label_true, s->code_ptr);
1091
1092 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1093 tcg_out_label(s, label_over, s->code_ptr);
1094 } else {
1095 /* When the destination does not overlap one of the arguments,
1096 clear the destination first, jump if cond false, and emit an
1097 increment in the true case. This results in smaller code. */
1098
1099 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1100
1101 label_over = gen_new_label();
1102 new_args[4] = tcg_invert_cond(new_args[4]);
1103 new_args[5] = label_arg(label_over);
1104 tcg_out_brcond2(s, new_args, const_args+1, 1);
1105
1106 tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1107 tcg_out_label(s, label_over, s->code_ptr);
1108 }
1109 }
1110 #endif
1111
1112 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1113 TCGReg dest, TCGReg v1)
1114 {
1115 if (have_cmov) {
1116 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1117 } else {
1118 TCGLabel *over = gen_new_label();
1119 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1120 tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1121 tcg_out_label(s, over, s->code_ptr);
1122 }
1123 }
1124
1125 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1126 TCGReg c1, TCGArg c2, int const_c2,
1127 TCGReg v1)
1128 {
1129 tcg_out_cmp(s, c1, c2, const_c2, 0);
1130 tcg_out_cmov(s, cond, 0, dest, v1);
1131 }
1132
1133 #if TCG_TARGET_REG_BITS == 64
1134 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1135 TCGReg c1, TCGArg c2, int const_c2,
1136 TCGReg v1)
1137 {
1138 tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1139 tcg_out_cmov(s, cond, P_REXW, dest, v1);
1140 }
1141 #endif
1142
1143 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1144 TCGArg arg2, bool const_a2)
1145 {
1146 if (const_a2) {
1147 tcg_debug_assert(have_bmi1);
1148 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1149 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1150 } else {
1151 /* ??? The manual says that the output is undefined when the
1152 input is zero, but real hardware leaves it unchanged. As
1153 noted in target-i386/translate.c, real programs depend on
1154 this -- now we are one more of those. */
1155 tcg_debug_assert(dest == arg2);
1156 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1157 }
1158 }
1159
1160 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1161 TCGArg arg2, bool const_a2)
1162 {
1163 if (have_lzcnt) {
1164 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1165 if (const_a2) {
1166 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1167 } else {
1168 tcg_debug_assert(dest != arg2);
1169 /* LZCNT sets C if the input was zero. */
1170 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1171 }
1172 } else {
1173 TCGType type = rexw ? TCG_TYPE_I64: TCG_TYPE_I32;
1174 TCGArg rev = rexw ? 63 : 31;
1175
1176 /* Recall that the output of BSR is the index not the count.
1177 Therefore we must adjust the result by ^ (SIZE-1). In some
1178 cases below, we prefer an extra XOR to a JMP. */
1179 /* ??? See the comment in tcg_out_ctz re BSF. */
1180 if (const_a2) {
1181 tcg_debug_assert(dest != arg1);
1182 tcg_out_movi(s, type, dest, arg2 ^ rev);
1183 } else {
1184 tcg_debug_assert(dest == arg2);
1185 tgen_arithi(s, ARITH_XOR + rexw, dest, rev, 0);
1186 }
1187 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1188 tgen_arithi(s, ARITH_XOR + rexw, dest, rev, 0);
1189 }
1190 }
1191
1192 static void tcg_out_branch(TCGContext *s, int call, tcg_insn_unit *dest)
1193 {
1194 intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1195
1196 if (disp == (int32_t)disp) {
1197 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1198 tcg_out32(s, disp);
1199 } else {
1200 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_R10, (uintptr_t)dest);
1201 tcg_out_modrm(s, OPC_GRP5,
1202 call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev, TCG_REG_R10);
1203 }
1204 }
1205
1206 static inline void tcg_out_call(TCGContext *s, tcg_insn_unit *dest)
1207 {
1208 tcg_out_branch(s, 1, dest);
1209 }
1210
1211 static void tcg_out_jmp(TCGContext *s, tcg_insn_unit *dest)
1212 {
1213 tcg_out_branch(s, 0, dest);
1214 }
1215
1216 static void tcg_out_nopn(TCGContext *s, int n)
1217 {
1218 int i;
1219 /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1220 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1221 * duplicate prefix, and all of the interesting recent cores can
1222 * decode and discard the duplicates in a single cycle.
1223 */
1224 tcg_debug_assert(n >= 1);
1225 for (i = 1; i < n; ++i) {
1226 tcg_out8(s, 0x66);
1227 }
1228 tcg_out8(s, 0x90);
1229 }
1230
1231 #if defined(CONFIG_SOFTMMU)
1232 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1233 * int mmu_idx, uintptr_t ra)
1234 */
1235 static void * const qemu_ld_helpers[16] = {
1236 [MO_UB] = helper_ret_ldub_mmu,
1237 [MO_LEUW] = helper_le_lduw_mmu,
1238 [MO_LEUL] = helper_le_ldul_mmu,
1239 [MO_LEQ] = helper_le_ldq_mmu,
1240 [MO_BEUW] = helper_be_lduw_mmu,
1241 [MO_BEUL] = helper_be_ldul_mmu,
1242 [MO_BEQ] = helper_be_ldq_mmu,
1243 };
1244
1245 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1246 * uintxx_t val, int mmu_idx, uintptr_t ra)
1247 */
1248 static void * const qemu_st_helpers[16] = {
1249 [MO_UB] = helper_ret_stb_mmu,
1250 [MO_LEUW] = helper_le_stw_mmu,
1251 [MO_LEUL] = helper_le_stl_mmu,
1252 [MO_LEQ] = helper_le_stq_mmu,
1253 [MO_BEUW] = helper_be_stw_mmu,
1254 [MO_BEUL] = helper_be_stl_mmu,
1255 [MO_BEQ] = helper_be_stq_mmu,
1256 };
1257
1258 /* Perform the TLB load and compare.
1259
1260 Inputs:
1261 ADDRLO and ADDRHI contain the low and high part of the address.
1262
1263 MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1264
1265 WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1266 This should be offsetof addr_read or addr_write.
1267
1268 Outputs:
1269 LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1270 positions of the displacements of forward jumps to the TLB miss case.
1271
1272 Second argument register is loaded with the low part of the address.
1273 In the TLB hit case, it has been adjusted as indicated by the TLB
1274 and so is a host address. In the TLB miss case, it continues to
1275 hold a guest address.
1276
1277 First argument register is clobbered. */
1278
1279 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1280 int mem_index, TCGMemOp opc,
1281 tcg_insn_unit **label_ptr, int which)
1282 {
1283 const TCGReg r0 = TCG_REG_L0;
1284 const TCGReg r1 = TCG_REG_L1;
1285 TCGType ttype = TCG_TYPE_I32;
1286 TCGType tlbtype = TCG_TYPE_I32;
1287 int trexw = 0, hrexw = 0, tlbrexw = 0;
1288 unsigned a_bits = get_alignment_bits(opc);
1289 unsigned s_bits = opc & MO_SIZE;
1290 unsigned a_mask = (1 << a_bits) - 1;
1291 unsigned s_mask = (1 << s_bits) - 1;
1292 target_ulong tlb_mask;
1293
1294 if (TCG_TARGET_REG_BITS == 64) {
1295 if (TARGET_LONG_BITS == 64) {
1296 ttype = TCG_TYPE_I64;
1297 trexw = P_REXW;
1298 }
1299 if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1300 hrexw = P_REXW;
1301 if (TARGET_PAGE_BITS + CPU_TLB_BITS > 32) {
1302 tlbtype = TCG_TYPE_I64;
1303 tlbrexw = P_REXW;
1304 }
1305 }
1306 }
1307
1308 tcg_out_mov(s, tlbtype, r0, addrlo);
1309 /* If the required alignment is at least as large as the access, simply
1310 copy the address and mask. For lesser alignments, check that we don't
1311 cross pages for the complete access. */
1312 if (a_bits >= s_bits) {
1313 tcg_out_mov(s, ttype, r1, addrlo);
1314 } else {
1315 tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1316 }
1317 tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1318
1319 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1320 TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1321
1322 tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1323 tgen_arithi(s, ARITH_AND + tlbrexw, r0,
1324 (CPU_TLB_SIZE - 1) << CPU_TLB_ENTRY_BITS, 0);
1325
1326 tcg_out_modrm_sib_offset(s, OPC_LEA + hrexw, r0, TCG_AREG0, r0, 0,
1327 offsetof(CPUArchState, tlb_table[mem_index][0])
1328 + which);
1329
1330 /* cmp 0(r0), r1 */
1331 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, 0);
1332
1333 /* Prepare for both the fast path add of the tlb addend, and the slow
1334 path function argument setup. There are two cases worth note:
1335 For 32-bit guest and x86_64 host, MOVL zero-extends the guest address
1336 before the fastpath ADDQ below. For 64-bit guest and x32 host, MOVQ
1337 copies the entire guest address for the slow path, while truncation
1338 for the 32-bit host happens with the fastpath ADDL below. */
1339 tcg_out_mov(s, ttype, r1, addrlo);
1340
1341 /* jne slow_path */
1342 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1343 label_ptr[0] = s->code_ptr;
1344 s->code_ptr += 4;
1345
1346 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1347 /* cmp 4(r0), addrhi */
1348 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, 4);
1349
1350 /* jne slow_path */
1351 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1352 label_ptr[1] = s->code_ptr;
1353 s->code_ptr += 4;
1354 }
1355
1356 /* TLB Hit. */
1357
1358 /* add addend(r0), r1 */
1359 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1360 offsetof(CPUTLBEntry, addend) - which);
1361 }
1362
1363 /*
1364 * Record the context of a call to the out of line helper code for the slow path
1365 * for a load or store, so that we can later generate the correct helper code
1366 */
1367 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, TCGMemOpIdx oi,
1368 TCGReg datalo, TCGReg datahi,
1369 TCGReg addrlo, TCGReg addrhi,
1370 tcg_insn_unit *raddr,
1371 tcg_insn_unit **label_ptr)
1372 {
1373 TCGLabelQemuLdst *label = new_ldst_label(s);
1374
1375 label->is_ld = is_ld;
1376 label->oi = oi;
1377 label->datalo_reg = datalo;
1378 label->datahi_reg = datahi;
1379 label->addrlo_reg = addrlo;
1380 label->addrhi_reg = addrhi;
1381 label->raddr = raddr;
1382 label->label_ptr[0] = label_ptr[0];
1383 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1384 label->label_ptr[1] = label_ptr[1];
1385 }
1386 }
1387
1388 /*
1389 * Generate code for the slow path for a load at the end of block
1390 */
1391 static void tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1392 {
1393 TCGMemOpIdx oi = l->oi;
1394 TCGMemOp opc = get_memop(oi);
1395 TCGReg data_reg;
1396 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1397
1398 /* resolve label address */
1399 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1400 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1401 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1402 }
1403
1404 if (TCG_TARGET_REG_BITS == 32) {
1405 int ofs = 0;
1406
1407 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1408 ofs += 4;
1409
1410 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1411 ofs += 4;
1412
1413 if (TARGET_LONG_BITS == 64) {
1414 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1415 ofs += 4;
1416 }
1417
1418 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1419 ofs += 4;
1420
1421 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1422 } else {
1423 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1424 /* The second argument is already loaded with addrlo. */
1425 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1426 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1427 (uintptr_t)l->raddr);
1428 }
1429
1430 tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1431
1432 data_reg = l->datalo_reg;
1433 switch (opc & MO_SSIZE) {
1434 case MO_SB:
1435 tcg_out_ext8s(s, data_reg, TCG_REG_EAX, P_REXW);
1436 break;
1437 case MO_SW:
1438 tcg_out_ext16s(s, data_reg, TCG_REG_EAX, P_REXW);
1439 break;
1440 #if TCG_TARGET_REG_BITS == 64
1441 case MO_SL:
1442 tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1443 break;
1444 #endif
1445 case MO_UB:
1446 case MO_UW:
1447 /* Note that the helpers have zero-extended to tcg_target_long. */
1448 case MO_UL:
1449 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1450 break;
1451 case MO_Q:
1452 if (TCG_TARGET_REG_BITS == 64) {
1453 tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1454 } else if (data_reg == TCG_REG_EDX) {
1455 /* xchg %edx, %eax */
1456 tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1457 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1458 } else {
1459 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1460 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1461 }
1462 break;
1463 default:
1464 tcg_abort();
1465 }
1466
1467 /* Jump to the code corresponding to next IR of qemu_st */
1468 tcg_out_jmp(s, l->raddr);
1469 }
1470
1471 /*
1472 * Generate code for the slow path for a store at the end of block
1473 */
1474 static void tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1475 {
1476 TCGMemOpIdx oi = l->oi;
1477 TCGMemOp opc = get_memop(oi);
1478 TCGMemOp s_bits = opc & MO_SIZE;
1479 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1480 TCGReg retaddr;
1481
1482 /* resolve label address */
1483 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1484 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1485 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1486 }
1487
1488 if (TCG_TARGET_REG_BITS == 32) {
1489 int ofs = 0;
1490
1491 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1492 ofs += 4;
1493
1494 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1495 ofs += 4;
1496
1497 if (TARGET_LONG_BITS == 64) {
1498 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1499 ofs += 4;
1500 }
1501
1502 tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1503 ofs += 4;
1504
1505 if (s_bits == MO_64) {
1506 tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1507 ofs += 4;
1508 }
1509
1510 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1511 ofs += 4;
1512
1513 retaddr = TCG_REG_EAX;
1514 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1515 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1516 } else {
1517 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1518 /* The second argument is already loaded with addrlo. */
1519 tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1520 tcg_target_call_iarg_regs[2], l->datalo_reg);
1521 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1522
1523 if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1524 retaddr = tcg_target_call_iarg_regs[4];
1525 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1526 } else {
1527 retaddr = TCG_REG_RAX;
1528 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1529 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1530 TCG_TARGET_CALL_STACK_OFFSET);
1531 }
1532 }
1533
1534 /* "Tail call" to the helper, with the return address back inline. */
1535 tcg_out_push(s, retaddr);
1536 tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1537 }
1538 #elif defined(__x86_64__) && defined(__linux__)
1539 # include <asm/prctl.h>
1540 # include <sys/prctl.h>
1541
1542 int arch_prctl(int code, unsigned long addr);
1543
1544 static int guest_base_flags;
1545 static inline void setup_guest_base_seg(void)
1546 {
1547 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1548 guest_base_flags = P_GS;
1549 }
1550 }
1551 #else
1552 # define guest_base_flags 0
1553 static inline void setup_guest_base_seg(void) { }
1554 #endif /* SOFTMMU */
1555
1556 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1557 TCGReg base, int index, intptr_t ofs,
1558 int seg, TCGMemOp memop)
1559 {
1560 const TCGMemOp real_bswap = memop & MO_BSWAP;
1561 TCGMemOp bswap = real_bswap;
1562 int movop = OPC_MOVL_GvEv;
1563
1564 if (have_movbe && real_bswap) {
1565 bswap = 0;
1566 movop = OPC_MOVBE_GyMy;
1567 }
1568
1569 switch (memop & MO_SSIZE) {
1570 case MO_UB:
1571 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1572 base, index, 0, ofs);
1573 break;
1574 case MO_SB:
1575 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + P_REXW + seg, datalo,
1576 base, index, 0, ofs);
1577 break;
1578 case MO_UW:
1579 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1580 base, index, 0, ofs);
1581 if (real_bswap) {
1582 tcg_out_rolw_8(s, datalo);
1583 }
1584 break;
1585 case MO_SW:
1586 if (real_bswap) {
1587 if (have_movbe) {
1588 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1589 datalo, base, index, 0, ofs);
1590 } else {
1591 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1592 base, index, 0, ofs);
1593 tcg_out_rolw_8(s, datalo);
1594 }
1595 tcg_out_modrm(s, OPC_MOVSWL + P_REXW, datalo, datalo);
1596 } else {
1597 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + P_REXW + seg,
1598 datalo, base, index, 0, ofs);
1599 }
1600 break;
1601 case MO_UL:
1602 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
1603 if (bswap) {
1604 tcg_out_bswap32(s, datalo);
1605 }
1606 break;
1607 #if TCG_TARGET_REG_BITS == 64
1608 case MO_SL:
1609 if (real_bswap) {
1610 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1611 base, index, 0, ofs);
1612 if (bswap) {
1613 tcg_out_bswap32(s, datalo);
1614 }
1615 tcg_out_ext32s(s, datalo, datalo);
1616 } else {
1617 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
1618 base, index, 0, ofs);
1619 }
1620 break;
1621 #endif
1622 case MO_Q:
1623 if (TCG_TARGET_REG_BITS == 64) {
1624 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
1625 base, index, 0, ofs);
1626 if (bswap) {
1627 tcg_out_bswap64(s, datalo);
1628 }
1629 } else {
1630 if (real_bswap) {
1631 int t = datalo;
1632 datalo = datahi;
1633 datahi = t;
1634 }
1635 if (base != datalo) {
1636 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1637 base, index, 0, ofs);
1638 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1639 base, index, 0, ofs + 4);
1640 } else {
1641 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
1642 base, index, 0, ofs + 4);
1643 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
1644 base, index, 0, ofs);
1645 }
1646 if (bswap) {
1647 tcg_out_bswap32(s, datalo);
1648 tcg_out_bswap32(s, datahi);
1649 }
1650 }
1651 break;
1652 default:
1653 tcg_abort();
1654 }
1655 }
1656
1657 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
1658 EAX. It will be useful once fixed registers globals are less
1659 common. */
1660 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
1661 {
1662 TCGReg datalo, datahi, addrlo;
1663 TCGReg addrhi __attribute__((unused));
1664 TCGMemOpIdx oi;
1665 TCGMemOp opc;
1666 #if defined(CONFIG_SOFTMMU)
1667 int mem_index;
1668 tcg_insn_unit *label_ptr[2];
1669 #endif
1670
1671 datalo = *args++;
1672 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
1673 addrlo = *args++;
1674 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
1675 oi = *args++;
1676 opc = get_memop(oi);
1677
1678 #if defined(CONFIG_SOFTMMU)
1679 mem_index = get_mmuidx(oi);
1680
1681 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
1682 label_ptr, offsetof(CPUTLBEntry, addr_read));
1683
1684 /* TLB Hit. */
1685 tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
1686
1687 /* Record the current context of a load into ldst label */
1688 add_qemu_ldst_label(s, true, oi, datalo, datahi, addrlo, addrhi,
1689 s->code_ptr, label_ptr);
1690 #else
1691 {
1692 int32_t offset = guest_base;
1693 TCGReg base = addrlo;
1694 int index = -1;
1695 int seg = 0;
1696
1697 /* For a 32-bit guest, the high 32 bits may contain garbage.
1698 We can do this with the ADDR32 prefix if we're not using
1699 a guest base, or when using segmentation. Otherwise we
1700 need to zero-extend manually. */
1701 if (guest_base == 0 || guest_base_flags) {
1702 seg = guest_base_flags;
1703 offset = 0;
1704 if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
1705 seg |= P_ADDR32;
1706 }
1707 } else if (TCG_TARGET_REG_BITS == 64) {
1708 if (TARGET_LONG_BITS == 32) {
1709 tcg_out_ext32u(s, TCG_REG_L0, base);
1710 base = TCG_REG_L0;
1711 }
1712 if (offset != guest_base) {
1713 tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
1714 index = TCG_REG_L1;
1715 offset = 0;
1716 }
1717 }
1718
1719 tcg_out_qemu_ld_direct(s, datalo, datahi,
1720 base, index, offset, seg, opc);
1721 }
1722 #endif
1723 }
1724
1725 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1726 TCGReg base, intptr_t ofs, int seg,
1727 TCGMemOp memop)
1728 {
1729 /* ??? Ideally we wouldn't need a scratch register. For user-only,
1730 we could perform the bswap twice to restore the original value
1731 instead of moving to the scratch. But as it is, the L constraint
1732 means that TCG_REG_L0 is definitely free here. */
1733 const TCGReg scratch = TCG_REG_L0;
1734 const TCGMemOp real_bswap = memop & MO_BSWAP;
1735 TCGMemOp bswap = real_bswap;
1736 int movop = OPC_MOVL_EvGv;
1737
1738 if (have_movbe && real_bswap) {
1739 bswap = 0;
1740 movop = OPC_MOVBE_MyGy;
1741 }
1742
1743 switch (memop & MO_SIZE) {
1744 case MO_8:
1745 /* In 32-bit mode, 8-bit stores can only happen from [abcd]x.
1746 Use the scratch register if necessary. */
1747 if (TCG_TARGET_REG_BITS == 32 && datalo >= 4) {
1748 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1749 datalo = scratch;
1750 }
1751 tcg_out_modrm_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
1752 datalo, base, ofs);
1753 break;
1754 case MO_16:
1755 if (bswap) {
1756 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1757 tcg_out_rolw_8(s, scratch);
1758 datalo = scratch;
1759 }
1760 tcg_out_modrm_offset(s, movop + P_DATA16 + seg, datalo, base, ofs);
1761 break;
1762 case MO_32:
1763 if (bswap) {
1764 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1765 tcg_out_bswap32(s, scratch);
1766 datalo = scratch;
1767 }
1768 tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
1769 break;
1770 case MO_64:
1771 if (TCG_TARGET_REG_BITS == 64) {
1772 if (bswap) {
1773 tcg_out_mov(s, TCG_TYPE_I64, scratch, datalo);
1774 tcg_out_bswap64(s, scratch);
1775 datalo = scratch;
1776 }
1777 tcg_out_modrm_offset(s, movop + P_REXW + seg, datalo, base, ofs);
1778 } else if (bswap) {
1779 tcg_out_mov(s, TCG_TYPE_I32, scratch, datahi);
1780 tcg_out_bswap32(s, scratch);
1781 tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs);
1782 tcg_out_mov(s, TCG_TYPE_I32, scratch, datalo);
1783 tcg_out_bswap32(s, scratch);
1784 tcg_out_modrm_offset(s, OPC_MOVL_EvGv + seg, scratch, base, ofs+4);
1785 } else {
1786 if (real_bswap) {
1787 int t = datalo;
1788 datalo = datahi;
1789 datahi = t;
1790 }
1791 tcg_out_modrm_offset(s, movop + seg, datalo, base, ofs);
1792 tcg_out_modrm_offset(s, movop + seg, datahi, base, ofs+4);
1793 }
1794 break;
1795 default:
1796 tcg_abort();
1797 }
1798 }
1799
1800 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
1801 {
1802 TCGReg datalo, datahi, addrlo;
1803 TCGReg addrhi __attribute__((unused));
1804 TCGMemOpIdx oi;
1805 TCGMemOp opc;
1806 #if defined(CONFIG_SOFTMMU)
1807 int mem_index;
1808 tcg_insn_unit *label_ptr[2];
1809 #endif
1810
1811 datalo = *args++;
1812 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
1813 addrlo = *args++;
1814 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
1815 oi = *args++;
1816 opc = get_memop(oi);
1817
1818 #if defined(CONFIG_SOFTMMU)
1819 mem_index = get_mmuidx(oi);
1820
1821 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
1822 label_ptr, offsetof(CPUTLBEntry, addr_write));
1823
1824 /* TLB Hit. */
1825 tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, 0, 0, opc);
1826
1827 /* Record the current context of a store into ldst label */
1828 add_qemu_ldst_label(s, false, oi, datalo, datahi, addrlo, addrhi,
1829 s->code_ptr, label_ptr);
1830 #else
1831 {
1832 int32_t offset = guest_base;
1833 TCGReg base = addrlo;
1834 int seg = 0;
1835
1836 /* See comment in tcg_out_qemu_ld re zero-extension of addrlo. */
1837 if (guest_base == 0 || guest_base_flags) {
1838 seg = guest_base_flags;
1839 offset = 0;
1840 if (TCG_TARGET_REG_BITS > TARGET_LONG_BITS) {
1841 seg |= P_ADDR32;
1842 }
1843 } else if (TCG_TARGET_REG_BITS == 64) {
1844 /* ??? Note that we can't use the same SIB addressing scheme
1845 as for loads, since we require L0 free for bswap. */
1846 if (offset != guest_base) {
1847 if (TARGET_LONG_BITS == 32) {
1848 tcg_out_ext32u(s, TCG_REG_L0, base);
1849 base = TCG_REG_L0;
1850 }
1851 tcg_out_movi(s, TCG_TYPE_I64, TCG_REG_L1, guest_base);
1852 tgen_arithr(s, ARITH_ADD + P_REXW, TCG_REG_L1, base);
1853 base = TCG_REG_L1;
1854 offset = 0;
1855 } else if (TARGET_LONG_BITS == 32) {
1856 tcg_out_ext32u(s, TCG_REG_L1, base);
1857 base = TCG_REG_L1;
1858 }
1859 }
1860
1861 tcg_out_qemu_st_direct(s, datalo, datahi, base, offset, seg, opc);
1862 }
1863 #endif
1864 }
1865
1866 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
1867 const TCGArg *args, const int *const_args)
1868 {
1869 TCGArg a0, a1, a2;
1870 int c, const_a2, vexop, rexw = 0;
1871
1872 #if TCG_TARGET_REG_BITS == 64
1873 # define OP_32_64(x) \
1874 case glue(glue(INDEX_op_, x), _i64): \
1875 rexw = P_REXW; /* FALLTHRU */ \
1876 case glue(glue(INDEX_op_, x), _i32)
1877 #else
1878 # define OP_32_64(x) \
1879 case glue(glue(INDEX_op_, x), _i32)
1880 #endif
1881
1882 /* Hoist the loads of the most common arguments. */
1883 a0 = args[0];
1884 a1 = args[1];
1885 a2 = args[2];
1886 const_a2 = const_args[2];
1887
1888 switch (opc) {
1889 case INDEX_op_exit_tb:
1890 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
1891 tcg_out_jmp(s, tb_ret_addr);
1892 break;
1893 case INDEX_op_goto_tb:
1894 if (s->tb_jmp_insn_offset) {
1895 /* direct jump method */
1896 int gap;
1897 /* jump displacement must be aligned for atomic patching;
1898 * see if we need to add extra nops before jump
1899 */
1900 gap = tcg_pcrel_diff(s, QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4));
1901 if (gap != 1) {
1902 tcg_out_nopn(s, gap - 1);
1903 }
1904 tcg_out8(s, OPC_JMP_long); /* jmp im */
1905 s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
1906 tcg_out32(s, 0);
1907 } else {
1908 /* indirect jump method */
1909 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
1910 (intptr_t)(s->tb_jmp_target_addr + a0));
1911 }
1912 s->tb_jmp_reset_offset[a0] = tcg_current_code_size(s);
1913 break;
1914 case INDEX_op_br:
1915 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
1916 break;
1917 OP_32_64(ld8u):
1918 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
1919 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
1920 break;
1921 OP_32_64(ld8s):
1922 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
1923 break;
1924 OP_32_64(ld16u):
1925 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
1926 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
1927 break;
1928 OP_32_64(ld16s):
1929 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
1930 break;
1931 #if TCG_TARGET_REG_BITS == 64
1932 case INDEX_op_ld32u_i64:
1933 #endif
1934 case INDEX_op_ld_i32:
1935 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
1936 break;
1937
1938 OP_32_64(st8):
1939 if (const_args[0]) {
1940 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
1941 tcg_out8(s, a0);
1942 } else {
1943 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
1944 }
1945 break;
1946 OP_32_64(st16):
1947 if (const_args[0]) {
1948 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
1949 tcg_out16(s, a0);
1950 } else {
1951 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
1952 }
1953 break;
1954 #if TCG_TARGET_REG_BITS == 64
1955 case INDEX_op_st32_i64:
1956 #endif
1957 case INDEX_op_st_i32:
1958 if (const_args[0]) {
1959 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
1960 tcg_out32(s, a0);
1961 } else {
1962 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
1963 }
1964 break;
1965
1966 OP_32_64(add):
1967 /* For 3-operand addition, use LEA. */
1968 if (a0 != a1) {
1969 TCGArg c3 = 0;
1970 if (const_a2) {
1971 c3 = a2, a2 = -1;
1972 } else if (a0 == a2) {
1973 /* Watch out for dest = src + dest, since we've removed
1974 the matching constraint on the add. */
1975 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
1976 break;
1977 }
1978
1979 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
1980 break;
1981 }
1982 c = ARITH_ADD;
1983 goto gen_arith;
1984 OP_32_64(sub):
1985 c = ARITH_SUB;
1986 goto gen_arith;
1987 OP_32_64(and):
1988 c = ARITH_AND;
1989 goto gen_arith;
1990 OP_32_64(or):
1991 c = ARITH_OR;
1992 goto gen_arith;
1993 OP_32_64(xor):
1994 c = ARITH_XOR;
1995 goto gen_arith;
1996 gen_arith:
1997 if (const_a2) {
1998 tgen_arithi(s, c + rexw, a0, a2, 0);
1999 } else {
2000 tgen_arithr(s, c + rexw, a0, a2);
2001 }
2002 break;
2003
2004 OP_32_64(andc):
2005 if (const_a2) {
2006 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2007 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2008 } else {
2009 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2010 }
2011 break;
2012
2013 OP_32_64(mul):
2014 if (const_a2) {
2015 int32_t val;
2016 val = a2;
2017 if (val == (int8_t)val) {
2018 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2019 tcg_out8(s, val);
2020 } else {
2021 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2022 tcg_out32(s, val);
2023 }
2024 } else {
2025 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2026 }
2027 break;
2028
2029 OP_32_64(div2):
2030 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2031 break;
2032 OP_32_64(divu2):
2033 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2034 break;
2035
2036 OP_32_64(shl):
2037 /* For small constant 3-operand shift, use LEA. */
2038 if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2039 if (a2 - 1 == 0) {
2040 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2041 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2042 } else {
2043 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2044 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2045 }
2046 break;
2047 }
2048 c = SHIFT_SHL;
2049 vexop = OPC_SHLX;
2050 goto gen_shift_maybe_vex;
2051 OP_32_64(shr):
2052 c = SHIFT_SHR;
2053 vexop = OPC_SHRX;
2054 goto gen_shift_maybe_vex;
2055 OP_32_64(sar):
2056 c = SHIFT_SAR;
2057 vexop = OPC_SARX;
2058 goto gen_shift_maybe_vex;
2059 OP_32_64(rotl):
2060 c = SHIFT_ROL;
2061 goto gen_shift;
2062 OP_32_64(rotr):
2063 c = SHIFT_ROR;
2064 goto gen_shift;
2065 gen_shift_maybe_vex:
2066 if (have_bmi2) {
2067 if (!const_a2) {
2068 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2069 break;
2070 }
2071 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2072 }
2073 /* FALLTHRU */
2074 gen_shift:
2075 if (const_a2) {
2076 tcg_out_shifti(s, c + rexw, a0, a2);
2077 } else {
2078 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2079 }
2080 break;
2081
2082 OP_32_64(ctz):
2083 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2084 break;
2085 OP_32_64(clz):
2086 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2087 break;
2088 OP_32_64(ctpop):
2089 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2090 break;
2091
2092 case INDEX_op_brcond_i32:
2093 tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2094 break;
2095 case INDEX_op_setcond_i32:
2096 tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2097 break;
2098 case INDEX_op_movcond_i32:
2099 tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2100 break;
2101
2102 OP_32_64(bswap16):
2103 tcg_out_rolw_8(s, a0);
2104 break;
2105 OP_32_64(bswap32):
2106 tcg_out_bswap32(s, a0);
2107 break;
2108
2109 OP_32_64(neg):
2110 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2111 break;
2112 OP_32_64(not):
2113 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2114 break;
2115
2116 OP_32_64(ext8s):
2117 tcg_out_ext8s(s, a0, a1, rexw);
2118 break;
2119 OP_32_64(ext16s):
2120 tcg_out_ext16s(s, a0, a1, rexw);
2121 break;
2122 OP_32_64(ext8u):
2123 tcg_out_ext8u(s, a0, a1);
2124 break;
2125 OP_32_64(ext16u):
2126 tcg_out_ext16u(s, a0, a1);
2127 break;
2128
2129 case INDEX_op_qemu_ld_i32:
2130 tcg_out_qemu_ld(s, args, 0);
2131 break;
2132 case INDEX_op_qemu_ld_i64:
2133 tcg_out_qemu_ld(s, args, 1);
2134 break;
2135 case INDEX_op_qemu_st_i32:
2136 tcg_out_qemu_st(s, args, 0);
2137 break;
2138 case INDEX_op_qemu_st_i64:
2139 tcg_out_qemu_st(s, args, 1);
2140 break;
2141
2142 OP_32_64(mulu2):
2143 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2144 break;
2145 OP_32_64(muls2):
2146 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2147 break;
2148 OP_32_64(add2):
2149 if (const_args[4]) {
2150 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2151 } else {
2152 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2153 }
2154 if (const_args[5]) {
2155 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2156 } else {
2157 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2158 }
2159 break;
2160 OP_32_64(sub2):
2161 if (const_args[4]) {
2162 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2163 } else {
2164 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2165 }
2166 if (const_args[5]) {
2167 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2168 } else {
2169 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2170 }
2171 break;
2172
2173 #if TCG_TARGET_REG_BITS == 32
2174 case INDEX_op_brcond2_i32:
2175 tcg_out_brcond2(s, args, const_args, 0);
2176 break;
2177 case INDEX_op_setcond2_i32:
2178 tcg_out_setcond2(s, args, const_args);
2179 break;
2180 #else /* TCG_TARGET_REG_BITS == 64 */
2181 case INDEX_op_ld32s_i64:
2182 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2183 break;
2184 case INDEX_op_ld_i64:
2185 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2186 break;
2187 case INDEX_op_st_i64:
2188 if (const_args[0]) {
2189 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2190 tcg_out32(s, a0);
2191 } else {
2192 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2193 }
2194 break;
2195
2196 case INDEX_op_brcond_i64:
2197 tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2198 break;
2199 case INDEX_op_setcond_i64:
2200 tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2201 break;
2202 case INDEX_op_movcond_i64:
2203 tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2204 break;
2205
2206 case INDEX_op_bswap64_i64:
2207 tcg_out_bswap64(s, a0);
2208 break;
2209 case INDEX_op_extu_i32_i64:
2210 case INDEX_op_ext32u_i64:
2211 tcg_out_ext32u(s, a0, a1);
2212 break;
2213 case INDEX_op_ext_i32_i64:
2214 case INDEX_op_ext32s_i64:
2215 tcg_out_ext32s(s, a0, a1);
2216 break;
2217 #endif
2218
2219 OP_32_64(deposit):
2220 if (args[3] == 0 && args[4] == 8) {
2221 /* load bits 0..7 */
2222 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2223 } else if (args[3] == 8 && args[4] == 8) {
2224 /* load bits 8..15 */
2225 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2226 } else if (args[3] == 0 && args[4] == 16) {
2227 /* load bits 0..15 */
2228 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2229 } else {
2230 tcg_abort();
2231 }
2232 break;
2233
2234 case INDEX_op_extract_i64:
2235 if (a2 + args[3] == 32) {
2236 /* This is a 32-bit zero-extending right shift. */
2237 tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2238 tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2239 break;
2240 }
2241 /* FALLTHRU */
2242 case INDEX_op_extract_i32:
2243 /* On the off-chance that we can use the high-byte registers.
2244 Otherwise we emit the same ext16 + shift pattern that we
2245 would have gotten from the normal tcg-op.c expansion. */
2246 tcg_debug_assert(a2 == 8 && args[3] == 8);
2247 if (a1 < 4 && a0 < 8) {
2248 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2249 } else {
2250 tcg_out_ext16u(s, a0, a1);
2251 tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2252 }
2253 break;
2254
2255 case INDEX_op_sextract_i32:
2256 /* We don't implement sextract_i64, as we cannot sign-extend to
2257 64-bits without using the REX prefix that explicitly excludes
2258 access to the high-byte registers. */
2259 tcg_debug_assert(a2 == 8 && args[3] == 8);
2260 if (a1 < 4 && a0 < 8) {
2261 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2262 } else {
2263 tcg_out_ext16s(s, a0, a1, 0);
2264 tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2265 }
2266 break;
2267
2268 case INDEX_op_mb:
2269 tcg_out_mb(s, a0);
2270 break;
2271 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
2272 case INDEX_op_mov_i64:
2273 case INDEX_op_movi_i32: /* Always emitted via tcg_out_movi. */
2274 case INDEX_op_movi_i64:
2275 case INDEX_op_call: /* Always emitted via tcg_out_call. */
2276 default:
2277 tcg_abort();
2278 }
2279
2280 #undef OP_32_64
2281 }
2282
2283 static const TCGTargetOpDef *tcg_target_op_def(TCGOpcode op)
2284 {
2285 static const TCGTargetOpDef ri_r = { .args_ct_str = { "ri", "r" } };
2286 static const TCGTargetOpDef re_r = { .args_ct_str = { "re", "r" } };
2287 static const TCGTargetOpDef qi_r = { .args_ct_str = { "qi", "r" } };
2288 static const TCGTargetOpDef r_r = { .args_ct_str = { "r", "r" } };
2289 static const TCGTargetOpDef r_q = { .args_ct_str = { "r", "q" } };
2290 static const TCGTargetOpDef r_re = { .args_ct_str = { "r", "re" } };
2291 static const TCGTargetOpDef r_0 = { .args_ct_str = { "r", "0" } };
2292 static const TCGTargetOpDef r_r_ri = { .args_ct_str = { "r", "r", "ri" } };
2293 static const TCGTargetOpDef r_r_re = { .args_ct_str = { "r", "r", "re" } };
2294 static const TCGTargetOpDef r_0_re = { .args_ct_str = { "r", "0", "re" } };
2295 static const TCGTargetOpDef r_0_ci = { .args_ct_str = { "r", "0", "ci" } };
2296 static const TCGTargetOpDef r_L = { .args_ct_str = { "r", "L" } };
2297 static const TCGTargetOpDef L_L = { .args_ct_str = { "L", "L" } };
2298 static const TCGTargetOpDef r_L_L = { .args_ct_str = { "r", "L", "L" } };
2299 static const TCGTargetOpDef r_r_L = { .args_ct_str = { "r", "r", "L" } };
2300 static const TCGTargetOpDef L_L_L = { .args_ct_str = { "L", "L", "L" } };
2301 static const TCGTargetOpDef r_r_L_L
2302 = { .args_ct_str = { "r", "r", "L", "L" } };
2303 static const TCGTargetOpDef L_L_L_L
2304 = { .args_ct_str = { "L", "L", "L", "L" } };
2305
2306 switch (op) {
2307 case INDEX_op_ld8u_i32:
2308 case INDEX_op_ld8u_i64:
2309 case INDEX_op_ld8s_i32:
2310 case INDEX_op_ld8s_i64:
2311 case INDEX_op_ld16u_i32:
2312 case INDEX_op_ld16u_i64:
2313 case INDEX_op_ld16s_i32:
2314 case INDEX_op_ld16s_i64:
2315 case INDEX_op_ld_i32:
2316 case INDEX_op_ld32u_i64:
2317 case INDEX_op_ld32s_i64:
2318 case INDEX_op_ld_i64:
2319 return &r_r;
2320
2321 case INDEX_op_st8_i32:
2322 case INDEX_op_st8_i64:
2323 return &qi_r;
2324 case INDEX_op_st16_i32:
2325 case INDEX_op_st16_i64:
2326 case INDEX_op_st_i32:
2327 case INDEX_op_st32_i64:
2328 return &ri_r;
2329 case INDEX_op_st_i64:
2330 return &re_r;
2331
2332 case INDEX_op_add_i32:
2333 case INDEX_op_add_i64:
2334 return &r_r_re;
2335 case INDEX_op_sub_i32:
2336 case INDEX_op_sub_i64:
2337 case INDEX_op_mul_i32:
2338 case INDEX_op_mul_i64:
2339 case INDEX_op_or_i32:
2340 case INDEX_op_or_i64:
2341 case INDEX_op_xor_i32:
2342 case INDEX_op_xor_i64:
2343 return &r_0_re;
2344
2345 case INDEX_op_and_i32:
2346 case INDEX_op_and_i64:
2347 {
2348 static const TCGTargetOpDef and
2349 = { .args_ct_str = { "r", "0", "reZ" } };
2350 return &and;
2351 }
2352 break;
2353 case INDEX_op_andc_i32:
2354 case INDEX_op_andc_i64:
2355 {
2356 static const TCGTargetOpDef andc
2357 = { .args_ct_str = { "r", "r", "rI" } };
2358 return &andc;
2359 }
2360 break;
2361
2362 case INDEX_op_shl_i32:
2363 case INDEX_op_shl_i64:
2364 case INDEX_op_shr_i32:
2365 case INDEX_op_shr_i64:
2366 case INDEX_op_sar_i32:
2367 case INDEX_op_sar_i64:
2368 return have_bmi2 ? &r_r_ri : &r_0_ci;
2369 case INDEX_op_rotl_i32:
2370 case INDEX_op_rotl_i64:
2371 case INDEX_op_rotr_i32:
2372 case INDEX_op_rotr_i64:
2373 return &r_0_ci;
2374
2375 case INDEX_op_brcond_i32:
2376 case INDEX_op_brcond_i64:
2377 return &r_re;
2378
2379 case INDEX_op_bswap16_i32:
2380 case INDEX_op_bswap16_i64:
2381 case INDEX_op_bswap32_i32:
2382 case INDEX_op_bswap32_i64:
2383 case INDEX_op_bswap64_i64:
2384 case INDEX_op_neg_i32:
2385 case INDEX_op_neg_i64:
2386 case INDEX_op_not_i32:
2387 case INDEX_op_not_i64:
2388 return &r_0;
2389
2390 case INDEX_op_ext8s_i32:
2391 case INDEX_op_ext8s_i64:
2392 case INDEX_op_ext8u_i32:
2393 case INDEX_op_ext8u_i64:
2394 return &r_q;
2395 case INDEX_op_ext16s_i32:
2396 case INDEX_op_ext16s_i64:
2397 case INDEX_op_ext16u_i32:
2398 case INDEX_op_ext16u_i64:
2399 case INDEX_op_ext32s_i64:
2400 case INDEX_op_ext32u_i64:
2401 case INDEX_op_ext_i32_i64:
2402 case INDEX_op_extu_i32_i64:
2403 case INDEX_op_extract_i32:
2404 case INDEX_op_extract_i64:
2405 case INDEX_op_sextract_i32:
2406 case INDEX_op_ctpop_i32:
2407 case INDEX_op_ctpop_i64:
2408 return &r_r;
2409
2410 case INDEX_op_deposit_i32:
2411 case INDEX_op_deposit_i64:
2412 {
2413 static const TCGTargetOpDef dep
2414 = { .args_ct_str = { "Q", "0", "Q" } };
2415 return &dep;
2416 }
2417 case INDEX_op_setcond_i32:
2418 case INDEX_op_setcond_i64:
2419 {
2420 static const TCGTargetOpDef setc
2421 = { .args_ct_str = { "q", "r", "re" } };
2422 return &setc;
2423 }
2424 case INDEX_op_movcond_i32:
2425 case INDEX_op_movcond_i64:
2426 {
2427 static const TCGTargetOpDef movc
2428 = { .args_ct_str = { "r", "r", "re", "r", "0" } };
2429 return &movc;
2430 }
2431 case INDEX_op_div2_i32:
2432 case INDEX_op_div2_i64:
2433 case INDEX_op_divu2_i32:
2434 case INDEX_op_divu2_i64:
2435 {
2436 static const TCGTargetOpDef div2
2437 = { .args_ct_str = { "a", "d", "0", "1", "r" } };
2438 return &div2;
2439 }
2440 case INDEX_op_mulu2_i32:
2441 case INDEX_op_mulu2_i64:
2442 case INDEX_op_muls2_i32:
2443 case INDEX_op_muls2_i64:
2444 {
2445 static const TCGTargetOpDef mul2
2446 = { .args_ct_str = { "a", "d", "a", "r" } };
2447 return &mul2;
2448 }
2449 case INDEX_op_add2_i32:
2450 case INDEX_op_add2_i64:
2451 case INDEX_op_sub2_i32:
2452 case INDEX_op_sub2_i64:
2453 {
2454 static const TCGTargetOpDef arith2
2455 = { .args_ct_str = { "r", "r", "0", "1", "re", "re" } };
2456 return &arith2;
2457 }
2458 case INDEX_op_ctz_i32:
2459 case INDEX_op_ctz_i64:
2460 {
2461 static const TCGTargetOpDef ctz[2] = {
2462 { .args_ct_str = { "r", "r", "0" } },
2463 { .args_ct_str = { "&r", "r", "rW" } },
2464 };
2465 return &ctz[have_bmi1];
2466 }
2467 case INDEX_op_clz_i32:
2468 case INDEX_op_clz_i64:
2469 {
2470 static const TCGTargetOpDef clz[2] = {
2471 { .args_ct_str = { "&r", "r", "0i" } },
2472 { .args_ct_str = { "&r", "r", "rW" } },
2473 };
2474 return &clz[have_lzcnt];
2475 }
2476
2477 case INDEX_op_qemu_ld_i32:
2478 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_L : &r_L_L;
2479 case INDEX_op_qemu_st_i32:
2480 return TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L : &L_L_L;
2481 case INDEX_op_qemu_ld_i64:
2482 return (TCG_TARGET_REG_BITS == 64 ? &r_L
2483 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &r_r_L
2484 : &r_r_L_L);
2485 case INDEX_op_qemu_st_i64:
2486 return (TCG_TARGET_REG_BITS == 64 ? &L_L
2487 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? &L_L_L
2488 : &L_L_L_L);
2489
2490 case INDEX_op_brcond2_i32:
2491 {
2492 static const TCGTargetOpDef b2
2493 = { .args_ct_str = { "r", "r", "ri", "ri" } };
2494 return &b2;
2495 }
2496 case INDEX_op_setcond2_i32:
2497 {
2498 static const TCGTargetOpDef s2
2499 = { .args_ct_str = { "r", "r", "r", "ri", "ri" } };
2500 return &s2;
2501 }
2502
2503 default:
2504 break;
2505 }
2506 return NULL;
2507 }
2508
2509 static int tcg_target_callee_save_regs[] = {
2510 #if TCG_TARGET_REG_BITS == 64
2511 TCG_REG_RBP,
2512 TCG_REG_RBX,
2513 #if defined(_WIN64)
2514 TCG_REG_RDI,
2515 TCG_REG_RSI,
2516 #endif
2517 TCG_REG_R12,
2518 TCG_REG_R13,
2519 TCG_REG_R14, /* Currently used for the global env. */
2520 TCG_REG_R15,
2521 #else
2522 TCG_REG_EBP, /* Currently used for the global env. */
2523 TCG_REG_EBX,
2524 TCG_REG_ESI,
2525 TCG_REG_EDI,
2526 #endif
2527 };
2528
2529 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
2530 and tcg_register_jit. */
2531
2532 #define PUSH_SIZE \
2533 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
2534 * (TCG_TARGET_REG_BITS / 8))
2535
2536 #define FRAME_SIZE \
2537 ((PUSH_SIZE \
2538 + TCG_STATIC_CALL_ARGS_SIZE \
2539 + CPU_TEMP_BUF_NLONGS * sizeof(long) \
2540 + TCG_TARGET_STACK_ALIGN - 1) \
2541 & ~(TCG_TARGET_STACK_ALIGN - 1))
2542
2543 /* Generate global QEMU prologue and epilogue code */
2544 static void tcg_target_qemu_prologue(TCGContext *s)
2545 {
2546 int i, stack_addend;
2547
2548 /* TB prologue */
2549
2550 /* Reserve some stack space, also for TCG temps. */
2551 stack_addend = FRAME_SIZE - PUSH_SIZE;
2552 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
2553 CPU_TEMP_BUF_NLONGS * sizeof(long));
2554
2555 /* Save all callee saved registers. */
2556 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
2557 tcg_out_push(s, tcg_target_callee_save_regs[i]);
2558 }
2559
2560 #if TCG_TARGET_REG_BITS == 32
2561 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
2562 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
2563 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
2564 /* jmp *tb. */
2565 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
2566 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
2567 + stack_addend);
2568 #else
2569 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
2570 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
2571 /* jmp *tb. */
2572 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
2573 #endif
2574
2575 /* TB epilogue */
2576 tb_ret_addr = s->code_ptr;
2577
2578 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
2579
2580 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
2581 tcg_out_pop(s, tcg_target_callee_save_regs[i]);
2582 }
2583 tcg_out_opc(s, OPC_RET, 0, 0, 0);
2584
2585 #if !defined(CONFIG_SOFTMMU)
2586 /* Try to set up a segment register to point to guest_base. */
2587 if (guest_base) {
2588 setup_guest_base_seg();
2589 }
2590 #endif
2591 }
2592
2593 static void tcg_target_init(TCGContext *s)
2594 {
2595 #ifdef CONFIG_CPUID_H
2596 unsigned a, b, c, d;
2597 int max = __get_cpuid_max(0, 0);
2598
2599 if (max >= 1) {
2600 __cpuid(1, a, b, c, d);
2601 #ifndef have_cmov
2602 /* For 32-bit, 99% certainty that we're running on hardware that
2603 supports cmov, but we still need to check. In case cmov is not
2604 available, we'll use a small forward branch. */
2605 have_cmov = (d & bit_CMOV) != 0;
2606 #endif
2607 #ifndef have_movbe
2608 /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
2609 need to probe for it. */
2610 have_movbe = (c & bit_MOVBE) != 0;
2611 #endif
2612 #ifdef bit_POPCNT
2613 have_popcnt = (c & bit_POPCNT) != 0;
2614 #endif
2615 }
2616
2617 if (max >= 7) {
2618 /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
2619 __cpuid_count(7, 0, a, b, c, d);
2620 #ifdef bit_BMI
2621 have_bmi1 = (b & bit_BMI) != 0;
2622 #endif
2623 #ifndef have_bmi2
2624 have_bmi2 = (b & bit_BMI2) != 0;
2625 #endif
2626 }
2627 #endif
2628
2629 #ifndef have_lzcnt
2630 max = __get_cpuid_max(0x8000000, 0);
2631 if (max >= 1) {
2632 __cpuid(0x80000001, a, b, c, d);
2633 /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs. */
2634 have_lzcnt = (c & bit_LZCNT) != 0;
2635 }
2636 #endif
2637
2638 if (TCG_TARGET_REG_BITS == 64) {
2639 tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xffff);
2640 tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I64], 0, 0xffff);
2641 } else {
2642 tcg_regset_set32(tcg_target_available_regs[TCG_TYPE_I32], 0, 0xff);
2643 }
2644
2645 tcg_regset_clear(tcg_target_call_clobber_regs);
2646 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
2647 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
2648 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
2649 if (TCG_TARGET_REG_BITS == 64) {
2650 #if !defined(_WIN64)
2651 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
2652 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
2653 #endif
2654 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
2655 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
2656 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
2657 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
2658 }
2659
2660 tcg_regset_clear(s->reserved_regs);
2661 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
2662 }
2663
2664 typedef struct {
2665 DebugFrameHeader h;
2666 uint8_t fde_def_cfa[4];
2667 uint8_t fde_reg_ofs[14];
2668 } DebugFrame;
2669
2670 /* We're expecting a 2 byte uleb128 encoded value. */
2671 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
2672
2673 #if !defined(__ELF__)
2674 /* Host machine without ELF. */
2675 #elif TCG_TARGET_REG_BITS == 64
2676 #define ELF_HOST_MACHINE EM_X86_64
2677 static const DebugFrame debug_frame = {
2678 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
2679 .h.cie.id = -1,
2680 .h.cie.version = 1,
2681 .h.cie.code_align = 1,
2682 .h.cie.data_align = 0x78, /* sleb128 -8 */
2683 .h.cie.return_column = 16,
2684
2685 /* Total FDE size does not include the "len" member. */
2686 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
2687
2688 .fde_def_cfa = {
2689 12, 7, /* DW_CFA_def_cfa %rsp, ... */
2690 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
2691 (FRAME_SIZE >> 7)
2692 },
2693 .fde_reg_ofs = {
2694 0x90, 1, /* DW_CFA_offset, %rip, -8 */
2695 /* The following ordering must match tcg_target_callee_save_regs. */
2696 0x86, 2, /* DW_CFA_offset, %rbp, -16 */
2697 0x83, 3, /* DW_CFA_offset, %rbx, -24 */
2698 0x8c, 4, /* DW_CFA_offset, %r12, -32 */
2699 0x8d, 5, /* DW_CFA_offset, %r13, -40 */
2700 0x8e, 6, /* DW_CFA_offset, %r14, -48 */
2701 0x8f, 7, /* DW_CFA_offset, %r15, -56 */
2702 }
2703 };
2704 #else
2705 #define ELF_HOST_MACHINE EM_386
2706 static const DebugFrame debug_frame = {
2707 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
2708 .h.cie.id = -1,
2709 .h.cie.version = 1,
2710 .h.cie.code_align = 1,
2711 .h.cie.data_align = 0x7c, /* sleb128 -4 */
2712 .h.cie.return_column = 8,
2713
2714 /* Total FDE size does not include the "len" member. */
2715 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
2716
2717 .fde_def_cfa = {
2718 12, 4, /* DW_CFA_def_cfa %esp, ... */
2719 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
2720 (FRAME_SIZE >> 7)
2721 },
2722 .fde_reg_ofs = {
2723 0x88, 1, /* DW_CFA_offset, %eip, -4 */
2724 /* The following ordering must match tcg_target_callee_save_regs. */
2725 0x85, 2, /* DW_CFA_offset, %ebp, -8 */
2726 0x83, 3, /* DW_CFA_offset, %ebx, -12 */
2727 0x86, 4, /* DW_CFA_offset, %esi, -16 */
2728 0x87, 5, /* DW_CFA_offset, %edi, -20 */
2729 }
2730 };
2731 #endif
2732
2733 #if defined(ELF_HOST_MACHINE)
2734 void tcg_register_jit(void *buf, size_t buf_size)
2735 {
2736 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
2737 }
2738 #endif