]> git.proxmox.com Git - mirror_qemu.git/blob - tcg/i386/tcg-target.c.inc
Merge remote-tracking branch 'remotes/vsementsov/tags/pull-jobs-2021-10-07-v2' into...
[mirror_qemu.git] / tcg / i386 / tcg-target.c.inc
1 /*
2 * Tiny Code Generator for QEMU
3 *
4 * Copyright (c) 2008 Fabrice Bellard
5 *
6 * Permission is hereby granted, free of charge, to any person obtaining a copy
7 * of this software and associated documentation files (the "Software"), to deal
8 * in the Software without restriction, including without limitation the rights
9 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10 * copies of the Software, and to permit persons to whom the Software is
11 * furnished to do so, subject to the following conditions:
12 *
13 * The above copyright notice and this permission notice shall be included in
14 * all copies or substantial portions of the Software.
15 *
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
19 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
22 * THE SOFTWARE.
23 */
24
25 #include "../tcg-pool.c.inc"
26
27 #ifdef CONFIG_DEBUG_TCG
28 static const char * const tcg_target_reg_names[TCG_TARGET_NB_REGS] = {
29 #if TCG_TARGET_REG_BITS == 64
30 "%rax", "%rcx", "%rdx", "%rbx", "%rsp", "%rbp", "%rsi", "%rdi",
31 #else
32 "%eax", "%ecx", "%edx", "%ebx", "%esp", "%ebp", "%esi", "%edi",
33 #endif
34 "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15",
35 "%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm4", "%xmm5", "%xmm6", "%xmm7",
36 #if TCG_TARGET_REG_BITS == 64
37 "%xmm8", "%xmm9", "%xmm10", "%xmm11",
38 "%xmm12", "%xmm13", "%xmm14", "%xmm15",
39 #endif
40 };
41 #endif
42
43 static const int tcg_target_reg_alloc_order[] = {
44 #if TCG_TARGET_REG_BITS == 64
45 TCG_REG_RBP,
46 TCG_REG_RBX,
47 TCG_REG_R12,
48 TCG_REG_R13,
49 TCG_REG_R14,
50 TCG_REG_R15,
51 TCG_REG_R10,
52 TCG_REG_R11,
53 TCG_REG_R9,
54 TCG_REG_R8,
55 TCG_REG_RCX,
56 TCG_REG_RDX,
57 TCG_REG_RSI,
58 TCG_REG_RDI,
59 TCG_REG_RAX,
60 #else
61 TCG_REG_EBX,
62 TCG_REG_ESI,
63 TCG_REG_EDI,
64 TCG_REG_EBP,
65 TCG_REG_ECX,
66 TCG_REG_EDX,
67 TCG_REG_EAX,
68 #endif
69 TCG_REG_XMM0,
70 TCG_REG_XMM1,
71 TCG_REG_XMM2,
72 TCG_REG_XMM3,
73 TCG_REG_XMM4,
74 TCG_REG_XMM5,
75 #ifndef _WIN64
76 /* The Win64 ABI has xmm6-xmm15 as caller-saves, and we do not save
77 any of them. Therefore only allow xmm0-xmm5 to be allocated. */
78 TCG_REG_XMM6,
79 TCG_REG_XMM7,
80 #if TCG_TARGET_REG_BITS == 64
81 TCG_REG_XMM8,
82 TCG_REG_XMM9,
83 TCG_REG_XMM10,
84 TCG_REG_XMM11,
85 TCG_REG_XMM12,
86 TCG_REG_XMM13,
87 TCG_REG_XMM14,
88 TCG_REG_XMM15,
89 #endif
90 #endif
91 };
92
93 static const int tcg_target_call_iarg_regs[] = {
94 #if TCG_TARGET_REG_BITS == 64
95 #if defined(_WIN64)
96 TCG_REG_RCX,
97 TCG_REG_RDX,
98 #else
99 TCG_REG_RDI,
100 TCG_REG_RSI,
101 TCG_REG_RDX,
102 TCG_REG_RCX,
103 #endif
104 TCG_REG_R8,
105 TCG_REG_R9,
106 #else
107 /* 32 bit mode uses stack based calling convention (GCC default). */
108 #endif
109 };
110
111 static const int tcg_target_call_oarg_regs[] = {
112 TCG_REG_EAX,
113 #if TCG_TARGET_REG_BITS == 32
114 TCG_REG_EDX
115 #endif
116 };
117
118 /* Constants we accept. */
119 #define TCG_CT_CONST_S32 0x100
120 #define TCG_CT_CONST_U32 0x200
121 #define TCG_CT_CONST_I32 0x400
122 #define TCG_CT_CONST_WSZ 0x800
123
124 /* Registers used with L constraint, which are the first argument
125 registers on x86_64, and two random call clobbered registers on
126 i386. */
127 #if TCG_TARGET_REG_BITS == 64
128 # define TCG_REG_L0 tcg_target_call_iarg_regs[0]
129 # define TCG_REG_L1 tcg_target_call_iarg_regs[1]
130 #else
131 # define TCG_REG_L0 TCG_REG_EAX
132 # define TCG_REG_L1 TCG_REG_EDX
133 #endif
134
135 #define ALL_BYTEH_REGS 0x0000000fu
136 #if TCG_TARGET_REG_BITS == 64
137 # define ALL_GENERAL_REGS 0x0000ffffu
138 # define ALL_VECTOR_REGS 0xffff0000u
139 # define ALL_BYTEL_REGS ALL_GENERAL_REGS
140 #else
141 # define ALL_GENERAL_REGS 0x000000ffu
142 # define ALL_VECTOR_REGS 0x00ff0000u
143 # define ALL_BYTEL_REGS ALL_BYTEH_REGS
144 #endif
145 #ifdef CONFIG_SOFTMMU
146 # define SOFTMMU_RESERVE_REGS ((1 << TCG_REG_L0) | (1 << TCG_REG_L1))
147 #else
148 # define SOFTMMU_RESERVE_REGS 0
149 #endif
150
151 /* The host compiler should supply <cpuid.h> to enable runtime features
152 detection, as we're not going to go so far as our own inline assembly.
153 If not available, default values will be assumed. */
154 #if defined(CONFIG_CPUID_H)
155 #include "qemu/cpuid.h"
156 #endif
157
158 /* For 64-bit, we always know that CMOV is available. */
159 #if TCG_TARGET_REG_BITS == 64
160 # define have_cmov 1
161 #elif defined(CONFIG_CPUID_H)
162 static bool have_cmov;
163 #else
164 # define have_cmov 0
165 #endif
166
167 /* We need these symbols in tcg-target.h, and we can't properly conditionalize
168 it there. Therefore we always define the variable. */
169 bool have_bmi1;
170 bool have_popcnt;
171 bool have_avx1;
172 bool have_avx2;
173 bool have_movbe;
174
175 #ifdef CONFIG_CPUID_H
176 static bool have_bmi2;
177 static bool have_lzcnt;
178 #else
179 # define have_bmi2 0
180 # define have_lzcnt 0
181 #endif
182
183 static const tcg_insn_unit *tb_ret_addr;
184
185 static bool patch_reloc(tcg_insn_unit *code_ptr, int type,
186 intptr_t value, intptr_t addend)
187 {
188 value += addend;
189 switch(type) {
190 case R_386_PC32:
191 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
192 if (value != (int32_t)value) {
193 return false;
194 }
195 /* FALLTHRU */
196 case R_386_32:
197 tcg_patch32(code_ptr, value);
198 break;
199 case R_386_PC8:
200 value -= (uintptr_t)tcg_splitwx_to_rx(code_ptr);
201 if (value != (int8_t)value) {
202 return false;
203 }
204 tcg_patch8(code_ptr, value);
205 break;
206 default:
207 tcg_abort();
208 }
209 return true;
210 }
211
212 /* test if a constant matches the constraint */
213 static bool tcg_target_const_match(int64_t val, TCGType type, int ct)
214 {
215 if (ct & TCG_CT_CONST) {
216 return 1;
217 }
218 if (type == TCG_TYPE_I32) {
219 if (ct & (TCG_CT_CONST_S32 | TCG_CT_CONST_U32 | TCG_CT_CONST_I32)) {
220 return 1;
221 }
222 } else {
223 if ((ct & TCG_CT_CONST_S32) && val == (int32_t)val) {
224 return 1;
225 }
226 if ((ct & TCG_CT_CONST_U32) && val == (uint32_t)val) {
227 return 1;
228 }
229 if ((ct & TCG_CT_CONST_I32) && ~val == (int32_t)~val) {
230 return 1;
231 }
232 }
233 if ((ct & TCG_CT_CONST_WSZ) && val == (type == TCG_TYPE_I32 ? 32 : 64)) {
234 return 1;
235 }
236 return 0;
237 }
238
239 # define LOWREGMASK(x) ((x) & 7)
240
241 #define P_EXT 0x100 /* 0x0f opcode prefix */
242 #define P_EXT38 0x200 /* 0x0f 0x38 opcode prefix */
243 #define P_DATA16 0x400 /* 0x66 opcode prefix */
244 #define P_VEXW 0x1000 /* Set VEX.W = 1 */
245 #if TCG_TARGET_REG_BITS == 64
246 # define P_REXW P_VEXW /* Set REX.W = 1; match VEXW */
247 # define P_REXB_R 0x2000 /* REG field as byte register */
248 # define P_REXB_RM 0x4000 /* R/M field as byte register */
249 # define P_GS 0x8000 /* gs segment override */
250 #else
251 # define P_REXW 0
252 # define P_REXB_R 0
253 # define P_REXB_RM 0
254 # define P_GS 0
255 #endif
256 #define P_EXT3A 0x10000 /* 0x0f 0x3a opcode prefix */
257 #define P_SIMDF3 0x20000 /* 0xf3 opcode prefix */
258 #define P_SIMDF2 0x40000 /* 0xf2 opcode prefix */
259 #define P_VEXL 0x80000 /* Set VEX.L = 1 */
260
261 #define OPC_ARITH_EvIz (0x81)
262 #define OPC_ARITH_EvIb (0x83)
263 #define OPC_ARITH_GvEv (0x03) /* ... plus (ARITH_FOO << 3) */
264 #define OPC_ANDN (0xf2 | P_EXT38)
265 #define OPC_ADD_GvEv (OPC_ARITH_GvEv | (ARITH_ADD << 3))
266 #define OPC_AND_GvEv (OPC_ARITH_GvEv | (ARITH_AND << 3))
267 #define OPC_BLENDPS (0x0c | P_EXT3A | P_DATA16)
268 #define OPC_BSF (0xbc | P_EXT)
269 #define OPC_BSR (0xbd | P_EXT)
270 #define OPC_BSWAP (0xc8 | P_EXT)
271 #define OPC_CALL_Jz (0xe8)
272 #define OPC_CMOVCC (0x40 | P_EXT) /* ... plus condition code */
273 #define OPC_CMP_GvEv (OPC_ARITH_GvEv | (ARITH_CMP << 3))
274 #define OPC_DEC_r32 (0x48)
275 #define OPC_IMUL_GvEv (0xaf | P_EXT)
276 #define OPC_IMUL_GvEvIb (0x6b)
277 #define OPC_IMUL_GvEvIz (0x69)
278 #define OPC_INC_r32 (0x40)
279 #define OPC_JCC_long (0x80 | P_EXT) /* ... plus condition code */
280 #define OPC_JCC_short (0x70) /* ... plus condition code */
281 #define OPC_JMP_long (0xe9)
282 #define OPC_JMP_short (0xeb)
283 #define OPC_LEA (0x8d)
284 #define OPC_LZCNT (0xbd | P_EXT | P_SIMDF3)
285 #define OPC_MOVB_EvGv (0x88) /* stores, more or less */
286 #define OPC_MOVL_EvGv (0x89) /* stores, more or less */
287 #define OPC_MOVL_GvEv (0x8b) /* loads, more or less */
288 #define OPC_MOVB_EvIz (0xc6)
289 #define OPC_MOVL_EvIz (0xc7)
290 #define OPC_MOVL_Iv (0xb8)
291 #define OPC_MOVBE_GyMy (0xf0 | P_EXT38)
292 #define OPC_MOVBE_MyGy (0xf1 | P_EXT38)
293 #define OPC_MOVD_VyEy (0x6e | P_EXT | P_DATA16)
294 #define OPC_MOVD_EyVy (0x7e | P_EXT | P_DATA16)
295 #define OPC_MOVDDUP (0x12 | P_EXT | P_SIMDF2)
296 #define OPC_MOVDQA_VxWx (0x6f | P_EXT | P_DATA16)
297 #define OPC_MOVDQA_WxVx (0x7f | P_EXT | P_DATA16)
298 #define OPC_MOVDQU_VxWx (0x6f | P_EXT | P_SIMDF3)
299 #define OPC_MOVDQU_WxVx (0x7f | P_EXT | P_SIMDF3)
300 #define OPC_MOVQ_VqWq (0x7e | P_EXT | P_SIMDF3)
301 #define OPC_MOVQ_WqVq (0xd6 | P_EXT | P_DATA16)
302 #define OPC_MOVSBL (0xbe | P_EXT)
303 #define OPC_MOVSWL (0xbf | P_EXT)
304 #define OPC_MOVSLQ (0x63 | P_REXW)
305 #define OPC_MOVZBL (0xb6 | P_EXT)
306 #define OPC_MOVZWL (0xb7 | P_EXT)
307 #define OPC_PABSB (0x1c | P_EXT38 | P_DATA16)
308 #define OPC_PABSW (0x1d | P_EXT38 | P_DATA16)
309 #define OPC_PABSD (0x1e | P_EXT38 | P_DATA16)
310 #define OPC_PACKSSDW (0x6b | P_EXT | P_DATA16)
311 #define OPC_PACKSSWB (0x63 | P_EXT | P_DATA16)
312 #define OPC_PACKUSDW (0x2b | P_EXT38 | P_DATA16)
313 #define OPC_PACKUSWB (0x67 | P_EXT | P_DATA16)
314 #define OPC_PADDB (0xfc | P_EXT | P_DATA16)
315 #define OPC_PADDW (0xfd | P_EXT | P_DATA16)
316 #define OPC_PADDD (0xfe | P_EXT | P_DATA16)
317 #define OPC_PADDQ (0xd4 | P_EXT | P_DATA16)
318 #define OPC_PADDSB (0xec | P_EXT | P_DATA16)
319 #define OPC_PADDSW (0xed | P_EXT | P_DATA16)
320 #define OPC_PADDUB (0xdc | P_EXT | P_DATA16)
321 #define OPC_PADDUW (0xdd | P_EXT | P_DATA16)
322 #define OPC_PAND (0xdb | P_EXT | P_DATA16)
323 #define OPC_PANDN (0xdf | P_EXT | P_DATA16)
324 #define OPC_PBLENDW (0x0e | P_EXT3A | P_DATA16)
325 #define OPC_PCMPEQB (0x74 | P_EXT | P_DATA16)
326 #define OPC_PCMPEQW (0x75 | P_EXT | P_DATA16)
327 #define OPC_PCMPEQD (0x76 | P_EXT | P_DATA16)
328 #define OPC_PCMPEQQ (0x29 | P_EXT38 | P_DATA16)
329 #define OPC_PCMPGTB (0x64 | P_EXT | P_DATA16)
330 #define OPC_PCMPGTW (0x65 | P_EXT | P_DATA16)
331 #define OPC_PCMPGTD (0x66 | P_EXT | P_DATA16)
332 #define OPC_PCMPGTQ (0x37 | P_EXT38 | P_DATA16)
333 #define OPC_PMAXSB (0x3c | P_EXT38 | P_DATA16)
334 #define OPC_PMAXSW (0xee | P_EXT | P_DATA16)
335 #define OPC_PMAXSD (0x3d | P_EXT38 | P_DATA16)
336 #define OPC_PMAXUB (0xde | P_EXT | P_DATA16)
337 #define OPC_PMAXUW (0x3e | P_EXT38 | P_DATA16)
338 #define OPC_PMAXUD (0x3f | P_EXT38 | P_DATA16)
339 #define OPC_PMINSB (0x38 | P_EXT38 | P_DATA16)
340 #define OPC_PMINSW (0xea | P_EXT | P_DATA16)
341 #define OPC_PMINSD (0x39 | P_EXT38 | P_DATA16)
342 #define OPC_PMINUB (0xda | P_EXT | P_DATA16)
343 #define OPC_PMINUW (0x3a | P_EXT38 | P_DATA16)
344 #define OPC_PMINUD (0x3b | P_EXT38 | P_DATA16)
345 #define OPC_PMOVSXBW (0x20 | P_EXT38 | P_DATA16)
346 #define OPC_PMOVSXWD (0x23 | P_EXT38 | P_DATA16)
347 #define OPC_PMOVSXDQ (0x25 | P_EXT38 | P_DATA16)
348 #define OPC_PMOVZXBW (0x30 | P_EXT38 | P_DATA16)
349 #define OPC_PMOVZXWD (0x33 | P_EXT38 | P_DATA16)
350 #define OPC_PMOVZXDQ (0x35 | P_EXT38 | P_DATA16)
351 #define OPC_PMULLW (0xd5 | P_EXT | P_DATA16)
352 #define OPC_PMULLD (0x40 | P_EXT38 | P_DATA16)
353 #define OPC_POR (0xeb | P_EXT | P_DATA16)
354 #define OPC_PSHUFB (0x00 | P_EXT38 | P_DATA16)
355 #define OPC_PSHUFD (0x70 | P_EXT | P_DATA16)
356 #define OPC_PSHUFLW (0x70 | P_EXT | P_SIMDF2)
357 #define OPC_PSHUFHW (0x70 | P_EXT | P_SIMDF3)
358 #define OPC_PSHIFTW_Ib (0x71 | P_EXT | P_DATA16) /* /2 /6 /4 */
359 #define OPC_PSHIFTD_Ib (0x72 | P_EXT | P_DATA16) /* /2 /6 /4 */
360 #define OPC_PSHIFTQ_Ib (0x73 | P_EXT | P_DATA16) /* /2 /6 /4 */
361 #define OPC_PSLLW (0xf1 | P_EXT | P_DATA16)
362 #define OPC_PSLLD (0xf2 | P_EXT | P_DATA16)
363 #define OPC_PSLLQ (0xf3 | P_EXT | P_DATA16)
364 #define OPC_PSRAW (0xe1 | P_EXT | P_DATA16)
365 #define OPC_PSRAD (0xe2 | P_EXT | P_DATA16)
366 #define OPC_PSRLW (0xd1 | P_EXT | P_DATA16)
367 #define OPC_PSRLD (0xd2 | P_EXT | P_DATA16)
368 #define OPC_PSRLQ (0xd3 | P_EXT | P_DATA16)
369 #define OPC_PSUBB (0xf8 | P_EXT | P_DATA16)
370 #define OPC_PSUBW (0xf9 | P_EXT | P_DATA16)
371 #define OPC_PSUBD (0xfa | P_EXT | P_DATA16)
372 #define OPC_PSUBQ (0xfb | P_EXT | P_DATA16)
373 #define OPC_PSUBSB (0xe8 | P_EXT | P_DATA16)
374 #define OPC_PSUBSW (0xe9 | P_EXT | P_DATA16)
375 #define OPC_PSUBUB (0xd8 | P_EXT | P_DATA16)
376 #define OPC_PSUBUW (0xd9 | P_EXT | P_DATA16)
377 #define OPC_PUNPCKLBW (0x60 | P_EXT | P_DATA16)
378 #define OPC_PUNPCKLWD (0x61 | P_EXT | P_DATA16)
379 #define OPC_PUNPCKLDQ (0x62 | P_EXT | P_DATA16)
380 #define OPC_PUNPCKLQDQ (0x6c | P_EXT | P_DATA16)
381 #define OPC_PUNPCKHBW (0x68 | P_EXT | P_DATA16)
382 #define OPC_PUNPCKHWD (0x69 | P_EXT | P_DATA16)
383 #define OPC_PUNPCKHDQ (0x6a | P_EXT | P_DATA16)
384 #define OPC_PUNPCKHQDQ (0x6d | P_EXT | P_DATA16)
385 #define OPC_PXOR (0xef | P_EXT | P_DATA16)
386 #define OPC_POP_r32 (0x58)
387 #define OPC_POPCNT (0xb8 | P_EXT | P_SIMDF3)
388 #define OPC_PUSH_r32 (0x50)
389 #define OPC_PUSH_Iv (0x68)
390 #define OPC_PUSH_Ib (0x6a)
391 #define OPC_RET (0xc3)
392 #define OPC_SETCC (0x90 | P_EXT | P_REXB_RM) /* ... plus cc */
393 #define OPC_SHIFT_1 (0xd1)
394 #define OPC_SHIFT_Ib (0xc1)
395 #define OPC_SHIFT_cl (0xd3)
396 #define OPC_SARX (0xf7 | P_EXT38 | P_SIMDF3)
397 #define OPC_SHUFPS (0xc6 | P_EXT)
398 #define OPC_SHLX (0xf7 | P_EXT38 | P_DATA16)
399 #define OPC_SHRX (0xf7 | P_EXT38 | P_SIMDF2)
400 #define OPC_SHRD_Ib (0xac | P_EXT)
401 #define OPC_TESTL (0x85)
402 #define OPC_TZCNT (0xbc | P_EXT | P_SIMDF3)
403 #define OPC_UD2 (0x0b | P_EXT)
404 #define OPC_VPBLENDD (0x02 | P_EXT3A | P_DATA16)
405 #define OPC_VPBLENDVB (0x4c | P_EXT3A | P_DATA16)
406 #define OPC_VPINSRB (0x20 | P_EXT3A | P_DATA16)
407 #define OPC_VPINSRW (0xc4 | P_EXT | P_DATA16)
408 #define OPC_VBROADCASTSS (0x18 | P_EXT38 | P_DATA16)
409 #define OPC_VBROADCASTSD (0x19 | P_EXT38 | P_DATA16)
410 #define OPC_VPBROADCASTB (0x78 | P_EXT38 | P_DATA16)
411 #define OPC_VPBROADCASTW (0x79 | P_EXT38 | P_DATA16)
412 #define OPC_VPBROADCASTD (0x58 | P_EXT38 | P_DATA16)
413 #define OPC_VPBROADCASTQ (0x59 | P_EXT38 | P_DATA16)
414 #define OPC_VPERMQ (0x00 | P_EXT3A | P_DATA16 | P_VEXW)
415 #define OPC_VPERM2I128 (0x46 | P_EXT3A | P_DATA16 | P_VEXL)
416 #define OPC_VPSLLVD (0x47 | P_EXT38 | P_DATA16)
417 #define OPC_VPSLLVQ (0x47 | P_EXT38 | P_DATA16 | P_VEXW)
418 #define OPC_VPSRAVD (0x46 | P_EXT38 | P_DATA16)
419 #define OPC_VPSRLVD (0x45 | P_EXT38 | P_DATA16)
420 #define OPC_VPSRLVQ (0x45 | P_EXT38 | P_DATA16 | P_VEXW)
421 #define OPC_VZEROUPPER (0x77 | P_EXT)
422 #define OPC_XCHG_ax_r32 (0x90)
423
424 #define OPC_GRP3_Ev (0xf7)
425 #define OPC_GRP5 (0xff)
426 #define OPC_GRP14 (0x73 | P_EXT | P_DATA16)
427
428 /* Group 1 opcode extensions for 0x80-0x83.
429 These are also used as modifiers for OPC_ARITH. */
430 #define ARITH_ADD 0
431 #define ARITH_OR 1
432 #define ARITH_ADC 2
433 #define ARITH_SBB 3
434 #define ARITH_AND 4
435 #define ARITH_SUB 5
436 #define ARITH_XOR 6
437 #define ARITH_CMP 7
438
439 /* Group 2 opcode extensions for 0xc0, 0xc1, 0xd0-0xd3. */
440 #define SHIFT_ROL 0
441 #define SHIFT_ROR 1
442 #define SHIFT_SHL 4
443 #define SHIFT_SHR 5
444 #define SHIFT_SAR 7
445
446 /* Group 3 opcode extensions for 0xf6, 0xf7. To be used with OPC_GRP3. */
447 #define EXT3_NOT 2
448 #define EXT3_NEG 3
449 #define EXT3_MUL 4
450 #define EXT3_IMUL 5
451 #define EXT3_DIV 6
452 #define EXT3_IDIV 7
453
454 /* Group 5 opcode extensions for 0xff. To be used with OPC_GRP5. */
455 #define EXT5_INC_Ev 0
456 #define EXT5_DEC_Ev 1
457 #define EXT5_CALLN_Ev 2
458 #define EXT5_JMPN_Ev 4
459
460 /* Condition codes to be added to OPC_JCC_{long,short}. */
461 #define JCC_JMP (-1)
462 #define JCC_JO 0x0
463 #define JCC_JNO 0x1
464 #define JCC_JB 0x2
465 #define JCC_JAE 0x3
466 #define JCC_JE 0x4
467 #define JCC_JNE 0x5
468 #define JCC_JBE 0x6
469 #define JCC_JA 0x7
470 #define JCC_JS 0x8
471 #define JCC_JNS 0x9
472 #define JCC_JP 0xa
473 #define JCC_JNP 0xb
474 #define JCC_JL 0xc
475 #define JCC_JGE 0xd
476 #define JCC_JLE 0xe
477 #define JCC_JG 0xf
478
479 static const uint8_t tcg_cond_to_jcc[] = {
480 [TCG_COND_EQ] = JCC_JE,
481 [TCG_COND_NE] = JCC_JNE,
482 [TCG_COND_LT] = JCC_JL,
483 [TCG_COND_GE] = JCC_JGE,
484 [TCG_COND_LE] = JCC_JLE,
485 [TCG_COND_GT] = JCC_JG,
486 [TCG_COND_LTU] = JCC_JB,
487 [TCG_COND_GEU] = JCC_JAE,
488 [TCG_COND_LEU] = JCC_JBE,
489 [TCG_COND_GTU] = JCC_JA,
490 };
491
492 #if TCG_TARGET_REG_BITS == 64
493 static void tcg_out_opc(TCGContext *s, int opc, int r, int rm, int x)
494 {
495 int rex;
496
497 if (opc & P_GS) {
498 tcg_out8(s, 0x65);
499 }
500 if (opc & P_DATA16) {
501 /* We should never be asking for both 16 and 64-bit operation. */
502 tcg_debug_assert((opc & P_REXW) == 0);
503 tcg_out8(s, 0x66);
504 }
505 if (opc & P_SIMDF3) {
506 tcg_out8(s, 0xf3);
507 } else if (opc & P_SIMDF2) {
508 tcg_out8(s, 0xf2);
509 }
510
511 rex = 0;
512 rex |= (opc & P_REXW) ? 0x8 : 0x0; /* REX.W */
513 rex |= (r & 8) >> 1; /* REX.R */
514 rex |= (x & 8) >> 2; /* REX.X */
515 rex |= (rm & 8) >> 3; /* REX.B */
516
517 /* P_REXB_{R,RM} indicates that the given register is the low byte.
518 For %[abcd]l we need no REX prefix, but for %{si,di,bp,sp}l we do,
519 as otherwise the encoding indicates %[abcd]h. Note that the values
520 that are ORed in merely indicate that the REX byte must be present;
521 those bits get discarded in output. */
522 rex |= opc & (r >= 4 ? P_REXB_R : 0);
523 rex |= opc & (rm >= 4 ? P_REXB_RM : 0);
524
525 if (rex) {
526 tcg_out8(s, (uint8_t)(rex | 0x40));
527 }
528
529 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
530 tcg_out8(s, 0x0f);
531 if (opc & P_EXT38) {
532 tcg_out8(s, 0x38);
533 } else if (opc & P_EXT3A) {
534 tcg_out8(s, 0x3a);
535 }
536 }
537
538 tcg_out8(s, opc);
539 }
540 #else
541 static void tcg_out_opc(TCGContext *s, int opc)
542 {
543 if (opc & P_DATA16) {
544 tcg_out8(s, 0x66);
545 }
546 if (opc & P_SIMDF3) {
547 tcg_out8(s, 0xf3);
548 } else if (opc & P_SIMDF2) {
549 tcg_out8(s, 0xf2);
550 }
551 if (opc & (P_EXT | P_EXT38 | P_EXT3A)) {
552 tcg_out8(s, 0x0f);
553 if (opc & P_EXT38) {
554 tcg_out8(s, 0x38);
555 } else if (opc & P_EXT3A) {
556 tcg_out8(s, 0x3a);
557 }
558 }
559 tcg_out8(s, opc);
560 }
561 /* Discard the register arguments to tcg_out_opc early, so as not to penalize
562 the 32-bit compilation paths. This method works with all versions of gcc,
563 whereas relying on optimization may not be able to exclude them. */
564 #define tcg_out_opc(s, opc, r, rm, x) (tcg_out_opc)(s, opc)
565 #endif
566
567 static void tcg_out_modrm(TCGContext *s, int opc, int r, int rm)
568 {
569 tcg_out_opc(s, opc, r, rm, 0);
570 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
571 }
572
573 static void tcg_out_vex_opc(TCGContext *s, int opc, int r, int v,
574 int rm, int index)
575 {
576 int tmp;
577
578 /* Use the two byte form if possible, which cannot encode
579 VEX.W, VEX.B, VEX.X, or an m-mmmm field other than P_EXT. */
580 if ((opc & (P_EXT | P_EXT38 | P_EXT3A | P_VEXW)) == P_EXT
581 && ((rm | index) & 8) == 0) {
582 /* Two byte VEX prefix. */
583 tcg_out8(s, 0xc5);
584
585 tmp = (r & 8 ? 0 : 0x80); /* VEX.R */
586 } else {
587 /* Three byte VEX prefix. */
588 tcg_out8(s, 0xc4);
589
590 /* VEX.m-mmmm */
591 if (opc & P_EXT3A) {
592 tmp = 3;
593 } else if (opc & P_EXT38) {
594 tmp = 2;
595 } else if (opc & P_EXT) {
596 tmp = 1;
597 } else {
598 g_assert_not_reached();
599 }
600 tmp |= (r & 8 ? 0 : 0x80); /* VEX.R */
601 tmp |= (index & 8 ? 0 : 0x40); /* VEX.X */
602 tmp |= (rm & 8 ? 0 : 0x20); /* VEX.B */
603 tcg_out8(s, tmp);
604
605 tmp = (opc & P_VEXW ? 0x80 : 0); /* VEX.W */
606 }
607
608 tmp |= (opc & P_VEXL ? 0x04 : 0); /* VEX.L */
609 /* VEX.pp */
610 if (opc & P_DATA16) {
611 tmp |= 1; /* 0x66 */
612 } else if (opc & P_SIMDF3) {
613 tmp |= 2; /* 0xf3 */
614 } else if (opc & P_SIMDF2) {
615 tmp |= 3; /* 0xf2 */
616 }
617 tmp |= (~v & 15) << 3; /* VEX.vvvv */
618 tcg_out8(s, tmp);
619 tcg_out8(s, opc);
620 }
621
622 static void tcg_out_vex_modrm(TCGContext *s, int opc, int r, int v, int rm)
623 {
624 tcg_out_vex_opc(s, opc, r, v, rm, 0);
625 tcg_out8(s, 0xc0 | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
626 }
627
628 /* Output an opcode with a full "rm + (index<<shift) + offset" address mode.
629 We handle either RM and INDEX missing with a negative value. In 64-bit
630 mode for absolute addresses, ~RM is the size of the immediate operand
631 that will follow the instruction. */
632
633 static void tcg_out_sib_offset(TCGContext *s, int r, int rm, int index,
634 int shift, intptr_t offset)
635 {
636 int mod, len;
637
638 if (index < 0 && rm < 0) {
639 if (TCG_TARGET_REG_BITS == 64) {
640 /* Try for a rip-relative addressing mode. This has replaced
641 the 32-bit-mode absolute addressing encoding. */
642 intptr_t pc = (intptr_t)s->code_ptr + 5 + ~rm;
643 intptr_t disp = offset - pc;
644 if (disp == (int32_t)disp) {
645 tcg_out8(s, (LOWREGMASK(r) << 3) | 5);
646 tcg_out32(s, disp);
647 return;
648 }
649
650 /* Try for an absolute address encoding. This requires the
651 use of the MODRM+SIB encoding and is therefore larger than
652 rip-relative addressing. */
653 if (offset == (int32_t)offset) {
654 tcg_out8(s, (LOWREGMASK(r) << 3) | 4);
655 tcg_out8(s, (4 << 3) | 5);
656 tcg_out32(s, offset);
657 return;
658 }
659
660 /* ??? The memory isn't directly addressable. */
661 g_assert_not_reached();
662 } else {
663 /* Absolute address. */
664 tcg_out8(s, (r << 3) | 5);
665 tcg_out32(s, offset);
666 return;
667 }
668 }
669
670 /* Find the length of the immediate addend. Note that the encoding
671 that would be used for (%ebp) indicates absolute addressing. */
672 if (rm < 0) {
673 mod = 0, len = 4, rm = 5;
674 } else if (offset == 0 && LOWREGMASK(rm) != TCG_REG_EBP) {
675 mod = 0, len = 0;
676 } else if (offset == (int8_t)offset) {
677 mod = 0x40, len = 1;
678 } else {
679 mod = 0x80, len = 4;
680 }
681
682 /* Use a single byte MODRM format if possible. Note that the encoding
683 that would be used for %esp is the escape to the two byte form. */
684 if (index < 0 && LOWREGMASK(rm) != TCG_REG_ESP) {
685 /* Single byte MODRM format. */
686 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | LOWREGMASK(rm));
687 } else {
688 /* Two byte MODRM+SIB format. */
689
690 /* Note that the encoding that would place %esp into the index
691 field indicates no index register. In 64-bit mode, the REX.X
692 bit counts, so %r12 can be used as the index. */
693 if (index < 0) {
694 index = 4;
695 } else {
696 tcg_debug_assert(index != TCG_REG_ESP);
697 }
698
699 tcg_out8(s, mod | (LOWREGMASK(r) << 3) | 4);
700 tcg_out8(s, (shift << 6) | (LOWREGMASK(index) << 3) | LOWREGMASK(rm));
701 }
702
703 if (len == 1) {
704 tcg_out8(s, offset);
705 } else if (len == 4) {
706 tcg_out32(s, offset);
707 }
708 }
709
710 static void tcg_out_modrm_sib_offset(TCGContext *s, int opc, int r, int rm,
711 int index, int shift, intptr_t offset)
712 {
713 tcg_out_opc(s, opc, r, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
714 tcg_out_sib_offset(s, r, rm, index, shift, offset);
715 }
716
717 static void tcg_out_vex_modrm_sib_offset(TCGContext *s, int opc, int r, int v,
718 int rm, int index, int shift,
719 intptr_t offset)
720 {
721 tcg_out_vex_opc(s, opc, r, v, rm < 0 ? 0 : rm, index < 0 ? 0 : index);
722 tcg_out_sib_offset(s, r, rm, index, shift, offset);
723 }
724
725 /* A simplification of the above with no index or shift. */
726 static inline void tcg_out_modrm_offset(TCGContext *s, int opc, int r,
727 int rm, intptr_t offset)
728 {
729 tcg_out_modrm_sib_offset(s, opc, r, rm, -1, 0, offset);
730 }
731
732 static inline void tcg_out_vex_modrm_offset(TCGContext *s, int opc, int r,
733 int v, int rm, intptr_t offset)
734 {
735 tcg_out_vex_modrm_sib_offset(s, opc, r, v, rm, -1, 0, offset);
736 }
737
738 /* Output an opcode with an expected reference to the constant pool. */
739 static inline void tcg_out_modrm_pool(TCGContext *s, int opc, int r)
740 {
741 tcg_out_opc(s, opc, r, 0, 0);
742 /* Absolute for 32-bit, pc-relative for 64-bit. */
743 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
744 tcg_out32(s, 0);
745 }
746
747 /* Output an opcode with an expected reference to the constant pool. */
748 static inline void tcg_out_vex_modrm_pool(TCGContext *s, int opc, int r)
749 {
750 tcg_out_vex_opc(s, opc, r, 0, 0, 0);
751 /* Absolute for 32-bit, pc-relative for 64-bit. */
752 tcg_out8(s, LOWREGMASK(r) << 3 | 5);
753 tcg_out32(s, 0);
754 }
755
756 /* Generate dest op= src. Uses the same ARITH_* codes as tgen_arithi. */
757 static inline void tgen_arithr(TCGContext *s, int subop, int dest, int src)
758 {
759 /* Propagate an opcode prefix, such as P_REXW. */
760 int ext = subop & ~0x7;
761 subop &= 0x7;
762
763 tcg_out_modrm(s, OPC_ARITH_GvEv + (subop << 3) + ext, dest, src);
764 }
765
766 static bool tcg_out_mov(TCGContext *s, TCGType type, TCGReg ret, TCGReg arg)
767 {
768 int rexw = 0;
769
770 if (arg == ret) {
771 return true;
772 }
773 switch (type) {
774 case TCG_TYPE_I64:
775 rexw = P_REXW;
776 /* fallthru */
777 case TCG_TYPE_I32:
778 if (ret < 16) {
779 if (arg < 16) {
780 tcg_out_modrm(s, OPC_MOVL_GvEv + rexw, ret, arg);
781 } else {
782 tcg_out_vex_modrm(s, OPC_MOVD_EyVy + rexw, arg, 0, ret);
783 }
784 } else {
785 if (arg < 16) {
786 tcg_out_vex_modrm(s, OPC_MOVD_VyEy + rexw, ret, 0, arg);
787 } else {
788 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
789 }
790 }
791 break;
792
793 case TCG_TYPE_V64:
794 tcg_debug_assert(ret >= 16 && arg >= 16);
795 tcg_out_vex_modrm(s, OPC_MOVQ_VqWq, ret, 0, arg);
796 break;
797 case TCG_TYPE_V128:
798 tcg_debug_assert(ret >= 16 && arg >= 16);
799 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx, ret, 0, arg);
800 break;
801 case TCG_TYPE_V256:
802 tcg_debug_assert(ret >= 16 && arg >= 16);
803 tcg_out_vex_modrm(s, OPC_MOVDQA_VxWx | P_VEXL, ret, 0, arg);
804 break;
805
806 default:
807 g_assert_not_reached();
808 }
809 return true;
810 }
811
812 static const int avx2_dup_insn[4] = {
813 OPC_VPBROADCASTB, OPC_VPBROADCASTW,
814 OPC_VPBROADCASTD, OPC_VPBROADCASTQ,
815 };
816
817 static bool tcg_out_dup_vec(TCGContext *s, TCGType type, unsigned vece,
818 TCGReg r, TCGReg a)
819 {
820 if (have_avx2) {
821 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
822 tcg_out_vex_modrm(s, avx2_dup_insn[vece] + vex_l, r, 0, a);
823 } else {
824 switch (vece) {
825 case MO_8:
826 /* ??? With zero in a register, use PSHUFB. */
827 tcg_out_vex_modrm(s, OPC_PUNPCKLBW, r, a, a);
828 a = r;
829 /* FALLTHRU */
830 case MO_16:
831 tcg_out_vex_modrm(s, OPC_PUNPCKLWD, r, a, a);
832 a = r;
833 /* FALLTHRU */
834 case MO_32:
835 tcg_out_vex_modrm(s, OPC_PSHUFD, r, 0, a);
836 /* imm8 operand: all output lanes selected from input lane 0. */
837 tcg_out8(s, 0);
838 break;
839 case MO_64:
840 tcg_out_vex_modrm(s, OPC_PUNPCKLQDQ, r, a, a);
841 break;
842 default:
843 g_assert_not_reached();
844 }
845 }
846 return true;
847 }
848
849 static bool tcg_out_dupm_vec(TCGContext *s, TCGType type, unsigned vece,
850 TCGReg r, TCGReg base, intptr_t offset)
851 {
852 if (have_avx2) {
853 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
854 tcg_out_vex_modrm_offset(s, avx2_dup_insn[vece] + vex_l,
855 r, 0, base, offset);
856 } else {
857 switch (vece) {
858 case MO_64:
859 tcg_out_vex_modrm_offset(s, OPC_MOVDDUP, r, 0, base, offset);
860 break;
861 case MO_32:
862 tcg_out_vex_modrm_offset(s, OPC_VBROADCASTSS, r, 0, base, offset);
863 break;
864 case MO_16:
865 tcg_out_vex_modrm_offset(s, OPC_VPINSRW, r, r, base, offset);
866 tcg_out8(s, 0); /* imm8 */
867 tcg_out_dup_vec(s, type, vece, r, r);
868 break;
869 case MO_8:
870 tcg_out_vex_modrm_offset(s, OPC_VPINSRB, r, r, base, offset);
871 tcg_out8(s, 0); /* imm8 */
872 tcg_out_dup_vec(s, type, vece, r, r);
873 break;
874 default:
875 g_assert_not_reached();
876 }
877 }
878 return true;
879 }
880
881 static void tcg_out_dupi_vec(TCGContext *s, TCGType type, unsigned vece,
882 TCGReg ret, int64_t arg)
883 {
884 int vex_l = (type == TCG_TYPE_V256 ? P_VEXL : 0);
885
886 if (arg == 0) {
887 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
888 return;
889 }
890 if (arg == -1) {
891 tcg_out_vex_modrm(s, OPC_PCMPEQB + vex_l, ret, ret, ret);
892 return;
893 }
894
895 if (TCG_TARGET_REG_BITS == 32 && vece < MO_64) {
896 if (have_avx2) {
897 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTD + vex_l, ret);
898 } else {
899 tcg_out_vex_modrm_pool(s, OPC_VBROADCASTSS, ret);
900 }
901 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
902 } else {
903 if (type == TCG_TYPE_V64) {
904 tcg_out_vex_modrm_pool(s, OPC_MOVQ_VqWq, ret);
905 } else if (have_avx2) {
906 tcg_out_vex_modrm_pool(s, OPC_VPBROADCASTQ + vex_l, ret);
907 } else {
908 tcg_out_vex_modrm_pool(s, OPC_MOVDDUP, ret);
909 }
910 if (TCG_TARGET_REG_BITS == 64) {
911 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
912 } else {
913 new_pool_l2(s, R_386_32, s->code_ptr - 4, 0, arg, arg >> 32);
914 }
915 }
916 }
917
918 static void tcg_out_movi_vec(TCGContext *s, TCGType type,
919 TCGReg ret, tcg_target_long arg)
920 {
921 if (arg == 0) {
922 tcg_out_vex_modrm(s, OPC_PXOR, ret, ret, ret);
923 return;
924 }
925 if (arg == -1) {
926 tcg_out_vex_modrm(s, OPC_PCMPEQB, ret, ret, ret);
927 return;
928 }
929
930 int rexw = (type == TCG_TYPE_I32 ? 0 : P_REXW);
931 tcg_out_vex_modrm_pool(s, OPC_MOVD_VyEy + rexw, ret);
932 if (TCG_TARGET_REG_BITS == 64) {
933 new_pool_label(s, arg, R_386_PC32, s->code_ptr - 4, -4);
934 } else {
935 new_pool_label(s, arg, R_386_32, s->code_ptr - 4, 0);
936 }
937 }
938
939 static void tcg_out_movi_int(TCGContext *s, TCGType type,
940 TCGReg ret, tcg_target_long arg)
941 {
942 tcg_target_long diff;
943
944 if (arg == 0) {
945 tgen_arithr(s, ARITH_XOR, ret, ret);
946 return;
947 }
948 if (arg == (uint32_t)arg || type == TCG_TYPE_I32) {
949 tcg_out_opc(s, OPC_MOVL_Iv + LOWREGMASK(ret), 0, ret, 0);
950 tcg_out32(s, arg);
951 return;
952 }
953 if (arg == (int32_t)arg) {
954 tcg_out_modrm(s, OPC_MOVL_EvIz + P_REXW, 0, ret);
955 tcg_out32(s, arg);
956 return;
957 }
958
959 /* Try a 7 byte pc-relative lea before the 10 byte movq. */
960 diff = tcg_pcrel_diff(s, (const void *)arg) - 7;
961 if (diff == (int32_t)diff) {
962 tcg_out_opc(s, OPC_LEA | P_REXW, ret, 0, 0);
963 tcg_out8(s, (LOWREGMASK(ret) << 3) | 5);
964 tcg_out32(s, diff);
965 return;
966 }
967
968 tcg_out_opc(s, OPC_MOVL_Iv + P_REXW + LOWREGMASK(ret), 0, ret, 0);
969 tcg_out64(s, arg);
970 }
971
972 static void tcg_out_movi(TCGContext *s, TCGType type,
973 TCGReg ret, tcg_target_long arg)
974 {
975 switch (type) {
976 case TCG_TYPE_I32:
977 #if TCG_TARGET_REG_BITS == 64
978 case TCG_TYPE_I64:
979 #endif
980 if (ret < 16) {
981 tcg_out_movi_int(s, type, ret, arg);
982 } else {
983 tcg_out_movi_vec(s, type, ret, arg);
984 }
985 break;
986 default:
987 g_assert_not_reached();
988 }
989 }
990
991 static inline void tcg_out_pushi(TCGContext *s, tcg_target_long val)
992 {
993 if (val == (int8_t)val) {
994 tcg_out_opc(s, OPC_PUSH_Ib, 0, 0, 0);
995 tcg_out8(s, val);
996 } else if (val == (int32_t)val) {
997 tcg_out_opc(s, OPC_PUSH_Iv, 0, 0, 0);
998 tcg_out32(s, val);
999 } else {
1000 tcg_abort();
1001 }
1002 }
1003
1004 static inline void tcg_out_mb(TCGContext *s, TCGArg a0)
1005 {
1006 /* Given the strength of x86 memory ordering, we only need care for
1007 store-load ordering. Experimentally, "lock orl $0,0(%esp)" is
1008 faster than "mfence", so don't bother with the sse insn. */
1009 if (a0 & TCG_MO_ST_LD) {
1010 tcg_out8(s, 0xf0);
1011 tcg_out_modrm_offset(s, OPC_ARITH_EvIb, ARITH_OR, TCG_REG_ESP, 0);
1012 tcg_out8(s, 0);
1013 }
1014 }
1015
1016 static inline void tcg_out_push(TCGContext *s, int reg)
1017 {
1018 tcg_out_opc(s, OPC_PUSH_r32 + LOWREGMASK(reg), 0, reg, 0);
1019 }
1020
1021 static inline void tcg_out_pop(TCGContext *s, int reg)
1022 {
1023 tcg_out_opc(s, OPC_POP_r32 + LOWREGMASK(reg), 0, reg, 0);
1024 }
1025
1026 static void tcg_out_ld(TCGContext *s, TCGType type, TCGReg ret,
1027 TCGReg arg1, intptr_t arg2)
1028 {
1029 switch (type) {
1030 case TCG_TYPE_I32:
1031 if (ret < 16) {
1032 tcg_out_modrm_offset(s, OPC_MOVL_GvEv, ret, arg1, arg2);
1033 } else {
1034 tcg_out_vex_modrm_offset(s, OPC_MOVD_VyEy, ret, 0, arg1, arg2);
1035 }
1036 break;
1037 case TCG_TYPE_I64:
1038 if (ret < 16) {
1039 tcg_out_modrm_offset(s, OPC_MOVL_GvEv | P_REXW, ret, arg1, arg2);
1040 break;
1041 }
1042 /* FALLTHRU */
1043 case TCG_TYPE_V64:
1044 /* There is no instruction that can validate 8-byte alignment. */
1045 tcg_debug_assert(ret >= 16);
1046 tcg_out_vex_modrm_offset(s, OPC_MOVQ_VqWq, ret, 0, arg1, arg2);
1047 break;
1048 case TCG_TYPE_V128:
1049 /*
1050 * The gvec infrastructure is asserts that v128 vector loads
1051 * and stores use a 16-byte aligned offset. Validate that the
1052 * final pointer is aligned by using an insn that will SIGSEGV.
1053 */
1054 tcg_debug_assert(ret >= 16);
1055 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_VxWx, ret, 0, arg1, arg2);
1056 break;
1057 case TCG_TYPE_V256:
1058 /*
1059 * The gvec infrastructure only requires 16-byte alignment,
1060 * so here we must use an unaligned load.
1061 */
1062 tcg_debug_assert(ret >= 16);
1063 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_VxWx | P_VEXL,
1064 ret, 0, arg1, arg2);
1065 break;
1066 default:
1067 g_assert_not_reached();
1068 }
1069 }
1070
1071 static void tcg_out_st(TCGContext *s, TCGType type, TCGReg arg,
1072 TCGReg arg1, intptr_t arg2)
1073 {
1074 switch (type) {
1075 case TCG_TYPE_I32:
1076 if (arg < 16) {
1077 tcg_out_modrm_offset(s, OPC_MOVL_EvGv, arg, arg1, arg2);
1078 } else {
1079 tcg_out_vex_modrm_offset(s, OPC_MOVD_EyVy, arg, 0, arg1, arg2);
1080 }
1081 break;
1082 case TCG_TYPE_I64:
1083 if (arg < 16) {
1084 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_REXW, arg, arg1, arg2);
1085 break;
1086 }
1087 /* FALLTHRU */
1088 case TCG_TYPE_V64:
1089 /* There is no instruction that can validate 8-byte alignment. */
1090 tcg_debug_assert(arg >= 16);
1091 tcg_out_vex_modrm_offset(s, OPC_MOVQ_WqVq, arg, 0, arg1, arg2);
1092 break;
1093 case TCG_TYPE_V128:
1094 /*
1095 * The gvec infrastructure is asserts that v128 vector loads
1096 * and stores use a 16-byte aligned offset. Validate that the
1097 * final pointer is aligned by using an insn that will SIGSEGV.
1098 */
1099 tcg_debug_assert(arg >= 16);
1100 tcg_out_vex_modrm_offset(s, OPC_MOVDQA_WxVx, arg, 0, arg1, arg2);
1101 break;
1102 case TCG_TYPE_V256:
1103 /*
1104 * The gvec infrastructure only requires 16-byte alignment,
1105 * so here we must use an unaligned store.
1106 */
1107 tcg_debug_assert(arg >= 16);
1108 tcg_out_vex_modrm_offset(s, OPC_MOVDQU_WxVx | P_VEXL,
1109 arg, 0, arg1, arg2);
1110 break;
1111 default:
1112 g_assert_not_reached();
1113 }
1114 }
1115
1116 static bool tcg_out_sti(TCGContext *s, TCGType type, TCGArg val,
1117 TCGReg base, intptr_t ofs)
1118 {
1119 int rexw = 0;
1120 if (TCG_TARGET_REG_BITS == 64 && type == TCG_TYPE_I64) {
1121 if (val != (int32_t)val) {
1122 return false;
1123 }
1124 rexw = P_REXW;
1125 } else if (type != TCG_TYPE_I32) {
1126 return false;
1127 }
1128 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | rexw, 0, base, ofs);
1129 tcg_out32(s, val);
1130 return true;
1131 }
1132
1133 static void tcg_out_shifti(TCGContext *s, int subopc, int reg, int count)
1134 {
1135 /* Propagate an opcode prefix, such as P_DATA16. */
1136 int ext = subopc & ~0x7;
1137 subopc &= 0x7;
1138
1139 if (count == 1) {
1140 tcg_out_modrm(s, OPC_SHIFT_1 + ext, subopc, reg);
1141 } else {
1142 tcg_out_modrm(s, OPC_SHIFT_Ib + ext, subopc, reg);
1143 tcg_out8(s, count);
1144 }
1145 }
1146
1147 static inline void tcg_out_bswap32(TCGContext *s, int reg)
1148 {
1149 tcg_out_opc(s, OPC_BSWAP + LOWREGMASK(reg), 0, reg, 0);
1150 }
1151
1152 static inline void tcg_out_rolw_8(TCGContext *s, int reg)
1153 {
1154 tcg_out_shifti(s, SHIFT_ROL + P_DATA16, reg, 8);
1155 }
1156
1157 static inline void tcg_out_ext8u(TCGContext *s, int dest, int src)
1158 {
1159 /* movzbl */
1160 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1161 tcg_out_modrm(s, OPC_MOVZBL + P_REXB_RM, dest, src);
1162 }
1163
1164 static void tcg_out_ext8s(TCGContext *s, int dest, int src, int rexw)
1165 {
1166 /* movsbl */
1167 tcg_debug_assert(src < 4 || TCG_TARGET_REG_BITS == 64);
1168 tcg_out_modrm(s, OPC_MOVSBL + P_REXB_RM + rexw, dest, src);
1169 }
1170
1171 static inline void tcg_out_ext16u(TCGContext *s, int dest, int src)
1172 {
1173 /* movzwl */
1174 tcg_out_modrm(s, OPC_MOVZWL, dest, src);
1175 }
1176
1177 static inline void tcg_out_ext16s(TCGContext *s, int dest, int src, int rexw)
1178 {
1179 /* movsw[lq] */
1180 tcg_out_modrm(s, OPC_MOVSWL + rexw, dest, src);
1181 }
1182
1183 static inline void tcg_out_ext32u(TCGContext *s, int dest, int src)
1184 {
1185 /* 32-bit mov zero extends. */
1186 tcg_out_modrm(s, OPC_MOVL_GvEv, dest, src);
1187 }
1188
1189 static inline void tcg_out_ext32s(TCGContext *s, int dest, int src)
1190 {
1191 tcg_out_modrm(s, OPC_MOVSLQ, dest, src);
1192 }
1193
1194 static inline void tcg_out_bswap64(TCGContext *s, int reg)
1195 {
1196 tcg_out_opc(s, OPC_BSWAP + P_REXW + LOWREGMASK(reg), 0, reg, 0);
1197 }
1198
1199 static void tgen_arithi(TCGContext *s, int c, int r0,
1200 tcg_target_long val, int cf)
1201 {
1202 int rexw = 0;
1203
1204 if (TCG_TARGET_REG_BITS == 64) {
1205 rexw = c & -8;
1206 c &= 7;
1207 }
1208
1209 /* ??? While INC is 2 bytes shorter than ADDL $1, they also induce
1210 partial flags update stalls on Pentium4 and are not recommended
1211 by current Intel optimization manuals. */
1212 if (!cf && (c == ARITH_ADD || c == ARITH_SUB) && (val == 1 || val == -1)) {
1213 int is_inc = (c == ARITH_ADD) ^ (val < 0);
1214 if (TCG_TARGET_REG_BITS == 64) {
1215 /* The single-byte increment encodings are re-tasked as the
1216 REX prefixes. Use the MODRM encoding. */
1217 tcg_out_modrm(s, OPC_GRP5 + rexw,
1218 (is_inc ? EXT5_INC_Ev : EXT5_DEC_Ev), r0);
1219 } else {
1220 tcg_out8(s, (is_inc ? OPC_INC_r32 : OPC_DEC_r32) + r0);
1221 }
1222 return;
1223 }
1224
1225 if (c == ARITH_AND) {
1226 if (TCG_TARGET_REG_BITS == 64) {
1227 if (val == 0xffffffffu) {
1228 tcg_out_ext32u(s, r0, r0);
1229 return;
1230 }
1231 if (val == (uint32_t)val) {
1232 /* AND with no high bits set can use a 32-bit operation. */
1233 rexw = 0;
1234 }
1235 }
1236 if (val == 0xffu && (r0 < 4 || TCG_TARGET_REG_BITS == 64)) {
1237 tcg_out_ext8u(s, r0, r0);
1238 return;
1239 }
1240 if (val == 0xffffu) {
1241 tcg_out_ext16u(s, r0, r0);
1242 return;
1243 }
1244 }
1245
1246 if (val == (int8_t)val) {
1247 tcg_out_modrm(s, OPC_ARITH_EvIb + rexw, c, r0);
1248 tcg_out8(s, val);
1249 return;
1250 }
1251 if (rexw == 0 || val == (int32_t)val) {
1252 tcg_out_modrm(s, OPC_ARITH_EvIz + rexw, c, r0);
1253 tcg_out32(s, val);
1254 return;
1255 }
1256
1257 tcg_abort();
1258 }
1259
1260 static void tcg_out_addi(TCGContext *s, int reg, tcg_target_long val)
1261 {
1262 if (val != 0) {
1263 tgen_arithi(s, ARITH_ADD + P_REXW, reg, val, 0);
1264 }
1265 }
1266
1267 /* Use SMALL != 0 to force a short forward branch. */
1268 static void tcg_out_jxx(TCGContext *s, int opc, TCGLabel *l, int small)
1269 {
1270 int32_t val, val1;
1271
1272 if (l->has_value) {
1273 val = tcg_pcrel_diff(s, l->u.value_ptr);
1274 val1 = val - 2;
1275 if ((int8_t)val1 == val1) {
1276 if (opc == -1) {
1277 tcg_out8(s, OPC_JMP_short);
1278 } else {
1279 tcg_out8(s, OPC_JCC_short + opc);
1280 }
1281 tcg_out8(s, val1);
1282 } else {
1283 if (small) {
1284 tcg_abort();
1285 }
1286 if (opc == -1) {
1287 tcg_out8(s, OPC_JMP_long);
1288 tcg_out32(s, val - 5);
1289 } else {
1290 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1291 tcg_out32(s, val - 6);
1292 }
1293 }
1294 } else if (small) {
1295 if (opc == -1) {
1296 tcg_out8(s, OPC_JMP_short);
1297 } else {
1298 tcg_out8(s, OPC_JCC_short + opc);
1299 }
1300 tcg_out_reloc(s, s->code_ptr, R_386_PC8, l, -1);
1301 s->code_ptr += 1;
1302 } else {
1303 if (opc == -1) {
1304 tcg_out8(s, OPC_JMP_long);
1305 } else {
1306 tcg_out_opc(s, OPC_JCC_long + opc, 0, 0, 0);
1307 }
1308 tcg_out_reloc(s, s->code_ptr, R_386_PC32, l, -4);
1309 s->code_ptr += 4;
1310 }
1311 }
1312
1313 static void tcg_out_cmp(TCGContext *s, TCGArg arg1, TCGArg arg2,
1314 int const_arg2, int rexw)
1315 {
1316 if (const_arg2) {
1317 if (arg2 == 0) {
1318 /* test r, r */
1319 tcg_out_modrm(s, OPC_TESTL + rexw, arg1, arg1);
1320 } else {
1321 tgen_arithi(s, ARITH_CMP + rexw, arg1, arg2, 0);
1322 }
1323 } else {
1324 tgen_arithr(s, ARITH_CMP + rexw, arg1, arg2);
1325 }
1326 }
1327
1328 static void tcg_out_brcond32(TCGContext *s, TCGCond cond,
1329 TCGArg arg1, TCGArg arg2, int const_arg2,
1330 TCGLabel *label, int small)
1331 {
1332 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1333 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1334 }
1335
1336 #if TCG_TARGET_REG_BITS == 64
1337 static void tcg_out_brcond64(TCGContext *s, TCGCond cond,
1338 TCGArg arg1, TCGArg arg2, int const_arg2,
1339 TCGLabel *label, int small)
1340 {
1341 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1342 tcg_out_jxx(s, tcg_cond_to_jcc[cond], label, small);
1343 }
1344 #else
1345 /* XXX: we implement it at the target level to avoid having to
1346 handle cross basic blocks temporaries */
1347 static void tcg_out_brcond2(TCGContext *s, const TCGArg *args,
1348 const int *const_args, int small)
1349 {
1350 TCGLabel *label_next = gen_new_label();
1351 TCGLabel *label_this = arg_label(args[5]);
1352
1353 switch(args[4]) {
1354 case TCG_COND_EQ:
1355 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1356 label_next, 1);
1357 tcg_out_brcond32(s, TCG_COND_EQ, args[1], args[3], const_args[3],
1358 label_this, small);
1359 break;
1360 case TCG_COND_NE:
1361 tcg_out_brcond32(s, TCG_COND_NE, args[0], args[2], const_args[2],
1362 label_this, small);
1363 tcg_out_brcond32(s, TCG_COND_NE, args[1], args[3], const_args[3],
1364 label_this, small);
1365 break;
1366 case TCG_COND_LT:
1367 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1368 label_this, small);
1369 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1370 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1371 label_this, small);
1372 break;
1373 case TCG_COND_LE:
1374 tcg_out_brcond32(s, TCG_COND_LT, args[1], args[3], const_args[3],
1375 label_this, small);
1376 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1377 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1378 label_this, small);
1379 break;
1380 case TCG_COND_GT:
1381 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1382 label_this, small);
1383 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1384 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1385 label_this, small);
1386 break;
1387 case TCG_COND_GE:
1388 tcg_out_brcond32(s, TCG_COND_GT, args[1], args[3], const_args[3],
1389 label_this, small);
1390 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1391 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1392 label_this, small);
1393 break;
1394 case TCG_COND_LTU:
1395 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1396 label_this, small);
1397 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1398 tcg_out_brcond32(s, TCG_COND_LTU, args[0], args[2], const_args[2],
1399 label_this, small);
1400 break;
1401 case TCG_COND_LEU:
1402 tcg_out_brcond32(s, TCG_COND_LTU, args[1], args[3], const_args[3],
1403 label_this, small);
1404 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1405 tcg_out_brcond32(s, TCG_COND_LEU, args[0], args[2], const_args[2],
1406 label_this, small);
1407 break;
1408 case TCG_COND_GTU:
1409 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1410 label_this, small);
1411 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1412 tcg_out_brcond32(s, TCG_COND_GTU, args[0], args[2], const_args[2],
1413 label_this, small);
1414 break;
1415 case TCG_COND_GEU:
1416 tcg_out_brcond32(s, TCG_COND_GTU, args[1], args[3], const_args[3],
1417 label_this, small);
1418 tcg_out_jxx(s, JCC_JNE, label_next, 1);
1419 tcg_out_brcond32(s, TCG_COND_GEU, args[0], args[2], const_args[2],
1420 label_this, small);
1421 break;
1422 default:
1423 tcg_abort();
1424 }
1425 tcg_out_label(s, label_next);
1426 }
1427 #endif
1428
1429 static void tcg_out_setcond32(TCGContext *s, TCGCond cond, TCGArg dest,
1430 TCGArg arg1, TCGArg arg2, int const_arg2)
1431 {
1432 tcg_out_cmp(s, arg1, arg2, const_arg2, 0);
1433 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1434 tcg_out_ext8u(s, dest, dest);
1435 }
1436
1437 #if TCG_TARGET_REG_BITS == 64
1438 static void tcg_out_setcond64(TCGContext *s, TCGCond cond, TCGArg dest,
1439 TCGArg arg1, TCGArg arg2, int const_arg2)
1440 {
1441 tcg_out_cmp(s, arg1, arg2, const_arg2, P_REXW);
1442 tcg_out_modrm(s, OPC_SETCC | tcg_cond_to_jcc[cond], 0, dest);
1443 tcg_out_ext8u(s, dest, dest);
1444 }
1445 #else
1446 static void tcg_out_setcond2(TCGContext *s, const TCGArg *args,
1447 const int *const_args)
1448 {
1449 TCGArg new_args[6];
1450 TCGLabel *label_true, *label_over;
1451
1452 memcpy(new_args, args+1, 5*sizeof(TCGArg));
1453
1454 if (args[0] == args[1] || args[0] == args[2]
1455 || (!const_args[3] && args[0] == args[3])
1456 || (!const_args[4] && args[0] == args[4])) {
1457 /* When the destination overlaps with one of the argument
1458 registers, don't do anything tricky. */
1459 label_true = gen_new_label();
1460 label_over = gen_new_label();
1461
1462 new_args[5] = label_arg(label_true);
1463 tcg_out_brcond2(s, new_args, const_args+1, 1);
1464
1465 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1466 tcg_out_jxx(s, JCC_JMP, label_over, 1);
1467 tcg_out_label(s, label_true);
1468
1469 tcg_out_movi(s, TCG_TYPE_I32, args[0], 1);
1470 tcg_out_label(s, label_over);
1471 } else {
1472 /* When the destination does not overlap one of the arguments,
1473 clear the destination first, jump if cond false, and emit an
1474 increment in the true case. This results in smaller code. */
1475
1476 tcg_out_movi(s, TCG_TYPE_I32, args[0], 0);
1477
1478 label_over = gen_new_label();
1479 new_args[4] = tcg_invert_cond(new_args[4]);
1480 new_args[5] = label_arg(label_over);
1481 tcg_out_brcond2(s, new_args, const_args+1, 1);
1482
1483 tgen_arithi(s, ARITH_ADD, args[0], 1, 0);
1484 tcg_out_label(s, label_over);
1485 }
1486 }
1487 #endif
1488
1489 static void tcg_out_cmov(TCGContext *s, TCGCond cond, int rexw,
1490 TCGReg dest, TCGReg v1)
1491 {
1492 if (have_cmov) {
1493 tcg_out_modrm(s, OPC_CMOVCC | tcg_cond_to_jcc[cond] | rexw, dest, v1);
1494 } else {
1495 TCGLabel *over = gen_new_label();
1496 tcg_out_jxx(s, tcg_cond_to_jcc[tcg_invert_cond(cond)], over, 1);
1497 tcg_out_mov(s, TCG_TYPE_I32, dest, v1);
1498 tcg_out_label(s, over);
1499 }
1500 }
1501
1502 static void tcg_out_movcond32(TCGContext *s, TCGCond cond, TCGReg dest,
1503 TCGReg c1, TCGArg c2, int const_c2,
1504 TCGReg v1)
1505 {
1506 tcg_out_cmp(s, c1, c2, const_c2, 0);
1507 tcg_out_cmov(s, cond, 0, dest, v1);
1508 }
1509
1510 #if TCG_TARGET_REG_BITS == 64
1511 static void tcg_out_movcond64(TCGContext *s, TCGCond cond, TCGReg dest,
1512 TCGReg c1, TCGArg c2, int const_c2,
1513 TCGReg v1)
1514 {
1515 tcg_out_cmp(s, c1, c2, const_c2, P_REXW);
1516 tcg_out_cmov(s, cond, P_REXW, dest, v1);
1517 }
1518 #endif
1519
1520 static void tcg_out_ctz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1521 TCGArg arg2, bool const_a2)
1522 {
1523 if (have_bmi1) {
1524 tcg_out_modrm(s, OPC_TZCNT + rexw, dest, arg1);
1525 if (const_a2) {
1526 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1527 } else {
1528 tcg_debug_assert(dest != arg2);
1529 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1530 }
1531 } else {
1532 tcg_debug_assert(dest != arg2);
1533 tcg_out_modrm(s, OPC_BSF + rexw, dest, arg1);
1534 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1535 }
1536 }
1537
1538 static void tcg_out_clz(TCGContext *s, int rexw, TCGReg dest, TCGReg arg1,
1539 TCGArg arg2, bool const_a2)
1540 {
1541 if (have_lzcnt) {
1542 tcg_out_modrm(s, OPC_LZCNT + rexw, dest, arg1);
1543 if (const_a2) {
1544 tcg_debug_assert(arg2 == (rexw ? 64 : 32));
1545 } else {
1546 tcg_debug_assert(dest != arg2);
1547 tcg_out_cmov(s, TCG_COND_LTU, rexw, dest, arg2);
1548 }
1549 } else {
1550 tcg_debug_assert(!const_a2);
1551 tcg_debug_assert(dest != arg1);
1552 tcg_debug_assert(dest != arg2);
1553
1554 /* Recall that the output of BSR is the index not the count. */
1555 tcg_out_modrm(s, OPC_BSR + rexw, dest, arg1);
1556 tgen_arithi(s, ARITH_XOR + rexw, dest, rexw ? 63 : 31, 0);
1557
1558 /* Since we have destroyed the flags from BSR, we have to re-test. */
1559 tcg_out_cmp(s, arg1, 0, 1, rexw);
1560 tcg_out_cmov(s, TCG_COND_EQ, rexw, dest, arg2);
1561 }
1562 }
1563
1564 static void tcg_out_branch(TCGContext *s, int call, const tcg_insn_unit *dest)
1565 {
1566 intptr_t disp = tcg_pcrel_diff(s, dest) - 5;
1567
1568 if (disp == (int32_t)disp) {
1569 tcg_out_opc(s, call ? OPC_CALL_Jz : OPC_JMP_long, 0, 0, 0);
1570 tcg_out32(s, disp);
1571 } else {
1572 /* rip-relative addressing into the constant pool.
1573 This is 6 + 8 = 14 bytes, as compared to using an
1574 an immediate load 10 + 6 = 16 bytes, plus we may
1575 be able to re-use the pool constant for more calls. */
1576 tcg_out_opc(s, OPC_GRP5, 0, 0, 0);
1577 tcg_out8(s, (call ? EXT5_CALLN_Ev : EXT5_JMPN_Ev) << 3 | 5);
1578 new_pool_label(s, (uintptr_t)dest, R_386_PC32, s->code_ptr, -4);
1579 tcg_out32(s, 0);
1580 }
1581 }
1582
1583 static inline void tcg_out_call(TCGContext *s, const tcg_insn_unit *dest)
1584 {
1585 tcg_out_branch(s, 1, dest);
1586 }
1587
1588 static void tcg_out_jmp(TCGContext *s, const tcg_insn_unit *dest)
1589 {
1590 tcg_out_branch(s, 0, dest);
1591 }
1592
1593 static void tcg_out_nopn(TCGContext *s, int n)
1594 {
1595 int i;
1596 /* Emit 1 or 2 operand size prefixes for the standard one byte nop,
1597 * "xchg %eax,%eax", forming "xchg %ax,%ax". All cores accept the
1598 * duplicate prefix, and all of the interesting recent cores can
1599 * decode and discard the duplicates in a single cycle.
1600 */
1601 tcg_debug_assert(n >= 1);
1602 for (i = 1; i < n; ++i) {
1603 tcg_out8(s, 0x66);
1604 }
1605 tcg_out8(s, 0x90);
1606 }
1607
1608 #if defined(CONFIG_SOFTMMU)
1609 #include "../tcg-ldst.c.inc"
1610
1611 /* helper signature: helper_ret_ld_mmu(CPUState *env, target_ulong addr,
1612 * int mmu_idx, uintptr_t ra)
1613 */
1614 static void * const qemu_ld_helpers[(MO_SIZE | MO_BSWAP) + 1] = {
1615 [MO_UB] = helper_ret_ldub_mmu,
1616 [MO_LEUW] = helper_le_lduw_mmu,
1617 [MO_LEUL] = helper_le_ldul_mmu,
1618 [MO_LEQ] = helper_le_ldq_mmu,
1619 [MO_BEUW] = helper_be_lduw_mmu,
1620 [MO_BEUL] = helper_be_ldul_mmu,
1621 [MO_BEQ] = helper_be_ldq_mmu,
1622 };
1623
1624 /* helper signature: helper_ret_st_mmu(CPUState *env, target_ulong addr,
1625 * uintxx_t val, int mmu_idx, uintptr_t ra)
1626 */
1627 static void * const qemu_st_helpers[(MO_SIZE | MO_BSWAP) + 1] = {
1628 [MO_UB] = helper_ret_stb_mmu,
1629 [MO_LEUW] = helper_le_stw_mmu,
1630 [MO_LEUL] = helper_le_stl_mmu,
1631 [MO_LEQ] = helper_le_stq_mmu,
1632 [MO_BEUW] = helper_be_stw_mmu,
1633 [MO_BEUL] = helper_be_stl_mmu,
1634 [MO_BEQ] = helper_be_stq_mmu,
1635 };
1636
1637 /* Perform the TLB load and compare.
1638
1639 Inputs:
1640 ADDRLO and ADDRHI contain the low and high part of the address.
1641
1642 MEM_INDEX and S_BITS are the memory context and log2 size of the load.
1643
1644 WHICH is the offset into the CPUTLBEntry structure of the slot to read.
1645 This should be offsetof addr_read or addr_write.
1646
1647 Outputs:
1648 LABEL_PTRS is filled with 1 (32-bit addresses) or 2 (64-bit addresses)
1649 positions of the displacements of forward jumps to the TLB miss case.
1650
1651 Second argument register is loaded with the low part of the address.
1652 In the TLB hit case, it has been adjusted as indicated by the TLB
1653 and so is a host address. In the TLB miss case, it continues to
1654 hold a guest address.
1655
1656 First argument register is clobbered. */
1657
1658 static inline void tcg_out_tlb_load(TCGContext *s, TCGReg addrlo, TCGReg addrhi,
1659 int mem_index, MemOp opc,
1660 tcg_insn_unit **label_ptr, int which)
1661 {
1662 const TCGReg r0 = TCG_REG_L0;
1663 const TCGReg r1 = TCG_REG_L1;
1664 TCGType ttype = TCG_TYPE_I32;
1665 TCGType tlbtype = TCG_TYPE_I32;
1666 int trexw = 0, hrexw = 0, tlbrexw = 0;
1667 unsigned a_bits = get_alignment_bits(opc);
1668 unsigned s_bits = opc & MO_SIZE;
1669 unsigned a_mask = (1 << a_bits) - 1;
1670 unsigned s_mask = (1 << s_bits) - 1;
1671 target_ulong tlb_mask;
1672
1673 if (TCG_TARGET_REG_BITS == 64) {
1674 if (TARGET_LONG_BITS == 64) {
1675 ttype = TCG_TYPE_I64;
1676 trexw = P_REXW;
1677 }
1678 if (TCG_TYPE_PTR == TCG_TYPE_I64) {
1679 hrexw = P_REXW;
1680 if (TARGET_PAGE_BITS + CPU_TLB_DYN_MAX_BITS > 32) {
1681 tlbtype = TCG_TYPE_I64;
1682 tlbrexw = P_REXW;
1683 }
1684 }
1685 }
1686
1687 tcg_out_mov(s, tlbtype, r0, addrlo);
1688 tcg_out_shifti(s, SHIFT_SHR + tlbrexw, r0,
1689 TARGET_PAGE_BITS - CPU_TLB_ENTRY_BITS);
1690
1691 tcg_out_modrm_offset(s, OPC_AND_GvEv + trexw, r0, TCG_AREG0,
1692 TLB_MASK_TABLE_OFS(mem_index) +
1693 offsetof(CPUTLBDescFast, mask));
1694
1695 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r0, TCG_AREG0,
1696 TLB_MASK_TABLE_OFS(mem_index) +
1697 offsetof(CPUTLBDescFast, table));
1698
1699 /* If the required alignment is at least as large as the access, simply
1700 copy the address and mask. For lesser alignments, check that we don't
1701 cross pages for the complete access. */
1702 if (a_bits >= s_bits) {
1703 tcg_out_mov(s, ttype, r1, addrlo);
1704 } else {
1705 tcg_out_modrm_offset(s, OPC_LEA + trexw, r1, addrlo, s_mask - a_mask);
1706 }
1707 tlb_mask = (target_ulong)TARGET_PAGE_MASK | a_mask;
1708 tgen_arithi(s, ARITH_AND + trexw, r1, tlb_mask, 0);
1709
1710 /* cmp 0(r0), r1 */
1711 tcg_out_modrm_offset(s, OPC_CMP_GvEv + trexw, r1, r0, which);
1712
1713 /* Prepare for both the fast path add of the tlb addend, and the slow
1714 path function argument setup. */
1715 tcg_out_mov(s, ttype, r1, addrlo);
1716
1717 /* jne slow_path */
1718 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1719 label_ptr[0] = s->code_ptr;
1720 s->code_ptr += 4;
1721
1722 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1723 /* cmp 4(r0), addrhi */
1724 tcg_out_modrm_offset(s, OPC_CMP_GvEv, addrhi, r0, which + 4);
1725
1726 /* jne slow_path */
1727 tcg_out_opc(s, OPC_JCC_long + JCC_JNE, 0, 0, 0);
1728 label_ptr[1] = s->code_ptr;
1729 s->code_ptr += 4;
1730 }
1731
1732 /* TLB Hit. */
1733
1734 /* add addend(r0), r1 */
1735 tcg_out_modrm_offset(s, OPC_ADD_GvEv + hrexw, r1, r0,
1736 offsetof(CPUTLBEntry, addend));
1737 }
1738
1739 /*
1740 * Record the context of a call to the out of line helper code for the slow path
1741 * for a load or store, so that we can later generate the correct helper code
1742 */
1743 static void add_qemu_ldst_label(TCGContext *s, bool is_ld, bool is_64,
1744 MemOpIdx oi,
1745 TCGReg datalo, TCGReg datahi,
1746 TCGReg addrlo, TCGReg addrhi,
1747 tcg_insn_unit *raddr,
1748 tcg_insn_unit **label_ptr)
1749 {
1750 TCGLabelQemuLdst *label = new_ldst_label(s);
1751
1752 label->is_ld = is_ld;
1753 label->oi = oi;
1754 label->type = is_64 ? TCG_TYPE_I64 : TCG_TYPE_I32;
1755 label->datalo_reg = datalo;
1756 label->datahi_reg = datahi;
1757 label->addrlo_reg = addrlo;
1758 label->addrhi_reg = addrhi;
1759 label->raddr = tcg_splitwx_to_rx(raddr);
1760 label->label_ptr[0] = label_ptr[0];
1761 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1762 label->label_ptr[1] = label_ptr[1];
1763 }
1764 }
1765
1766 /*
1767 * Generate code for the slow path for a load at the end of block
1768 */
1769 static bool tcg_out_qemu_ld_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1770 {
1771 MemOpIdx oi = l->oi;
1772 MemOp opc = get_memop(oi);
1773 TCGReg data_reg;
1774 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1775 int rexw = (l->type == TCG_TYPE_I64 ? P_REXW : 0);
1776
1777 /* resolve label address */
1778 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1779 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1780 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1781 }
1782
1783 if (TCG_TARGET_REG_BITS == 32) {
1784 int ofs = 0;
1785
1786 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1787 ofs += 4;
1788
1789 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1790 ofs += 4;
1791
1792 if (TARGET_LONG_BITS == 64) {
1793 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1794 ofs += 4;
1795 }
1796
1797 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1798 ofs += 4;
1799
1800 tcg_out_sti(s, TCG_TYPE_PTR, (uintptr_t)l->raddr, TCG_REG_ESP, ofs);
1801 } else {
1802 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1803 /* The second argument is already loaded with addrlo. */
1804 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[2], oi);
1805 tcg_out_movi(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[3],
1806 (uintptr_t)l->raddr);
1807 }
1808
1809 tcg_out_call(s, qemu_ld_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1810
1811 data_reg = l->datalo_reg;
1812 switch (opc & MO_SSIZE) {
1813 case MO_SB:
1814 tcg_out_ext8s(s, data_reg, TCG_REG_EAX, rexw);
1815 break;
1816 case MO_SW:
1817 tcg_out_ext16s(s, data_reg, TCG_REG_EAX, rexw);
1818 break;
1819 #if TCG_TARGET_REG_BITS == 64
1820 case MO_SL:
1821 tcg_out_ext32s(s, data_reg, TCG_REG_EAX);
1822 break;
1823 #endif
1824 case MO_UB:
1825 case MO_UW:
1826 /* Note that the helpers have zero-extended to tcg_target_long. */
1827 case MO_UL:
1828 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1829 break;
1830 case MO_Q:
1831 if (TCG_TARGET_REG_BITS == 64) {
1832 tcg_out_mov(s, TCG_TYPE_I64, data_reg, TCG_REG_RAX);
1833 } else if (data_reg == TCG_REG_EDX) {
1834 /* xchg %edx, %eax */
1835 tcg_out_opc(s, OPC_XCHG_ax_r32 + TCG_REG_EDX, 0, 0, 0);
1836 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EAX);
1837 } else {
1838 tcg_out_mov(s, TCG_TYPE_I32, data_reg, TCG_REG_EAX);
1839 tcg_out_mov(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_EDX);
1840 }
1841 break;
1842 default:
1843 tcg_abort();
1844 }
1845
1846 /* Jump to the code corresponding to next IR of qemu_st */
1847 tcg_out_jmp(s, l->raddr);
1848 return true;
1849 }
1850
1851 /*
1852 * Generate code for the slow path for a store at the end of block
1853 */
1854 static bool tcg_out_qemu_st_slow_path(TCGContext *s, TCGLabelQemuLdst *l)
1855 {
1856 MemOpIdx oi = l->oi;
1857 MemOp opc = get_memop(oi);
1858 MemOp s_bits = opc & MO_SIZE;
1859 tcg_insn_unit **label_ptr = &l->label_ptr[0];
1860 TCGReg retaddr;
1861
1862 /* resolve label address */
1863 tcg_patch32(label_ptr[0], s->code_ptr - label_ptr[0] - 4);
1864 if (TARGET_LONG_BITS > TCG_TARGET_REG_BITS) {
1865 tcg_patch32(label_ptr[1], s->code_ptr - label_ptr[1] - 4);
1866 }
1867
1868 if (TCG_TARGET_REG_BITS == 32) {
1869 int ofs = 0;
1870
1871 tcg_out_st(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP, ofs);
1872 ofs += 4;
1873
1874 tcg_out_st(s, TCG_TYPE_I32, l->addrlo_reg, TCG_REG_ESP, ofs);
1875 ofs += 4;
1876
1877 if (TARGET_LONG_BITS == 64) {
1878 tcg_out_st(s, TCG_TYPE_I32, l->addrhi_reg, TCG_REG_ESP, ofs);
1879 ofs += 4;
1880 }
1881
1882 tcg_out_st(s, TCG_TYPE_I32, l->datalo_reg, TCG_REG_ESP, ofs);
1883 ofs += 4;
1884
1885 if (s_bits == MO_64) {
1886 tcg_out_st(s, TCG_TYPE_I32, l->datahi_reg, TCG_REG_ESP, ofs);
1887 ofs += 4;
1888 }
1889
1890 tcg_out_sti(s, TCG_TYPE_I32, oi, TCG_REG_ESP, ofs);
1891 ofs += 4;
1892
1893 retaddr = TCG_REG_EAX;
1894 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1895 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP, ofs);
1896 } else {
1897 tcg_out_mov(s, TCG_TYPE_PTR, tcg_target_call_iarg_regs[0], TCG_AREG0);
1898 /* The second argument is already loaded with addrlo. */
1899 tcg_out_mov(s, (s_bits == MO_64 ? TCG_TYPE_I64 : TCG_TYPE_I32),
1900 tcg_target_call_iarg_regs[2], l->datalo_reg);
1901 tcg_out_movi(s, TCG_TYPE_I32, tcg_target_call_iarg_regs[3], oi);
1902
1903 if (ARRAY_SIZE(tcg_target_call_iarg_regs) > 4) {
1904 retaddr = tcg_target_call_iarg_regs[4];
1905 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1906 } else {
1907 retaddr = TCG_REG_RAX;
1908 tcg_out_movi(s, TCG_TYPE_PTR, retaddr, (uintptr_t)l->raddr);
1909 tcg_out_st(s, TCG_TYPE_PTR, retaddr, TCG_REG_ESP,
1910 TCG_TARGET_CALL_STACK_OFFSET);
1911 }
1912 }
1913
1914 /* "Tail call" to the helper, with the return address back inline. */
1915 tcg_out_push(s, retaddr);
1916 tcg_out_jmp(s, qemu_st_helpers[opc & (MO_BSWAP | MO_SIZE)]);
1917 return true;
1918 }
1919 #elif TCG_TARGET_REG_BITS == 32
1920 # define x86_guest_base_seg 0
1921 # define x86_guest_base_index -1
1922 # define x86_guest_base_offset guest_base
1923 #else
1924 static int x86_guest_base_seg;
1925 static int x86_guest_base_index = -1;
1926 static int32_t x86_guest_base_offset;
1927 # if defined(__x86_64__) && defined(__linux__)
1928 # include <asm/prctl.h>
1929 # include <sys/prctl.h>
1930 int arch_prctl(int code, unsigned long addr);
1931 static inline int setup_guest_base_seg(void)
1932 {
1933 if (arch_prctl(ARCH_SET_GS, guest_base) == 0) {
1934 return P_GS;
1935 }
1936 return 0;
1937 }
1938 # elif defined (__FreeBSD__) || defined (__FreeBSD_kernel__)
1939 # include <machine/sysarch.h>
1940 static inline int setup_guest_base_seg(void)
1941 {
1942 if (sysarch(AMD64_SET_GSBASE, &guest_base) == 0) {
1943 return P_GS;
1944 }
1945 return 0;
1946 }
1947 # else
1948 static inline int setup_guest_base_seg(void)
1949 {
1950 return 0;
1951 }
1952 # endif
1953 #endif /* SOFTMMU */
1954
1955 static void tcg_out_qemu_ld_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
1956 TCGReg base, int index, intptr_t ofs,
1957 int seg, bool is64, MemOp memop)
1958 {
1959 bool use_movbe = false;
1960 int rexw = is64 * P_REXW;
1961 int movop = OPC_MOVL_GvEv;
1962
1963 /* Do big-endian loads with movbe. */
1964 if (memop & MO_BSWAP) {
1965 tcg_debug_assert(have_movbe);
1966 use_movbe = true;
1967 movop = OPC_MOVBE_GyMy;
1968 }
1969
1970 switch (memop & MO_SSIZE) {
1971 case MO_UB:
1972 tcg_out_modrm_sib_offset(s, OPC_MOVZBL + seg, datalo,
1973 base, index, 0, ofs);
1974 break;
1975 case MO_SB:
1976 tcg_out_modrm_sib_offset(s, OPC_MOVSBL + rexw + seg, datalo,
1977 base, index, 0, ofs);
1978 break;
1979 case MO_UW:
1980 if (use_movbe) {
1981 /* There is no extending movbe; only low 16-bits are modified. */
1982 if (datalo != base && datalo != index) {
1983 /* XOR breaks dependency chains. */
1984 tgen_arithr(s, ARITH_XOR, datalo, datalo);
1985 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1986 datalo, base, index, 0, ofs);
1987 } else {
1988 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
1989 datalo, base, index, 0, ofs);
1990 tcg_out_ext16u(s, datalo, datalo);
1991 }
1992 } else {
1993 tcg_out_modrm_sib_offset(s, OPC_MOVZWL + seg, datalo,
1994 base, index, 0, ofs);
1995 }
1996 break;
1997 case MO_SW:
1998 if (use_movbe) {
1999 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + P_DATA16 + seg,
2000 datalo, base, index, 0, ofs);
2001 tcg_out_ext16s(s, datalo, datalo, rexw);
2002 } else {
2003 tcg_out_modrm_sib_offset(s, OPC_MOVSWL + rexw + seg,
2004 datalo, base, index, 0, ofs);
2005 }
2006 break;
2007 case MO_UL:
2008 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2009 break;
2010 #if TCG_TARGET_REG_BITS == 64
2011 case MO_SL:
2012 if (use_movbe) {
2013 tcg_out_modrm_sib_offset(s, OPC_MOVBE_GyMy + seg, datalo,
2014 base, index, 0, ofs);
2015 tcg_out_ext32s(s, datalo, datalo);
2016 } else {
2017 tcg_out_modrm_sib_offset(s, OPC_MOVSLQ + seg, datalo,
2018 base, index, 0, ofs);
2019 }
2020 break;
2021 #endif
2022 case MO_Q:
2023 if (TCG_TARGET_REG_BITS == 64) {
2024 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2025 base, index, 0, ofs);
2026 } else {
2027 if (use_movbe) {
2028 TCGReg t = datalo;
2029 datalo = datahi;
2030 datahi = t;
2031 }
2032 if (base != datalo) {
2033 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2034 base, index, 0, ofs);
2035 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2036 base, index, 0, ofs + 4);
2037 } else {
2038 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2039 base, index, 0, ofs + 4);
2040 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2041 base, index, 0, ofs);
2042 }
2043 }
2044 break;
2045 default:
2046 g_assert_not_reached();
2047 }
2048 }
2049
2050 /* XXX: qemu_ld and qemu_st could be modified to clobber only EDX and
2051 EAX. It will be useful once fixed registers globals are less
2052 common. */
2053 static void tcg_out_qemu_ld(TCGContext *s, const TCGArg *args, bool is64)
2054 {
2055 TCGReg datalo, datahi, addrlo;
2056 TCGReg addrhi __attribute__((unused));
2057 MemOpIdx oi;
2058 MemOp opc;
2059 #if defined(CONFIG_SOFTMMU)
2060 int mem_index;
2061 tcg_insn_unit *label_ptr[2];
2062 #endif
2063
2064 datalo = *args++;
2065 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2066 addrlo = *args++;
2067 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2068 oi = *args++;
2069 opc = get_memop(oi);
2070
2071 #if defined(CONFIG_SOFTMMU)
2072 mem_index = get_mmuidx(oi);
2073
2074 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2075 label_ptr, offsetof(CPUTLBEntry, addr_read));
2076
2077 /* TLB Hit. */
2078 tcg_out_qemu_ld_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, is64, opc);
2079
2080 /* Record the current context of a load into ldst label */
2081 add_qemu_ldst_label(s, true, is64, oi, datalo, datahi, addrlo, addrhi,
2082 s->code_ptr, label_ptr);
2083 #else
2084 tcg_out_qemu_ld_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2085 x86_guest_base_offset, x86_guest_base_seg,
2086 is64, opc);
2087 #endif
2088 }
2089
2090 static void tcg_out_qemu_st_direct(TCGContext *s, TCGReg datalo, TCGReg datahi,
2091 TCGReg base, int index, intptr_t ofs,
2092 int seg, MemOp memop)
2093 {
2094 bool use_movbe = false;
2095 int movop = OPC_MOVL_EvGv;
2096
2097 /*
2098 * Do big-endian stores with movbe or softmmu.
2099 * User-only without movbe will have its swapping done generically.
2100 */
2101 if (memop & MO_BSWAP) {
2102 tcg_debug_assert(have_movbe);
2103 use_movbe = true;
2104 movop = OPC_MOVBE_MyGy;
2105 }
2106
2107 switch (memop & MO_SIZE) {
2108 case MO_8:
2109 /* This is handled with constraints on INDEX_op_qemu_st8_i32. */
2110 tcg_debug_assert(TCG_TARGET_REG_BITS == 64 || datalo < 4);
2111 tcg_out_modrm_sib_offset(s, OPC_MOVB_EvGv + P_REXB_R + seg,
2112 datalo, base, index, 0, ofs);
2113 break;
2114 case MO_16:
2115 tcg_out_modrm_sib_offset(s, movop + P_DATA16 + seg, datalo,
2116 base, index, 0, ofs);
2117 break;
2118 case MO_32:
2119 tcg_out_modrm_sib_offset(s, movop + seg, datalo, base, index, 0, ofs);
2120 break;
2121 case MO_64:
2122 if (TCG_TARGET_REG_BITS == 64) {
2123 tcg_out_modrm_sib_offset(s, movop + P_REXW + seg, datalo,
2124 base, index, 0, ofs);
2125 } else {
2126 if (use_movbe) {
2127 TCGReg t = datalo;
2128 datalo = datahi;
2129 datahi = t;
2130 }
2131 tcg_out_modrm_sib_offset(s, movop + seg, datalo,
2132 base, index, 0, ofs);
2133 tcg_out_modrm_sib_offset(s, movop + seg, datahi,
2134 base, index, 0, ofs + 4);
2135 }
2136 break;
2137 default:
2138 g_assert_not_reached();
2139 }
2140 }
2141
2142 static void tcg_out_qemu_st(TCGContext *s, const TCGArg *args, bool is64)
2143 {
2144 TCGReg datalo, datahi, addrlo;
2145 TCGReg addrhi __attribute__((unused));
2146 MemOpIdx oi;
2147 MemOp opc;
2148 #if defined(CONFIG_SOFTMMU)
2149 int mem_index;
2150 tcg_insn_unit *label_ptr[2];
2151 #endif
2152
2153 datalo = *args++;
2154 datahi = (TCG_TARGET_REG_BITS == 32 && is64 ? *args++ : 0);
2155 addrlo = *args++;
2156 addrhi = (TARGET_LONG_BITS > TCG_TARGET_REG_BITS ? *args++ : 0);
2157 oi = *args++;
2158 opc = get_memop(oi);
2159
2160 #if defined(CONFIG_SOFTMMU)
2161 mem_index = get_mmuidx(oi);
2162
2163 tcg_out_tlb_load(s, addrlo, addrhi, mem_index, opc,
2164 label_ptr, offsetof(CPUTLBEntry, addr_write));
2165
2166 /* TLB Hit. */
2167 tcg_out_qemu_st_direct(s, datalo, datahi, TCG_REG_L1, -1, 0, 0, opc);
2168
2169 /* Record the current context of a store into ldst label */
2170 add_qemu_ldst_label(s, false, is64, oi, datalo, datahi, addrlo, addrhi,
2171 s->code_ptr, label_ptr);
2172 #else
2173 tcg_out_qemu_st_direct(s, datalo, datahi, addrlo, x86_guest_base_index,
2174 x86_guest_base_offset, x86_guest_base_seg, opc);
2175 #endif
2176 }
2177
2178 static inline void tcg_out_op(TCGContext *s, TCGOpcode opc,
2179 const TCGArg args[TCG_MAX_OP_ARGS],
2180 const int const_args[TCG_MAX_OP_ARGS])
2181 {
2182 TCGArg a0, a1, a2;
2183 int c, const_a2, vexop, rexw = 0;
2184
2185 #if TCG_TARGET_REG_BITS == 64
2186 # define OP_32_64(x) \
2187 case glue(glue(INDEX_op_, x), _i64): \
2188 rexw = P_REXW; /* FALLTHRU */ \
2189 case glue(glue(INDEX_op_, x), _i32)
2190 #else
2191 # define OP_32_64(x) \
2192 case glue(glue(INDEX_op_, x), _i32)
2193 #endif
2194
2195 /* Hoist the loads of the most common arguments. */
2196 a0 = args[0];
2197 a1 = args[1];
2198 a2 = args[2];
2199 const_a2 = const_args[2];
2200
2201 switch (opc) {
2202 case INDEX_op_exit_tb:
2203 /* Reuse the zeroing that exists for goto_ptr. */
2204 if (a0 == 0) {
2205 tcg_out_jmp(s, tcg_code_gen_epilogue);
2206 } else {
2207 tcg_out_movi(s, TCG_TYPE_PTR, TCG_REG_EAX, a0);
2208 tcg_out_jmp(s, tb_ret_addr);
2209 }
2210 break;
2211 case INDEX_op_goto_tb:
2212 if (s->tb_jmp_insn_offset) {
2213 /* direct jump method */
2214 int gap;
2215 /* jump displacement must be aligned for atomic patching;
2216 * see if we need to add extra nops before jump
2217 */
2218 gap = QEMU_ALIGN_PTR_UP(s->code_ptr + 1, 4) - s->code_ptr;
2219 if (gap != 1) {
2220 tcg_out_nopn(s, gap - 1);
2221 }
2222 tcg_out8(s, OPC_JMP_long); /* jmp im */
2223 s->tb_jmp_insn_offset[a0] = tcg_current_code_size(s);
2224 tcg_out32(s, 0);
2225 } else {
2226 /* indirect jump method */
2227 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, -1,
2228 (intptr_t)(s->tb_jmp_target_addr + a0));
2229 }
2230 set_jmp_reset_offset(s, a0);
2231 break;
2232 case INDEX_op_goto_ptr:
2233 /* jmp to the given host address (could be epilogue) */
2234 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, a0);
2235 break;
2236 case INDEX_op_br:
2237 tcg_out_jxx(s, JCC_JMP, arg_label(a0), 0);
2238 break;
2239 OP_32_64(ld8u):
2240 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2241 tcg_out_modrm_offset(s, OPC_MOVZBL, a0, a1, a2);
2242 break;
2243 OP_32_64(ld8s):
2244 tcg_out_modrm_offset(s, OPC_MOVSBL + rexw, a0, a1, a2);
2245 break;
2246 OP_32_64(ld16u):
2247 /* Note that we can ignore REXW for the zero-extend to 64-bit. */
2248 tcg_out_modrm_offset(s, OPC_MOVZWL, a0, a1, a2);
2249 break;
2250 OP_32_64(ld16s):
2251 tcg_out_modrm_offset(s, OPC_MOVSWL + rexw, a0, a1, a2);
2252 break;
2253 #if TCG_TARGET_REG_BITS == 64
2254 case INDEX_op_ld32u_i64:
2255 #endif
2256 case INDEX_op_ld_i32:
2257 tcg_out_ld(s, TCG_TYPE_I32, a0, a1, a2);
2258 break;
2259
2260 OP_32_64(st8):
2261 if (const_args[0]) {
2262 tcg_out_modrm_offset(s, OPC_MOVB_EvIz, 0, a1, a2);
2263 tcg_out8(s, a0);
2264 } else {
2265 tcg_out_modrm_offset(s, OPC_MOVB_EvGv | P_REXB_R, a0, a1, a2);
2266 }
2267 break;
2268 OP_32_64(st16):
2269 if (const_args[0]) {
2270 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_DATA16, 0, a1, a2);
2271 tcg_out16(s, a0);
2272 } else {
2273 tcg_out_modrm_offset(s, OPC_MOVL_EvGv | P_DATA16, a0, a1, a2);
2274 }
2275 break;
2276 #if TCG_TARGET_REG_BITS == 64
2277 case INDEX_op_st32_i64:
2278 #endif
2279 case INDEX_op_st_i32:
2280 if (const_args[0]) {
2281 tcg_out_modrm_offset(s, OPC_MOVL_EvIz, 0, a1, a2);
2282 tcg_out32(s, a0);
2283 } else {
2284 tcg_out_st(s, TCG_TYPE_I32, a0, a1, a2);
2285 }
2286 break;
2287
2288 OP_32_64(add):
2289 /* For 3-operand addition, use LEA. */
2290 if (a0 != a1) {
2291 TCGArg c3 = 0;
2292 if (const_a2) {
2293 c3 = a2, a2 = -1;
2294 } else if (a0 == a2) {
2295 /* Watch out for dest = src + dest, since we've removed
2296 the matching constraint on the add. */
2297 tgen_arithr(s, ARITH_ADD + rexw, a0, a1);
2298 break;
2299 }
2300
2301 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a2, 0, c3);
2302 break;
2303 }
2304 c = ARITH_ADD;
2305 goto gen_arith;
2306 OP_32_64(sub):
2307 c = ARITH_SUB;
2308 goto gen_arith;
2309 OP_32_64(and):
2310 c = ARITH_AND;
2311 goto gen_arith;
2312 OP_32_64(or):
2313 c = ARITH_OR;
2314 goto gen_arith;
2315 OP_32_64(xor):
2316 c = ARITH_XOR;
2317 goto gen_arith;
2318 gen_arith:
2319 if (const_a2) {
2320 tgen_arithi(s, c + rexw, a0, a2, 0);
2321 } else {
2322 tgen_arithr(s, c + rexw, a0, a2);
2323 }
2324 break;
2325
2326 OP_32_64(andc):
2327 if (const_a2) {
2328 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2329 tgen_arithi(s, ARITH_AND + rexw, a0, ~a2, 0);
2330 } else {
2331 tcg_out_vex_modrm(s, OPC_ANDN + rexw, a0, a2, a1);
2332 }
2333 break;
2334
2335 OP_32_64(mul):
2336 if (const_a2) {
2337 int32_t val;
2338 val = a2;
2339 if (val == (int8_t)val) {
2340 tcg_out_modrm(s, OPC_IMUL_GvEvIb + rexw, a0, a0);
2341 tcg_out8(s, val);
2342 } else {
2343 tcg_out_modrm(s, OPC_IMUL_GvEvIz + rexw, a0, a0);
2344 tcg_out32(s, val);
2345 }
2346 } else {
2347 tcg_out_modrm(s, OPC_IMUL_GvEv + rexw, a0, a2);
2348 }
2349 break;
2350
2351 OP_32_64(div2):
2352 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IDIV, args[4]);
2353 break;
2354 OP_32_64(divu2):
2355 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_DIV, args[4]);
2356 break;
2357
2358 OP_32_64(shl):
2359 /* For small constant 3-operand shift, use LEA. */
2360 if (const_a2 && a0 != a1 && (a2 - 1) < 3) {
2361 if (a2 - 1 == 0) {
2362 /* shl $1,a1,a0 -> lea (a1,a1),a0 */
2363 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, a1, a1, 0, 0);
2364 } else {
2365 /* shl $n,a1,a0 -> lea 0(,a1,n),a0 */
2366 tcg_out_modrm_sib_offset(s, OPC_LEA + rexw, a0, -1, a1, a2, 0);
2367 }
2368 break;
2369 }
2370 c = SHIFT_SHL;
2371 vexop = OPC_SHLX;
2372 goto gen_shift_maybe_vex;
2373 OP_32_64(shr):
2374 c = SHIFT_SHR;
2375 vexop = OPC_SHRX;
2376 goto gen_shift_maybe_vex;
2377 OP_32_64(sar):
2378 c = SHIFT_SAR;
2379 vexop = OPC_SARX;
2380 goto gen_shift_maybe_vex;
2381 OP_32_64(rotl):
2382 c = SHIFT_ROL;
2383 goto gen_shift;
2384 OP_32_64(rotr):
2385 c = SHIFT_ROR;
2386 goto gen_shift;
2387 gen_shift_maybe_vex:
2388 if (have_bmi2) {
2389 if (!const_a2) {
2390 tcg_out_vex_modrm(s, vexop + rexw, a0, a2, a1);
2391 break;
2392 }
2393 tcg_out_mov(s, rexw ? TCG_TYPE_I64 : TCG_TYPE_I32, a0, a1);
2394 }
2395 /* FALLTHRU */
2396 gen_shift:
2397 if (const_a2) {
2398 tcg_out_shifti(s, c + rexw, a0, a2);
2399 } else {
2400 tcg_out_modrm(s, OPC_SHIFT_cl + rexw, c, a0);
2401 }
2402 break;
2403
2404 OP_32_64(ctz):
2405 tcg_out_ctz(s, rexw, args[0], args[1], args[2], const_args[2]);
2406 break;
2407 OP_32_64(clz):
2408 tcg_out_clz(s, rexw, args[0], args[1], args[2], const_args[2]);
2409 break;
2410 OP_32_64(ctpop):
2411 tcg_out_modrm(s, OPC_POPCNT + rexw, a0, a1);
2412 break;
2413
2414 case INDEX_op_brcond_i32:
2415 tcg_out_brcond32(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2416 break;
2417 case INDEX_op_setcond_i32:
2418 tcg_out_setcond32(s, args[3], a0, a1, a2, const_a2);
2419 break;
2420 case INDEX_op_movcond_i32:
2421 tcg_out_movcond32(s, args[5], a0, a1, a2, const_a2, args[3]);
2422 break;
2423
2424 OP_32_64(bswap16):
2425 if (a2 & TCG_BSWAP_OS) {
2426 /* Output must be sign-extended. */
2427 if (rexw) {
2428 tcg_out_bswap64(s, a0);
2429 tcg_out_shifti(s, SHIFT_SAR + rexw, a0, 48);
2430 } else {
2431 tcg_out_bswap32(s, a0);
2432 tcg_out_shifti(s, SHIFT_SAR, a0, 16);
2433 }
2434 } else if ((a2 & (TCG_BSWAP_IZ | TCG_BSWAP_OZ)) == TCG_BSWAP_OZ) {
2435 /* Output must be zero-extended, but input isn't. */
2436 tcg_out_bswap32(s, a0);
2437 tcg_out_shifti(s, SHIFT_SHR, a0, 16);
2438 } else {
2439 tcg_out_rolw_8(s, a0);
2440 }
2441 break;
2442 OP_32_64(bswap32):
2443 tcg_out_bswap32(s, a0);
2444 if (rexw && (a2 & TCG_BSWAP_OS)) {
2445 tcg_out_ext32s(s, a0, a0);
2446 }
2447 break;
2448
2449 OP_32_64(neg):
2450 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NEG, a0);
2451 break;
2452 OP_32_64(not):
2453 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_NOT, a0);
2454 break;
2455
2456 OP_32_64(ext8s):
2457 tcg_out_ext8s(s, a0, a1, rexw);
2458 break;
2459 OP_32_64(ext16s):
2460 tcg_out_ext16s(s, a0, a1, rexw);
2461 break;
2462 OP_32_64(ext8u):
2463 tcg_out_ext8u(s, a0, a1);
2464 break;
2465 OP_32_64(ext16u):
2466 tcg_out_ext16u(s, a0, a1);
2467 break;
2468
2469 case INDEX_op_qemu_ld_i32:
2470 tcg_out_qemu_ld(s, args, 0);
2471 break;
2472 case INDEX_op_qemu_ld_i64:
2473 tcg_out_qemu_ld(s, args, 1);
2474 break;
2475 case INDEX_op_qemu_st_i32:
2476 case INDEX_op_qemu_st8_i32:
2477 tcg_out_qemu_st(s, args, 0);
2478 break;
2479 case INDEX_op_qemu_st_i64:
2480 tcg_out_qemu_st(s, args, 1);
2481 break;
2482
2483 OP_32_64(mulu2):
2484 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_MUL, args[3]);
2485 break;
2486 OP_32_64(muls2):
2487 tcg_out_modrm(s, OPC_GRP3_Ev + rexw, EXT3_IMUL, args[3]);
2488 break;
2489 OP_32_64(add2):
2490 if (const_args[4]) {
2491 tgen_arithi(s, ARITH_ADD + rexw, a0, args[4], 1);
2492 } else {
2493 tgen_arithr(s, ARITH_ADD + rexw, a0, args[4]);
2494 }
2495 if (const_args[5]) {
2496 tgen_arithi(s, ARITH_ADC + rexw, a1, args[5], 1);
2497 } else {
2498 tgen_arithr(s, ARITH_ADC + rexw, a1, args[5]);
2499 }
2500 break;
2501 OP_32_64(sub2):
2502 if (const_args[4]) {
2503 tgen_arithi(s, ARITH_SUB + rexw, a0, args[4], 1);
2504 } else {
2505 tgen_arithr(s, ARITH_SUB + rexw, a0, args[4]);
2506 }
2507 if (const_args[5]) {
2508 tgen_arithi(s, ARITH_SBB + rexw, a1, args[5], 1);
2509 } else {
2510 tgen_arithr(s, ARITH_SBB + rexw, a1, args[5]);
2511 }
2512 break;
2513
2514 #if TCG_TARGET_REG_BITS == 32
2515 case INDEX_op_brcond2_i32:
2516 tcg_out_brcond2(s, args, const_args, 0);
2517 break;
2518 case INDEX_op_setcond2_i32:
2519 tcg_out_setcond2(s, args, const_args);
2520 break;
2521 #else /* TCG_TARGET_REG_BITS == 64 */
2522 case INDEX_op_ld32s_i64:
2523 tcg_out_modrm_offset(s, OPC_MOVSLQ, a0, a1, a2);
2524 break;
2525 case INDEX_op_ld_i64:
2526 tcg_out_ld(s, TCG_TYPE_I64, a0, a1, a2);
2527 break;
2528 case INDEX_op_st_i64:
2529 if (const_args[0]) {
2530 tcg_out_modrm_offset(s, OPC_MOVL_EvIz | P_REXW, 0, a1, a2);
2531 tcg_out32(s, a0);
2532 } else {
2533 tcg_out_st(s, TCG_TYPE_I64, a0, a1, a2);
2534 }
2535 break;
2536
2537 case INDEX_op_brcond_i64:
2538 tcg_out_brcond64(s, a2, a0, a1, const_args[1], arg_label(args[3]), 0);
2539 break;
2540 case INDEX_op_setcond_i64:
2541 tcg_out_setcond64(s, args[3], a0, a1, a2, const_a2);
2542 break;
2543 case INDEX_op_movcond_i64:
2544 tcg_out_movcond64(s, args[5], a0, a1, a2, const_a2, args[3]);
2545 break;
2546
2547 case INDEX_op_bswap64_i64:
2548 tcg_out_bswap64(s, a0);
2549 break;
2550 case INDEX_op_extu_i32_i64:
2551 case INDEX_op_ext32u_i64:
2552 case INDEX_op_extrl_i64_i32:
2553 tcg_out_ext32u(s, a0, a1);
2554 break;
2555 case INDEX_op_ext_i32_i64:
2556 case INDEX_op_ext32s_i64:
2557 tcg_out_ext32s(s, a0, a1);
2558 break;
2559 case INDEX_op_extrh_i64_i32:
2560 tcg_out_shifti(s, SHIFT_SHR + P_REXW, a0, 32);
2561 break;
2562 #endif
2563
2564 OP_32_64(deposit):
2565 if (args[3] == 0 && args[4] == 8) {
2566 /* load bits 0..7 */
2567 tcg_out_modrm(s, OPC_MOVB_EvGv | P_REXB_R | P_REXB_RM, a2, a0);
2568 } else if (args[3] == 8 && args[4] == 8) {
2569 /* load bits 8..15 */
2570 tcg_out_modrm(s, OPC_MOVB_EvGv, a2, a0 + 4);
2571 } else if (args[3] == 0 && args[4] == 16) {
2572 /* load bits 0..15 */
2573 tcg_out_modrm(s, OPC_MOVL_EvGv | P_DATA16, a2, a0);
2574 } else {
2575 tcg_abort();
2576 }
2577 break;
2578
2579 case INDEX_op_extract_i64:
2580 if (a2 + args[3] == 32) {
2581 /* This is a 32-bit zero-extending right shift. */
2582 tcg_out_mov(s, TCG_TYPE_I32, a0, a1);
2583 tcg_out_shifti(s, SHIFT_SHR, a0, a2);
2584 break;
2585 }
2586 /* FALLTHRU */
2587 case INDEX_op_extract_i32:
2588 /* On the off-chance that we can use the high-byte registers.
2589 Otherwise we emit the same ext16 + shift pattern that we
2590 would have gotten from the normal tcg-op.c expansion. */
2591 tcg_debug_assert(a2 == 8 && args[3] == 8);
2592 if (a1 < 4 && a0 < 8) {
2593 tcg_out_modrm(s, OPC_MOVZBL, a0, a1 + 4);
2594 } else {
2595 tcg_out_ext16u(s, a0, a1);
2596 tcg_out_shifti(s, SHIFT_SHR, a0, 8);
2597 }
2598 break;
2599
2600 case INDEX_op_sextract_i32:
2601 /* We don't implement sextract_i64, as we cannot sign-extend to
2602 64-bits without using the REX prefix that explicitly excludes
2603 access to the high-byte registers. */
2604 tcg_debug_assert(a2 == 8 && args[3] == 8);
2605 if (a1 < 4 && a0 < 8) {
2606 tcg_out_modrm(s, OPC_MOVSBL, a0, a1 + 4);
2607 } else {
2608 tcg_out_ext16s(s, a0, a1, 0);
2609 tcg_out_shifti(s, SHIFT_SAR, a0, 8);
2610 }
2611 break;
2612
2613 OP_32_64(extract2):
2614 /* Note that SHRD outputs to the r/m operand. */
2615 tcg_out_modrm(s, OPC_SHRD_Ib + rexw, a2, a0);
2616 tcg_out8(s, args[3]);
2617 break;
2618
2619 case INDEX_op_mb:
2620 tcg_out_mb(s, a0);
2621 break;
2622 case INDEX_op_mov_i32: /* Always emitted via tcg_out_mov. */
2623 case INDEX_op_mov_i64:
2624 case INDEX_op_call: /* Always emitted via tcg_out_call. */
2625 default:
2626 tcg_abort();
2627 }
2628
2629 #undef OP_32_64
2630 }
2631
2632 static void tcg_out_vec_op(TCGContext *s, TCGOpcode opc,
2633 unsigned vecl, unsigned vece,
2634 const TCGArg args[TCG_MAX_OP_ARGS],
2635 const int const_args[TCG_MAX_OP_ARGS])
2636 {
2637 static int const add_insn[4] = {
2638 OPC_PADDB, OPC_PADDW, OPC_PADDD, OPC_PADDQ
2639 };
2640 static int const ssadd_insn[4] = {
2641 OPC_PADDSB, OPC_PADDSW, OPC_UD2, OPC_UD2
2642 };
2643 static int const usadd_insn[4] = {
2644 OPC_PADDUB, OPC_PADDUW, OPC_UD2, OPC_UD2
2645 };
2646 static int const sub_insn[4] = {
2647 OPC_PSUBB, OPC_PSUBW, OPC_PSUBD, OPC_PSUBQ
2648 };
2649 static int const sssub_insn[4] = {
2650 OPC_PSUBSB, OPC_PSUBSW, OPC_UD2, OPC_UD2
2651 };
2652 static int const ussub_insn[4] = {
2653 OPC_PSUBUB, OPC_PSUBUW, OPC_UD2, OPC_UD2
2654 };
2655 static int const mul_insn[4] = {
2656 OPC_UD2, OPC_PMULLW, OPC_PMULLD, OPC_UD2
2657 };
2658 static int const shift_imm_insn[4] = {
2659 OPC_UD2, OPC_PSHIFTW_Ib, OPC_PSHIFTD_Ib, OPC_PSHIFTQ_Ib
2660 };
2661 static int const cmpeq_insn[4] = {
2662 OPC_PCMPEQB, OPC_PCMPEQW, OPC_PCMPEQD, OPC_PCMPEQQ
2663 };
2664 static int const cmpgt_insn[4] = {
2665 OPC_PCMPGTB, OPC_PCMPGTW, OPC_PCMPGTD, OPC_PCMPGTQ
2666 };
2667 static int const punpckl_insn[4] = {
2668 OPC_PUNPCKLBW, OPC_PUNPCKLWD, OPC_PUNPCKLDQ, OPC_PUNPCKLQDQ
2669 };
2670 static int const punpckh_insn[4] = {
2671 OPC_PUNPCKHBW, OPC_PUNPCKHWD, OPC_PUNPCKHDQ, OPC_PUNPCKHQDQ
2672 };
2673 static int const packss_insn[4] = {
2674 OPC_PACKSSWB, OPC_PACKSSDW, OPC_UD2, OPC_UD2
2675 };
2676 static int const packus_insn[4] = {
2677 OPC_PACKUSWB, OPC_PACKUSDW, OPC_UD2, OPC_UD2
2678 };
2679 static int const smin_insn[4] = {
2680 OPC_PMINSB, OPC_PMINSW, OPC_PMINSD, OPC_UD2
2681 };
2682 static int const smax_insn[4] = {
2683 OPC_PMAXSB, OPC_PMAXSW, OPC_PMAXSD, OPC_UD2
2684 };
2685 static int const umin_insn[4] = {
2686 OPC_PMINUB, OPC_PMINUW, OPC_PMINUD, OPC_UD2
2687 };
2688 static int const umax_insn[4] = {
2689 OPC_PMAXUB, OPC_PMAXUW, OPC_PMAXUD, OPC_UD2
2690 };
2691 static int const shlv_insn[4] = {
2692 /* TODO: AVX512 adds support for MO_16. */
2693 OPC_UD2, OPC_UD2, OPC_VPSLLVD, OPC_VPSLLVQ
2694 };
2695 static int const shrv_insn[4] = {
2696 /* TODO: AVX512 adds support for MO_16. */
2697 OPC_UD2, OPC_UD2, OPC_VPSRLVD, OPC_VPSRLVQ
2698 };
2699 static int const sarv_insn[4] = {
2700 /* TODO: AVX512 adds support for MO_16, MO_64. */
2701 OPC_UD2, OPC_UD2, OPC_VPSRAVD, OPC_UD2
2702 };
2703 static int const shls_insn[4] = {
2704 OPC_UD2, OPC_PSLLW, OPC_PSLLD, OPC_PSLLQ
2705 };
2706 static int const shrs_insn[4] = {
2707 OPC_UD2, OPC_PSRLW, OPC_PSRLD, OPC_PSRLQ
2708 };
2709 static int const sars_insn[4] = {
2710 OPC_UD2, OPC_PSRAW, OPC_PSRAD, OPC_UD2
2711 };
2712 static int const abs_insn[4] = {
2713 /* TODO: AVX512 adds support for MO_64. */
2714 OPC_PABSB, OPC_PABSW, OPC_PABSD, OPC_UD2
2715 };
2716
2717 TCGType type = vecl + TCG_TYPE_V64;
2718 int insn, sub;
2719 TCGArg a0, a1, a2;
2720
2721 a0 = args[0];
2722 a1 = args[1];
2723 a2 = args[2];
2724
2725 switch (opc) {
2726 case INDEX_op_add_vec:
2727 insn = add_insn[vece];
2728 goto gen_simd;
2729 case INDEX_op_ssadd_vec:
2730 insn = ssadd_insn[vece];
2731 goto gen_simd;
2732 case INDEX_op_usadd_vec:
2733 insn = usadd_insn[vece];
2734 goto gen_simd;
2735 case INDEX_op_sub_vec:
2736 insn = sub_insn[vece];
2737 goto gen_simd;
2738 case INDEX_op_sssub_vec:
2739 insn = sssub_insn[vece];
2740 goto gen_simd;
2741 case INDEX_op_ussub_vec:
2742 insn = ussub_insn[vece];
2743 goto gen_simd;
2744 case INDEX_op_mul_vec:
2745 insn = mul_insn[vece];
2746 goto gen_simd;
2747 case INDEX_op_and_vec:
2748 insn = OPC_PAND;
2749 goto gen_simd;
2750 case INDEX_op_or_vec:
2751 insn = OPC_POR;
2752 goto gen_simd;
2753 case INDEX_op_xor_vec:
2754 insn = OPC_PXOR;
2755 goto gen_simd;
2756 case INDEX_op_smin_vec:
2757 insn = smin_insn[vece];
2758 goto gen_simd;
2759 case INDEX_op_umin_vec:
2760 insn = umin_insn[vece];
2761 goto gen_simd;
2762 case INDEX_op_smax_vec:
2763 insn = smax_insn[vece];
2764 goto gen_simd;
2765 case INDEX_op_umax_vec:
2766 insn = umax_insn[vece];
2767 goto gen_simd;
2768 case INDEX_op_shlv_vec:
2769 insn = shlv_insn[vece];
2770 goto gen_simd;
2771 case INDEX_op_shrv_vec:
2772 insn = shrv_insn[vece];
2773 goto gen_simd;
2774 case INDEX_op_sarv_vec:
2775 insn = sarv_insn[vece];
2776 goto gen_simd;
2777 case INDEX_op_shls_vec:
2778 insn = shls_insn[vece];
2779 goto gen_simd;
2780 case INDEX_op_shrs_vec:
2781 insn = shrs_insn[vece];
2782 goto gen_simd;
2783 case INDEX_op_sars_vec:
2784 insn = sars_insn[vece];
2785 goto gen_simd;
2786 case INDEX_op_x86_punpckl_vec:
2787 insn = punpckl_insn[vece];
2788 goto gen_simd;
2789 case INDEX_op_x86_punpckh_vec:
2790 insn = punpckh_insn[vece];
2791 goto gen_simd;
2792 case INDEX_op_x86_packss_vec:
2793 insn = packss_insn[vece];
2794 goto gen_simd;
2795 case INDEX_op_x86_packus_vec:
2796 insn = packus_insn[vece];
2797 goto gen_simd;
2798 #if TCG_TARGET_REG_BITS == 32
2799 case INDEX_op_dup2_vec:
2800 /* First merge the two 32-bit inputs to a single 64-bit element. */
2801 tcg_out_vex_modrm(s, OPC_PUNPCKLDQ, a0, a1, a2);
2802 /* Then replicate the 64-bit elements across the rest of the vector. */
2803 if (type != TCG_TYPE_V64) {
2804 tcg_out_dup_vec(s, type, MO_64, a0, a0);
2805 }
2806 break;
2807 #endif
2808 case INDEX_op_abs_vec:
2809 insn = abs_insn[vece];
2810 a2 = a1;
2811 a1 = 0;
2812 goto gen_simd;
2813 gen_simd:
2814 tcg_debug_assert(insn != OPC_UD2);
2815 if (type == TCG_TYPE_V256) {
2816 insn |= P_VEXL;
2817 }
2818 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2819 break;
2820
2821 case INDEX_op_cmp_vec:
2822 sub = args[3];
2823 if (sub == TCG_COND_EQ) {
2824 insn = cmpeq_insn[vece];
2825 } else if (sub == TCG_COND_GT) {
2826 insn = cmpgt_insn[vece];
2827 } else {
2828 g_assert_not_reached();
2829 }
2830 goto gen_simd;
2831
2832 case INDEX_op_andc_vec:
2833 insn = OPC_PANDN;
2834 if (type == TCG_TYPE_V256) {
2835 insn |= P_VEXL;
2836 }
2837 tcg_out_vex_modrm(s, insn, a0, a2, a1);
2838 break;
2839
2840 case INDEX_op_shli_vec:
2841 sub = 6;
2842 goto gen_shift;
2843 case INDEX_op_shri_vec:
2844 sub = 2;
2845 goto gen_shift;
2846 case INDEX_op_sari_vec:
2847 tcg_debug_assert(vece != MO_64);
2848 sub = 4;
2849 gen_shift:
2850 tcg_debug_assert(vece != MO_8);
2851 insn = shift_imm_insn[vece];
2852 if (type == TCG_TYPE_V256) {
2853 insn |= P_VEXL;
2854 }
2855 tcg_out_vex_modrm(s, insn, sub, a0, a1);
2856 tcg_out8(s, a2);
2857 break;
2858
2859 case INDEX_op_ld_vec:
2860 tcg_out_ld(s, type, a0, a1, a2);
2861 break;
2862 case INDEX_op_st_vec:
2863 tcg_out_st(s, type, a0, a1, a2);
2864 break;
2865 case INDEX_op_dupm_vec:
2866 tcg_out_dupm_vec(s, type, vece, a0, a1, a2);
2867 break;
2868
2869 case INDEX_op_x86_shufps_vec:
2870 insn = OPC_SHUFPS;
2871 sub = args[3];
2872 goto gen_simd_imm8;
2873 case INDEX_op_x86_blend_vec:
2874 if (vece == MO_16) {
2875 insn = OPC_PBLENDW;
2876 } else if (vece == MO_32) {
2877 insn = (have_avx2 ? OPC_VPBLENDD : OPC_BLENDPS);
2878 } else {
2879 g_assert_not_reached();
2880 }
2881 sub = args[3];
2882 goto gen_simd_imm8;
2883 case INDEX_op_x86_vperm2i128_vec:
2884 insn = OPC_VPERM2I128;
2885 sub = args[3];
2886 goto gen_simd_imm8;
2887 gen_simd_imm8:
2888 if (type == TCG_TYPE_V256) {
2889 insn |= P_VEXL;
2890 }
2891 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2892 tcg_out8(s, sub);
2893 break;
2894
2895 case INDEX_op_x86_vpblendvb_vec:
2896 insn = OPC_VPBLENDVB;
2897 if (type == TCG_TYPE_V256) {
2898 insn |= P_VEXL;
2899 }
2900 tcg_out_vex_modrm(s, insn, a0, a1, a2);
2901 tcg_out8(s, args[3] << 4);
2902 break;
2903
2904 case INDEX_op_x86_psrldq_vec:
2905 tcg_out_vex_modrm(s, OPC_GRP14, 3, a0, a1);
2906 tcg_out8(s, a2);
2907 break;
2908
2909 case INDEX_op_mov_vec: /* Always emitted via tcg_out_mov. */
2910 case INDEX_op_dup_vec: /* Always emitted via tcg_out_dup_vec. */
2911 default:
2912 g_assert_not_reached();
2913 }
2914 }
2915
2916 static TCGConstraintSetIndex tcg_target_op_def(TCGOpcode op)
2917 {
2918 switch (op) {
2919 case INDEX_op_goto_ptr:
2920 return C_O0_I1(r);
2921
2922 case INDEX_op_ld8u_i32:
2923 case INDEX_op_ld8u_i64:
2924 case INDEX_op_ld8s_i32:
2925 case INDEX_op_ld8s_i64:
2926 case INDEX_op_ld16u_i32:
2927 case INDEX_op_ld16u_i64:
2928 case INDEX_op_ld16s_i32:
2929 case INDEX_op_ld16s_i64:
2930 case INDEX_op_ld_i32:
2931 case INDEX_op_ld32u_i64:
2932 case INDEX_op_ld32s_i64:
2933 case INDEX_op_ld_i64:
2934 return C_O1_I1(r, r);
2935
2936 case INDEX_op_st8_i32:
2937 case INDEX_op_st8_i64:
2938 return C_O0_I2(qi, r);
2939
2940 case INDEX_op_st16_i32:
2941 case INDEX_op_st16_i64:
2942 case INDEX_op_st_i32:
2943 case INDEX_op_st32_i64:
2944 return C_O0_I2(ri, r);
2945
2946 case INDEX_op_st_i64:
2947 return C_O0_I2(re, r);
2948
2949 case INDEX_op_add_i32:
2950 case INDEX_op_add_i64:
2951 return C_O1_I2(r, r, re);
2952
2953 case INDEX_op_sub_i32:
2954 case INDEX_op_sub_i64:
2955 case INDEX_op_mul_i32:
2956 case INDEX_op_mul_i64:
2957 case INDEX_op_or_i32:
2958 case INDEX_op_or_i64:
2959 case INDEX_op_xor_i32:
2960 case INDEX_op_xor_i64:
2961 return C_O1_I2(r, 0, re);
2962
2963 case INDEX_op_and_i32:
2964 case INDEX_op_and_i64:
2965 return C_O1_I2(r, 0, reZ);
2966
2967 case INDEX_op_andc_i32:
2968 case INDEX_op_andc_i64:
2969 return C_O1_I2(r, r, rI);
2970
2971 case INDEX_op_shl_i32:
2972 case INDEX_op_shl_i64:
2973 case INDEX_op_shr_i32:
2974 case INDEX_op_shr_i64:
2975 case INDEX_op_sar_i32:
2976 case INDEX_op_sar_i64:
2977 return have_bmi2 ? C_O1_I2(r, r, ri) : C_O1_I2(r, 0, ci);
2978
2979 case INDEX_op_rotl_i32:
2980 case INDEX_op_rotl_i64:
2981 case INDEX_op_rotr_i32:
2982 case INDEX_op_rotr_i64:
2983 return C_O1_I2(r, 0, ci);
2984
2985 case INDEX_op_brcond_i32:
2986 case INDEX_op_brcond_i64:
2987 return C_O0_I2(r, re);
2988
2989 case INDEX_op_bswap16_i32:
2990 case INDEX_op_bswap16_i64:
2991 case INDEX_op_bswap32_i32:
2992 case INDEX_op_bswap32_i64:
2993 case INDEX_op_bswap64_i64:
2994 case INDEX_op_neg_i32:
2995 case INDEX_op_neg_i64:
2996 case INDEX_op_not_i32:
2997 case INDEX_op_not_i64:
2998 case INDEX_op_extrh_i64_i32:
2999 return C_O1_I1(r, 0);
3000
3001 case INDEX_op_ext8s_i32:
3002 case INDEX_op_ext8s_i64:
3003 case INDEX_op_ext8u_i32:
3004 case INDEX_op_ext8u_i64:
3005 return C_O1_I1(r, q);
3006
3007 case INDEX_op_ext16s_i32:
3008 case INDEX_op_ext16s_i64:
3009 case INDEX_op_ext16u_i32:
3010 case INDEX_op_ext16u_i64:
3011 case INDEX_op_ext32s_i64:
3012 case INDEX_op_ext32u_i64:
3013 case INDEX_op_ext_i32_i64:
3014 case INDEX_op_extu_i32_i64:
3015 case INDEX_op_extrl_i64_i32:
3016 case INDEX_op_extract_i32:
3017 case INDEX_op_extract_i64:
3018 case INDEX_op_sextract_i32:
3019 case INDEX_op_ctpop_i32:
3020 case INDEX_op_ctpop_i64:
3021 return C_O1_I1(r, r);
3022
3023 case INDEX_op_extract2_i32:
3024 case INDEX_op_extract2_i64:
3025 return C_O1_I2(r, 0, r);
3026
3027 case INDEX_op_deposit_i32:
3028 case INDEX_op_deposit_i64:
3029 return C_O1_I2(Q, 0, Q);
3030
3031 case INDEX_op_setcond_i32:
3032 case INDEX_op_setcond_i64:
3033 return C_O1_I2(q, r, re);
3034
3035 case INDEX_op_movcond_i32:
3036 case INDEX_op_movcond_i64:
3037 return C_O1_I4(r, r, re, r, 0);
3038
3039 case INDEX_op_div2_i32:
3040 case INDEX_op_div2_i64:
3041 case INDEX_op_divu2_i32:
3042 case INDEX_op_divu2_i64:
3043 return C_O2_I3(a, d, 0, 1, r);
3044
3045 case INDEX_op_mulu2_i32:
3046 case INDEX_op_mulu2_i64:
3047 case INDEX_op_muls2_i32:
3048 case INDEX_op_muls2_i64:
3049 return C_O2_I2(a, d, a, r);
3050
3051 case INDEX_op_add2_i32:
3052 case INDEX_op_add2_i64:
3053 case INDEX_op_sub2_i32:
3054 case INDEX_op_sub2_i64:
3055 return C_O2_I4(r, r, 0, 1, re, re);
3056
3057 case INDEX_op_ctz_i32:
3058 case INDEX_op_ctz_i64:
3059 return have_bmi1 ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3060
3061 case INDEX_op_clz_i32:
3062 case INDEX_op_clz_i64:
3063 return have_lzcnt ? C_N1_I2(r, r, rW) : C_N1_I2(r, r, r);
3064
3065 case INDEX_op_qemu_ld_i32:
3066 return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3067 ? C_O1_I1(r, L) : C_O1_I2(r, L, L));
3068
3069 case INDEX_op_qemu_st_i32:
3070 return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3071 ? C_O0_I2(L, L) : C_O0_I3(L, L, L));
3072 case INDEX_op_qemu_st8_i32:
3073 return (TARGET_LONG_BITS <= TCG_TARGET_REG_BITS
3074 ? C_O0_I2(s, L) : C_O0_I3(s, L, L));
3075
3076 case INDEX_op_qemu_ld_i64:
3077 return (TCG_TARGET_REG_BITS == 64 ? C_O1_I1(r, L)
3078 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O2_I1(r, r, L)
3079 : C_O2_I2(r, r, L, L));
3080
3081 case INDEX_op_qemu_st_i64:
3082 return (TCG_TARGET_REG_BITS == 64 ? C_O0_I2(L, L)
3083 : TARGET_LONG_BITS <= TCG_TARGET_REG_BITS ? C_O0_I3(L, L, L)
3084 : C_O0_I4(L, L, L, L));
3085
3086 case INDEX_op_brcond2_i32:
3087 return C_O0_I4(r, r, ri, ri);
3088
3089 case INDEX_op_setcond2_i32:
3090 return C_O1_I4(r, r, r, ri, ri);
3091
3092 case INDEX_op_ld_vec:
3093 case INDEX_op_dupm_vec:
3094 return C_O1_I1(x, r);
3095
3096 case INDEX_op_st_vec:
3097 return C_O0_I2(x, r);
3098
3099 case INDEX_op_add_vec:
3100 case INDEX_op_sub_vec:
3101 case INDEX_op_mul_vec:
3102 case INDEX_op_and_vec:
3103 case INDEX_op_or_vec:
3104 case INDEX_op_xor_vec:
3105 case INDEX_op_andc_vec:
3106 case INDEX_op_ssadd_vec:
3107 case INDEX_op_usadd_vec:
3108 case INDEX_op_sssub_vec:
3109 case INDEX_op_ussub_vec:
3110 case INDEX_op_smin_vec:
3111 case INDEX_op_umin_vec:
3112 case INDEX_op_smax_vec:
3113 case INDEX_op_umax_vec:
3114 case INDEX_op_shlv_vec:
3115 case INDEX_op_shrv_vec:
3116 case INDEX_op_sarv_vec:
3117 case INDEX_op_shls_vec:
3118 case INDEX_op_shrs_vec:
3119 case INDEX_op_sars_vec:
3120 case INDEX_op_rotls_vec:
3121 case INDEX_op_cmp_vec:
3122 case INDEX_op_x86_shufps_vec:
3123 case INDEX_op_x86_blend_vec:
3124 case INDEX_op_x86_packss_vec:
3125 case INDEX_op_x86_packus_vec:
3126 case INDEX_op_x86_vperm2i128_vec:
3127 case INDEX_op_x86_punpckl_vec:
3128 case INDEX_op_x86_punpckh_vec:
3129 #if TCG_TARGET_REG_BITS == 32
3130 case INDEX_op_dup2_vec:
3131 #endif
3132 return C_O1_I2(x, x, x);
3133
3134 case INDEX_op_abs_vec:
3135 case INDEX_op_dup_vec:
3136 case INDEX_op_shli_vec:
3137 case INDEX_op_shri_vec:
3138 case INDEX_op_sari_vec:
3139 case INDEX_op_x86_psrldq_vec:
3140 return C_O1_I1(x, x);
3141
3142 case INDEX_op_x86_vpblendvb_vec:
3143 return C_O1_I3(x, x, x, x);
3144
3145 default:
3146 g_assert_not_reached();
3147 }
3148 }
3149
3150 int tcg_can_emit_vec_op(TCGOpcode opc, TCGType type, unsigned vece)
3151 {
3152 switch (opc) {
3153 case INDEX_op_add_vec:
3154 case INDEX_op_sub_vec:
3155 case INDEX_op_and_vec:
3156 case INDEX_op_or_vec:
3157 case INDEX_op_xor_vec:
3158 case INDEX_op_andc_vec:
3159 return 1;
3160 case INDEX_op_rotli_vec:
3161 case INDEX_op_cmp_vec:
3162 case INDEX_op_cmpsel_vec:
3163 return -1;
3164
3165 case INDEX_op_shli_vec:
3166 case INDEX_op_shri_vec:
3167 /* We must expand the operation for MO_8. */
3168 return vece == MO_8 ? -1 : 1;
3169
3170 case INDEX_op_sari_vec:
3171 /* We must expand the operation for MO_8. */
3172 if (vece == MO_8) {
3173 return -1;
3174 }
3175 /* We can emulate this for MO_64, but it does not pay off
3176 unless we're producing at least 4 values. */
3177 if (vece == MO_64) {
3178 return type >= TCG_TYPE_V256 ? -1 : 0;
3179 }
3180 return 1;
3181
3182 case INDEX_op_shls_vec:
3183 case INDEX_op_shrs_vec:
3184 return vece >= MO_16;
3185 case INDEX_op_sars_vec:
3186 return vece >= MO_16 && vece <= MO_32;
3187 case INDEX_op_rotls_vec:
3188 return vece >= MO_16 ? -1 : 0;
3189
3190 case INDEX_op_shlv_vec:
3191 case INDEX_op_shrv_vec:
3192 return have_avx2 && vece >= MO_32;
3193 case INDEX_op_sarv_vec:
3194 return have_avx2 && vece == MO_32;
3195 case INDEX_op_rotlv_vec:
3196 case INDEX_op_rotrv_vec:
3197 return have_avx2 && vece >= MO_32 ? -1 : 0;
3198
3199 case INDEX_op_mul_vec:
3200 if (vece == MO_8) {
3201 /* We can expand the operation for MO_8. */
3202 return -1;
3203 }
3204 if (vece == MO_64) {
3205 return 0;
3206 }
3207 return 1;
3208
3209 case INDEX_op_ssadd_vec:
3210 case INDEX_op_usadd_vec:
3211 case INDEX_op_sssub_vec:
3212 case INDEX_op_ussub_vec:
3213 return vece <= MO_16;
3214 case INDEX_op_smin_vec:
3215 case INDEX_op_smax_vec:
3216 case INDEX_op_umin_vec:
3217 case INDEX_op_umax_vec:
3218 case INDEX_op_abs_vec:
3219 return vece <= MO_32;
3220
3221 default:
3222 return 0;
3223 }
3224 }
3225
3226 static void expand_vec_shi(TCGType type, unsigned vece, TCGOpcode opc,
3227 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3228 {
3229 TCGv_vec t1, t2;
3230
3231 tcg_debug_assert(vece == MO_8);
3232
3233 t1 = tcg_temp_new_vec(type);
3234 t2 = tcg_temp_new_vec(type);
3235
3236 /*
3237 * Unpack to W, shift, and repack. Tricky bits:
3238 * (1) Use punpck*bw x,x to produce DDCCBBAA,
3239 * i.e. duplicate in other half of the 16-bit lane.
3240 * (2) For right-shift, add 8 so that the high half of the lane
3241 * becomes zero. For left-shift, and left-rotate, we must
3242 * shift up and down again.
3243 * (3) Step 2 leaves high half zero such that PACKUSWB
3244 * (pack with unsigned saturation) does not modify
3245 * the quantity.
3246 */
3247 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3248 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3249 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3250 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3251
3252 if (opc != INDEX_op_rotli_vec) {
3253 imm += 8;
3254 }
3255 if (opc == INDEX_op_shri_vec) {
3256 tcg_gen_shri_vec(MO_16, t1, t1, imm);
3257 tcg_gen_shri_vec(MO_16, t2, t2, imm);
3258 } else {
3259 tcg_gen_shli_vec(MO_16, t1, t1, imm);
3260 tcg_gen_shli_vec(MO_16, t2, t2, imm);
3261 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3262 tcg_gen_shri_vec(MO_16, t2, t2, 8);
3263 }
3264
3265 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3266 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3267 tcg_temp_free_vec(t1);
3268 tcg_temp_free_vec(t2);
3269 }
3270
3271 static void expand_vec_sari(TCGType type, unsigned vece,
3272 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3273 {
3274 TCGv_vec t1, t2;
3275
3276 switch (vece) {
3277 case MO_8:
3278 /* Unpack to W, shift, and repack, as in expand_vec_shi. */
3279 t1 = tcg_temp_new_vec(type);
3280 t2 = tcg_temp_new_vec(type);
3281 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3282 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3283 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3284 tcgv_vec_arg(t2), tcgv_vec_arg(v1), tcgv_vec_arg(v1));
3285 tcg_gen_sari_vec(MO_16, t1, t1, imm + 8);
3286 tcg_gen_sari_vec(MO_16, t2, t2, imm + 8);
3287 vec_gen_3(INDEX_op_x86_packss_vec, type, MO_8,
3288 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t2));
3289 tcg_temp_free_vec(t1);
3290 tcg_temp_free_vec(t2);
3291 break;
3292
3293 case MO_64:
3294 if (imm <= 32) {
3295 /*
3296 * We can emulate a small sign extend by performing an arithmetic
3297 * 32-bit shift and overwriting the high half of a 64-bit logical
3298 * shift. Note that the ISA says shift of 32 is valid, but TCG
3299 * does not, so we have to bound the smaller shift -- we get the
3300 * same result in the high half either way.
3301 */
3302 t1 = tcg_temp_new_vec(type);
3303 tcg_gen_sari_vec(MO_32, t1, v1, MIN(imm, 31));
3304 tcg_gen_shri_vec(MO_64, v0, v1, imm);
3305 vec_gen_4(INDEX_op_x86_blend_vec, type, MO_32,
3306 tcgv_vec_arg(v0), tcgv_vec_arg(v0),
3307 tcgv_vec_arg(t1), 0xaa);
3308 tcg_temp_free_vec(t1);
3309 } else {
3310 /* Otherwise we will need to use a compare vs 0 to produce
3311 * the sign-extend, shift and merge.
3312 */
3313 t1 = tcg_const_zeros_vec(type);
3314 tcg_gen_cmp_vec(TCG_COND_GT, MO_64, t1, t1, v1);
3315 tcg_gen_shri_vec(MO_64, v0, v1, imm);
3316 tcg_gen_shli_vec(MO_64, t1, t1, 64 - imm);
3317 tcg_gen_or_vec(MO_64, v0, v0, t1);
3318 tcg_temp_free_vec(t1);
3319 }
3320 break;
3321
3322 default:
3323 g_assert_not_reached();
3324 }
3325 }
3326
3327 static void expand_vec_rotli(TCGType type, unsigned vece,
3328 TCGv_vec v0, TCGv_vec v1, TCGArg imm)
3329 {
3330 TCGv_vec t;
3331
3332 if (vece == MO_8) {
3333 expand_vec_shi(type, vece, INDEX_op_rotli_vec, v0, v1, imm);
3334 return;
3335 }
3336
3337 t = tcg_temp_new_vec(type);
3338 tcg_gen_shli_vec(vece, t, v1, imm);
3339 tcg_gen_shri_vec(vece, v0, v1, (8 << vece) - imm);
3340 tcg_gen_or_vec(vece, v0, v0, t);
3341 tcg_temp_free_vec(t);
3342 }
3343
3344 static void expand_vec_rotls(TCGType type, unsigned vece,
3345 TCGv_vec v0, TCGv_vec v1, TCGv_i32 lsh)
3346 {
3347 TCGv_i32 rsh;
3348 TCGv_vec t;
3349
3350 tcg_debug_assert(vece != MO_8);
3351
3352 t = tcg_temp_new_vec(type);
3353 rsh = tcg_temp_new_i32();
3354
3355 tcg_gen_neg_i32(rsh, lsh);
3356 tcg_gen_andi_i32(rsh, rsh, (8 << vece) - 1);
3357 tcg_gen_shls_vec(vece, t, v1, lsh);
3358 tcg_gen_shrs_vec(vece, v0, v1, rsh);
3359 tcg_gen_or_vec(vece, v0, v0, t);
3360 tcg_temp_free_vec(t);
3361 tcg_temp_free_i32(rsh);
3362 }
3363
3364 static void expand_vec_rotv(TCGType type, unsigned vece, TCGv_vec v0,
3365 TCGv_vec v1, TCGv_vec sh, bool right)
3366 {
3367 TCGv_vec t = tcg_temp_new_vec(type);
3368
3369 tcg_gen_dupi_vec(vece, t, 8 << vece);
3370 tcg_gen_sub_vec(vece, t, t, sh);
3371 if (right) {
3372 tcg_gen_shlv_vec(vece, t, v1, t);
3373 tcg_gen_shrv_vec(vece, v0, v1, sh);
3374 } else {
3375 tcg_gen_shrv_vec(vece, t, v1, t);
3376 tcg_gen_shlv_vec(vece, v0, v1, sh);
3377 }
3378 tcg_gen_or_vec(vece, v0, v0, t);
3379 tcg_temp_free_vec(t);
3380 }
3381
3382 static void expand_vec_mul(TCGType type, unsigned vece,
3383 TCGv_vec v0, TCGv_vec v1, TCGv_vec v2)
3384 {
3385 TCGv_vec t1, t2, t3, t4, zero;
3386
3387 tcg_debug_assert(vece == MO_8);
3388
3389 /*
3390 * Unpack v1 bytes to words, 0 | x.
3391 * Unpack v2 bytes to words, y | 0.
3392 * This leaves the 8-bit result, x * y, with 8 bits of right padding.
3393 * Shift logical right by 8 bits to clear the high 8 bytes before
3394 * using an unsigned saturated pack.
3395 *
3396 * The difference between the V64, V128 and V256 cases is merely how
3397 * we distribute the expansion between temporaries.
3398 */
3399 switch (type) {
3400 case TCG_TYPE_V64:
3401 t1 = tcg_temp_new_vec(TCG_TYPE_V128);
3402 t2 = tcg_temp_new_vec(TCG_TYPE_V128);
3403 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3404 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3405 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3406 vec_gen_3(INDEX_op_x86_punpckl_vec, TCG_TYPE_V128, MO_8,
3407 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3408 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3409 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3410 vec_gen_3(INDEX_op_x86_packus_vec, TCG_TYPE_V128, MO_8,
3411 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t1));
3412 tcg_temp_free_vec(t1);
3413 tcg_temp_free_vec(t2);
3414 break;
3415
3416 case TCG_TYPE_V128:
3417 case TCG_TYPE_V256:
3418 t1 = tcg_temp_new_vec(type);
3419 t2 = tcg_temp_new_vec(type);
3420 t3 = tcg_temp_new_vec(type);
3421 t4 = tcg_temp_new_vec(type);
3422 zero = tcg_constant_vec(TCG_TYPE_V128, MO_8, 0);
3423 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3424 tcgv_vec_arg(t1), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3425 vec_gen_3(INDEX_op_x86_punpckl_vec, type, MO_8,
3426 tcgv_vec_arg(t2), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3427 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3428 tcgv_vec_arg(t3), tcgv_vec_arg(v1), tcgv_vec_arg(zero));
3429 vec_gen_3(INDEX_op_x86_punpckh_vec, type, MO_8,
3430 tcgv_vec_arg(t4), tcgv_vec_arg(zero), tcgv_vec_arg(v2));
3431 tcg_gen_mul_vec(MO_16, t1, t1, t2);
3432 tcg_gen_mul_vec(MO_16, t3, t3, t4);
3433 tcg_gen_shri_vec(MO_16, t1, t1, 8);
3434 tcg_gen_shri_vec(MO_16, t3, t3, 8);
3435 vec_gen_3(INDEX_op_x86_packus_vec, type, MO_8,
3436 tcgv_vec_arg(v0), tcgv_vec_arg(t1), tcgv_vec_arg(t3));
3437 tcg_temp_free_vec(t1);
3438 tcg_temp_free_vec(t2);
3439 tcg_temp_free_vec(t3);
3440 tcg_temp_free_vec(t4);
3441 break;
3442
3443 default:
3444 g_assert_not_reached();
3445 }
3446 }
3447
3448 static bool expand_vec_cmp_noinv(TCGType type, unsigned vece, TCGv_vec v0,
3449 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3450 {
3451 enum {
3452 NEED_INV = 1,
3453 NEED_SWAP = 2,
3454 NEED_BIAS = 4,
3455 NEED_UMIN = 8,
3456 NEED_UMAX = 16,
3457 };
3458 TCGv_vec t1, t2, t3;
3459 uint8_t fixup;
3460
3461 switch (cond) {
3462 case TCG_COND_EQ:
3463 case TCG_COND_GT:
3464 fixup = 0;
3465 break;
3466 case TCG_COND_NE:
3467 case TCG_COND_LE:
3468 fixup = NEED_INV;
3469 break;
3470 case TCG_COND_LT:
3471 fixup = NEED_SWAP;
3472 break;
3473 case TCG_COND_GE:
3474 fixup = NEED_SWAP | NEED_INV;
3475 break;
3476 case TCG_COND_LEU:
3477 if (vece <= MO_32) {
3478 fixup = NEED_UMIN;
3479 } else {
3480 fixup = NEED_BIAS | NEED_INV;
3481 }
3482 break;
3483 case TCG_COND_GTU:
3484 if (vece <= MO_32) {
3485 fixup = NEED_UMIN | NEED_INV;
3486 } else {
3487 fixup = NEED_BIAS;
3488 }
3489 break;
3490 case TCG_COND_GEU:
3491 if (vece <= MO_32) {
3492 fixup = NEED_UMAX;
3493 } else {
3494 fixup = NEED_BIAS | NEED_SWAP | NEED_INV;
3495 }
3496 break;
3497 case TCG_COND_LTU:
3498 if (vece <= MO_32) {
3499 fixup = NEED_UMAX | NEED_INV;
3500 } else {
3501 fixup = NEED_BIAS | NEED_SWAP;
3502 }
3503 break;
3504 default:
3505 g_assert_not_reached();
3506 }
3507
3508 if (fixup & NEED_INV) {
3509 cond = tcg_invert_cond(cond);
3510 }
3511 if (fixup & NEED_SWAP) {
3512 t1 = v1, v1 = v2, v2 = t1;
3513 cond = tcg_swap_cond(cond);
3514 }
3515
3516 t1 = t2 = NULL;
3517 if (fixup & (NEED_UMIN | NEED_UMAX)) {
3518 t1 = tcg_temp_new_vec(type);
3519 if (fixup & NEED_UMIN) {
3520 tcg_gen_umin_vec(vece, t1, v1, v2);
3521 } else {
3522 tcg_gen_umax_vec(vece, t1, v1, v2);
3523 }
3524 v2 = t1;
3525 cond = TCG_COND_EQ;
3526 } else if (fixup & NEED_BIAS) {
3527 t1 = tcg_temp_new_vec(type);
3528 t2 = tcg_temp_new_vec(type);
3529 t3 = tcg_constant_vec(type, vece, 1ull << ((8 << vece) - 1));
3530 tcg_gen_sub_vec(vece, t1, v1, t3);
3531 tcg_gen_sub_vec(vece, t2, v2, t3);
3532 v1 = t1;
3533 v2 = t2;
3534 cond = tcg_signed_cond(cond);
3535 }
3536
3537 tcg_debug_assert(cond == TCG_COND_EQ || cond == TCG_COND_GT);
3538 /* Expand directly; do not recurse. */
3539 vec_gen_4(INDEX_op_cmp_vec, type, vece,
3540 tcgv_vec_arg(v0), tcgv_vec_arg(v1), tcgv_vec_arg(v2), cond);
3541
3542 if (t1) {
3543 tcg_temp_free_vec(t1);
3544 if (t2) {
3545 tcg_temp_free_vec(t2);
3546 }
3547 }
3548 return fixup & NEED_INV;
3549 }
3550
3551 static void expand_vec_cmp(TCGType type, unsigned vece, TCGv_vec v0,
3552 TCGv_vec v1, TCGv_vec v2, TCGCond cond)
3553 {
3554 if (expand_vec_cmp_noinv(type, vece, v0, v1, v2, cond)) {
3555 tcg_gen_not_vec(vece, v0, v0);
3556 }
3557 }
3558
3559 static void expand_vec_cmpsel(TCGType type, unsigned vece, TCGv_vec v0,
3560 TCGv_vec c1, TCGv_vec c2,
3561 TCGv_vec v3, TCGv_vec v4, TCGCond cond)
3562 {
3563 TCGv_vec t = tcg_temp_new_vec(type);
3564
3565 if (expand_vec_cmp_noinv(type, vece, t, c1, c2, cond)) {
3566 /* Invert the sense of the compare by swapping arguments. */
3567 TCGv_vec x;
3568 x = v3, v3 = v4, v4 = x;
3569 }
3570 vec_gen_4(INDEX_op_x86_vpblendvb_vec, type, vece,
3571 tcgv_vec_arg(v0), tcgv_vec_arg(v4),
3572 tcgv_vec_arg(v3), tcgv_vec_arg(t));
3573 tcg_temp_free_vec(t);
3574 }
3575
3576 void tcg_expand_vec_op(TCGOpcode opc, TCGType type, unsigned vece,
3577 TCGArg a0, ...)
3578 {
3579 va_list va;
3580 TCGArg a2;
3581 TCGv_vec v0, v1, v2, v3, v4;
3582
3583 va_start(va, a0);
3584 v0 = temp_tcgv_vec(arg_temp(a0));
3585 v1 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3586 a2 = va_arg(va, TCGArg);
3587
3588 switch (opc) {
3589 case INDEX_op_shli_vec:
3590 case INDEX_op_shri_vec:
3591 expand_vec_shi(type, vece, opc, v0, v1, a2);
3592 break;
3593
3594 case INDEX_op_sari_vec:
3595 expand_vec_sari(type, vece, v0, v1, a2);
3596 break;
3597
3598 case INDEX_op_rotli_vec:
3599 expand_vec_rotli(type, vece, v0, v1, a2);
3600 break;
3601
3602 case INDEX_op_rotls_vec:
3603 expand_vec_rotls(type, vece, v0, v1, temp_tcgv_i32(arg_temp(a2)));
3604 break;
3605
3606 case INDEX_op_rotlv_vec:
3607 v2 = temp_tcgv_vec(arg_temp(a2));
3608 expand_vec_rotv(type, vece, v0, v1, v2, false);
3609 break;
3610 case INDEX_op_rotrv_vec:
3611 v2 = temp_tcgv_vec(arg_temp(a2));
3612 expand_vec_rotv(type, vece, v0, v1, v2, true);
3613 break;
3614
3615 case INDEX_op_mul_vec:
3616 v2 = temp_tcgv_vec(arg_temp(a2));
3617 expand_vec_mul(type, vece, v0, v1, v2);
3618 break;
3619
3620 case INDEX_op_cmp_vec:
3621 v2 = temp_tcgv_vec(arg_temp(a2));
3622 expand_vec_cmp(type, vece, v0, v1, v2, va_arg(va, TCGArg));
3623 break;
3624
3625 case INDEX_op_cmpsel_vec:
3626 v2 = temp_tcgv_vec(arg_temp(a2));
3627 v3 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3628 v4 = temp_tcgv_vec(arg_temp(va_arg(va, TCGArg)));
3629 expand_vec_cmpsel(type, vece, v0, v1, v2, v3, v4, va_arg(va, TCGArg));
3630 break;
3631
3632 default:
3633 break;
3634 }
3635
3636 va_end(va);
3637 }
3638
3639 static const int tcg_target_callee_save_regs[] = {
3640 #if TCG_TARGET_REG_BITS == 64
3641 TCG_REG_RBP,
3642 TCG_REG_RBX,
3643 #if defined(_WIN64)
3644 TCG_REG_RDI,
3645 TCG_REG_RSI,
3646 #endif
3647 TCG_REG_R12,
3648 TCG_REG_R13,
3649 TCG_REG_R14, /* Currently used for the global env. */
3650 TCG_REG_R15,
3651 #else
3652 TCG_REG_EBP, /* Currently used for the global env. */
3653 TCG_REG_EBX,
3654 TCG_REG_ESI,
3655 TCG_REG_EDI,
3656 #endif
3657 };
3658
3659 /* Compute frame size via macros, to share between tcg_target_qemu_prologue
3660 and tcg_register_jit. */
3661
3662 #define PUSH_SIZE \
3663 ((1 + ARRAY_SIZE(tcg_target_callee_save_regs)) \
3664 * (TCG_TARGET_REG_BITS / 8))
3665
3666 #define FRAME_SIZE \
3667 ((PUSH_SIZE \
3668 + TCG_STATIC_CALL_ARGS_SIZE \
3669 + CPU_TEMP_BUF_NLONGS * sizeof(long) \
3670 + TCG_TARGET_STACK_ALIGN - 1) \
3671 & ~(TCG_TARGET_STACK_ALIGN - 1))
3672
3673 /* Generate global QEMU prologue and epilogue code */
3674 static void tcg_target_qemu_prologue(TCGContext *s)
3675 {
3676 int i, stack_addend;
3677
3678 /* TB prologue */
3679
3680 /* Reserve some stack space, also for TCG temps. */
3681 stack_addend = FRAME_SIZE - PUSH_SIZE;
3682 tcg_set_frame(s, TCG_REG_CALL_STACK, TCG_STATIC_CALL_ARGS_SIZE,
3683 CPU_TEMP_BUF_NLONGS * sizeof(long));
3684
3685 /* Save all callee saved registers. */
3686 for (i = 0; i < ARRAY_SIZE(tcg_target_callee_save_regs); i++) {
3687 tcg_out_push(s, tcg_target_callee_save_regs[i]);
3688 }
3689
3690 #if TCG_TARGET_REG_BITS == 32
3691 tcg_out_ld(s, TCG_TYPE_PTR, TCG_AREG0, TCG_REG_ESP,
3692 (ARRAY_SIZE(tcg_target_callee_save_regs) + 1) * 4);
3693 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3694 /* jmp *tb. */
3695 tcg_out_modrm_offset(s, OPC_GRP5, EXT5_JMPN_Ev, TCG_REG_ESP,
3696 (ARRAY_SIZE(tcg_target_callee_save_regs) + 2) * 4
3697 + stack_addend);
3698 #else
3699 # if !defined(CONFIG_SOFTMMU) && TCG_TARGET_REG_BITS == 64
3700 if (guest_base) {
3701 int seg = setup_guest_base_seg();
3702 if (seg != 0) {
3703 x86_guest_base_seg = seg;
3704 } else if (guest_base == (int32_t)guest_base) {
3705 x86_guest_base_offset = guest_base;
3706 } else {
3707 /* Choose R12 because, as a base, it requires a SIB byte. */
3708 x86_guest_base_index = TCG_REG_R12;
3709 tcg_out_movi(s, TCG_TYPE_PTR, x86_guest_base_index, guest_base);
3710 tcg_regset_set_reg(s->reserved_regs, x86_guest_base_index);
3711 }
3712 }
3713 # endif
3714 tcg_out_mov(s, TCG_TYPE_PTR, TCG_AREG0, tcg_target_call_iarg_regs[0]);
3715 tcg_out_addi(s, TCG_REG_ESP, -stack_addend);
3716 /* jmp *tb. */
3717 tcg_out_modrm(s, OPC_GRP5, EXT5_JMPN_Ev, tcg_target_call_iarg_regs[1]);
3718 #endif
3719
3720 /*
3721 * Return path for goto_ptr. Set return value to 0, a-la exit_tb,
3722 * and fall through to the rest of the epilogue.
3723 */
3724 tcg_code_gen_epilogue = tcg_splitwx_to_rx(s->code_ptr);
3725 tcg_out_movi(s, TCG_TYPE_REG, TCG_REG_EAX, 0);
3726
3727 /* TB epilogue */
3728 tb_ret_addr = tcg_splitwx_to_rx(s->code_ptr);
3729
3730 tcg_out_addi(s, TCG_REG_CALL_STACK, stack_addend);
3731
3732 if (have_avx2) {
3733 tcg_out_vex_opc(s, OPC_VZEROUPPER, 0, 0, 0, 0);
3734 }
3735 for (i = ARRAY_SIZE(tcg_target_callee_save_regs) - 1; i >= 0; i--) {
3736 tcg_out_pop(s, tcg_target_callee_save_regs[i]);
3737 }
3738 tcg_out_opc(s, OPC_RET, 0, 0, 0);
3739 }
3740
3741 static void tcg_out_nop_fill(tcg_insn_unit *p, int count)
3742 {
3743 memset(p, 0x90, count);
3744 }
3745
3746 static void tcg_target_init(TCGContext *s)
3747 {
3748 #ifdef CONFIG_CPUID_H
3749 unsigned a, b, c, d, b7 = 0;
3750 int max = __get_cpuid_max(0, 0);
3751
3752 if (max >= 7) {
3753 /* BMI1 is available on AMD Piledriver and Intel Haswell CPUs. */
3754 __cpuid_count(7, 0, a, b7, c, d);
3755 have_bmi1 = (b7 & bit_BMI) != 0;
3756 have_bmi2 = (b7 & bit_BMI2) != 0;
3757 }
3758
3759 if (max >= 1) {
3760 __cpuid(1, a, b, c, d);
3761 #ifndef have_cmov
3762 /* For 32-bit, 99% certainty that we're running on hardware that
3763 supports cmov, but we still need to check. In case cmov is not
3764 available, we'll use a small forward branch. */
3765 have_cmov = (d & bit_CMOV) != 0;
3766 #endif
3767
3768 /* MOVBE is only available on Intel Atom and Haswell CPUs, so we
3769 need to probe for it. */
3770 have_movbe = (c & bit_MOVBE) != 0;
3771 have_popcnt = (c & bit_POPCNT) != 0;
3772
3773 /* There are a number of things we must check before we can be
3774 sure of not hitting invalid opcode. */
3775 if (c & bit_OSXSAVE) {
3776 unsigned xcrl, xcrh;
3777 /* The xgetbv instruction is not available to older versions of
3778 * the assembler, so we encode the instruction manually.
3779 */
3780 asm(".byte 0x0f, 0x01, 0xd0" : "=a" (xcrl), "=d" (xcrh) : "c" (0));
3781 if ((xcrl & 6) == 6) {
3782 have_avx1 = (c & bit_AVX) != 0;
3783 have_avx2 = (b7 & bit_AVX2) != 0;
3784 }
3785 }
3786 }
3787
3788 max = __get_cpuid_max(0x8000000, 0);
3789 if (max >= 1) {
3790 __cpuid(0x80000001, a, b, c, d);
3791 /* LZCNT was introduced with AMD Barcelona and Intel Haswell CPUs. */
3792 have_lzcnt = (c & bit_LZCNT) != 0;
3793 }
3794 #endif /* CONFIG_CPUID_H */
3795
3796 tcg_target_available_regs[TCG_TYPE_I32] = ALL_GENERAL_REGS;
3797 if (TCG_TARGET_REG_BITS == 64) {
3798 tcg_target_available_regs[TCG_TYPE_I64] = ALL_GENERAL_REGS;
3799 }
3800 if (have_avx1) {
3801 tcg_target_available_regs[TCG_TYPE_V64] = ALL_VECTOR_REGS;
3802 tcg_target_available_regs[TCG_TYPE_V128] = ALL_VECTOR_REGS;
3803 }
3804 if (have_avx2) {
3805 tcg_target_available_regs[TCG_TYPE_V256] = ALL_VECTOR_REGS;
3806 }
3807
3808 tcg_target_call_clobber_regs = ALL_VECTOR_REGS;
3809 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EAX);
3810 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_EDX);
3811 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_ECX);
3812 if (TCG_TARGET_REG_BITS == 64) {
3813 #if !defined(_WIN64)
3814 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RDI);
3815 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_RSI);
3816 #endif
3817 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R8);
3818 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R9);
3819 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R10);
3820 tcg_regset_set_reg(tcg_target_call_clobber_regs, TCG_REG_R11);
3821 }
3822
3823 s->reserved_regs = 0;
3824 tcg_regset_set_reg(s->reserved_regs, TCG_REG_CALL_STACK);
3825 }
3826
3827 typedef struct {
3828 DebugFrameHeader h;
3829 uint8_t fde_def_cfa[4];
3830 uint8_t fde_reg_ofs[14];
3831 } DebugFrame;
3832
3833 /* We're expecting a 2 byte uleb128 encoded value. */
3834 QEMU_BUILD_BUG_ON(FRAME_SIZE >= (1 << 14));
3835
3836 #if !defined(__ELF__)
3837 /* Host machine without ELF. */
3838 #elif TCG_TARGET_REG_BITS == 64
3839 #define ELF_HOST_MACHINE EM_X86_64
3840 static const DebugFrame debug_frame = {
3841 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3842 .h.cie.id = -1,
3843 .h.cie.version = 1,
3844 .h.cie.code_align = 1,
3845 .h.cie.data_align = 0x78, /* sleb128 -8 */
3846 .h.cie.return_column = 16,
3847
3848 /* Total FDE size does not include the "len" member. */
3849 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3850
3851 .fde_def_cfa = {
3852 12, 7, /* DW_CFA_def_cfa %rsp, ... */
3853 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
3854 (FRAME_SIZE >> 7)
3855 },
3856 .fde_reg_ofs = {
3857 0x90, 1, /* DW_CFA_offset, %rip, -8 */
3858 /* The following ordering must match tcg_target_callee_save_regs. */
3859 0x86, 2, /* DW_CFA_offset, %rbp, -16 */
3860 0x83, 3, /* DW_CFA_offset, %rbx, -24 */
3861 0x8c, 4, /* DW_CFA_offset, %r12, -32 */
3862 0x8d, 5, /* DW_CFA_offset, %r13, -40 */
3863 0x8e, 6, /* DW_CFA_offset, %r14, -48 */
3864 0x8f, 7, /* DW_CFA_offset, %r15, -56 */
3865 }
3866 };
3867 #else
3868 #define ELF_HOST_MACHINE EM_386
3869 static const DebugFrame debug_frame = {
3870 .h.cie.len = sizeof(DebugFrameCIE)-4, /* length after .len member */
3871 .h.cie.id = -1,
3872 .h.cie.version = 1,
3873 .h.cie.code_align = 1,
3874 .h.cie.data_align = 0x7c, /* sleb128 -4 */
3875 .h.cie.return_column = 8,
3876
3877 /* Total FDE size does not include the "len" member. */
3878 .h.fde.len = sizeof(DebugFrame) - offsetof(DebugFrame, h.fde.cie_offset),
3879
3880 .fde_def_cfa = {
3881 12, 4, /* DW_CFA_def_cfa %esp, ... */
3882 (FRAME_SIZE & 0x7f) | 0x80, /* ... uleb128 FRAME_SIZE */
3883 (FRAME_SIZE >> 7)
3884 },
3885 .fde_reg_ofs = {
3886 0x88, 1, /* DW_CFA_offset, %eip, -4 */
3887 /* The following ordering must match tcg_target_callee_save_regs. */
3888 0x85, 2, /* DW_CFA_offset, %ebp, -8 */
3889 0x83, 3, /* DW_CFA_offset, %ebx, -12 */
3890 0x86, 4, /* DW_CFA_offset, %esi, -16 */
3891 0x87, 5, /* DW_CFA_offset, %edi, -20 */
3892 }
3893 };
3894 #endif
3895
3896 #if defined(ELF_HOST_MACHINE)
3897 void tcg_register_jit(const void *buf, size_t buf_size)
3898 {
3899 tcg_register_jit_int(buf, buf_size, &debug_frame, sizeof(debug_frame));
3900 }
3901 #endif