]>
Commit | Line | Data |
---|---|---|
39adb5c3 TG |
1 | /** @file |
2 | * IPRT - Assembly Routines for Optimizing some Integers Math Operations. | |
3 | */ | |
4 | ||
5 | /* | |
6 | * Copyright (C) 2006-2016 Oracle Corporation | |
7 | * | |
8 | * This file is part of VirtualBox Open Source Edition (OSE), as | |
9 | * available from http://www.virtualbox.org. This file is free software; | |
10 | * you can redistribute it and/or modify it under the terms of the GNU | |
11 | * General Public License (GPL) as published by the Free Software | |
12 | * Foundation, in version 2 as it comes in the "COPYING" file of the | |
13 | * VirtualBox OSE distribution. VirtualBox OSE is distributed in the | |
14 | * hope that it will be useful, but WITHOUT ANY WARRANTY of any kind. | |
15 | * | |
16 | * The contents of this file may alternatively be used under the terms | |
17 | * of the Common Development and Distribution License Version 1.0 | |
18 | * (CDDL) only, as it comes in the "COPYING.CDDL" file of the | |
19 | * VirtualBox OSE distribution, in which case the provisions of the | |
20 | * CDDL are applicable instead of those of the GPL. | |
21 | * | |
22 | * You may elect to license modified versions of this file under the | |
23 | * terms and conditions of either the GPL or the CDDL or both. | |
24 | */ | |
25 | ||
26 | #ifndef ___iprt_asm_math_h | |
27 | #define ___iprt_asm_math_h | |
28 | ||
29 | #include <iprt/types.h> | |
30 | ||
31 | #if defined(_MSC_VER) && RT_INLINE_ASM_USES_INTRIN | |
32 | # pragma warning(push) | |
33 | # pragma warning(disable:4668) /* Several incorrect __cplusplus uses. */ | |
34 | # pragma warning(disable:4255) /* Incorrect __slwpcb prototype. */ | |
35 | # include <intrin.h> | |
36 | # pragma warning(pop) | |
37 | /* Emit the intrinsics at all optimization levels. */ | |
38 | # pragma intrinsic(__emul) | |
39 | # pragma intrinsic(__emulu) | |
40 | # ifdef RT_ARCH_AMD64 | |
41 | # pragma intrinsic(_mul128) | |
42 | # pragma intrinsic(_umul128) | |
43 | # endif | |
44 | #endif | |
45 | ||
46 | ||
47 | /** @defgroup grp_rt_asm_math Interger Math Optimizations | |
48 | * @ingroup grp_rt_asm | |
49 | * @{ */ | |
50 | ||
51 | /** | |
52 | * Multiplies two unsigned 32-bit values returning an unsigned 64-bit result. | |
53 | * | |
54 | * @returns u32F1 * u32F2. | |
55 | */ | |
56 | ||
57 | #if RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_X86) | |
58 | DECLASM(uint64_t) ASMMult2xU32RetU64(uint32_t u32F1, uint32_t u32F2); | |
59 | #else | |
60 | DECLINLINE(uint64_t) ASMMult2xU32RetU64(uint32_t u32F1, uint32_t u32F2) | |
61 | { | |
62 | # ifdef RT_ARCH_X86 | |
63 | uint64_t u64; | |
64 | # if RT_INLINE_ASM_GNU_STYLE | |
65 | __asm__ __volatile__("mull %%edx" | |
66 | : "=A" (u64) | |
67 | : "a" (u32F2), "d" (u32F1)); | |
68 | # elif RT_INLINE_ASM_USES_INTRIN | |
69 | u64 = __emulu(u32F1, u32F2); | |
70 | # else | |
71 | __asm | |
72 | { | |
73 | mov edx, [u32F1] | |
74 | mov eax, [u32F2] | |
75 | mul edx | |
76 | mov dword ptr [u64], eax | |
77 | mov dword ptr [u64 + 4], edx | |
78 | } | |
79 | # endif | |
80 | return u64; | |
81 | # else /* generic: */ | |
82 | return (uint64_t)u32F1 * u32F2; | |
83 | # endif | |
84 | } | |
85 | #endif | |
86 | ||
87 | ||
88 | /** | |
89 | * Multiplies two signed 32-bit values returning a signed 64-bit result. | |
90 | * | |
91 | * @returns u32F1 * u32F2. | |
92 | */ | |
93 | #if RT_INLINE_ASM_EXTERNAL && !RT_INLINE_ASM_USES_INTRIN && defined(RT_ARCH_X86) | |
94 | DECLASM(int64_t) ASMMult2xS32RetS64(int32_t i32F1, int32_t i32F2); | |
95 | #else | |
96 | DECLINLINE(int64_t) ASMMult2xS32RetS64(int32_t i32F1, int32_t i32F2) | |
97 | { | |
98 | # ifdef RT_ARCH_X86 | |
99 | int64_t i64; | |
100 | # if RT_INLINE_ASM_GNU_STYLE | |
101 | __asm__ __volatile__("imull %%edx" | |
102 | : "=A" (i64) | |
103 | : "a" (i32F2), "d" (i32F1)); | |
104 | # elif RT_INLINE_ASM_USES_INTRIN | |
105 | i64 = __emul(i32F1, i32F2); | |
106 | # else | |
107 | __asm | |
108 | { | |
109 | mov edx, [i32F1] | |
110 | mov eax, [i32F2] | |
111 | imul edx | |
112 | mov dword ptr [i64], eax | |
113 | mov dword ptr [i64 + 4], edx | |
114 | } | |
115 | # endif | |
116 | return i64; | |
117 | # else /* generic: */ | |
118 | return (int64_t)i32F1 * i32F2; | |
119 | # endif | |
120 | } | |
121 | #endif | |
122 | ||
123 | ||
124 | #if ARCH_BITS == 64 | |
125 | DECLINLINE(uint64_t) ASMMult2xU64Ret2xU64(uint64_t u64F1, uint64_t u64F2, uint64_t *pu64ProdHi) | |
126 | { | |
127 | # if defined(RT_ARCH_AMD64) && (RT_INLINE_ASM_GNU_STYLE || RT_INLINE_ASM_USES_INTRIN) | |
128 | # if RT_INLINE_ASM_GNU_STYLE | |
129 | uint64_t u64Low, u64High; | |
130 | __asm__ __volatile__("mulq %%rdx" | |
131 | : "=a" (u64Low), "=d" (u64High) | |
132 | : "0" (u64F1), "1" (u64F2)); | |
133 | *pu64ProdHi = u64High; | |
134 | return u64Low; | |
135 | # elif RT_INLINE_ASM_USES_INTRIN | |
136 | return _umul128(u64F1, u64F2, pu64ProdHi); | |
137 | # else | |
138 | # error "hmm" | |
139 | # endif | |
140 | # else /* generic: */ | |
141 | /* | |
142 | * F1 * F2 = Prod | |
143 | * -- -- | |
144 | * ab * cd = b*d + a*d*10 + b*c*10 + a*c*100 | |
145 | * | |
146 | * Where a, b, c and d are 'digits', and 10 is max digit + 1. | |
147 | * | |
148 | * Our digits are 32-bit wide, so instead of 10 we multiply by 4G. | |
149 | * Prod = F1.s.Lo*F2.s.Lo + F1.s.Hi*F2.s.Lo*4G | |
150 | * + F1.s.Lo*F2.s.Hi*4G + F1.s.Hi*F2.s.Hi*4G*4G | |
151 | */ | |
152 | RTUINT128U Prod; | |
153 | RTUINT64U Tmp1; | |
154 | uint64_t u64Tmp; | |
155 | RTUINT64U F1, F2; | |
156 | F1.u = u64F1; | |
157 | F2.u = u64F2; | |
158 | ||
159 | Prod.s.Lo = ASMMult2xU32RetU64(F1.s.Lo, F2.s.Lo); | |
160 | ||
161 | Tmp1.u = ASMMult2xU32RetU64(F1.s.Hi, F2.s.Lo); | |
162 | u64Tmp = (uint64_t)Prod.DWords.dw1 + Tmp1.s.Lo; | |
163 | Prod.DWords.dw1 = (uint32_t)u64Tmp; | |
164 | Prod.s.Hi = Tmp1.s.Hi; | |
165 | Prod.s.Hi += u64Tmp >> 32; /* carry */ | |
166 | ||
167 | Tmp1.u = ASMMult2xU32RetU64(F1.s.Lo, F2.s.Hi); | |
168 | u64Tmp = (uint64_t)Prod.DWords.dw1 + Tmp1.s.Lo; | |
169 | Prod.DWords.dw1 = (uint32_t)u64Tmp; | |
170 | u64Tmp >>= 32; /* carry */ | |
171 | u64Tmp += Prod.DWords.dw2; | |
172 | u64Tmp += Tmp1.s.Hi; | |
173 | Prod.DWords.dw2 = (uint32_t)u64Tmp; | |
174 | Prod.DWords.dw3 += u64Tmp >> 32; /* carry */ | |
175 | ||
176 | Prod.s.Hi += ASMMult2xU32RetU64(F1.s.Hi, F2.s.Hi); | |
177 | *pu64ProdHi = Prod.s.Hi; | |
178 | return Prod.s.Lo; | |
179 | # endif | |
180 | } | |
181 | #endif | |
182 | ||
183 | ||
184 | ||
185 | /** | |
186 | * Divides a 64-bit unsigned by a 32-bit unsigned returning an unsigned 32-bit result. | |
187 | * | |
188 | * @returns u64 / u32. | |
189 | */ | |
190 | #if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86) | |
191 | DECLASM(uint32_t) ASMDivU64ByU32RetU32(uint64_t u64, uint32_t u32); | |
192 | #else | |
193 | DECLINLINE(uint32_t) ASMDivU64ByU32RetU32(uint64_t u64, uint32_t u32) | |
194 | { | |
195 | # ifdef RT_ARCH_X86 | |
196 | # if RT_INLINE_ASM_GNU_STYLE | |
197 | RTCCUINTREG uDummy; | |
198 | __asm__ __volatile__("divl %3" | |
199 | : "=a" (u32), "=d"(uDummy) | |
200 | : "A" (u64), "r" (u32)); | |
201 | # else | |
202 | __asm | |
203 | { | |
204 | mov eax, dword ptr [u64] | |
205 | mov edx, dword ptr [u64 + 4] | |
206 | mov ecx, [u32] | |
207 | div ecx | |
208 | mov [u32], eax | |
209 | } | |
210 | # endif | |
211 | return u32; | |
212 | # else /* generic: */ | |
213 | return (uint32_t)(u64 / u32); | |
214 | # endif | |
215 | } | |
216 | #endif | |
217 | ||
218 | ||
219 | /** | |
220 | * Divides a 64-bit signed by a 32-bit signed returning a signed 32-bit result. | |
221 | * | |
222 | * @returns u64 / u32. | |
223 | */ | |
224 | #if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86) | |
225 | DECLASM(int32_t) ASMDivS64ByS32RetS32(int64_t i64, int32_t i32); | |
226 | #else | |
227 | DECLINLINE(int32_t) ASMDivS64ByS32RetS32(int64_t i64, int32_t i32) | |
228 | { | |
229 | # ifdef RT_ARCH_X86 | |
230 | # if RT_INLINE_ASM_GNU_STYLE | |
231 | RTCCUINTREG iDummy; | |
232 | __asm__ __volatile__("idivl %3" | |
233 | : "=a" (i32), "=d"(iDummy) | |
234 | : "A" (i64), "r" (i32)); | |
235 | # else | |
236 | __asm | |
237 | { | |
238 | mov eax, dword ptr [i64] | |
239 | mov edx, dword ptr [i64 + 4] | |
240 | mov ecx, [i32] | |
241 | idiv ecx | |
242 | mov [i32], eax | |
243 | } | |
244 | # endif | |
245 | return i32; | |
246 | # else /* generic: */ | |
247 | return (int32_t)(i64 / i32); | |
248 | # endif | |
249 | } | |
250 | #endif | |
251 | ||
252 | ||
253 | /** | |
254 | * Performs 64-bit unsigned by a 32-bit unsigned division with a 32-bit unsigned result, | |
255 | * returning the rest. | |
256 | * | |
257 | * @returns u64 % u32. | |
258 | * | |
259 | * @remarks It is important that the result is <= UINT32_MAX or we'll overflow and crash. | |
260 | */ | |
261 | #if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86) | |
262 | DECLASM(uint32_t) ASMModU64ByU32RetU32(uint64_t u64, uint32_t u32); | |
263 | #else | |
264 | DECLINLINE(uint32_t) ASMModU64ByU32RetU32(uint64_t u64, uint32_t u32) | |
265 | { | |
266 | # ifdef RT_ARCH_X86 | |
267 | # if RT_INLINE_ASM_GNU_STYLE | |
268 | RTCCUINTREG uDummy; | |
269 | __asm__ __volatile__("divl %3" | |
270 | : "=a" (uDummy), "=d"(u32) | |
271 | : "A" (u64), "r" (u32)); | |
272 | # else | |
273 | __asm | |
274 | { | |
275 | mov eax, dword ptr [u64] | |
276 | mov edx, dword ptr [u64 + 4] | |
277 | mov ecx, [u32] | |
278 | div ecx | |
279 | mov [u32], edx | |
280 | } | |
281 | # endif | |
282 | return u32; | |
283 | # else /* generic: */ | |
284 | return (uint32_t)(u64 % u32); | |
285 | # endif | |
286 | } | |
287 | #endif | |
288 | ||
289 | ||
290 | /** | |
291 | * Performs 64-bit signed by a 32-bit signed division with a 32-bit signed result, | |
292 | * returning the rest. | |
293 | * | |
294 | * @returns u64 % u32. | |
295 | * | |
296 | * @remarks It is important that the result is <= UINT32_MAX or we'll overflow and crash. | |
297 | */ | |
298 | #if RT_INLINE_ASM_EXTERNAL && defined(RT_ARCH_X86) | |
299 | DECLASM(int32_t) ASMModS64ByS32RetS32(int64_t i64, int32_t i32); | |
300 | #else | |
301 | DECLINLINE(int32_t) ASMModS64ByS32RetS32(int64_t i64, int32_t i32) | |
302 | { | |
303 | # ifdef RT_ARCH_X86 | |
304 | # if RT_INLINE_ASM_GNU_STYLE | |
305 | RTCCUINTREG iDummy; | |
306 | __asm__ __volatile__("idivl %3" | |
307 | : "=a" (iDummy), "=d"(i32) | |
308 | : "A" (i64), "r" (i32)); | |
309 | # else | |
310 | __asm | |
311 | { | |
312 | mov eax, dword ptr [i64] | |
313 | mov edx, dword ptr [i64 + 4] | |
314 | mov ecx, [i32] | |
315 | idiv ecx | |
316 | mov [i32], edx | |
317 | } | |
318 | # endif | |
319 | return i32; | |
320 | # else /* generic: */ | |
321 | return (int32_t)(i64 % i32); | |
322 | # endif | |
323 | } | |
324 | #endif | |
325 | ||
326 | ||
327 | /** | |
328 | * Multiple a 32-bit by a 32-bit integer and divide the result by a 32-bit integer | |
329 | * using a 64 bit intermediate result. | |
330 | * | |
331 | * @returns (u32A * u32B) / u32C. | |
332 | * @param u32A The 32-bit value (A). | |
333 | * @param u32B The 32-bit value to multiple by A. | |
334 | * @param u32C The 32-bit value to divide A*B by. | |
335 | * | |
336 | * @remarks Architecture specific. | |
337 | * @remarks Make sure the result won't ever exceed 32-bit, because hardware | |
338 | * exception may be raised if it does. | |
339 | * @remarks On x86 this may be used to avoid dragging in 64-bit builtin | |
340 | * arithmetics functions. | |
341 | */ | |
342 | #if RT_INLINE_ASM_EXTERNAL && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)) | |
343 | DECLASM(uint32_t) ASMMultU32ByU32DivByU32(uint32_t u32A, uint32_t u32B, uint32_t u32C); | |
344 | #else | |
345 | DECLINLINE(uint32_t) ASMMultU32ByU32DivByU32(uint32_t u32A, uint32_t u32B, uint32_t u32C) | |
346 | { | |
347 | # if RT_INLINE_ASM_GNU_STYLE && (defined(RT_ARCH_AMD64) || defined(RT_ARCH_X86)) | |
348 | uint32_t u32Result, u32Spill; | |
349 | __asm__ __volatile__("mull %2\n\t" | |
350 | "divl %3\n\t" | |
351 | : "=&a" (u32Result), | |
352 | "=&d" (u32Spill) | |
353 | : "r" (u32B), | |
354 | "r" (u32C), | |
355 | "0" (u32A)); | |
356 | return u32Result; | |
357 | # else | |
358 | return (uint32_t)(((uint64_t)u32A * u32B) / u32C); | |
359 | # endif | |
360 | } | |
361 | #endif | |
362 | ||
363 | ||
364 | /** | |
365 | * Multiple a 64-bit by a 32-bit integer and divide the result by a 32-bit integer | |
366 | * using a 96 bit intermediate result. | |
367 | * | |
368 | * @returns (u64A * u32B) / u32C. | |
369 | * @param u64A The 64-bit value. | |
370 | * @param u32B The 32-bit value to multiple by A. | |
371 | * @param u32C The 32-bit value to divide A*B by. | |
372 | * | |
373 | * @remarks Architecture specific. | |
374 | * @remarks Make sure the result won't ever exceed 64-bit, because hardware | |
375 | * exception may be raised if it does. | |
376 | * @remarks On x86 this may be used to avoid dragging in 64-bit builtin | |
377 | * arithmetics function. | |
378 | */ | |
379 | #if RT_INLINE_ASM_EXTERNAL || !defined(__GNUC__) || (!defined(RT_ARCH_AMD64) && !defined(RT_ARCH_X86)) | |
380 | DECLASM(uint64_t) ASMMultU64ByU32DivByU32(uint64_t u64A, uint32_t u32B, uint32_t u32C); | |
381 | #else | |
382 | DECLINLINE(uint64_t) ASMMultU64ByU32DivByU32(uint64_t u64A, uint32_t u32B, uint32_t u32C) | |
383 | { | |
384 | # if RT_INLINE_ASM_GNU_STYLE | |
385 | # ifdef RT_ARCH_AMD64 | |
386 | uint64_t u64Result, u64Spill; | |
387 | __asm__ __volatile__("mulq %2\n\t" | |
388 | "divq %3\n\t" | |
389 | : "=&a" (u64Result), | |
390 | "=&d" (u64Spill) | |
391 | : "r" ((uint64_t)u32B), | |
392 | "r" ((uint64_t)u32C), | |
393 | "0" (u64A)); | |
394 | return u64Result; | |
395 | # else | |
396 | uint32_t u32Dummy; | |
397 | uint64_t u64Result; | |
398 | __asm__ __volatile__("mull %%ecx \n\t" /* eax = u64Lo.lo = (u64A.lo * u32B).lo | |
399 | edx = u64Lo.hi = (u64A.lo * u32B).hi */ | |
400 | "xchg %%eax,%%esi \n\t" /* esi = u64Lo.lo | |
401 | eax = u64A.hi */ | |
402 | "xchg %%edx,%%edi \n\t" /* edi = u64Low.hi | |
403 | edx = u32C */ | |
404 | "xchg %%edx,%%ecx \n\t" /* ecx = u32C | |
405 | edx = u32B */ | |
406 | "mull %%edx \n\t" /* eax = u64Hi.lo = (u64A.hi * u32B).lo | |
407 | edx = u64Hi.hi = (u64A.hi * u32B).hi */ | |
408 | "addl %%edi,%%eax \n\t" /* u64Hi.lo += u64Lo.hi */ | |
409 | "adcl $0,%%edx \n\t" /* u64Hi.hi += carry */ | |
410 | "divl %%ecx \n\t" /* eax = u64Hi / u32C | |
411 | edx = u64Hi % u32C */ | |
412 | "movl %%eax,%%edi \n\t" /* edi = u64Result.hi = u64Hi / u32C */ | |
413 | "movl %%esi,%%eax \n\t" /* eax = u64Lo.lo */ | |
414 | "divl %%ecx \n\t" /* u64Result.lo */ | |
415 | "movl %%edi,%%edx \n\t" /* u64Result.hi */ | |
416 | : "=A"(u64Result), "=c"(u32Dummy), | |
417 | "=S"(u32Dummy), "=D"(u32Dummy) | |
418 | : "a"((uint32_t)u64A), | |
419 | "S"((uint32_t)(u64A >> 32)), | |
420 | "c"(u32B), | |
421 | "D"(u32C)); | |
422 | return u64Result; | |
423 | # endif | |
424 | # else | |
425 | RTUINT64U u; | |
426 | uint64_t u64Lo = (uint64_t)(u64A & 0xffffffff) * u32B; | |
427 | uint64_t u64Hi = (uint64_t)(u64A >> 32) * u32B; | |
428 | u64Hi += (u64Lo >> 32); | |
429 | u.s.Hi = (uint32_t)(u64Hi / u32C); | |
430 | u.s.Lo = (uint32_t)((((u64Hi % u32C) << 32) + (u64Lo & 0xffffffff)) / u32C); | |
431 | return u.u; | |
432 | # endif | |
433 | } | |
434 | #endif | |
435 | ||
436 | /** @} */ | |
437 | #endif | |
438 |