1 // TODO: when `unsafe_block_in_unsafe_fn` is stabilized, remove this
2 #![allow(unused_unsafe)]
4 //! This `specialized_div_rem` module is originally from version 1.0.0 of the
5 //! `specialized-div-rem` crate. Note that `for` loops with ranges are not used in this
6 //! module, since unoptimized compilation may generate references to `memcpy`.
8 //! The purpose of these macros is to easily change the both the division algorithm used
9 //! for a given integer size and the half division used by that algorithm. The way
10 //! functions call each other is also constructed such that linkers will find the chain of
11 //! software and hardware divisions needed for every size of signed and unsigned division.
12 //! For example, most target compilations do the following:
14 //! - Many 128 bit division functions like `u128::wrapping_div` use
15 //! `std::intrinsics::unchecked_div`, which gets replaced by `__udivti3` because there
16 //! is not a 128 bit by 128 bit hardware division function in most architectures.
17 //! `__udivti3` uses `u128_div_rem` (this extra level of function calls exists because
18 //! `__umodti3` and `__udivmodti4` also exist, and `specialized_div_rem` supplies just
19 //! one function to calculate both the quotient and remainder. If configuration flags
20 //! enable it, `impl_trifecta!` defines `u128_div_rem` to use the trifecta algorithm,
21 //! which requires the half sized division `u64_by_u64_div_rem`. If the architecture
22 //! supplies a 64 bit hardware division instruction, `u64_by_u64_div_rem` will be
23 //! reduced to those instructions. Note that we do not specify the half size division
24 //! directly to be `__udivdi3`, because hardware division would never be introduced.
25 //! - If the architecture does not supply a 64 bit hardware division instruction, u64
26 //! divisions will use functions such as `__udivdi3`. This will call `u64_div_rem`
27 //! which is defined by `impl_delegate!`. The half division for this algorithm is
28 //! `u32_by_u32_div_rem` which in turn becomes hardware division instructions or more
29 //! software division algorithms.
30 //! - If the architecture does not supply a 32 bit hardware instruction, linkers will
31 //! look for `__udivsi3`. `impl_binary_long!` is used, but this algorithm uses no half
32 //! division, so the chain of calls ends here.
34 //! On some architectures like x86_64, an asymmetrically sized division is supplied, in
35 //! which 128 bit numbers can be divided by 64 bit numbers. `impl_asymmetric!` is used to
36 //! extend the 128 by 64 bit division to a full 128 by 128 bit division.
38 // `allow(dead_code)` is used in various places, because the configuration code would otherwise be
39 // ridiculously complex
49 pub use self::delegate
::u128_divide_sparc
;
57 /// The behavior of all divisions by zero is controlled by this function. This function should be
58 /// impossible to reach by Rust users, unless `compiler-builtins` public division functions or
59 /// `core/std::unchecked_div/rem` are directly used without a zero check in front.
60 fn zero_div_fn() -> ! {
61 unsafe { core::hint::unreachable_unchecked() }
64 const USE_LZ
: bool
= {
65 if cfg
!(target_arch
= "arm") {
66 if cfg
!(target_feature
= "thumb-mode") {
67 // ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
68 // supported. This is needed to successfully differentiate between targets like
69 // `thumbv8.base` and `thumbv8.main`.
70 cfg
!(target_feature
= "v6t2")
72 // Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
73 // supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
74 // feature does not seem to work.
75 cfg
!(target_feature
= "v5te")
77 } else if cfg
!(any(target_arch
= "sparc", target_arch
= "sparc64")) {
78 // LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
79 cfg
!(target_feature
= "vis3")
80 } else if cfg
!(any(target_arch
= "riscv32", target_arch
= "riscv64")) {
81 // The `B` extension on RISC-V determines if a CLZ assembly instruction exists
82 cfg
!(target_feature
= "b")
84 // All other common targets Rust supports should have CLZ instructions
89 impl_normalization_shift
!(
90 u32_normalization_shift
,
97 impl_normalization_shift
!(
98 u64_normalization_shift
,
106 /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
107 /// `checked_div` and `checked_rem` are used to avoid bringing in panic function
110 fn u64_by_u64_div_rem(duo
: u64, div
: u64) -> (u64, u64) {
111 if let Some(quo
) = duo
.checked_div(div
) {
112 if let Some(rem
) = duo
.checked_rem(div
) {
119 // Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
120 // microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
121 // faster if the target pointer width is at least 64.
123 not(any(target_pointer_width
= "16", target_pointer_width
= "32")),
124 not(all(not(feature
= "no-asm"), target_arch
= "x86_64")),
125 not(any(target_arch
= "sparc", target_arch
= "sparc64"))
137 // If the pointer width less than 64, then the target architecture almost certainly does not have
138 // the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster.
140 any(target_pointer_width
= "16", target_pointer_width
= "32"),
141 not(all(not(feature
= "no-asm"), target_arch
= "x86_64")),
142 not(any(target_arch
= "sparc", target_arch
= "sparc64"))
147 u64_normalization_shift
,
156 /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
160 /// If the quotient does not fit in a `u64`, a floating point exception occurs.
161 /// If `div == 0`, then a division by zero exception occurs.
162 #[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
164 unsafe fn u128_by_u64_div_rem(duo
: u128
, div
: u64) -> (u64, u64) {
165 let duo_lo
= duo
as u64;
166 let duo_hi
= (duo
>> 64) as u64;
170 // divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this)
171 // by `div`. The quotient is stored in rax and the remainder in rdx.
172 // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
176 inlateout("rax") duo_lo
=> quo
,
177 inlateout("rdx") duo_hi
=> rem
,
178 options(att_syntax
, pure, nomem
, nostack
)
184 // use `asymmetric` instead of `trifecta` on x86_64
185 #[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
197 /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
198 /// `checked_div` and `checked_rem` are used to avoid bringing in panic function
202 fn u32_by_u32_div_rem(duo
: u32, div
: u32) -> (u32, u32) {
203 if let Some(quo
) = duo
.checked_div(div
) {
204 if let Some(rem
) = duo
.checked_rem(div
) {
211 // When not on x86 and the pointer width is not 64, use `delegate` since the division size is larger
212 // than register size.
214 not(all(not(feature
= "no-asm"), target_arch
= "x86")),
215 not(target_pointer_width
= "64")
220 u32_normalization_shift
,
229 // When not on x86 and the pointer width is 64, use `binary_long`.
231 not(all(not(feature
= "no-asm"), target_arch
= "x86")),
232 target_pointer_width
= "64"
237 u64_normalization_shift
,
243 /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
247 /// If the quotient does not fit in a `u32`, a floating point exception occurs.
248 /// If `div == 0`, then a division by zero exception occurs.
249 #[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
251 unsafe fn u64_by_u32_div_rem(duo
: u64, div
: u32) -> (u32, u32) {
252 let duo_lo
= duo
as u32;
253 let duo_hi
= (duo
>> 32) as u32;
257 // divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this)
258 // by `div`. The quotient is stored in rax and the remainder in rdx.
259 // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
263 inlateout("rax") duo_lo
=> quo
,
264 inlateout("rdx") duo_hi
=> rem
,
265 options(att_syntax
, pure, nomem
, nostack
)
271 // use `asymmetric` instead of `delegate` on x86
272 #[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
284 // 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
288 u32_normalization_shift
,