]> git.proxmox.com Git - rustc.git/blob - vendor/compiler_builtins/src/int/specialized_div_rem/mod.rs
New upstream version 1.51.0+dfsg1
[rustc.git] / vendor / compiler_builtins / src / int / specialized_div_rem / mod.rs
1 // TODO: when `unsafe_block_in_unsafe_fn` is stabilized, remove this
2 #![allow(unused_unsafe)]
3
4 //! This `specialized_div_rem` module is originally from version 1.0.0 of the
5 //! `specialized-div-rem` crate. Note that `for` loops with ranges are not used in this
6 //! module, since unoptimized compilation may generate references to `memcpy`.
7 //!
8 //! The purpose of these macros is to easily change the both the division algorithm used
9 //! for a given integer size and the half division used by that algorithm. The way
10 //! functions call each other is also constructed such that linkers will find the chain of
11 //! software and hardware divisions needed for every size of signed and unsigned division.
12 //! For example, most target compilations do the following:
13 //!
14 //! - Many 128 bit division functions like `u128::wrapping_div` use
15 //! `std::intrinsics::unchecked_div`, which gets replaced by `__udivti3` because there
16 //! is not a 128 bit by 128 bit hardware division function in most architectures.
17 //! `__udivti3` uses `u128_div_rem` (this extra level of function calls exists because
18 //! `__umodti3` and `__udivmodti4` also exist, and `specialized_div_rem` supplies just
19 //! one function to calculate both the quotient and remainder. If configuration flags
20 //! enable it, `impl_trifecta!` defines `u128_div_rem` to use the trifecta algorithm,
21 //! which requires the half sized division `u64_by_u64_div_rem`. If the architecture
22 //! supplies a 64 bit hardware division instruction, `u64_by_u64_div_rem` will be
23 //! reduced to those instructions. Note that we do not specify the half size division
24 //! directly to be `__udivdi3`, because hardware division would never be introduced.
25 //! - If the architecture does not supply a 64 bit hardware division instruction, u64
26 //! divisions will use functions such as `__udivdi3`. This will call `u64_div_rem`
27 //! which is defined by `impl_delegate!`. The half division for this algorithm is
28 //! `u32_by_u32_div_rem` which in turn becomes hardware division instructions or more
29 //! software division algorithms.
30 //! - If the architecture does not supply a 32 bit hardware instruction, linkers will
31 //! look for `__udivsi3`. `impl_binary_long!` is used, but this algorithm uses no half
32 //! division, so the chain of calls ends here.
33 //!
34 //! On some architectures like x86_64, an asymmetrically sized division is supplied, in
35 //! which 128 bit numbers can be divided by 64 bit numbers. `impl_asymmetric!` is used to
36 //! extend the 128 by 64 bit division to a full 128 by 128 bit division.
37
38 // `allow(dead_code)` is used in various places, because the configuration code would otherwise be
39 // ridiculously complex
40
41 #[macro_use]
42 mod norm_shift;
43
44 #[macro_use]
45 mod binary_long;
46
47 #[macro_use]
48 mod delegate;
49 pub use self::delegate::u128_divide_sparc;
50
51 #[macro_use]
52 mod trifecta;
53
54 #[macro_use]
55 mod asymmetric;
56
57 /// The behavior of all divisions by zero is controlled by this function. This function should be
58 /// impossible to reach by Rust users, unless `compiler-builtins` public division functions or
59 /// `core/std::unchecked_div/rem` are directly used without a zero check in front.
60 fn zero_div_fn() -> ! {
61 unsafe { core::hint::unreachable_unchecked() }
62 }
63
64 const USE_LZ: bool = {
65 if cfg!(target_arch = "arm") {
66 if cfg!(target_feature = "thumb-mode") {
67 // ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
68 // supported. This is needed to successfully differentiate between targets like
69 // `thumbv8.base` and `thumbv8.main`.
70 cfg!(target_feature = "v6t2")
71 } else {
72 // Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
73 // supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
74 // feature does not seem to work.
75 cfg!(target_feature = "v5te")
76 }
77 } else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
78 // LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
79 cfg!(target_feature = "vis3")
80 } else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
81 // The `B` extension on RISC-V determines if a CLZ assembly instruction exists
82 cfg!(target_feature = "b")
83 } else {
84 // All other common targets Rust supports should have CLZ instructions
85 true
86 }
87 };
88
89 impl_normalization_shift!(
90 u32_normalization_shift,
91 USE_LZ,
92 32,
93 u32,
94 i32,
95 allow(dead_code)
96 );
97 impl_normalization_shift!(
98 u64_normalization_shift,
99 USE_LZ,
100 64,
101 u64,
102 i64,
103 allow(dead_code)
104 );
105
106 /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
107 /// `checked_div` and `checked_rem` are used to avoid bringing in panic function
108 /// dependencies.
109 #[inline]
110 fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
111 if let Some(quo) = duo.checked_div(div) {
112 if let Some(rem) = duo.checked_rem(div) {
113 return (quo, rem);
114 }
115 }
116 zero_div_fn()
117 }
118
119 // Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
120 // microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
121 // faster if the target pointer width is at least 64.
122 #[cfg(all(
123 not(any(target_pointer_width = "16", target_pointer_width = "32")),
124 not(all(not(feature = "no-asm"), target_arch = "x86_64")),
125 not(any(target_arch = "sparc", target_arch = "sparc64"))
126 ))]
127 impl_trifecta!(
128 u128_div_rem,
129 zero_div_fn,
130 u64_by_u64_div_rem,
131 32,
132 u32,
133 u64,
134 u128
135 );
136
137 // If the pointer width less than 64, then the target architecture almost certainly does not have
138 // the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster.
139 #[cfg(all(
140 any(target_pointer_width = "16", target_pointer_width = "32"),
141 not(all(not(feature = "no-asm"), target_arch = "x86_64")),
142 not(any(target_arch = "sparc", target_arch = "sparc64"))
143 ))]
144 impl_delegate!(
145 u128_div_rem,
146 zero_div_fn,
147 u64_normalization_shift,
148 u64_by_u64_div_rem,
149 32,
150 u32,
151 u64,
152 u128,
153 i128
154 );
155
156 /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
157 ///
158 /// # Safety
159 ///
160 /// If the quotient does not fit in a `u64`, a floating point exception occurs.
161 /// If `div == 0`, then a division by zero exception occurs.
162 #[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
163 #[inline]
164 unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) {
165 let duo_lo = duo as u64;
166 let duo_hi = (duo >> 64) as u64;
167 let quo: u64;
168 let rem: u64;
169 unsafe {
170 // divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this)
171 // by `div`. The quotient is stored in rax and the remainder in rdx.
172 // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
173 asm!(
174 "div {0}",
175 in(reg) div,
176 inlateout("rax") duo_lo => quo,
177 inlateout("rdx") duo_hi => rem,
178 options(att_syntax, pure, nomem, nostack)
179 );
180 }
181 (quo, rem)
182 }
183
184 // use `asymmetric` instead of `trifecta` on x86_64
185 #[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
186 impl_asymmetric!(
187 u128_div_rem,
188 zero_div_fn,
189 u64_by_u64_div_rem,
190 u128_by_u64_div_rem,
191 32,
192 u32,
193 u64,
194 u128
195 );
196
197 /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
198 /// `checked_div` and `checked_rem` are used to avoid bringing in panic function
199 /// dependencies.
200 #[inline]
201 #[allow(dead_code)]
202 fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) {
203 if let Some(quo) = duo.checked_div(div) {
204 if let Some(rem) = duo.checked_rem(div) {
205 return (quo, rem);
206 }
207 }
208 zero_div_fn()
209 }
210
211 // When not on x86 and the pointer width is not 64, use `delegate` since the division size is larger
212 // than register size.
213 #[cfg(all(
214 not(all(not(feature = "no-asm"), target_arch = "x86")),
215 not(target_pointer_width = "64")
216 ))]
217 impl_delegate!(
218 u64_div_rem,
219 zero_div_fn,
220 u32_normalization_shift,
221 u32_by_u32_div_rem,
222 16,
223 u16,
224 u32,
225 u64,
226 i64
227 );
228
229 // When not on x86 and the pointer width is 64, use `binary_long`.
230 #[cfg(all(
231 not(all(not(feature = "no-asm"), target_arch = "x86")),
232 target_pointer_width = "64"
233 ))]
234 impl_binary_long!(
235 u64_div_rem,
236 zero_div_fn,
237 u64_normalization_shift,
238 64,
239 u64,
240 i64
241 );
242
243 /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
244 ///
245 /// # Safety
246 ///
247 /// If the quotient does not fit in a `u32`, a floating point exception occurs.
248 /// If `div == 0`, then a division by zero exception occurs.
249 #[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
250 #[inline]
251 unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) {
252 let duo_lo = duo as u32;
253 let duo_hi = (duo >> 32) as u32;
254 let quo: u32;
255 let rem: u32;
256 unsafe {
257 // divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this)
258 // by `div`. The quotient is stored in rax and the remainder in rdx.
259 // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
260 asm!(
261 "div {0}",
262 in(reg) div,
263 inlateout("rax") duo_lo => quo,
264 inlateout("rdx") duo_hi => rem,
265 options(att_syntax, pure, nomem, nostack)
266 );
267 }
268 (quo, rem)
269 }
270
271 // use `asymmetric` instead of `delegate` on x86
272 #[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
273 impl_asymmetric!(
274 u64_div_rem,
275 zero_div_fn,
276 u32_by_u32_div_rem,
277 u64_by_u32_div_rem,
278 16,
279 u16,
280 u32,
281 u64
282 );
283
284 // 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
285 impl_binary_long!(
286 u32_div_rem,
287 zero_div_fn,
288 u32_normalization_shift,
289 32,
290 u32,
291 i32
292 );