vendor/compiler_builtins/src/int/specialized_div_rem/mod.rs

   1 // TODO: when `unsafe_block_in_unsafe_fn` is stabilized, remove this
   2 #![allow(unused_unsafe)]
   3
   4 //! This `specialized_div_rem` module is originally from version 1.0.0 of the
   5 //! `specialized-div-rem` crate. Note that `for` loops with ranges are not used in this
   6 //! module, since unoptimized compilation may generate references to `memcpy`.
   7 //!
   8 //! The purpose of these macros is to easily change the both the division algorithm used
   9 //! for a given integer size and the half division used by that algorithm. The way
  10 //! functions call each other is also constructed such that linkers will find the chain of
  11 //! software and hardware divisions needed for every size of signed and unsigned division.
  12 //! For example, most target compilations do the following:
  13 //!
  14 //!  - Many 128 bit division functions like `u128::wrapping_div` use
  15 //!    `std::intrinsics::unchecked_div`, which gets replaced by `__udivti3` because there
  16 //!    is not a 128 bit by 128 bit hardware division function in most architectures.
  17 //!    `__udivti3` uses `u128_div_rem` (this extra level of function calls exists because
  18 //!    `__umodti3` and `__udivmodti4` also exist, and `specialized_div_rem` supplies just
  19 //!    one function to calculate both the quotient and remainder. If configuration flags
  20 //!    enable it, `impl_trifecta!` defines `u128_div_rem` to use the trifecta algorithm,
  21 //!    which requires the half sized division `u64_by_u64_div_rem`. If the architecture
  22 //!    supplies a 64 bit hardware division instruction, `u64_by_u64_div_rem` will be
  23 //!    reduced to those instructions. Note that we do not specify the half size division
  24 //!    directly to be `__udivdi3`, because hardware division would never be introduced.
  25 //!  - If the architecture does not supply a 64 bit hardware division instruction, u64
  26 //!    divisions will use functions such as `__udivdi3`. This will call `u64_div_rem`
  27 //!    which is defined by `impl_delegate!`. The half division for this algorithm is
  28 //!    `u32_by_u32_div_rem` which in turn becomes hardware division instructions or more
  29 //!    software division algorithms.
  30 //!  - If the architecture does not supply a 32 bit hardware instruction, linkers will
  31 //!    look for `__udivsi3`. `impl_binary_long!` is used, but this  algorithm uses no half
  32 //!    division, so the chain of calls ends here.
  33 //!
  34 //! On some architectures like x86_64, an asymmetrically sized division is supplied, in
  35 //! which 128 bit numbers can be divided by 64 bit numbers. `impl_asymmetric!` is used to
  36 //! extend the 128 by 64 bit division to a full 128 by 128 bit division.
  37
  38 // `allow(dead_code)` is used in various places, because the configuration code would otherwise be
  39 // ridiculously complex
  40
  41 #[macro_use]
  42 mod norm_shift;
  43
  44 #[macro_use]
  45 mod binary_long;
  46
  47 #[macro_use]
  48 mod delegate;
  49 pub use self::delegate::u128_divide_sparc;
  50
  51 #[macro_use]
  52 mod trifecta;
  53
  54 #[macro_use]
  55 mod asymmetric;
  56
  57 /// The behavior of all divisions by zero is controlled by this function. This function should be
  58 /// impossible to reach by Rust users, unless `compiler-builtins` public division functions or
  59 /// `core/std::unchecked_div/rem` are directly used without a zero check in front.
  60 fn zero_div_fn() -> ! {
  61     unsafe { core::hint::unreachable_unchecked() }
  62 }
  63
  64 const USE_LZ: bool = {
  65     if cfg!(target_arch = "arm") {
  66         if cfg!(target_feature = "thumb-mode") {
  67             // ARM thumb targets have CLZ instructions if the instruction set of ARMv6T2 is
  68             // supported. This is needed to successfully differentiate between targets like
  69             // `thumbv8.base` and `thumbv8.main`.
  70             cfg!(target_feature = "v6t2")
  71         } else {
  72             // Regular ARM targets have CLZ instructions if the ARMv5TE instruction set is
  73             // supported. Technically, ARMv5T was the first to have CLZ, but the "v5t" target
  74             // feature does not seem to work.
  75             cfg!(target_feature = "v5te")
  76         }
  77     } else if cfg!(any(target_arch = "sparc", target_arch = "sparc64")) {
  78         // LZD or LZCNT on SPARC only exists for the VIS 3 extension and later.
  79         cfg!(target_feature = "vis3")
  80     } else if cfg!(any(target_arch = "riscv32", target_arch = "riscv64")) {
  81         // The `B` extension on RISC-V determines if a CLZ assembly instruction exists
  82         cfg!(target_feature = "b")
  83     } else {
  84         // All other common targets Rust supports should have CLZ instructions
  85         true
  86     }
  87 };
  88
  89 impl_normalization_shift!(
  90     u32_normalization_shift,
  91     USE_LZ,
  92     32,
  93     u32,
  94     i32,
  95     allow(dead_code)
  96 );
  97 impl_normalization_shift!(
  98     u64_normalization_shift,
  99     USE_LZ,
 100     64,
 101     u64,
 102     i64,
 103     allow(dead_code)
 104 );
 105
 106 /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
 107 /// `checked_div` and `checked_rem` are used to avoid bringing in panic function
 108 /// dependencies.
 109 #[inline]
 110 fn u64_by_u64_div_rem(duo: u64, div: u64) -> (u64, u64) {
 111     if let Some(quo) = duo.checked_div(div) {
 112         if let Some(rem) = duo.checked_rem(div) {
 113             return (quo, rem);
 114         }
 115     }
 116     zero_div_fn()
 117 }
 118
 119 // Whether `trifecta` or `delegate` is faster for 128 bit division depends on the speed at which a
 120 // microarchitecture can multiply and divide. We decide to be optimistic and assume `trifecta` is
 121 // faster if the target pointer width is at least 64.
 122 #[cfg(all(
 123     not(any(target_pointer_width = "16", target_pointer_width = "32")),
 124     not(all(not(feature = "no-asm"), target_arch = "x86_64")),
 125     not(any(target_arch = "sparc", target_arch = "sparc64"))
 126 ))]
 127 impl_trifecta!(
 128     u128_div_rem,
 129     zero_div_fn,
 130     u64_by_u64_div_rem,
 131     32,
 132     u32,
 133     u64,
 134     u128
 135 );
 136
 137 // If the pointer width less than 64, then the target architecture almost certainly does not have
 138 // the fast 64 to 128 bit widening multiplication needed for `trifecta` to be faster.
 139 #[cfg(all(
 140     any(target_pointer_width = "16", target_pointer_width = "32"),
 141     not(all(not(feature = "no-asm"), target_arch = "x86_64")),
 142     not(any(target_arch = "sparc", target_arch = "sparc64"))
 143 ))]
 144 impl_delegate!(
 145     u128_div_rem,
 146     zero_div_fn,
 147     u64_normalization_shift,
 148     u64_by_u64_div_rem,
 149     32,
 150     u32,
 151     u64,
 152     u128,
 153     i128
 154 );
 155
 156 /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
 157 ///
 158 /// # Safety
 159 ///
 160 /// If the quotient does not fit in a `u64`, a floating point exception occurs.
 161 /// If `div == 0`, then a division by zero exception occurs.
 162 #[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
 163 #[inline]
 164 unsafe fn u128_by_u64_div_rem(duo: u128, div: u64) -> (u64, u64) {
 165     let duo_lo = duo as u64;
 166     let duo_hi = (duo >> 64) as u64;
 167     let quo: u64;
 168     let rem: u64;
 169     unsafe {
 170         // divides the combined registers rdx:rax (`duo` is split into two 64 bit parts to do this)
 171         // by `div`. The quotient is stored in rax and the remainder in rdx.
 172         // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
 173         asm!(
 174             "div {0}",
 175             in(reg) div,
 176             inlateout("rax") duo_lo => quo,
 177             inlateout("rdx") duo_hi => rem,
 178             options(att_syntax, pure, nomem, nostack)
 179         );
 180     }
 181     (quo, rem)
 182 }
 183
 184 // use `asymmetric` instead of `trifecta` on x86_64
 185 #[cfg(all(not(feature = "no-asm"), target_arch = "x86_64"))]
 186 impl_asymmetric!(
 187     u128_div_rem,
 188     zero_div_fn,
 189     u64_by_u64_div_rem,
 190     u128_by_u64_div_rem,
 191     32,
 192     u32,
 193     u64,
 194     u128
 195 );
 196
 197 /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
 198 /// `checked_div` and `checked_rem` are used to avoid bringing in panic function
 199 /// dependencies.
 200 #[inline]
 201 #[allow(dead_code)]
 202 fn u32_by_u32_div_rem(duo: u32, div: u32) -> (u32, u32) {
 203     if let Some(quo) = duo.checked_div(div) {
 204         if let Some(rem) = duo.checked_rem(div) {
 205             return (quo, rem);
 206         }
 207     }
 208     zero_div_fn()
 209 }
 210
 211 // When not on x86 and the pointer width is not 64, use `delegate` since the division size is larger
 212 // than register size.
 213 #[cfg(all(
 214     not(all(not(feature = "no-asm"), target_arch = "x86")),
 215     not(target_pointer_width = "64")
 216 ))]
 217 impl_delegate!(
 218     u64_div_rem,
 219     zero_div_fn,
 220     u32_normalization_shift,
 221     u32_by_u32_div_rem,
 222     16,
 223     u16,
 224     u32,
 225     u64,
 226     i64
 227 );
 228
 229 // When not on x86 and the pointer width is 64, use `binary_long`.
 230 #[cfg(all(
 231     not(all(not(feature = "no-asm"), target_arch = "x86")),
 232     target_pointer_width = "64"
 233 ))]
 234 impl_binary_long!(
 235     u64_div_rem,
 236     zero_div_fn,
 237     u64_normalization_shift,
 238     64,
 239     u64,
 240     i64
 241 );
 242
 243 /// Divides `duo` by `div` and returns a tuple of the quotient and the remainder.
 244 ///
 245 /// # Safety
 246 ///
 247 /// If the quotient does not fit in a `u32`, a floating point exception occurs.
 248 /// If `div == 0`, then a division by zero exception occurs.
 249 #[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
 250 #[inline]
 251 unsafe fn u64_by_u32_div_rem(duo: u64, div: u32) -> (u32, u32) {
 252     let duo_lo = duo as u32;
 253     let duo_hi = (duo >> 32) as u32;
 254     let quo: u32;
 255     let rem: u32;
 256     unsafe {
 257         // divides the combined registers rdx:rax (`duo` is split into two 32 bit parts to do this)
 258         // by `div`. The quotient is stored in rax and the remainder in rdx.
 259         // FIXME: Use the Intel syntax once we drop LLVM 9 support on rust-lang/rust.
 260         asm!(
 261             "div {0}",
 262             in(reg) div,
 263             inlateout("rax") duo_lo => quo,
 264             inlateout("rdx") duo_hi => rem,
 265             options(att_syntax, pure, nomem, nostack)
 266         );
 267     }
 268     (quo, rem)
 269 }
 270
 271 // use `asymmetric` instead of `delegate` on x86
 272 #[cfg(all(not(feature = "no-asm"), target_arch = "x86"))]
 273 impl_asymmetric!(
 274     u64_div_rem,
 275     zero_div_fn,
 276     u32_by_u32_div_rem,
 277     u64_by_u32_div_rem,
 278     16,
 279     u16,
 280     u32,
 281     u64
 282 );
 283
 284 // 32 bits is the smallest division used by `compiler-builtins`, so we end with binary long division
 285 impl_binary_long!(
 286     u32_div_rem,
 287     zero_div_fn,
 288     u32_normalization_shift,
 289     32,
 290     u32,
 291     i32
 292 );