library/core/src/num/dec2flt/rawfp.rs

   1 //! Bit fiddling on positive IEEE 754 floats. Negative numbers aren't and needn't be handled.
   2 //! Normal floating point numbers have a canonical representation as (frac, exp) such that the
   3 //! value is 2<sup>exp</sup> * (1 + sum(frac[N-i] / 2<sup>i</sup>)) where N is the number of bits.
   4 //! Subnormals are slightly different and weird, but the same principle applies.
   5 //!
   6 //! Here, however, we represent them as (sig, k) with f positive, such that the value is f *
   7 //! 2<sup>e</sup>. Besides making the "hidden bit" explicit, this changes the exponent by the
   8 //! so-called mantissa shift.
   9 //!
  10 //! Put another way, normally floats are written as (1) but here they are written as (2):
  11 //!
  12 //! 1. `1.101100...11 * 2^m`
  13 //! 2. `1101100...11 * 2^n`
  14 //!
  15 //! We call (1) the **fractional representation** and (2) the **integral representation**.
  16 //!
  17 //! Many functions in this module only handle normal numbers. The dec2flt routines conservatively
  18 //! take the universally-correct slow path (Algorithm M) for very small and very large numbers.
  19 //! That algorithm needs only next_float() which does handle subnormals and zeros.
  20 use crate::cmp::Ordering::{Equal, Greater, Less};
  21 use crate::convert::{TryFrom, TryInto};
  22 use crate::fmt::{Debug, LowerExp};
  23 use crate::num::dec2flt::num::{self, Big};
  24 use crate::num::dec2flt::table;
  25 use crate::num::diy_float::Fp;
  26 use crate::num::FpCategory;
  27 use crate::num::FpCategory::{Infinite, Nan, Normal, Subnormal, Zero};
  28 use crate::ops::{Add, Div, Mul, Neg};
  29
  30 #[derive(Copy, Clone, Debug)]
  31 pub struct Unpacked {
  32     pub sig: u64,
  33     pub k: i16,
  34 }
  35
  36 impl Unpacked {
  37     pub fn new(sig: u64, k: i16) -> Self {
  38         Unpacked { sig, k }
  39     }
  40 }
  41
  42 /// A helper trait to avoid duplicating basically all the conversion code for `f32` and `f64`.
  43 ///
  44 /// See the parent module's doc comment for why this is necessary.
  45 ///
  46 /// Should **never ever** be implemented for other types or be used outside the dec2flt module.
  47 pub trait RawFloat:
  48     Copy + Debug + LowerExp + Mul<Output = Self> + Div<Output = Self> + Neg<Output = Self>
  49 {
  50     const INFINITY: Self;
  51     const NAN: Self;
  52     const ZERO: Self;
  53
  54     /// Type used by `to_bits` and `from_bits`.
  55     type Bits: Add<Output = Self::Bits> + From<u8> + TryFrom<u64>;
  56
  57     /// Performs a raw transmutation to an integer.
  58     fn to_bits(self) -> Self::Bits;
  59
  60     /// Performs a raw transmutation from an integer.
  61     fn from_bits(v: Self::Bits) -> Self;
  62
  63     /// Returns the category that this number falls into.
  64     fn classify(self) -> FpCategory;
  65
  66     /// Returns the mantissa, exponent and sign as integers.
  67     fn integer_decode(self) -> (u64, i16, i8);
  68
  69     /// Decodes the float.
  70     fn unpack(self) -> Unpacked;
  71
  72     /// Casts from a small integer that can be represented exactly. Panic if the integer can't be
  73     /// represented, the other code in this module makes sure to never let that happen.
  74     fn from_int(x: u64) -> Self;
  75
  76     /// Gets the value 10<sup>e</sup> from a pre-computed table.
  77     /// Panics for `e >= CEIL_LOG5_OF_MAX_SIG`.
  78     fn short_fast_pow10(e: usize) -> Self;
  79
  80     /// What the name says. It's easier to hard code than juggling intrinsics and
  81     /// hoping LLVM constant folds it.
  82     const CEIL_LOG5_OF_MAX_SIG: i16;
  83
  84     // A conservative bound on the decimal digits of inputs that can't produce overflow or zero or
  85     /// subnormals. Probably the decimal exponent of the maximum normal value, hence the name.
  86     const MAX_NORMAL_DIGITS: usize;
  87
  88     /// When the most significant decimal digit has a place value greater than this, the number
  89     /// is certainly rounded to infinity.
  90     const INF_CUTOFF: i64;
  91
  92     /// When the most significant decimal digit has a place value less than this, the number
  93     /// is certainly rounded to zero.
  94     const ZERO_CUTOFF: i64;
  95
  96     /// The number of bits in the exponent.
  97     const EXP_BITS: u8;
  98
  99     /// The number of bits in the significand, *including* the hidden bit.
 100     const SIG_BITS: u8;
 101
 102     /// The number of bits in the significand, *excluding* the hidden bit.
 103     const EXPLICIT_SIG_BITS: u8;
 104
 105     /// The maximum legal exponent in fractional representation.
 106     const MAX_EXP: i16;
 107
 108     /// The minimum legal exponent in fractional representation, excluding subnormals.
 109     const MIN_EXP: i16;
 110
 111     /// `MAX_EXP` for integral representation, i.e., with the shift applied.
 112     const MAX_EXP_INT: i16;
 113
 114     /// `MAX_EXP` encoded (i.e., with offset bias)
 115     const MAX_ENCODED_EXP: i16;
 116
 117     /// `MIN_EXP` for integral representation, i.e., with the shift applied.
 118     const MIN_EXP_INT: i16;
 119
 120     /// The maximum normalized significand in integral representation.
 121     const MAX_SIG: u64;
 122
 123     /// The minimal normalized significand in integral representation.
 124     const MIN_SIG: u64;
 125 }
 126
 127 // Mostly a workaround for #34344.
 128 macro_rules! other_constants {
 129     ($type: ident) => {
 130         const EXPLICIT_SIG_BITS: u8 = Self::SIG_BITS - 1;
 131         const MAX_EXP: i16 = (1 << (Self::EXP_BITS - 1)) - 1;
 132         const MIN_EXP: i16 = -<Self as RawFloat>::MAX_EXP + 1;
 133         const MAX_EXP_INT: i16 = <Self as RawFloat>::MAX_EXP - (Self::SIG_BITS as i16 - 1);
 134         const MAX_ENCODED_EXP: i16 = (1 << Self::EXP_BITS) - 1;
 135         const MIN_EXP_INT: i16 = <Self as RawFloat>::MIN_EXP - (Self::SIG_BITS as i16 - 1);
 136         const MAX_SIG: u64 = (1 << Self::SIG_BITS) - 1;
 137         const MIN_SIG: u64 = 1 << (Self::SIG_BITS - 1);
 138
 139         const INFINITY: Self = $type::INFINITY;
 140         const NAN: Self = $type::NAN;
 141         const ZERO: Self = 0.0;
 142     };
 143 }
 144
 145 impl RawFloat for f32 {
 146     type Bits = u32;
 147
 148     const SIG_BITS: u8 = 24;
 149     const EXP_BITS: u8 = 8;
 150     const CEIL_LOG5_OF_MAX_SIG: i16 = 11;
 151     const MAX_NORMAL_DIGITS: usize = 35;
 152     const INF_CUTOFF: i64 = 40;
 153     const ZERO_CUTOFF: i64 = -48;
 154     other_constants!(f32);
 155
 156     /// Returns the mantissa, exponent and sign as integers.
 157     fn integer_decode(self) -> (u64, i16, i8) {
 158         let bits = self.to_bits();
 159         let sign: i8 = if bits >> 31 == 0 { 1 } else { -1 };
 160         let mut exponent: i16 = ((bits >> 23) & 0xff) as i16;
 161         let mantissa =
 162             if exponent == 0 { (bits & 0x7fffff) << 1 } else { (bits & 0x7fffff) | 0x800000 };
 163         // Exponent bias + mantissa shift
 164         exponent -= 127 + 23;
 165         (mantissa as u64, exponent, sign)
 166     }
 167
 168     fn unpack(self) -> Unpacked {
 169         let (sig, exp, _sig) = self.integer_decode();
 170         Unpacked::new(sig, exp)
 171     }
 172
 173     fn from_int(x: u64) -> f32 {
 174         // rkruppe is uncertain whether `as` rounds correctly on all platforms.
 175         debug_assert!(x as f32 == fp_to_float(Fp { f: x, e: 0 }));
 176         x as f32
 177     }
 178
 179     fn short_fast_pow10(e: usize) -> Self {
 180         table::F32_SHORT_POWERS[e]
 181     }
 182
 183     fn classify(self) -> FpCategory {
 184         self.classify()
 185     }
 186     fn to_bits(self) -> Self::Bits {
 187         self.to_bits()
 188     }
 189     fn from_bits(v: Self::Bits) -> Self {
 190         Self::from_bits(v)
 191     }
 192 }
 193
 194 impl RawFloat for f64 {
 195     type Bits = u64;
 196
 197     const SIG_BITS: u8 = 53;
 198     const EXP_BITS: u8 = 11;
 199     const CEIL_LOG5_OF_MAX_SIG: i16 = 23;
 200     const MAX_NORMAL_DIGITS: usize = 305;
 201     const INF_CUTOFF: i64 = 310;
 202     const ZERO_CUTOFF: i64 = -326;
 203     other_constants!(f64);
 204
 205     /// Returns the mantissa, exponent and sign as integers.
 206     fn integer_decode(self) -> (u64, i16, i8) {
 207         let bits = self.to_bits();
 208         let sign: i8 = if bits >> 63 == 0 { 1 } else { -1 };
 209         let mut exponent: i16 = ((bits >> 52) & 0x7ff) as i16;
 210         let mantissa = if exponent == 0 {
 211             (bits & 0xfffffffffffff) << 1
 212         } else {
 213             (bits & 0xfffffffffffff) | 0x10000000000000
 214         };
 215         // Exponent bias + mantissa shift
 216         exponent -= 1023 + 52;
 217         (mantissa, exponent, sign)
 218     }
 219
 220     fn unpack(self) -> Unpacked {
 221         let (sig, exp, _sig) = self.integer_decode();
 222         Unpacked::new(sig, exp)
 223     }
 224
 225     fn from_int(x: u64) -> f64 {
 226         // rkruppe is uncertain whether `as` rounds correctly on all platforms.
 227         debug_assert!(x as f64 == fp_to_float(Fp { f: x, e: 0 }));
 228         x as f64
 229     }
 230
 231     fn short_fast_pow10(e: usize) -> Self {
 232         table::F64_SHORT_POWERS[e]
 233     }
 234
 235     fn classify(self) -> FpCategory {
 236         self.classify()
 237     }
 238     fn to_bits(self) -> Self::Bits {
 239         self.to_bits()
 240     }
 241     fn from_bits(v: Self::Bits) -> Self {
 242         Self::from_bits(v)
 243     }
 244 }
 245
 246 /// Converts an `Fp` to the closest machine float type.
 247 /// Does not handle subnormal results.
 248 pub fn fp_to_float<T: RawFloat>(x: Fp) -> T {
 249     let x = x.normalize();
 250     // x.f is 64 bit, so x.e has a mantissa shift of 63
 251     let e = x.e + 63;
 252     if e > T::MAX_EXP {
 253         panic!("fp_to_float: exponent {} too large", e)
 254     } else if e > T::MIN_EXP {
 255         encode_normal(round_normal::<T>(x))
 256     } else {
 257         panic!("fp_to_float: exponent {} too small", e)
 258     }
 259 }
 260
 261 /// Round the 64-bit significand to T::SIG_BITS bits with half-to-even.
 262 /// Does not handle exponent overflow.
 263 pub fn round_normal<T: RawFloat>(x: Fp) -> Unpacked {
 264     let excess = 64 - T::SIG_BITS as i16;
 265     let half: u64 = 1 << (excess - 1);
 266     let (q, rem) = (x.f >> excess, x.f & ((1 << excess) - 1));
 267     assert_eq!(q << excess | rem, x.f);
 268     // Adjust mantissa shift
 269     let k = x.e + excess;
 270     if rem < half {
 271         Unpacked::new(q, k)
 272     } else if rem == half && (q % 2) == 0 {
 273         Unpacked::new(q, k)
 274     } else if q == T::MAX_SIG {
 275         Unpacked::new(T::MIN_SIG, k + 1)
 276     } else {
 277         Unpacked::new(q + 1, k)
 278     }
 279 }
 280
 281 /// Inverse of `RawFloat::unpack()` for normalized numbers.
 282 /// Panics if the significand or exponent are not valid for normalized numbers.
 283 pub fn encode_normal<T: RawFloat>(x: Unpacked) -> T {
 284     debug_assert!(
 285         T::MIN_SIG <= x.sig && x.sig <= T::MAX_SIG,
 286         "encode_normal: significand not normalized"
 287     );
 288     // Remove the hidden bit
 289     let sig_enc = x.sig & !(1 << T::EXPLICIT_SIG_BITS);
 290     // Adjust the exponent for exponent bias and mantissa shift
 291     let k_enc = x.k + T::MAX_EXP + T::EXPLICIT_SIG_BITS as i16;
 292     debug_assert!(k_enc != 0 && k_enc < T::MAX_ENCODED_EXP, "encode_normal: exponent out of range");
 293     // Leave sign bit at 0 ("+"), our numbers are all positive
 294     let bits = (k_enc as u64) << T::EXPLICIT_SIG_BITS | sig_enc;
 295     T::from_bits(bits.try_into().unwrap_or_else(|_| unreachable!()))
 296 }
 297
 298 /// Construct a subnormal. A mantissa of 0 is allowed and constructs zero.
 299 pub fn encode_subnormal<T: RawFloat>(significand: u64) -> T {
 300     assert!(significand < T::MIN_SIG, "encode_subnormal: not actually subnormal");
 301     // Encoded exponent is 0, the sign bit is 0, so we just have to reinterpret the bits.
 302     T::from_bits(significand.try_into().unwrap_or_else(|_| unreachable!()))
 303 }
 304
 305 /// Approximate a bignum with an Fp. Rounds within 0.5 ULP with half-to-even.
 306 pub fn big_to_fp(f: &Big) -> Fp {
 307     let end = f.bit_length();
 308     assert!(end != 0, "big_to_fp: unexpectedly, input is zero");
 309     let start = end.saturating_sub(64);
 310     let leading = num::get_bits(f, start, end);
 311     // We cut off all bits prior to the index `start`, i.e., we effectively right-shift by
 312     // an amount of `start`, so this is also the exponent we need.
 313     let e = start as i16;
 314     let rounded_down = Fp { f: leading, e }.normalize();
 315     // Round (half-to-even) depending on the truncated bits.
 316     match num::compare_with_half_ulp(f, start) {
 317         Less => rounded_down,
 318         Equal if leading % 2 == 0 => rounded_down,
 319         Equal | Greater => match leading.checked_add(1) {
 320             Some(f) => Fp { f, e }.normalize(),
 321             None => Fp { f: 1 << 63, e: e + 1 },
 322         },
 323     }
 324 }
 325
 326 /// Finds the largest floating point number strictly smaller than the argument.
 327 /// Does not handle subnormals, zero, or exponent underflow.
 328 pub fn prev_float<T: RawFloat>(x: T) -> T {
 329     match x.classify() {
 330         Infinite => panic!("prev_float: argument is infinite"),
 331         Nan => panic!("prev_float: argument is NaN"),
 332         Subnormal => panic!("prev_float: argument is subnormal"),
 333         Zero => panic!("prev_float: argument is zero"),
 334         Normal => {
 335             let Unpacked { sig, k } = x.unpack();
 336             if sig == T::MIN_SIG {
 337                 encode_normal(Unpacked::new(T::MAX_SIG, k - 1))
 338             } else {
 339                 encode_normal(Unpacked::new(sig - 1, k))
 340             }
 341         }
 342     }
 343 }
 344
 345 // Find the smallest floating point number strictly larger than the argument.
 346 // This operation is saturating, i.e., next_float(inf) == inf.
 347 // Unlike most code in this module, this function does handle zero, subnormals, and infinities.
 348 // However, like all other code here, it does not deal with NaN and negative numbers.
 349 pub fn next_float<T: RawFloat>(x: T) -> T {
 350     match x.classify() {
 351         Nan => panic!("next_float: argument is NaN"),
 352         Infinite => T::INFINITY,
 353         // This seems too good to be true, but it works.
 354         // 0.0 is encoded as the all-zero word. Subnormals are 0x000m...m where m is the mantissa.
 355         // In particular, the smallest subnormal is 0x0...01 and the largest is 0x000F...F.
 356         // The smallest normal number is 0x0010...0, so this corner case works as well.
 357         // If the increment overflows the mantissa, the carry bit increments the exponent as we
 358         // want, and the mantissa bits become zero. Because of the hidden bit convention, this
 359         // too is exactly what we want!
 360         // Finally, f64::MAX + 1 = 7eff...f + 1 = 7ff0...0 = f64::INFINITY.
 361         Zero | Subnormal | Normal => T::from_bits(x.to_bits() + T::Bits::from(1u8)),
 362     }
 363 }