1 //! Bit fiddling on positive IEEE 754 floats. Negative numbers aren't and needn't be handled.
2 //! Normal floating point numbers have a canonical representation as (frac, exp) such that the
3 //! value is 2<sup>exp</sup> * (1 + sum(frac[N-i] / 2<sup>i</sup>)) where N is the number of bits.
4 //! Subnormals are slightly different and weird, but the same principle applies.
6 //! Here, however, we represent them as (sig, k) with f positive, such that the value is f *
7 //! 2<sup>e</sup>. Besides making the "hidden bit" explicit, this changes the exponent by the
8 //! so-called mantissa shift.
10 //! Put another way, normally floats are written as (1) but here they are written as (2):
12 //! 1. `1.101100...11 * 2^m`
13 //! 2. `1101100...11 * 2^n`
15 //! We call (1) the **fractional representation** and (2) the **integral representation**.
17 //! Many functions in this module only handle normal numbers. The dec2flt routines conservatively
18 //! take the universally-correct slow path (Algorithm M) for very small and very large numbers.
19 //! That algorithm needs only next_float() which does handle subnormals and zeros.
20 use crate::cmp
::Ordering
::{Equal, Greater, Less}
;
21 use crate::convert
::{TryFrom, TryInto}
;
22 use crate::fmt
::{Debug, LowerExp}
;
23 use crate::num
::dec2flt
::num
::{self, Big}
;
24 use crate::num
::dec2flt
::table
;
25 use crate::num
::diy_float
::Fp
;
26 use crate::num
::FpCategory
;
27 use crate::num
::FpCategory
::{Infinite, Nan, Normal, Subnormal, Zero}
;
28 use crate::ops
::{Add, Div, Mul, Neg}
;
30 #[derive(Copy, Clone, Debug)]
37 pub fn new(sig
: u64, k
: i16) -> Self {
42 /// A helper trait to avoid duplicating basically all the conversion code for `f32` and `f64`.
44 /// See the parent module's doc comment for why this is necessary.
46 /// Should **never ever** be implemented for other types or be used outside the dec2flt module.
48 Copy
+ Debug
+ LowerExp
+ Mul
<Output
= Self> + Div
<Output
= Self> + Neg
<Output
= Self>
54 /// Type used by `to_bits` and `from_bits`.
55 type Bits
: Add
<Output
= Self::Bits
> + From
<u8> + TryFrom
<u64>;
57 /// Performs a raw transmutation to an integer.
58 fn to_bits(self) -> Self::Bits
;
60 /// Performs a raw transmutation from an integer.
61 fn from_bits(v
: Self::Bits
) -> Self;
63 /// Returns the category that this number falls into.
64 fn classify(self) -> FpCategory
;
66 /// Returns the mantissa, exponent and sign as integers.
67 fn integer_decode(self) -> (u64, i16, i8);
69 /// Decodes the float.
70 fn unpack(self) -> Unpacked
;
72 /// Casts from a small integer that can be represented exactly. Panic if the integer can't be
73 /// represented, the other code in this module makes sure to never let that happen.
74 fn from_int(x
: u64) -> Self;
76 /// Gets the value 10<sup>e</sup> from a pre-computed table.
77 /// Panics for `e >= CEIL_LOG5_OF_MAX_SIG`.
78 fn short_fast_pow10(e
: usize) -> Self;
80 /// What the name says. It's easier to hard code than juggling intrinsics and
81 /// hoping LLVM constant folds it.
82 const CEIL_LOG5_OF_MAX_SIG
: i16;
84 // A conservative bound on the decimal digits of inputs that can't produce overflow or zero or
85 /// subnormals. Probably the decimal exponent of the maximum normal value, hence the name.
86 const MAX_NORMAL_DIGITS
: usize;
88 /// When the most significant decimal digit has a place value greater than this, the number
89 /// is certainly rounded to infinity.
90 const INF_CUTOFF
: i64;
92 /// When the most significant decimal digit has a place value less than this, the number
93 /// is certainly rounded to zero.
94 const ZERO_CUTOFF
: i64;
96 /// The number of bits in the exponent.
99 /// The number of bits in the significand, *including* the hidden bit.
102 /// The number of bits in the significand, *excluding* the hidden bit.
103 const EXPLICIT_SIG_BITS
: u8;
105 /// The maximum legal exponent in fractional representation.
108 /// The minimum legal exponent in fractional representation, excluding subnormals.
111 /// `MAX_EXP` for integral representation, i.e., with the shift applied.
112 const MAX_EXP_INT
: i16;
114 /// `MAX_EXP` encoded (i.e., with offset bias)
115 const MAX_ENCODED_EXP
: i16;
117 /// `MIN_EXP` for integral representation, i.e., with the shift applied.
118 const MIN_EXP_INT
: i16;
120 /// The maximum normalized significand in integral representation.
123 /// The minimal normalized significand in integral representation.
127 // Mostly a workaround for #34344.
128 macro_rules
! other_constants
{
130 const EXPLICIT_SIG_BITS
: u8 = Self::SIG_BITS
- 1;
131 const MAX_EXP
: i16 = (1 << (Self::EXP_BITS
- 1)) - 1;
132 const MIN_EXP
: i16 = -<Self as RawFloat
>::MAX_EXP
+ 1;
133 const MAX_EXP_INT
: i16 = <Self as RawFloat
>::MAX_EXP
- (Self::SIG_BITS
as i16 - 1);
134 const MAX_ENCODED_EXP
: i16 = (1 << Self::EXP_BITS
) - 1;
135 const MIN_EXP_INT
: i16 = <Self as RawFloat
>::MIN_EXP
- (Self::SIG_BITS
as i16 - 1);
136 const MAX_SIG
: u64 = (1 << Self::SIG_BITS
) - 1;
137 const MIN_SIG
: u64 = 1 << (Self::SIG_BITS
- 1);
139 const INFINITY
: Self = $
type::INFINITY
;
140 const NAN
: Self = $
type::NAN
;
141 const ZERO
: Self = 0.0;
145 impl RawFloat
for f32 {
148 const SIG_BITS
: u8 = 24;
149 const EXP_BITS
: u8 = 8;
150 const CEIL_LOG5_OF_MAX_SIG
: i16 = 11;
151 const MAX_NORMAL_DIGITS
: usize = 35;
152 const INF_CUTOFF
: i64 = 40;
153 const ZERO_CUTOFF
: i64 = -48;
154 other_constants
!(f32);
156 /// Returns the mantissa, exponent and sign as integers.
157 fn integer_decode(self) -> (u64, i16, i8) {
158 let bits
= self.to_bits();
159 let sign
: i8 = if bits
>> 31 == 0 { 1 }
else { -1 }
;
160 let mut exponent
: i16 = ((bits
>> 23) & 0xff) as i16;
162 if exponent
== 0 { (bits & 0x7fffff) << 1 }
else { (bits & 0x7fffff) | 0x800000 }
;
163 // Exponent bias + mantissa shift
164 exponent
-= 127 + 23;
165 (mantissa
as u64, exponent
, sign
)
168 fn unpack(self) -> Unpacked
{
169 let (sig
, exp
, _sig
) = self.integer_decode();
170 Unpacked
::new(sig
, exp
)
173 fn from_int(x
: u64) -> f32 {
174 // rkruppe is uncertain whether `as` rounds correctly on all platforms.
175 debug_assert
!(x
as f32 == fp_to_float(Fp { f: x, e: 0 }
));
179 fn short_fast_pow10(e
: usize) -> Self {
180 table
::F32_SHORT_POWERS
[e
]
183 fn classify(self) -> FpCategory
{
186 fn to_bits(self) -> Self::Bits
{
189 fn from_bits(v
: Self::Bits
) -> Self {
194 impl RawFloat
for f64 {
197 const SIG_BITS
: u8 = 53;
198 const EXP_BITS
: u8 = 11;
199 const CEIL_LOG5_OF_MAX_SIG
: i16 = 23;
200 const MAX_NORMAL_DIGITS
: usize = 305;
201 const INF_CUTOFF
: i64 = 310;
202 const ZERO_CUTOFF
: i64 = -326;
203 other_constants
!(f64);
205 /// Returns the mantissa, exponent and sign as integers.
206 fn integer_decode(self) -> (u64, i16, i8) {
207 let bits
= self.to_bits();
208 let sign
: i8 = if bits
>> 63 == 0 { 1 }
else { -1 }
;
209 let mut exponent
: i16 = ((bits
>> 52) & 0x7ff) as i16;
210 let mantissa
= if exponent
== 0 {
211 (bits
& 0xfffffffffffff) << 1
213 (bits
& 0xfffffffffffff) | 0x10000000000000
215 // Exponent bias + mantissa shift
216 exponent
-= 1023 + 52;
217 (mantissa
, exponent
, sign
)
220 fn unpack(self) -> Unpacked
{
221 let (sig
, exp
, _sig
) = self.integer_decode();
222 Unpacked
::new(sig
, exp
)
225 fn from_int(x
: u64) -> f64 {
226 // rkruppe is uncertain whether `as` rounds correctly on all platforms.
227 debug_assert
!(x
as f64 == fp_to_float(Fp { f: x, e: 0 }
));
231 fn short_fast_pow10(e
: usize) -> Self {
232 table
::F64_SHORT_POWERS
[e
]
235 fn classify(self) -> FpCategory
{
238 fn to_bits(self) -> Self::Bits
{
241 fn from_bits(v
: Self::Bits
) -> Self {
246 /// Converts an `Fp` to the closest machine float type.
247 /// Does not handle subnormal results.
248 pub fn fp_to_float
<T
: RawFloat
>(x
: Fp
) -> T
{
249 let x
= x
.normalize();
250 // x.f is 64 bit, so x.e has a mantissa shift of 63
253 panic
!("fp_to_float: exponent {} too large", e
)
254 } else if e
> T
::MIN_EXP
{
255 encode_normal(round_normal
::<T
>(x
))
257 panic
!("fp_to_float: exponent {} too small", e
)
261 /// Round the 64-bit significand to T::SIG_BITS bits with half-to-even.
262 /// Does not handle exponent overflow.
263 pub fn round_normal
<T
: RawFloat
>(x
: Fp
) -> Unpacked
{
264 let excess
= 64 - T
::SIG_BITS
as i16;
265 let half
: u64 = 1 << (excess
- 1);
266 let (q
, rem
) = (x
.f
>> excess
, x
.f
& ((1 << excess
) - 1));
267 assert_eq
!(q
<< excess
| rem
, x
.f
);
268 // Adjust mantissa shift
269 let k
= x
.e
+ excess
;
272 } else if rem
== half
&& (q
% 2) == 0 {
274 } else if q
== T
::MAX_SIG
{
275 Unpacked
::new(T
::MIN_SIG
, k
+ 1)
277 Unpacked
::new(q
+ 1, k
)
281 /// Inverse of `RawFloat::unpack()` for normalized numbers.
282 /// Panics if the significand or exponent are not valid for normalized numbers.
283 pub fn encode_normal
<T
: RawFloat
>(x
: Unpacked
) -> T
{
285 T
::MIN_SIG
<= x
.sig
&& x
.sig
<= T
::MAX_SIG
,
286 "encode_normal: significand not normalized"
288 // Remove the hidden bit
289 let sig_enc
= x
.sig
& !(1 << T
::EXPLICIT_SIG_BITS
);
290 // Adjust the exponent for exponent bias and mantissa shift
291 let k_enc
= x
.k
+ T
::MAX_EXP
+ T
::EXPLICIT_SIG_BITS
as i16;
292 debug_assert
!(k_enc
!= 0 && k_enc
< T
::MAX_ENCODED_EXP
, "encode_normal: exponent out of range");
293 // Leave sign bit at 0 ("+"), our numbers are all positive
294 let bits
= (k_enc
as u64) << T
::EXPLICIT_SIG_BITS
| sig_enc
;
295 T
::from_bits(bits
.try_into().unwrap_or_else(|_
| unreachable
!()))
298 /// Construct a subnormal. A mantissa of 0 is allowed and constructs zero.
299 pub fn encode_subnormal
<T
: RawFloat
>(significand
: u64) -> T
{
300 assert
!(significand
< T
::MIN_SIG
, "encode_subnormal: not actually subnormal");
301 // Encoded exponent is 0, the sign bit is 0, so we just have to reinterpret the bits.
302 T
::from_bits(significand
.try_into().unwrap_or_else(|_
| unreachable
!()))
305 /// Approximate a bignum with an Fp. Rounds within 0.5 ULP with half-to-even.
306 pub fn big_to_fp(f
: &Big
) -> Fp
{
307 let end
= f
.bit_length();
308 assert
!(end
!= 0, "big_to_fp: unexpectedly, input is zero");
309 let start
= end
.saturating_sub(64);
310 let leading
= num
::get_bits(f
, start
, end
);
311 // We cut off all bits prior to the index `start`, i.e., we effectively right-shift by
312 // an amount of `start`, so this is also the exponent we need.
313 let e
= start
as i16;
314 let rounded_down
= Fp { f: leading, e }
.normalize();
315 // Round (half-to-even) depending on the truncated bits.
316 match num
::compare_with_half_ulp(f
, start
) {
317 Less
=> rounded_down
,
318 Equal
if leading
% 2 == 0 => rounded_down
,
319 Equal
| Greater
=> match leading
.checked_add(1) {
320 Some(f
) => Fp { f, e }
.normalize(),
321 None
=> Fp { f: 1 << 63, e: e + 1 }
,
326 /// Finds the largest floating point number strictly smaller than the argument.
327 /// Does not handle subnormals, zero, or exponent underflow.
328 pub fn prev_float
<T
: RawFloat
>(x
: T
) -> T
{
330 Infinite
=> panic
!("prev_float: argument is infinite"),
331 Nan
=> panic
!("prev_float: argument is NaN"),
332 Subnormal
=> panic
!("prev_float: argument is subnormal"),
333 Zero
=> panic
!("prev_float: argument is zero"),
335 let Unpacked { sig, k }
= x
.unpack();
336 if sig
== T
::MIN_SIG
{
337 encode_normal(Unpacked
::new(T
::MAX_SIG
, k
- 1))
339 encode_normal(Unpacked
::new(sig
- 1, k
))
345 // Find the smallest floating point number strictly larger than the argument.
346 // This operation is saturating, i.e., next_float(inf) == inf.
347 // Unlike most code in this module, this function does handle zero, subnormals, and infinities.
348 // However, like all other code here, it does not deal with NaN and negative numbers.
349 pub fn next_float
<T
: RawFloat
>(x
: T
) -> T
{
351 Nan
=> panic
!("next_float: argument is NaN"),
352 Infinite
=> T
::INFINITY
,
353 // This seems too good to be true, but it works.
354 // 0.0 is encoded as the all-zero word. Subnormals are 0x000m...m where m is the mantissa.
355 // In particular, the smallest subnormal is 0x0...01 and the largest is 0x000F...F.
356 // The smallest normal number is 0x0010...0, so this corner case works as well.
357 // If the increment overflows the mantissa, the carry bit increments the exponent as we
358 // want, and the mantissa bits become zero. Because of the hidden bit convention, this
359 // too is exactly what we want!
360 // Finally, f64::MAX + 1 = 7eff...f + 1 = 7ff0...0 = f64::INFINITY.
361 Zero
| Subnormal
| Normal
=> T
::from_bits(x
.to_bits() + T
::Bits
::from(1u8)),