1 // Copyright 2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
11 //! Bit fiddling on positive IEEE 754 floats. Negative numbers aren't and needn't be handled.
12 //! Normal floating point numbers have a canonical representation as (frac, exp) such that the
13 //! value is 2^exp * (1 + sum(frac[N-i] / 2^i)) where N is the number of bits. Subnormals are
14 //! slightly different and weird, but the same principle applies.
16 //! Here, however, we represent them as (sig, k) with f positive, such that the value is f * 2^e.
17 //! Besides making the "hidden bit" explicit, this changes the exponent by the so-called
20 //! Put another way, normally floats are written as (1) but here they are written as (2):
22 //! 1. `1.101100...11 * 2^m`
23 //! 2. `1101100...11 * 2^n`
25 //! We call (1) the **fractional representation** and (2) the **integral representation**.
27 //! Many functions in this module only handle normal numbers. The dec2flt routines conservatively
28 //! take the universally-correct slow path (Algorithm M) for very small and very large numbers.
29 //! That algorithm needs only next_float() which does handle subnormals and zeros.
31 use cmp
::Ordering
::{Less, Equal, Greater}
;
32 use ops
::{Mul, Div, Neg}
;
33 use fmt
::{Debug, LowerExp}
;
35 use num
::diy_float
::Fp
;
36 use num
::FpCategory
::{Infinite, Zero, Subnormal, Normal, Nan}
;
38 use num
::dec2flt
::num
::{self, Big}
;
39 use num
::dec2flt
::table
;
41 #[derive(Copy, Clone, Debug)]
48 pub fn new(sig
: u64, k
: i16) -> Self {
49 Unpacked { sig: sig, k: k }
53 /// A helper trait to avoid duplicating basically all the conversion code for `f32` and `f64`.
55 /// See the parent module's doc comment for why this is necessary.
57 /// Should **never ever** be implemented for other types or be used outside the dec2flt module.
58 /// Inherits from `Float` because there is some overlap, but all the reused methods are trivial.
59 /// The "methods" (pseudo-constants) with default implementation should not be overriden.
60 pub trait RawFloat
: Float
+ Copy
+ Debug
+ LowerExp
61 + Mul
<Output
=Self> + Div
<Output
=Self> + Neg
<Output
=Self>
63 // suffix of "2" because Float::infinity is deprecated
65 fn infinity2() -> Self {
69 // suffix of "2" because Float::nan is deprecated
75 // suffix of "2" because Float::zero is deprecated
78 // suffix of "2" because Float::integer_decode is deprecated
80 fn integer_decode2(self) -> (u64, i16, i8) {
81 Float
::integer_decode(self)
84 /// Get the raw binary representation of the float.
85 fn transmute(self) -> u64;
87 /// Transmute the raw binary representation into a float.
88 fn from_bits(bits
: u64) -> Self;
91 fn unpack(self) -> Unpacked
;
93 /// Cast from a small integer that can be represented exactly. Panic if the integer can't be
94 /// represented, the other code in this module makes sure to never let that happen.
95 fn from_int(x
: u64) -> Self;
97 /// Get the value 10^e from a pre-computed table. Panics for e >= ceil_log5_of_max_sig().
98 fn short_fast_pow10(e
: usize) -> Self;
100 // FIXME Everything that follows should be associated constants, but taking the value of an
101 // associated constant from a type parameter does not work (yet?)
102 // A possible workaround is having a `FloatInfo` struct for all the constants, but so far
103 // the methods aren't painful enough to rewrite.
105 /// What the name says. It's easier to hard code than juggling intrinsics and
106 /// hoping LLVM constant folds it.
107 fn ceil_log5_of_max_sig() -> i16;
109 // A conservative bound on the decimal digits of inputs that can't produce overflow or zero or
110 /// subnormals. Probably the decimal exponent of the maximum normal value, hence the name.
111 fn max_normal_digits() -> usize;
113 /// When the most significant decimal digit has a place value greater than this, the number
114 /// is certainly rounded to infinity.
115 fn inf_cutoff() -> i64;
117 /// When the most significant decimal digit has a place value less than this, the number
118 /// is certainly rounded to zero.
119 fn zero_cutoff() -> i64;
121 /// The number of bits in the exponent.
124 /// The number of bits in the singificand, *including* the hidden bit.
127 /// The number of bits in the singificand, *excluding* the hidden bit.
128 fn explicit_sig_bits() -> u8 {
132 /// The maximum legal exponent in fractional representation.
133 fn max_exp() -> i16 {
134 (1 << (Self::exp_bits() - 1)) - 1
137 /// The minimum legal exponent in fractional representation, excluding subnormals.
138 fn min_exp() -> i16 {
142 /// `MAX_EXP` for integral representation, i.e., with the shift applied.
143 fn max_exp_int() -> i16 {
144 Self::max_exp() - (Self::sig_bits() as i16 - 1)
147 /// `MAX_EXP` encoded (i.e., with offset bias)
148 fn max_encoded_exp() -> i16 {
149 (1 << Self::exp_bits()) - 1
152 /// `MIN_EXP` for integral representation, i.e., with the shift applied.
153 fn min_exp_int() -> i16 {
154 Self::min_exp() - (Self::sig_bits() as i16 - 1)
157 /// The maximum normalized singificand in integral representation.
158 fn max_sig() -> u64 {
159 (1 << Self::sig_bits()) - 1
162 /// The minimal normalized significand in integral representation.
163 fn min_sig() -> u64 {
164 1 << (Self::sig_bits() - 1)
168 impl RawFloat
for f32 {
173 fn sig_bits() -> u8 {
177 fn exp_bits() -> u8 {
181 fn ceil_log5_of_max_sig() -> i16 {
185 fn transmute(self) -> u64 {
186 let bits
: u32 = unsafe { transmute(self) }
;
190 fn from_bits(bits
: u64) -> f32 {
191 assert
!(bits
< u32::MAX
as u64, "f32::from_bits: too many bits");
192 unsafe { transmute(bits as u32) }
195 fn unpack(self) -> Unpacked
{
196 let (sig
, exp
, _sig
) = self.integer_decode2();
197 Unpacked
::new(sig
, exp
)
200 fn from_int(x
: u64) -> f32 {
201 // rkruppe is uncertain whether `as` rounds correctly on all platforms.
202 debug_assert
!(x
as f32 == fp_to_float(Fp { f: x, e: 0 }
));
206 fn short_fast_pow10(e
: usize) -> Self {
207 table
::F32_SHORT_POWERS
[e
]
210 fn max_normal_digits() -> usize {
214 fn inf_cutoff() -> i64 {
218 fn zero_cutoff() -> i64 {
224 impl RawFloat
for f64 {
229 fn sig_bits() -> u8 {
233 fn exp_bits() -> u8 {
237 fn ceil_log5_of_max_sig() -> i16 {
241 fn transmute(self) -> u64 {
242 let bits
: u64 = unsafe { transmute(self) }
;
246 fn from_bits(bits
: u64) -> f64 {
247 unsafe { transmute(bits) }
250 fn unpack(self) -> Unpacked
{
251 let (sig
, exp
, _sig
) = self.integer_decode2();
252 Unpacked
::new(sig
, exp
)
255 fn from_int(x
: u64) -> f64 {
256 // rkruppe is uncertain whether `as` rounds correctly on all platforms.
257 debug_assert
!(x
as f64 == fp_to_float(Fp { f: x, e: 0 }
));
261 fn short_fast_pow10(e
: usize) -> Self {
262 table
::F64_SHORT_POWERS
[e
]
265 fn max_normal_digits() -> usize {
269 fn inf_cutoff() -> i64 {
273 fn zero_cutoff() -> i64 {
279 /// Convert an Fp to the closest f64. Only handles number that fit into a normalized f64.
280 pub fn fp_to_float
<T
: RawFloat
>(x
: Fp
) -> T
{
281 let x
= x
.normalize();
282 // x.f is 64 bit, so x.e has a mantissa shift of 63
284 if e
> T
::max_exp() {
285 panic
!("fp_to_float: exponent {} too large", e
)
286 } else if e
> T
::min_exp() {
287 encode_normal(round_normal
::<T
>(x
))
289 panic
!("fp_to_float: exponent {} too small", e
)
293 /// Round the 64-bit significand to 53 bit with half-to-even. Does not handle exponent overflow.
294 pub fn round_normal
<T
: RawFloat
>(x
: Fp
) -> Unpacked
{
295 let excess
= 64 - T
::sig_bits() as i16;
296 let half
: u64 = 1 << (excess
- 1);
297 let (q
, rem
) = (x
.f
>> excess
, x
.f
& ((1 << excess
) - 1));
298 assert_eq
!(q
<< excess
| rem
, x
.f
);
299 // Adjust mantissa shift
300 let k
= x
.e
+ excess
;
303 } else if rem
== half
&& (q
% 2) == 0 {
305 } else if q
== T
::max_sig() {
306 Unpacked
::new(T
::min_sig(), k
+ 1)
308 Unpacked
::new(q
+ 1, k
)
312 /// Inverse of `RawFloat::unpack()` for normalized numbers.
313 /// Panics if the significand or exponent are not valid for normalized numbers.
314 pub fn encode_normal
<T
: RawFloat
>(x
: Unpacked
) -> T
{
315 debug_assert
!(T
::min_sig() <= x
.sig
&& x
.sig
<= T
::max_sig(),
316 "encode_normal: significand not normalized");
317 // Remove the hidden bit
318 let sig_enc
= x
.sig
& !(1 << T
::explicit_sig_bits());
319 // Adjust the exponent for exponent bias and mantissa shift
320 let k_enc
= x
.k
+ T
::max_exp() + T
::explicit_sig_bits() as i16;
321 debug_assert
!(k_enc
!= 0 && k_enc
< T
::max_encoded_exp(),
322 "encode_normal: exponent out of range");
323 // Leave sign bit at 0 ("+"), our numbers are all positive
324 let bits
= (k_enc
as u64) << T
::explicit_sig_bits() | sig_enc
;
328 /// Construct the subnormal. A mantissa of 0 is allowed and constructs zero.
329 pub fn encode_subnormal
<T
: RawFloat
>(significand
: u64) -> T
{
330 assert
!(significand
< T
::min_sig(), "encode_subnormal: not actually subnormal");
331 // Encoded exponent is 0, the sign bit is 0, so we just have to reinterpret the bits.
332 T
::from_bits(significand
)
335 /// Approximate a bignum with an Fp. Rounds within 0.5 ULP with half-to-even.
336 pub fn big_to_fp(f
: &Big
) -> Fp
{
337 let end
= f
.bit_length();
338 assert
!(end
!= 0, "big_to_fp: unexpectedly, input is zero");
339 let start
= end
.saturating_sub(64);
340 let leading
= num
::get_bits(f
, start
, end
);
341 // We cut off all bits prior to the index `start`, i.e., we effectively right-shift by
342 // an amount of `start`, so this is also the exponent we need.
343 let e
= start
as i16;
344 let rounded_down
= Fp { f: leading, e: e }
.normalize();
345 // Round (half-to-even) depending on the truncated bits.
346 match num
::compare_with_half_ulp(f
, start
) {
347 Less
=> rounded_down
,
348 Equal
if leading
% 2 == 0 => rounded_down
,
349 Equal
| Greater
=> match leading
.checked_add(1) {
350 Some(f
) => Fp { f: f, e: e }
.normalize(),
351 None
=> Fp { f: 1 << 63, e: e + 1 }
,
356 /// Find the largest floating point number strictly smaller than the argument.
357 /// Does not handle subnormals, zero, or exponent underflow.
358 pub fn prev_float
<T
: RawFloat
>(x
: T
) -> T
{
360 Infinite
=> panic
!("prev_float: argument is infinite"),
361 Nan
=> panic
!("prev_float: argument is NaN"),
362 Subnormal
=> panic
!("prev_float: argument is subnormal"),
363 Zero
=> panic
!("prev_float: argument is zero"),
365 let Unpacked { sig, k }
= x
.unpack();
366 if sig
== T
::min_sig() {
367 encode_normal(Unpacked
::new(T
::max_sig(), k
- 1))
369 encode_normal(Unpacked
::new(sig
- 1, k
))
375 // Find the smallest floating point number strictly larger than the argument.
376 // This operation is saturating, i.e. next_float(inf) == inf.
377 // Unlike most code in this module, this function does handle zero, subnormals, and infinities.
378 // However, like all other code here, it does not deal with NaN and negative numbers.
379 pub fn next_float
<T
: RawFloat
>(x
: T
) -> T
{
381 Nan
=> panic
!("next_float: argument is NaN"),
382 Infinite
=> T
::infinity2(),
383 // This seems too good to be true, but it works.
384 // 0.0 is encoded as the all-zero word. Subnormals are 0x000m...m where m is the mantissa.
385 // In particular, the smallest subnormal is 0x0...01 and the largest is 0x000F...F.
386 // The smallest normal number is 0x0010...0, so this corner case works as well.
387 // If the increment overflows the mantissa, the carry bit increments the exponent as we
388 // want, and the mantissa bits become zero. Because of the hidden bit convention, this
389 // too is exactly what we want!
390 // Finally, f64::MAX + 1 = 7eff...f + 1 = 7ff0...0 = f64::INFINITY.
391 Zero
| Subnormal
| Normal
=> {
392 let bits
: u64 = x
.transmute();
393 T
::from_bits(bits
+ 1)