]>
Commit | Line | Data |
---|---|---|
e9174d1e SL |
1 | // Copyright 2015 The Rust Project Developers. See the COPYRIGHT |
2 | // file at the top-level directory of this distribution and at | |
3 | // http://rust-lang.org/COPYRIGHT. | |
4 | // | |
5 | // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or | |
6 | // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license | |
7 | // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your | |
8 | // option. This file may not be copied, modified, or distributed | |
9 | // except according to those terms. | |
10 | ||
11 | //! Bit fiddling on positive IEEE 754 floats. Negative numbers aren't and needn't be handled. | |
12 | //! Normal floating point numbers have a canonical representation as (frac, exp) such that the | |
13 | //! value is 2^exp * (1 + sum(frac[N-i] / 2^i)) where N is the number of bits. Subnormals are | |
14 | //! slightly different and weird, but the same principle applies. | |
15 | //! | |
16 | //! Here, however, we represent them as (sig, k) with f positive, such that the value is f * 2^e. | |
17 | //! Besides making the "hidden bit" explicit, this changes the exponent by the so-called | |
18 | //! mantissa shift. | |
19 | //! | |
20 | //! Put another way, normally floats are written as (1) but here they are written as (2): | |
21 | //! | |
22 | //! 1. `1.101100...11 * 2^m` | |
23 | //! 2. `1101100...11 * 2^n` | |
24 | //! | |
25 | //! We call (1) the **fractional representation** and (2) the **integral representation**. | |
26 | //! | |
27 | //! Many functions in this module only handle normal numbers. The dec2flt routines conservatively | |
28 | //! take the universally-correct slow path (Algorithm M) for very small and very large numbers. | |
29 | //! That algorithm needs only next_float() which does handle subnormals and zeros. | |
e9174d1e SL |
30 | use u32; |
31 | use cmp::Ordering::{Less, Equal, Greater}; | |
32 | use ops::{Mul, Div, Neg}; | |
33 | use fmt::{Debug, LowerExp}; | |
34 | use mem::transmute; | |
b039eaaf | 35 | use num::diy_float::Fp; |
e9174d1e SL |
36 | use num::FpCategory::{Infinite, Zero, Subnormal, Normal, Nan}; |
37 | use num::Float; | |
b039eaaf | 38 | use num::dec2flt::num::{self, Big}; |
9cc50fc6 | 39 | use num::dec2flt::table; |
e9174d1e SL |
40 | |
41 | #[derive(Copy, Clone, Debug)] | |
42 | pub struct Unpacked { | |
43 | pub sig: u64, | |
44 | pub k: i16, | |
45 | } | |
46 | ||
47 | impl Unpacked { | |
48 | pub fn new(sig: u64, k: i16) -> Self { | |
49 | Unpacked { sig: sig, k: k } | |
50 | } | |
51 | } | |
52 | ||
53 | /// A helper trait to avoid duplicating basically all the conversion code for `f32` and `f64`. | |
54 | /// | |
55 | /// See the parent module's doc comment for why this is necessary. | |
56 | /// | |
57 | /// Should **never ever** be implemented for other types or be used outside the dec2flt module. | |
58 | /// Inherits from `Float` because there is some overlap, but all the reused methods are trivial. | |
59 | /// The "methods" (pseudo-constants) with default implementation should not be overriden. | |
60 | pub trait RawFloat : Float + Copy + Debug + LowerExp | |
61 | + Mul<Output=Self> + Div<Output=Self> + Neg<Output=Self> | |
62 | { | |
3157f602 XL |
63 | // suffix of "2" because Float::infinity is deprecated |
64 | #[allow(deprecated)] | |
65 | fn infinity2() -> Self { | |
66 | Float::infinity() | |
67 | } | |
68 | ||
69 | // suffix of "2" because Float::nan is deprecated | |
70 | #[allow(deprecated)] | |
71 | fn nan2() -> Self { | |
72 | Float::nan() | |
73 | } | |
74 | ||
75 | // suffix of "2" because Float::zero is deprecated | |
76 | fn zero2() -> Self; | |
77 | ||
78 | // suffix of "2" because Float::integer_decode is deprecated | |
79 | #[allow(deprecated)] | |
80 | fn integer_decode2(self) -> (u64, i16, i8) { | |
81 | Float::integer_decode(self) | |
82 | } | |
83 | ||
e9174d1e SL |
84 | /// Get the raw binary representation of the float. |
85 | fn transmute(self) -> u64; | |
86 | ||
87 | /// Transmute the raw binary representation into a float. | |
88 | fn from_bits(bits: u64) -> Self; | |
89 | ||
90 | /// Decode the float. | |
91 | fn unpack(self) -> Unpacked; | |
92 | ||
93 | /// Cast from a small integer that can be represented exactly. Panic if the integer can't be | |
94 | /// represented, the other code in this module makes sure to never let that happen. | |
95 | fn from_int(x: u64) -> Self; | |
96 | ||
9cc50fc6 SL |
97 | /// Get the value 10^e from a pre-computed table. Panics for e >= ceil_log5_of_max_sig(). |
98 | fn short_fast_pow10(e: usize) -> Self; | |
99 | ||
e9174d1e SL |
100 | // FIXME Everything that follows should be associated constants, but taking the value of an |
101 | // associated constant from a type parameter does not work (yet?) | |
102 | // A possible workaround is having a `FloatInfo` struct for all the constants, but so far | |
103 | // the methods aren't painful enough to rewrite. | |
104 | ||
105 | /// What the name says. It's easier to hard code than juggling intrinsics and | |
106 | /// hoping LLVM constant folds it. | |
107 | fn ceil_log5_of_max_sig() -> i16; | |
108 | ||
109 | // A conservative bound on the decimal digits of inputs that can't produce overflow or zero or | |
110 | /// subnormals. Probably the decimal exponent of the maximum normal value, hence the name. | |
111 | fn max_normal_digits() -> usize; | |
112 | ||
113 | /// When the most significant decimal digit has a place value greater than this, the number | |
114 | /// is certainly rounded to infinity. | |
115 | fn inf_cutoff() -> i64; | |
116 | ||
117 | /// When the most significant decimal digit has a place value less than this, the number | |
118 | /// is certainly rounded to zero. | |
119 | fn zero_cutoff() -> i64; | |
120 | ||
121 | /// The number of bits in the exponent. | |
122 | fn exp_bits() -> u8; | |
123 | ||
124 | /// The number of bits in the singificand, *including* the hidden bit. | |
125 | fn sig_bits() -> u8; | |
126 | ||
127 | /// The number of bits in the singificand, *excluding* the hidden bit. | |
128 | fn explicit_sig_bits() -> u8 { | |
129 | Self::sig_bits() - 1 | |
130 | } | |
131 | ||
132 | /// The maximum legal exponent in fractional representation. | |
133 | fn max_exp() -> i16 { | |
134 | (1 << (Self::exp_bits() - 1)) - 1 | |
135 | } | |
136 | ||
137 | /// The minimum legal exponent in fractional representation, excluding subnormals. | |
138 | fn min_exp() -> i16 { | |
139 | -Self::max_exp() + 1 | |
140 | } | |
141 | ||
142 | /// `MAX_EXP` for integral representation, i.e., with the shift applied. | |
143 | fn max_exp_int() -> i16 { | |
144 | Self::max_exp() - (Self::sig_bits() as i16 - 1) | |
145 | } | |
146 | ||
147 | /// `MAX_EXP` encoded (i.e., with offset bias) | |
148 | fn max_encoded_exp() -> i16 { | |
149 | (1 << Self::exp_bits()) - 1 | |
150 | } | |
151 | ||
152 | /// `MIN_EXP` for integral representation, i.e., with the shift applied. | |
153 | fn min_exp_int() -> i16 { | |
154 | Self::min_exp() - (Self::sig_bits() as i16 - 1) | |
155 | } | |
156 | ||
157 | /// The maximum normalized singificand in integral representation. | |
158 | fn max_sig() -> u64 { | |
159 | (1 << Self::sig_bits()) - 1 | |
160 | } | |
161 | ||
162 | /// The minimal normalized significand in integral representation. | |
163 | fn min_sig() -> u64 { | |
164 | 1 << (Self::sig_bits() - 1) | |
165 | } | |
166 | } | |
167 | ||
168 | impl RawFloat for f32 { | |
3157f602 XL |
169 | fn zero2() -> Self { |
170 | 0.0 | |
171 | } | |
172 | ||
e9174d1e SL |
173 | fn sig_bits() -> u8 { |
174 | 24 | |
175 | } | |
176 | ||
177 | fn exp_bits() -> u8 { | |
178 | 8 | |
179 | } | |
180 | ||
181 | fn ceil_log5_of_max_sig() -> i16 { | |
182 | 11 | |
183 | } | |
184 | ||
185 | fn transmute(self) -> u64 { | |
186 | let bits: u32 = unsafe { transmute(self) }; | |
187 | bits as u64 | |
188 | } | |
189 | ||
190 | fn from_bits(bits: u64) -> f32 { | |
191 | assert!(bits < u32::MAX as u64, "f32::from_bits: too many bits"); | |
192 | unsafe { transmute(bits as u32) } | |
193 | } | |
194 | ||
195 | fn unpack(self) -> Unpacked { | |
3157f602 | 196 | let (sig, exp, _sig) = self.integer_decode2(); |
e9174d1e SL |
197 | Unpacked::new(sig, exp) |
198 | } | |
199 | ||
200 | fn from_int(x: u64) -> f32 { | |
201 | // rkruppe is uncertain whether `as` rounds correctly on all platforms. | |
202 | debug_assert!(x as f32 == fp_to_float(Fp { f: x, e: 0 })); | |
203 | x as f32 | |
204 | } | |
205 | ||
9cc50fc6 SL |
206 | fn short_fast_pow10(e: usize) -> Self { |
207 | table::F32_SHORT_POWERS[e] | |
208 | } | |
209 | ||
e9174d1e SL |
210 | fn max_normal_digits() -> usize { |
211 | 35 | |
212 | } | |
213 | ||
214 | fn inf_cutoff() -> i64 { | |
215 | 40 | |
216 | } | |
217 | ||
218 | fn zero_cutoff() -> i64 { | |
219 | -48 | |
220 | } | |
221 | } | |
222 | ||
223 | ||
224 | impl RawFloat for f64 { | |
3157f602 XL |
225 | fn zero2() -> Self { |
226 | 0.0 | |
227 | } | |
228 | ||
e9174d1e SL |
229 | fn sig_bits() -> u8 { |
230 | 53 | |
231 | } | |
232 | ||
233 | fn exp_bits() -> u8 { | |
234 | 11 | |
235 | } | |
236 | ||
237 | fn ceil_log5_of_max_sig() -> i16 { | |
238 | 23 | |
239 | } | |
240 | ||
241 | fn transmute(self) -> u64 { | |
242 | let bits: u64 = unsafe { transmute(self) }; | |
243 | bits | |
244 | } | |
245 | ||
246 | fn from_bits(bits: u64) -> f64 { | |
247 | unsafe { transmute(bits) } | |
248 | } | |
249 | ||
250 | fn unpack(self) -> Unpacked { | |
3157f602 | 251 | let (sig, exp, _sig) = self.integer_decode2(); |
e9174d1e SL |
252 | Unpacked::new(sig, exp) |
253 | } | |
254 | ||
255 | fn from_int(x: u64) -> f64 { | |
256 | // rkruppe is uncertain whether `as` rounds correctly on all platforms. | |
257 | debug_assert!(x as f64 == fp_to_float(Fp { f: x, e: 0 })); | |
258 | x as f64 | |
259 | } | |
260 | ||
9cc50fc6 SL |
261 | fn short_fast_pow10(e: usize) -> Self { |
262 | table::F64_SHORT_POWERS[e] | |
263 | } | |
264 | ||
e9174d1e SL |
265 | fn max_normal_digits() -> usize { |
266 | 305 | |
267 | } | |
268 | ||
269 | fn inf_cutoff() -> i64 { | |
270 | 310 | |
271 | } | |
272 | ||
273 | fn zero_cutoff() -> i64 { | |
274 | -326 | |
275 | } | |
276 | ||
277 | } | |
278 | ||
279 | /// Convert an Fp to the closest f64. Only handles number that fit into a normalized f64. | |
280 | pub fn fp_to_float<T: RawFloat>(x: Fp) -> T { | |
281 | let x = x.normalize(); | |
282 | // x.f is 64 bit, so x.e has a mantissa shift of 63 | |
283 | let e = x.e + 63; | |
284 | if e > T::max_exp() { | |
285 | panic!("fp_to_float: exponent {} too large", e) | |
286 | } else if e > T::min_exp() { | |
287 | encode_normal(round_normal::<T>(x)) | |
288 | } else { | |
289 | panic!("fp_to_float: exponent {} too small", e) | |
290 | } | |
291 | } | |
292 | ||
293 | /// Round the 64-bit significand to 53 bit with half-to-even. Does not handle exponent overflow. | |
294 | pub fn round_normal<T: RawFloat>(x: Fp) -> Unpacked { | |
295 | let excess = 64 - T::sig_bits() as i16; | |
296 | let half: u64 = 1 << (excess - 1); | |
297 | let (q, rem) = (x.f >> excess, x.f & ((1 << excess) - 1)); | |
298 | assert_eq!(q << excess | rem, x.f); | |
299 | // Adjust mantissa shift | |
300 | let k = x.e + excess; | |
301 | if rem < half { | |
302 | Unpacked::new(q, k) | |
303 | } else if rem == half && (q % 2) == 0 { | |
304 | Unpacked::new(q, k) | |
305 | } else if q == T::max_sig() { | |
306 | Unpacked::new(T::min_sig(), k + 1) | |
307 | } else { | |
308 | Unpacked::new(q + 1, k) | |
309 | } | |
310 | } | |
311 | ||
312 | /// Inverse of `RawFloat::unpack()` for normalized numbers. | |
313 | /// Panics if the significand or exponent are not valid for normalized numbers. | |
314 | pub fn encode_normal<T: RawFloat>(x: Unpacked) -> T { | |
315 | debug_assert!(T::min_sig() <= x.sig && x.sig <= T::max_sig(), | |
316 | "encode_normal: significand not normalized"); | |
317 | // Remove the hidden bit | |
318 | let sig_enc = x.sig & !(1 << T::explicit_sig_bits()); | |
319 | // Adjust the exponent for exponent bias and mantissa shift | |
320 | let k_enc = x.k + T::max_exp() + T::explicit_sig_bits() as i16; | |
321 | debug_assert!(k_enc != 0 && k_enc < T::max_encoded_exp(), | |
322 | "encode_normal: exponent out of range"); | |
323 | // Leave sign bit at 0 ("+"), our numbers are all positive | |
324 | let bits = (k_enc as u64) << T::explicit_sig_bits() | sig_enc; | |
325 | T::from_bits(bits) | |
326 | } | |
327 | ||
328 | /// Construct the subnormal. A mantissa of 0 is allowed and constructs zero. | |
329 | pub fn encode_subnormal<T: RawFloat>(significand: u64) -> T { | |
330 | assert!(significand < T::min_sig(), "encode_subnormal: not actually subnormal"); | |
9cc50fc6 | 331 | // Encoded exponent is 0, the sign bit is 0, so we just have to reinterpret the bits. |
e9174d1e SL |
332 | T::from_bits(significand) |
333 | } | |
334 | ||
335 | /// Approximate a bignum with an Fp. Rounds within 0.5 ULP with half-to-even. | |
336 | pub fn big_to_fp(f: &Big) -> Fp { | |
337 | let end = f.bit_length(); | |
338 | assert!(end != 0, "big_to_fp: unexpectedly, input is zero"); | |
339 | let start = end.saturating_sub(64); | |
340 | let leading = num::get_bits(f, start, end); | |
341 | // We cut off all bits prior to the index `start`, i.e., we effectively right-shift by | |
342 | // an amount of `start`, so this is also the exponent we need. | |
343 | let e = start as i16; | |
344 | let rounded_down = Fp { f: leading, e: e }.normalize(); | |
345 | // Round (half-to-even) depending on the truncated bits. | |
346 | match num::compare_with_half_ulp(f, start) { | |
347 | Less => rounded_down, | |
348 | Equal if leading % 2 == 0 => rounded_down, | |
349 | Equal | Greater => match leading.checked_add(1) { | |
350 | Some(f) => Fp { f: f, e: e }.normalize(), | |
351 | None => Fp { f: 1 << 63, e: e + 1 }, | |
352 | } | |
353 | } | |
354 | } | |
355 | ||
356 | /// Find the largest floating point number strictly smaller than the argument. | |
357 | /// Does not handle subnormals, zero, or exponent underflow. | |
358 | pub fn prev_float<T: RawFloat>(x: T) -> T { | |
359 | match x.classify() { | |
360 | Infinite => panic!("prev_float: argument is infinite"), | |
361 | Nan => panic!("prev_float: argument is NaN"), | |
362 | Subnormal => panic!("prev_float: argument is subnormal"), | |
363 | Zero => panic!("prev_float: argument is zero"), | |
364 | Normal => { | |
365 | let Unpacked { sig, k } = x.unpack(); | |
366 | if sig == T::min_sig() { | |
367 | encode_normal(Unpacked::new(T::max_sig(), k - 1)) | |
368 | } else { | |
369 | encode_normal(Unpacked::new(sig - 1, k)) | |
370 | } | |
371 | } | |
372 | } | |
373 | } | |
374 | ||
375 | // Find the smallest floating point number strictly larger than the argument. | |
376 | // This operation is saturating, i.e. next_float(inf) == inf. | |
377 | // Unlike most code in this module, this function does handle zero, subnormals, and infinities. | |
378 | // However, like all other code here, it does not deal with NaN and negative numbers. | |
379 | pub fn next_float<T: RawFloat>(x: T) -> T { | |
380 | match x.classify() { | |
381 | Nan => panic!("next_float: argument is NaN"), | |
3157f602 | 382 | Infinite => T::infinity2(), |
e9174d1e SL |
383 | // This seems too good to be true, but it works. |
384 | // 0.0 is encoded as the all-zero word. Subnormals are 0x000m...m where m is the mantissa. | |
385 | // In particular, the smallest subnormal is 0x0...01 and the largest is 0x000F...F. | |
386 | // The smallest normal number is 0x0010...0, so this corner case works as well. | |
387 | // If the increment overflows the mantissa, the carry bit increments the exponent as we | |
388 | // want, and the mantissa bits become zero. Because of the hidden bit convention, this | |
389 | // too is exactly what we want! | |
390 | // Finally, f64::MAX + 1 = 7eff...f + 1 = 7ff0...0 = f64::INFINITY. | |
391 | Zero | Subnormal | Normal => { | |
392 | let bits: u64 = x.transmute(); | |
393 | T::from_bits(bits + 1) | |
394 | } | |
395 | } | |
396 | } |