]> git.proxmox.com Git - rustc.git/blame - src/libcore/num/dec2flt/rawfp.rs
New upstream version 1.13.0+dfsg1
[rustc.git] / src / libcore / num / dec2flt / rawfp.rs
CommitLineData
e9174d1e
SL
1// Copyright 2015 The Rust Project Developers. See the COPYRIGHT
2// file at the top-level directory of this distribution and at
3// http://rust-lang.org/COPYRIGHT.
4//
5// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6// http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7// <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8// option. This file may not be copied, modified, or distributed
9// except according to those terms.
10
11//! Bit fiddling on positive IEEE 754 floats. Negative numbers aren't and needn't be handled.
12//! Normal floating point numbers have a canonical representation as (frac, exp) such that the
13//! value is 2^exp * (1 + sum(frac[N-i] / 2^i)) where N is the number of bits. Subnormals are
14//! slightly different and weird, but the same principle applies.
15//!
16//! Here, however, we represent them as (sig, k) with f positive, such that the value is f * 2^e.
17//! Besides making the "hidden bit" explicit, this changes the exponent by the so-called
18//! mantissa shift.
19//!
20//! Put another way, normally floats are written as (1) but here they are written as (2):
21//!
22//! 1. `1.101100...11 * 2^m`
23//! 2. `1101100...11 * 2^n`
24//!
25//! We call (1) the **fractional representation** and (2) the **integral representation**.
26//!
27//! Many functions in this module only handle normal numbers. The dec2flt routines conservatively
28//! take the universally-correct slow path (Algorithm M) for very small and very large numbers.
29//! That algorithm needs only next_float() which does handle subnormals and zeros.
e9174d1e
SL
30use u32;
31use cmp::Ordering::{Less, Equal, Greater};
32use ops::{Mul, Div, Neg};
33use fmt::{Debug, LowerExp};
34use mem::transmute;
b039eaaf 35use num::diy_float::Fp;
e9174d1e
SL
36use num::FpCategory::{Infinite, Zero, Subnormal, Normal, Nan};
37use num::Float;
b039eaaf 38use num::dec2flt::num::{self, Big};
9cc50fc6 39use num::dec2flt::table;
e9174d1e
SL
40
41#[derive(Copy, Clone, Debug)]
42pub struct Unpacked {
43 pub sig: u64,
44 pub k: i16,
45}
46
47impl Unpacked {
48 pub fn new(sig: u64, k: i16) -> Self {
49 Unpacked { sig: sig, k: k }
50 }
51}
52
53/// A helper trait to avoid duplicating basically all the conversion code for `f32` and `f64`.
54///
55/// See the parent module's doc comment for why this is necessary.
56///
57/// Should **never ever** be implemented for other types or be used outside the dec2flt module.
58/// Inherits from `Float` because there is some overlap, but all the reused methods are trivial.
59/// The "methods" (pseudo-constants) with default implementation should not be overriden.
60pub trait RawFloat : Float + Copy + Debug + LowerExp
61 + Mul<Output=Self> + Div<Output=Self> + Neg<Output=Self>
62{
3157f602
XL
63 // suffix of "2" because Float::infinity is deprecated
64 #[allow(deprecated)]
65 fn infinity2() -> Self {
66 Float::infinity()
67 }
68
69 // suffix of "2" because Float::nan is deprecated
70 #[allow(deprecated)]
71 fn nan2() -> Self {
72 Float::nan()
73 }
74
75 // suffix of "2" because Float::zero is deprecated
76 fn zero2() -> Self;
77
78 // suffix of "2" because Float::integer_decode is deprecated
79 #[allow(deprecated)]
80 fn integer_decode2(self) -> (u64, i16, i8) {
81 Float::integer_decode(self)
82 }
83
e9174d1e
SL
84 /// Get the raw binary representation of the float.
85 fn transmute(self) -> u64;
86
87 /// Transmute the raw binary representation into a float.
88 fn from_bits(bits: u64) -> Self;
89
90 /// Decode the float.
91 fn unpack(self) -> Unpacked;
92
93 /// Cast from a small integer that can be represented exactly. Panic if the integer can't be
94 /// represented, the other code in this module makes sure to never let that happen.
95 fn from_int(x: u64) -> Self;
96
9cc50fc6
SL
97 /// Get the value 10^e from a pre-computed table. Panics for e >= ceil_log5_of_max_sig().
98 fn short_fast_pow10(e: usize) -> Self;
99
e9174d1e
SL
100 // FIXME Everything that follows should be associated constants, but taking the value of an
101 // associated constant from a type parameter does not work (yet?)
102 // A possible workaround is having a `FloatInfo` struct for all the constants, but so far
103 // the methods aren't painful enough to rewrite.
104
105 /// What the name says. It's easier to hard code than juggling intrinsics and
106 /// hoping LLVM constant folds it.
107 fn ceil_log5_of_max_sig() -> i16;
108
109 // A conservative bound on the decimal digits of inputs that can't produce overflow or zero or
110 /// subnormals. Probably the decimal exponent of the maximum normal value, hence the name.
111 fn max_normal_digits() -> usize;
112
113 /// When the most significant decimal digit has a place value greater than this, the number
114 /// is certainly rounded to infinity.
115 fn inf_cutoff() -> i64;
116
117 /// When the most significant decimal digit has a place value less than this, the number
118 /// is certainly rounded to zero.
119 fn zero_cutoff() -> i64;
120
121 /// The number of bits in the exponent.
122 fn exp_bits() -> u8;
123
124 /// The number of bits in the singificand, *including* the hidden bit.
125 fn sig_bits() -> u8;
126
127 /// The number of bits in the singificand, *excluding* the hidden bit.
128 fn explicit_sig_bits() -> u8 {
129 Self::sig_bits() - 1
130 }
131
132 /// The maximum legal exponent in fractional representation.
133 fn max_exp() -> i16 {
134 (1 << (Self::exp_bits() - 1)) - 1
135 }
136
137 /// The minimum legal exponent in fractional representation, excluding subnormals.
138 fn min_exp() -> i16 {
139 -Self::max_exp() + 1
140 }
141
142 /// `MAX_EXP` for integral representation, i.e., with the shift applied.
143 fn max_exp_int() -> i16 {
144 Self::max_exp() - (Self::sig_bits() as i16 - 1)
145 }
146
147 /// `MAX_EXP` encoded (i.e., with offset bias)
148 fn max_encoded_exp() -> i16 {
149 (1 << Self::exp_bits()) - 1
150 }
151
152 /// `MIN_EXP` for integral representation, i.e., with the shift applied.
153 fn min_exp_int() -> i16 {
154 Self::min_exp() - (Self::sig_bits() as i16 - 1)
155 }
156
157 /// The maximum normalized singificand in integral representation.
158 fn max_sig() -> u64 {
159 (1 << Self::sig_bits()) - 1
160 }
161
162 /// The minimal normalized significand in integral representation.
163 fn min_sig() -> u64 {
164 1 << (Self::sig_bits() - 1)
165 }
166}
167
168impl RawFloat for f32 {
3157f602
XL
169 fn zero2() -> Self {
170 0.0
171 }
172
e9174d1e
SL
173 fn sig_bits() -> u8 {
174 24
175 }
176
177 fn exp_bits() -> u8 {
178 8
179 }
180
181 fn ceil_log5_of_max_sig() -> i16 {
182 11
183 }
184
185 fn transmute(self) -> u64 {
186 let bits: u32 = unsafe { transmute(self) };
187 bits as u64
188 }
189
190 fn from_bits(bits: u64) -> f32 {
191 assert!(bits < u32::MAX as u64, "f32::from_bits: too many bits");
192 unsafe { transmute(bits as u32) }
193 }
194
195 fn unpack(self) -> Unpacked {
3157f602 196 let (sig, exp, _sig) = self.integer_decode2();
e9174d1e
SL
197 Unpacked::new(sig, exp)
198 }
199
200 fn from_int(x: u64) -> f32 {
201 // rkruppe is uncertain whether `as` rounds correctly on all platforms.
202 debug_assert!(x as f32 == fp_to_float(Fp { f: x, e: 0 }));
203 x as f32
204 }
205
9cc50fc6
SL
206 fn short_fast_pow10(e: usize) -> Self {
207 table::F32_SHORT_POWERS[e]
208 }
209
e9174d1e
SL
210 fn max_normal_digits() -> usize {
211 35
212 }
213
214 fn inf_cutoff() -> i64 {
215 40
216 }
217
218 fn zero_cutoff() -> i64 {
219 -48
220 }
221}
222
223
224impl RawFloat for f64 {
3157f602
XL
225 fn zero2() -> Self {
226 0.0
227 }
228
e9174d1e
SL
229 fn sig_bits() -> u8 {
230 53
231 }
232
233 fn exp_bits() -> u8 {
234 11
235 }
236
237 fn ceil_log5_of_max_sig() -> i16 {
238 23
239 }
240
241 fn transmute(self) -> u64 {
242 let bits: u64 = unsafe { transmute(self) };
243 bits
244 }
245
246 fn from_bits(bits: u64) -> f64 {
247 unsafe { transmute(bits) }
248 }
249
250 fn unpack(self) -> Unpacked {
3157f602 251 let (sig, exp, _sig) = self.integer_decode2();
e9174d1e
SL
252 Unpacked::new(sig, exp)
253 }
254
255 fn from_int(x: u64) -> f64 {
256 // rkruppe is uncertain whether `as` rounds correctly on all platforms.
257 debug_assert!(x as f64 == fp_to_float(Fp { f: x, e: 0 }));
258 x as f64
259 }
260
9cc50fc6
SL
261 fn short_fast_pow10(e: usize) -> Self {
262 table::F64_SHORT_POWERS[e]
263 }
264
e9174d1e
SL
265 fn max_normal_digits() -> usize {
266 305
267 }
268
269 fn inf_cutoff() -> i64 {
270 310
271 }
272
273 fn zero_cutoff() -> i64 {
274 -326
275 }
276
277}
278
279/// Convert an Fp to the closest f64. Only handles number that fit into a normalized f64.
280pub fn fp_to_float<T: RawFloat>(x: Fp) -> T {
281 let x = x.normalize();
282 // x.f is 64 bit, so x.e has a mantissa shift of 63
283 let e = x.e + 63;
284 if e > T::max_exp() {
285 panic!("fp_to_float: exponent {} too large", e)
286 } else if e > T::min_exp() {
287 encode_normal(round_normal::<T>(x))
288 } else {
289 panic!("fp_to_float: exponent {} too small", e)
290 }
291}
292
293/// Round the 64-bit significand to 53 bit with half-to-even. Does not handle exponent overflow.
294pub fn round_normal<T: RawFloat>(x: Fp) -> Unpacked {
295 let excess = 64 - T::sig_bits() as i16;
296 let half: u64 = 1 << (excess - 1);
297 let (q, rem) = (x.f >> excess, x.f & ((1 << excess) - 1));
298 assert_eq!(q << excess | rem, x.f);
299 // Adjust mantissa shift
300 let k = x.e + excess;
301 if rem < half {
302 Unpacked::new(q, k)
303 } else if rem == half && (q % 2) == 0 {
304 Unpacked::new(q, k)
305 } else if q == T::max_sig() {
306 Unpacked::new(T::min_sig(), k + 1)
307 } else {
308 Unpacked::new(q + 1, k)
309 }
310}
311
312/// Inverse of `RawFloat::unpack()` for normalized numbers.
313/// Panics if the significand or exponent are not valid for normalized numbers.
314pub fn encode_normal<T: RawFloat>(x: Unpacked) -> T {
315 debug_assert!(T::min_sig() <= x.sig && x.sig <= T::max_sig(),
316 "encode_normal: significand not normalized");
317 // Remove the hidden bit
318 let sig_enc = x.sig & !(1 << T::explicit_sig_bits());
319 // Adjust the exponent for exponent bias and mantissa shift
320 let k_enc = x.k + T::max_exp() + T::explicit_sig_bits() as i16;
321 debug_assert!(k_enc != 0 && k_enc < T::max_encoded_exp(),
322 "encode_normal: exponent out of range");
323 // Leave sign bit at 0 ("+"), our numbers are all positive
324 let bits = (k_enc as u64) << T::explicit_sig_bits() | sig_enc;
325 T::from_bits(bits)
326}
327
328/// Construct the subnormal. A mantissa of 0 is allowed and constructs zero.
329pub fn encode_subnormal<T: RawFloat>(significand: u64) -> T {
330 assert!(significand < T::min_sig(), "encode_subnormal: not actually subnormal");
9cc50fc6 331 // Encoded exponent is 0, the sign bit is 0, so we just have to reinterpret the bits.
e9174d1e
SL
332 T::from_bits(significand)
333}
334
335/// Approximate a bignum with an Fp. Rounds within 0.5 ULP with half-to-even.
336pub fn big_to_fp(f: &Big) -> Fp {
337 let end = f.bit_length();
338 assert!(end != 0, "big_to_fp: unexpectedly, input is zero");
339 let start = end.saturating_sub(64);
340 let leading = num::get_bits(f, start, end);
341 // We cut off all bits prior to the index `start`, i.e., we effectively right-shift by
342 // an amount of `start`, so this is also the exponent we need.
343 let e = start as i16;
344 let rounded_down = Fp { f: leading, e: e }.normalize();
345 // Round (half-to-even) depending on the truncated bits.
346 match num::compare_with_half_ulp(f, start) {
347 Less => rounded_down,
348 Equal if leading % 2 == 0 => rounded_down,
349 Equal | Greater => match leading.checked_add(1) {
350 Some(f) => Fp { f: f, e: e }.normalize(),
351 None => Fp { f: 1 << 63, e: e + 1 },
352 }
353 }
354}
355
356/// Find the largest floating point number strictly smaller than the argument.
357/// Does not handle subnormals, zero, or exponent underflow.
358pub fn prev_float<T: RawFloat>(x: T) -> T {
359 match x.classify() {
360 Infinite => panic!("prev_float: argument is infinite"),
361 Nan => panic!("prev_float: argument is NaN"),
362 Subnormal => panic!("prev_float: argument is subnormal"),
363 Zero => panic!("prev_float: argument is zero"),
364 Normal => {
365 let Unpacked { sig, k } = x.unpack();
366 if sig == T::min_sig() {
367 encode_normal(Unpacked::new(T::max_sig(), k - 1))
368 } else {
369 encode_normal(Unpacked::new(sig - 1, k))
370 }
371 }
372 }
373}
374
375// Find the smallest floating point number strictly larger than the argument.
376// This operation is saturating, i.e. next_float(inf) == inf.
377// Unlike most code in this module, this function does handle zero, subnormals, and infinities.
378// However, like all other code here, it does not deal with NaN and negative numbers.
379pub fn next_float<T: RawFloat>(x: T) -> T {
380 match x.classify() {
381 Nan => panic!("next_float: argument is NaN"),
3157f602 382 Infinite => T::infinity2(),
e9174d1e
SL
383 // This seems too good to be true, but it works.
384 // 0.0 is encoded as the all-zero word. Subnormals are 0x000m...m where m is the mantissa.
385 // In particular, the smallest subnormal is 0x0...01 and the largest is 0x000F...F.
386 // The smallest normal number is 0x0010...0, so this corner case works as well.
387 // If the increment overflows the mantissa, the carry bit increments the exponent as we
388 // want, and the mantissa bits become zero. Because of the hidden bit convention, this
389 // too is exactly what we want!
390 // Finally, f64::MAX + 1 = 7eff...f + 1 = 7ff0...0 = f64::INFINITY.
391 Zero | Subnormal | Normal => {
392 let bits: u64 = x.transmute();
393 T::from_bits(bits + 1)
394 }
395 }
396}