]> git.proxmox.com Git - rustc.git/blob - library/core/src/num/dec2flt/rawfp.rs
New upstream version 1.54.0+dfsg1
[rustc.git] / library / core / src / num / dec2flt / rawfp.rs
1 //! Bit fiddling on positive IEEE 754 floats. Negative numbers aren't and needn't be handled.
2 //! Normal floating point numbers have a canonical representation as (frac, exp) such that the
3 //! value is 2<sup>exp</sup> * (1 + sum(frac[N-i] / 2<sup>i</sup>)) where N is the number of bits.
4 //! Subnormals are slightly different and weird, but the same principle applies.
5 //!
6 //! Here, however, we represent them as (sig, k) with f positive, such that the value is f *
7 //! 2<sup>e</sup>. Besides making the "hidden bit" explicit, this changes the exponent by the
8 //! so-called mantissa shift.
9 //!
10 //! Put another way, normally floats are written as (1) but here they are written as (2):
11 //!
12 //! 1. `1.101100...11 * 2^m`
13 //! 2. `1101100...11 * 2^n`
14 //!
15 //! We call (1) the **fractional representation** and (2) the **integral representation**.
16 //!
17 //! Many functions in this module only handle normal numbers. The dec2flt routines conservatively
18 //! take the universally-correct slow path (Algorithm M) for very small and very large numbers.
19 //! That algorithm needs only next_float() which does handle subnormals and zeros.
20 use crate::cmp::Ordering::{Equal, Greater, Less};
21 use crate::convert::{TryFrom, TryInto};
22 use crate::fmt::{Debug, LowerExp};
23 use crate::num::dec2flt::num::{self, Big};
24 use crate::num::dec2flt::table;
25 use crate::num::diy_float::Fp;
26 use crate::num::FpCategory;
27 use crate::num::FpCategory::{Infinite, Nan, Normal, Subnormal, Zero};
28 use crate::ops::{Add, Div, Mul, Neg};
29
30 #[derive(Copy, Clone, Debug)]
31 pub struct Unpacked {
32 pub sig: u64,
33 pub k: i16,
34 }
35
36 impl Unpacked {
37 pub fn new(sig: u64, k: i16) -> Self {
38 Unpacked { sig, k }
39 }
40 }
41
42 /// A helper trait to avoid duplicating basically all the conversion code for `f32` and `f64`.
43 ///
44 /// See the parent module's doc comment for why this is necessary.
45 ///
46 /// Should **never ever** be implemented for other types or be used outside the dec2flt module.
47 pub trait RawFloat:
48 Copy + Debug + LowerExp + Mul<Output = Self> + Div<Output = Self> + Neg<Output = Self>
49 {
50 const INFINITY: Self;
51 const NAN: Self;
52 const ZERO: Self;
53
54 /// Type used by `to_bits` and `from_bits`.
55 type Bits: Add<Output = Self::Bits> + From<u8> + TryFrom<u64>;
56
57 /// Performs a raw transmutation to an integer.
58 fn to_bits(self) -> Self::Bits;
59
60 /// Performs a raw transmutation from an integer.
61 fn from_bits(v: Self::Bits) -> Self;
62
63 /// Returns the category that this number falls into.
64 fn classify(self) -> FpCategory;
65
66 /// Returns the mantissa, exponent and sign as integers.
67 fn integer_decode(self) -> (u64, i16, i8);
68
69 /// Decodes the float.
70 fn unpack(self) -> Unpacked;
71
72 /// Casts from a small integer that can be represented exactly. Panic if the integer can't be
73 /// represented, the other code in this module makes sure to never let that happen.
74 fn from_int(x: u64) -> Self;
75
76 /// Gets the value 10<sup>e</sup> from a pre-computed table.
77 /// Panics for `e >= CEIL_LOG5_OF_MAX_SIG`.
78 fn short_fast_pow10(e: usize) -> Self;
79
80 /// What the name says. It's easier to hard code than juggling intrinsics and
81 /// hoping LLVM constant folds it.
82 const CEIL_LOG5_OF_MAX_SIG: i16;
83
84 // A conservative bound on the decimal digits of inputs that can't produce overflow or zero or
85 /// subnormals. Probably the decimal exponent of the maximum normal value, hence the name.
86 const MAX_NORMAL_DIGITS: usize;
87
88 /// When the most significant decimal digit has a place value greater than this, the number
89 /// is certainly rounded to infinity.
90 const INF_CUTOFF: i64;
91
92 /// When the most significant decimal digit has a place value less than this, the number
93 /// is certainly rounded to zero.
94 const ZERO_CUTOFF: i64;
95
96 /// The number of bits in the exponent.
97 const EXP_BITS: u8;
98
99 /// The number of bits in the significand, *including* the hidden bit.
100 const SIG_BITS: u8;
101
102 /// The number of bits in the significand, *excluding* the hidden bit.
103 const EXPLICIT_SIG_BITS: u8;
104
105 /// The maximum legal exponent in fractional representation.
106 const MAX_EXP: i16;
107
108 /// The minimum legal exponent in fractional representation, excluding subnormals.
109 const MIN_EXP: i16;
110
111 /// `MAX_EXP` for integral representation, i.e., with the shift applied.
112 const MAX_EXP_INT: i16;
113
114 /// `MAX_EXP` encoded (i.e., with offset bias)
115 const MAX_ENCODED_EXP: i16;
116
117 /// `MIN_EXP` for integral representation, i.e., with the shift applied.
118 const MIN_EXP_INT: i16;
119
120 /// The maximum normalized significand in integral representation.
121 const MAX_SIG: u64;
122
123 /// The minimal normalized significand in integral representation.
124 const MIN_SIG: u64;
125 }
126
127 // Mostly a workaround for #34344.
128 macro_rules! other_constants {
129 ($type: ident) => {
130 const EXPLICIT_SIG_BITS: u8 = Self::SIG_BITS - 1;
131 const MAX_EXP: i16 = (1 << (Self::EXP_BITS - 1)) - 1;
132 const MIN_EXP: i16 = -<Self as RawFloat>::MAX_EXP + 1;
133 const MAX_EXP_INT: i16 = <Self as RawFloat>::MAX_EXP - (Self::SIG_BITS as i16 - 1);
134 const MAX_ENCODED_EXP: i16 = (1 << Self::EXP_BITS) - 1;
135 const MIN_EXP_INT: i16 = <Self as RawFloat>::MIN_EXP - (Self::SIG_BITS as i16 - 1);
136 const MAX_SIG: u64 = (1 << Self::SIG_BITS) - 1;
137 const MIN_SIG: u64 = 1 << (Self::SIG_BITS - 1);
138
139 const INFINITY: Self = $type::INFINITY;
140 const NAN: Self = $type::NAN;
141 const ZERO: Self = 0.0;
142 };
143 }
144
145 impl RawFloat for f32 {
146 type Bits = u32;
147
148 const SIG_BITS: u8 = 24;
149 const EXP_BITS: u8 = 8;
150 const CEIL_LOG5_OF_MAX_SIG: i16 = 11;
151 const MAX_NORMAL_DIGITS: usize = 35;
152 const INF_CUTOFF: i64 = 40;
153 const ZERO_CUTOFF: i64 = -48;
154 other_constants!(f32);
155
156 /// Returns the mantissa, exponent and sign as integers.
157 fn integer_decode(self) -> (u64, i16, i8) {
158 let bits = self.to_bits();
159 let sign: i8 = if bits >> 31 == 0 { 1 } else { -1 };
160 let mut exponent: i16 = ((bits >> 23) & 0xff) as i16;
161 let mantissa =
162 if exponent == 0 { (bits & 0x7fffff) << 1 } else { (bits & 0x7fffff) | 0x800000 };
163 // Exponent bias + mantissa shift
164 exponent -= 127 + 23;
165 (mantissa as u64, exponent, sign)
166 }
167
168 fn unpack(self) -> Unpacked {
169 let (sig, exp, _sig) = self.integer_decode();
170 Unpacked::new(sig, exp)
171 }
172
173 fn from_int(x: u64) -> f32 {
174 // rkruppe is uncertain whether `as` rounds correctly on all platforms.
175 debug_assert!(x as f32 == fp_to_float(Fp { f: x, e: 0 }));
176 x as f32
177 }
178
179 fn short_fast_pow10(e: usize) -> Self {
180 table::F32_SHORT_POWERS[e]
181 }
182
183 fn classify(self) -> FpCategory {
184 self.classify()
185 }
186 fn to_bits(self) -> Self::Bits {
187 self.to_bits()
188 }
189 fn from_bits(v: Self::Bits) -> Self {
190 Self::from_bits(v)
191 }
192 }
193
194 impl RawFloat for f64 {
195 type Bits = u64;
196
197 const SIG_BITS: u8 = 53;
198 const EXP_BITS: u8 = 11;
199 const CEIL_LOG5_OF_MAX_SIG: i16 = 23;
200 const MAX_NORMAL_DIGITS: usize = 305;
201 const INF_CUTOFF: i64 = 310;
202 const ZERO_CUTOFF: i64 = -326;
203 other_constants!(f64);
204
205 /// Returns the mantissa, exponent and sign as integers.
206 fn integer_decode(self) -> (u64, i16, i8) {
207 let bits = self.to_bits();
208 let sign: i8 = if bits >> 63 == 0 { 1 } else { -1 };
209 let mut exponent: i16 = ((bits >> 52) & 0x7ff) as i16;
210 let mantissa = if exponent == 0 {
211 (bits & 0xfffffffffffff) << 1
212 } else {
213 (bits & 0xfffffffffffff) | 0x10000000000000
214 };
215 // Exponent bias + mantissa shift
216 exponent -= 1023 + 52;
217 (mantissa, exponent, sign)
218 }
219
220 fn unpack(self) -> Unpacked {
221 let (sig, exp, _sig) = self.integer_decode();
222 Unpacked::new(sig, exp)
223 }
224
225 fn from_int(x: u64) -> f64 {
226 // rkruppe is uncertain whether `as` rounds correctly on all platforms.
227 debug_assert!(x as f64 == fp_to_float(Fp { f: x, e: 0 }));
228 x as f64
229 }
230
231 fn short_fast_pow10(e: usize) -> Self {
232 table::F64_SHORT_POWERS[e]
233 }
234
235 fn classify(self) -> FpCategory {
236 self.classify()
237 }
238 fn to_bits(self) -> Self::Bits {
239 self.to_bits()
240 }
241 fn from_bits(v: Self::Bits) -> Self {
242 Self::from_bits(v)
243 }
244 }
245
246 /// Converts an `Fp` to the closest machine float type.
247 /// Does not handle subnormal results.
248 pub fn fp_to_float<T: RawFloat>(x: Fp) -> T {
249 let x = x.normalize();
250 // x.f is 64 bit, so x.e has a mantissa shift of 63
251 let e = x.e + 63;
252 if e > T::MAX_EXP {
253 panic!("fp_to_float: exponent {} too large", e)
254 } else if e > T::MIN_EXP {
255 encode_normal(round_normal::<T>(x))
256 } else {
257 panic!("fp_to_float: exponent {} too small", e)
258 }
259 }
260
261 /// Round the 64-bit significand to T::SIG_BITS bits with half-to-even.
262 /// Does not handle exponent overflow.
263 pub fn round_normal<T: RawFloat>(x: Fp) -> Unpacked {
264 let excess = 64 - T::SIG_BITS as i16;
265 let half: u64 = 1 << (excess - 1);
266 let (q, rem) = (x.f >> excess, x.f & ((1 << excess) - 1));
267 assert_eq!(q << excess | rem, x.f);
268 // Adjust mantissa shift
269 let k = x.e + excess;
270 if rem < half {
271 Unpacked::new(q, k)
272 } else if rem == half && (q % 2) == 0 {
273 Unpacked::new(q, k)
274 } else if q == T::MAX_SIG {
275 Unpacked::new(T::MIN_SIG, k + 1)
276 } else {
277 Unpacked::new(q + 1, k)
278 }
279 }
280
281 /// Inverse of `RawFloat::unpack()` for normalized numbers.
282 /// Panics if the significand or exponent are not valid for normalized numbers.
283 pub fn encode_normal<T: RawFloat>(x: Unpacked) -> T {
284 debug_assert!(
285 T::MIN_SIG <= x.sig && x.sig <= T::MAX_SIG,
286 "encode_normal: significand not normalized"
287 );
288 // Remove the hidden bit
289 let sig_enc = x.sig & !(1 << T::EXPLICIT_SIG_BITS);
290 // Adjust the exponent for exponent bias and mantissa shift
291 let k_enc = x.k + T::MAX_EXP + T::EXPLICIT_SIG_BITS as i16;
292 debug_assert!(k_enc != 0 && k_enc < T::MAX_ENCODED_EXP, "encode_normal: exponent out of range");
293 // Leave sign bit at 0 ("+"), our numbers are all positive
294 let bits = (k_enc as u64) << T::EXPLICIT_SIG_BITS | sig_enc;
295 T::from_bits(bits.try_into().unwrap_or_else(|_| unreachable!()))
296 }
297
298 /// Construct a subnormal. A mantissa of 0 is allowed and constructs zero.
299 pub fn encode_subnormal<T: RawFloat>(significand: u64) -> T {
300 assert!(significand < T::MIN_SIG, "encode_subnormal: not actually subnormal");
301 // Encoded exponent is 0, the sign bit is 0, so we just have to reinterpret the bits.
302 T::from_bits(significand.try_into().unwrap_or_else(|_| unreachable!()))
303 }
304
305 /// Approximate a bignum with an Fp. Rounds within 0.5 ULP with half-to-even.
306 pub fn big_to_fp(f: &Big) -> Fp {
307 let end = f.bit_length();
308 assert!(end != 0, "big_to_fp: unexpectedly, input is zero");
309 let start = end.saturating_sub(64);
310 let leading = num::get_bits(f, start, end);
311 // We cut off all bits prior to the index `start`, i.e., we effectively right-shift by
312 // an amount of `start`, so this is also the exponent we need.
313 let e = start as i16;
314 let rounded_down = Fp { f: leading, e }.normalize();
315 // Round (half-to-even) depending on the truncated bits.
316 match num::compare_with_half_ulp(f, start) {
317 Less => rounded_down,
318 Equal if leading % 2 == 0 => rounded_down,
319 Equal | Greater => match leading.checked_add(1) {
320 Some(f) => Fp { f, e }.normalize(),
321 None => Fp { f: 1 << 63, e: e + 1 },
322 },
323 }
324 }
325
326 /// Finds the largest floating point number strictly smaller than the argument.
327 /// Does not handle subnormals, zero, or exponent underflow.
328 pub fn prev_float<T: RawFloat>(x: T) -> T {
329 match x.classify() {
330 Infinite => panic!("prev_float: argument is infinite"),
331 Nan => panic!("prev_float: argument is NaN"),
332 Subnormal => panic!("prev_float: argument is subnormal"),
333 Zero => panic!("prev_float: argument is zero"),
334 Normal => {
335 let Unpacked { sig, k } = x.unpack();
336 if sig == T::MIN_SIG {
337 encode_normal(Unpacked::new(T::MAX_SIG, k - 1))
338 } else {
339 encode_normal(Unpacked::new(sig - 1, k))
340 }
341 }
342 }
343 }
344
345 // Find the smallest floating point number strictly larger than the argument.
346 // This operation is saturating, i.e., next_float(inf) == inf.
347 // Unlike most code in this module, this function does handle zero, subnormals, and infinities.
348 // However, like all other code here, it does not deal with NaN and negative numbers.
349 pub fn next_float<T: RawFloat>(x: T) -> T {
350 match x.classify() {
351 Nan => panic!("next_float: argument is NaN"),
352 Infinite => T::INFINITY,
353 // This seems too good to be true, but it works.
354 // 0.0 is encoded as the all-zero word. Subnormals are 0x000m...m where m is the mantissa.
355 // In particular, the smallest subnormal is 0x0...01 and the largest is 0x000F...F.
356 // The smallest normal number is 0x0010...0, so this corner case works as well.
357 // If the increment overflows the mantissa, the carry bit increments the exponent as we
358 // want, and the mantissa bits become zero. Because of the hidden bit convention, this
359 // too is exactly what we want!
360 // Finally, f64::MAX + 1 = 7eff...f + 1 = 7ff0...0 = f64::INFINITY.
361 Zero | Subnormal | Normal => T::from_bits(x.to_bits() + T::Bits::from(1u8)),
362 }
363 }