]> git.proxmox.com Git - mirror_qemu.git/blame - fpu/softfloat.c
Merge remote-tracking branch 'remotes/armbru/tags/pull-error-2015-02-05' into staging
[mirror_qemu.git] / fpu / softfloat.c
CommitLineData
8d725fac
AF
1/*
2 * QEMU float support
3 *
16017c48
PM
4 * The code in this source file is derived from release 2a of the SoftFloat
5 * IEC/IEEE Floating-point Arithmetic Package. Those parts of the code (and
6 * some later contributions) are provided under that license, as detailed below.
7 * It has subsequently been modified by contributors to the QEMU Project,
8 * so some portions are provided under:
9 * the SoftFloat-2a license
10 * the BSD license
11 * GPL-v2-or-later
12 *
13 * Any future contributions to this file after December 1st 2014 will be
14 * taken to be licensed under the Softfloat-2a license unless specifically
15 * indicated otherwise.
8d725fac 16 */
158142c2 17
a7d1ac78
PM
18/*
19===============================================================================
20This C source file is part of the SoftFloat IEC/IEEE Floating-point
21Arithmetic Package, Release 2a.
158142c2
FB
22
23Written by John R. Hauser. This work was made possible in part by the
24International Computer Science Institute, located at Suite 600, 1947 Center
25Street, Berkeley, California 94704. Funding was partially provided by the
26National Science Foundation under grant MIP-9311980. The original version
27of this code was written as part of a project to build a fixed-point vector
28processor in collaboration with the University of California at Berkeley,
29overseen by Profs. Nelson Morgan and John Wawrzynek. More information
a7d1ac78 30is available through the Web page `http://HTTP.CS.Berkeley.EDU/~jhauser/
158142c2
FB
31arithmetic/SoftFloat.html'.
32
a7d1ac78
PM
33THIS SOFTWARE IS DISTRIBUTED AS IS, FOR FREE. Although reasonable effort
34has been made to avoid it, THIS SOFTWARE MAY CONTAIN FAULTS THAT WILL AT
35TIMES RESULT IN INCORRECT BEHAVIOR. USE OF THIS SOFTWARE IS RESTRICTED TO
36PERSONS AND ORGANIZATIONS WHO CAN AND WILL TAKE FULL RESPONSIBILITY FOR ANY
37AND ALL LOSSES, COSTS, OR OTHER PROBLEMS ARISING FROM ITS USE.
158142c2
FB
38
39Derivative works are acceptable, even for commercial purposes, so long as
a7d1ac78
PM
40(1) they include prominent notice that the work is derivative, and (2) they
41include prominent notice akin to these four paragraphs for those parts of
42this code that are retained.
158142c2 43
a7d1ac78
PM
44===============================================================================
45*/
158142c2 46
16017c48
PM
47/* BSD licensing:
48 * Copyright (c) 2006, Fabrice Bellard
49 * All rights reserved.
50 *
51 * Redistribution and use in source and binary forms, with or without
52 * modification, are permitted provided that the following conditions are met:
53 *
54 * 1. Redistributions of source code must retain the above copyright notice,
55 * this list of conditions and the following disclaimer.
56 *
57 * 2. Redistributions in binary form must reproduce the above copyright notice,
58 * this list of conditions and the following disclaimer in the documentation
59 * and/or other materials provided with the distribution.
60 *
61 * 3. Neither the name of the copyright holder nor the names of its contributors
62 * may be used to endorse or promote products derived from this software without
63 * specific prior written permission.
64 *
65 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
66 * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
67 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
68 * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
69 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
70 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
71 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
72 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
73 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
74 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
75 * THE POSSIBILITY OF SUCH DAMAGE.
76 */
77
78/* Portions of this work are licensed under the terms of the GNU GPL,
79 * version 2 or later. See the COPYING file in the top-level directory.
80 */
81
2ac8bd03
PM
82/* softfloat (and in particular the code in softfloat-specialize.h) is
83 * target-dependent and needs the TARGET_* macros.
84 */
85#include "config.h"
86
6b4c305c 87#include "fpu/softfloat.h"
158142c2 88
dc355b76
PM
89/* We only need stdlib for abort() */
90#include <stdlib.h>
91
158142c2
FB
92/*----------------------------------------------------------------------------
93| Primitive arithmetic functions, including multi-word arithmetic, and
94| division and square root approximations. (Can be specialized to target if
95| desired.)
96*----------------------------------------------------------------------------*/
97#include "softfloat-macros.h"
98
99/*----------------------------------------------------------------------------
100| Functions and definitions to determine: (1) whether tininess for underflow
101| is detected before or after rounding by default, (2) what (if anything)
102| happens when exceptions are raised, (3) how signaling NaNs are distinguished
103| from quiet NaNs, (4) the default generated quiet NaNs, and (5) how NaNs
104| are propagated from function inputs to output. These details are target-
105| specific.
106*----------------------------------------------------------------------------*/
107#include "softfloat-specialize.h"
108
bb4d4bb3
PM
109/*----------------------------------------------------------------------------
110| Returns the fraction bits of the half-precision floating-point value `a'.
111*----------------------------------------------------------------------------*/
112
a49db98d 113static inline uint32_t extractFloat16Frac(float16 a)
bb4d4bb3
PM
114{
115 return float16_val(a) & 0x3ff;
116}
117
118/*----------------------------------------------------------------------------
119| Returns the exponent bits of the half-precision floating-point value `a'.
120*----------------------------------------------------------------------------*/
121
a49db98d 122static inline int_fast16_t extractFloat16Exp(float16 a)
bb4d4bb3
PM
123{
124 return (float16_val(a) >> 10) & 0x1f;
125}
126
127/*----------------------------------------------------------------------------
128| Returns the sign bit of the single-precision floating-point value `a'.
129*----------------------------------------------------------------------------*/
130
a49db98d 131static inline flag extractFloat16Sign(float16 a)
bb4d4bb3
PM
132{
133 return float16_val(a)>>15;
134}
135
158142c2
FB
136/*----------------------------------------------------------------------------
137| Takes a 64-bit fixed-point value `absZ' with binary point between bits 6
138| and 7, and returns the properly rounded 32-bit integer corresponding to the
139| input. If `zSign' is 1, the input is negated before being converted to an
140| integer. Bit 63 of `absZ' must be zero. Ordinarily, the fixed-point input
141| is simply rounded to an integer, with the inexact exception raised if the
142| input cannot be represented exactly as an integer. However, if the fixed-
143| point input is too large, the invalid exception is raised and the largest
144| positive or negative integer is returned.
145*----------------------------------------------------------------------------*/
146
bb98fe42 147static int32 roundAndPackInt32( flag zSign, uint64_t absZ STATUS_PARAM)
158142c2
FB
148{
149 int8 roundingMode;
150 flag roundNearestEven;
151 int8 roundIncrement, roundBits;
760e1416 152 int32_t z;
158142c2
FB
153
154 roundingMode = STATUS(float_rounding_mode);
155 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
156 switch (roundingMode) {
157 case float_round_nearest_even:
f9288a76 158 case float_round_ties_away:
dc355b76
PM
159 roundIncrement = 0x40;
160 break;
161 case float_round_to_zero:
162 roundIncrement = 0;
163 break;
164 case float_round_up:
165 roundIncrement = zSign ? 0 : 0x7f;
166 break;
167 case float_round_down:
168 roundIncrement = zSign ? 0x7f : 0;
169 break;
170 default:
171 abort();
158142c2
FB
172 }
173 roundBits = absZ & 0x7F;
174 absZ = ( absZ + roundIncrement )>>7;
175 absZ &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
176 z = absZ;
177 if ( zSign ) z = - z;
178 if ( ( absZ>>32 ) || ( z && ( ( z < 0 ) ^ zSign ) ) ) {
179 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 180 return zSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
181 }
182 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
183 return z;
184
185}
186
187/*----------------------------------------------------------------------------
188| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
189| `absZ1', with binary point between bits 63 and 64 (between the input words),
190| and returns the properly rounded 64-bit integer corresponding to the input.
191| If `zSign' is 1, the input is negated before being converted to an integer.
192| Ordinarily, the fixed-point input is simply rounded to an integer, with
193| the inexact exception raised if the input cannot be represented exactly as
194| an integer. However, if the fixed-point input is too large, the invalid
195| exception is raised and the largest positive or negative integer is
196| returned.
197*----------------------------------------------------------------------------*/
198
bb98fe42 199static int64 roundAndPackInt64( flag zSign, uint64_t absZ0, uint64_t absZ1 STATUS_PARAM)
158142c2
FB
200{
201 int8 roundingMode;
202 flag roundNearestEven, increment;
760e1416 203 int64_t z;
158142c2
FB
204
205 roundingMode = STATUS(float_rounding_mode);
206 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
207 switch (roundingMode) {
208 case float_round_nearest_even:
f9288a76 209 case float_round_ties_away:
dc355b76
PM
210 increment = ((int64_t) absZ1 < 0);
211 break;
212 case float_round_to_zero:
213 increment = 0;
214 break;
215 case float_round_up:
216 increment = !zSign && absZ1;
217 break;
218 case float_round_down:
219 increment = zSign && absZ1;
220 break;
221 default:
222 abort();
158142c2
FB
223 }
224 if ( increment ) {
225 ++absZ0;
226 if ( absZ0 == 0 ) goto overflow;
bb98fe42 227 absZ0 &= ~ ( ( (uint64_t) ( absZ1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
228 }
229 z = absZ0;
230 if ( zSign ) z = - z;
231 if ( z && ( ( z < 0 ) ^ zSign ) ) {
232 overflow:
233 float_raise( float_flag_invalid STATUS_VAR);
234 return
bb98fe42 235 zSign ? (int64_t) LIT64( 0x8000000000000000 )
158142c2
FB
236 : LIT64( 0x7FFFFFFFFFFFFFFF );
237 }
238 if ( absZ1 ) STATUS(float_exception_flags) |= float_flag_inexact;
239 return z;
240
241}
242
fb3ea83a
TM
243/*----------------------------------------------------------------------------
244| Takes the 128-bit fixed-point value formed by concatenating `absZ0' and
245| `absZ1', with binary point between bits 63 and 64 (between the input words),
246| and returns the properly rounded 64-bit unsigned integer corresponding to the
247| input. Ordinarily, the fixed-point input is simply rounded to an integer,
248| with the inexact exception raised if the input cannot be represented exactly
249| as an integer. However, if the fixed-point input is too large, the invalid
250| exception is raised and the largest unsigned integer is returned.
251*----------------------------------------------------------------------------*/
252
253static int64 roundAndPackUint64(flag zSign, uint64_t absZ0,
254 uint64_t absZ1 STATUS_PARAM)
255{
256 int8 roundingMode;
257 flag roundNearestEven, increment;
258
259 roundingMode = STATUS(float_rounding_mode);
260 roundNearestEven = (roundingMode == float_round_nearest_even);
dc355b76
PM
261 switch (roundingMode) {
262 case float_round_nearest_even:
f9288a76 263 case float_round_ties_away:
dc355b76
PM
264 increment = ((int64_t)absZ1 < 0);
265 break;
266 case float_round_to_zero:
267 increment = 0;
268 break;
269 case float_round_up:
270 increment = !zSign && absZ1;
271 break;
272 case float_round_down:
273 increment = zSign && absZ1;
274 break;
275 default:
276 abort();
fb3ea83a
TM
277 }
278 if (increment) {
279 ++absZ0;
280 if (absZ0 == 0) {
281 float_raise(float_flag_invalid STATUS_VAR);
282 return LIT64(0xFFFFFFFFFFFFFFFF);
283 }
284 absZ0 &= ~(((uint64_t)(absZ1<<1) == 0) & roundNearestEven);
285 }
286
287 if (zSign && absZ0) {
288 float_raise(float_flag_invalid STATUS_VAR);
289 return 0;
290 }
291
292 if (absZ1) {
293 STATUS(float_exception_flags) |= float_flag_inexact;
294 }
295 return absZ0;
296}
297
158142c2
FB
298/*----------------------------------------------------------------------------
299| Returns the fraction bits of the single-precision floating-point value `a'.
300*----------------------------------------------------------------------------*/
301
a49db98d 302static inline uint32_t extractFloat32Frac( float32 a )
158142c2
FB
303{
304
f090c9d4 305 return float32_val(a) & 0x007FFFFF;
158142c2
FB
306
307}
308
309/*----------------------------------------------------------------------------
310| Returns the exponent bits of the single-precision floating-point value `a'.
311*----------------------------------------------------------------------------*/
312
a49db98d 313static inline int_fast16_t extractFloat32Exp(float32 a)
158142c2
FB
314{
315
f090c9d4 316 return ( float32_val(a)>>23 ) & 0xFF;
158142c2
FB
317
318}
319
320/*----------------------------------------------------------------------------
321| Returns the sign bit of the single-precision floating-point value `a'.
322*----------------------------------------------------------------------------*/
323
a49db98d 324static inline flag extractFloat32Sign( float32 a )
158142c2
FB
325{
326
f090c9d4 327 return float32_val(a)>>31;
158142c2
FB
328
329}
330
37d18660
PM
331/*----------------------------------------------------------------------------
332| If `a' is denormal and we are in flush-to-zero mode then set the
333| input-denormal exception and return zero. Otherwise just return the value.
334*----------------------------------------------------------------------------*/
7baeabce 335float32 float32_squash_input_denormal(float32 a STATUS_PARAM)
37d18660
PM
336{
337 if (STATUS(flush_inputs_to_zero)) {
338 if (extractFloat32Exp(a) == 0 && extractFloat32Frac(a) != 0) {
339 float_raise(float_flag_input_denormal STATUS_VAR);
340 return make_float32(float32_val(a) & 0x80000000);
341 }
342 }
343 return a;
344}
345
158142c2
FB
346/*----------------------------------------------------------------------------
347| Normalizes the subnormal single-precision floating-point value represented
348| by the denormalized significand `aSig'. The normalized exponent and
349| significand are stored at the locations pointed to by `zExpPtr' and
350| `zSigPtr', respectively.
351*----------------------------------------------------------------------------*/
352
353static void
94a49d86 354 normalizeFloat32Subnormal(uint32_t aSig, int_fast16_t *zExpPtr, uint32_t *zSigPtr)
158142c2
FB
355{
356 int8 shiftCount;
357
358 shiftCount = countLeadingZeros32( aSig ) - 8;
359 *zSigPtr = aSig<<shiftCount;
360 *zExpPtr = 1 - shiftCount;
361
362}
363
364/*----------------------------------------------------------------------------
365| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
366| single-precision floating-point value, returning the result. After being
367| shifted into the proper positions, the three fields are simply added
368| together to form the result. This means that any integer portion of `zSig'
369| will be added into the exponent. Since a properly normalized significand
370| will have an integer portion equal to 1, the `zExp' input should be 1 less
371| than the desired result exponent whenever `zSig' is a complete, normalized
372| significand.
373*----------------------------------------------------------------------------*/
374
a49db98d 375static inline float32 packFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig)
158142c2
FB
376{
377
f090c9d4 378 return make_float32(
bb98fe42 379 ( ( (uint32_t) zSign )<<31 ) + ( ( (uint32_t) zExp )<<23 ) + zSig);
158142c2
FB
380
381}
382
383/*----------------------------------------------------------------------------
384| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
385| and significand `zSig', and returns the proper single-precision floating-
386| point value corresponding to the abstract input. Ordinarily, the abstract
387| value is simply rounded and packed into the single-precision format, with
388| the inexact exception raised if the abstract input cannot be represented
389| exactly. However, if the abstract value is too large, the overflow and
390| inexact exceptions are raised and an infinity or maximal finite value is
391| returned. If the abstract value is too small, the input value is rounded to
392| a subnormal number, and the underflow and inexact exceptions are raised if
393| the abstract input cannot be represented exactly as a subnormal single-
394| precision floating-point number.
395| The input significand `zSig' has its binary point between bits 30
396| and 29, which is 7 bits to the left of the usual location. This shifted
397| significand must be normalized or smaller. If `zSig' is not normalized,
398| `zExp' must be 0; in that case, the result returned is a subnormal number,
399| and it must not require rounding. In the usual case that `zSig' is
400| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
401| The handling of underflow and overflow follows the IEC/IEEE Standard for
402| Binary Floating-Point Arithmetic.
403*----------------------------------------------------------------------------*/
404
94a49d86 405static float32 roundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
406{
407 int8 roundingMode;
408 flag roundNearestEven;
409 int8 roundIncrement, roundBits;
410 flag isTiny;
411
412 roundingMode = STATUS(float_rounding_mode);
413 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
414 switch (roundingMode) {
415 case float_round_nearest_even:
f9288a76 416 case float_round_ties_away:
dc355b76
PM
417 roundIncrement = 0x40;
418 break;
419 case float_round_to_zero:
420 roundIncrement = 0;
421 break;
422 case float_round_up:
423 roundIncrement = zSign ? 0 : 0x7f;
424 break;
425 case float_round_down:
426 roundIncrement = zSign ? 0x7f : 0;
427 break;
428 default:
429 abort();
430 break;
158142c2
FB
431 }
432 roundBits = zSig & 0x7F;
bb98fe42 433 if ( 0xFD <= (uint16_t) zExp ) {
158142c2
FB
434 if ( ( 0xFD < zExp )
435 || ( ( zExp == 0xFD )
bb98fe42 436 && ( (int32_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
437 ) {
438 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 439 return packFloat32( zSign, 0xFF, - ( roundIncrement == 0 ));
158142c2
FB
440 }
441 if ( zExp < 0 ) {
e6afc87f
PM
442 if (STATUS(flush_to_zero)) {
443 float_raise(float_flag_output_denormal STATUS_VAR);
444 return packFloat32(zSign, 0, 0);
445 }
158142c2
FB
446 isTiny =
447 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
448 || ( zExp < -1 )
449 || ( zSig + roundIncrement < 0x80000000 );
450 shift32RightJamming( zSig, - zExp, &zSig );
451 zExp = 0;
452 roundBits = zSig & 0x7F;
453 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
454 }
455 }
456 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
457 zSig = ( zSig + roundIncrement )>>7;
458 zSig &= ~ ( ( ( roundBits ^ 0x40 ) == 0 ) & roundNearestEven );
459 if ( zSig == 0 ) zExp = 0;
460 return packFloat32( zSign, zExp, zSig );
461
462}
463
464/*----------------------------------------------------------------------------
465| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
466| and significand `zSig', and returns the proper single-precision floating-
467| point value corresponding to the abstract input. This routine is just like
468| `roundAndPackFloat32' except that `zSig' does not have to be normalized.
469| Bit 31 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
470| floating-point exponent.
471*----------------------------------------------------------------------------*/
472
473static float32
94a49d86 474 normalizeRoundAndPackFloat32(flag zSign, int_fast16_t zExp, uint32_t zSig STATUS_PARAM)
158142c2
FB
475{
476 int8 shiftCount;
477
478 shiftCount = countLeadingZeros32( zSig ) - 1;
479 return roundAndPackFloat32( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
480
481}
482
483/*----------------------------------------------------------------------------
484| Returns the fraction bits of the double-precision floating-point value `a'.
485*----------------------------------------------------------------------------*/
486
a49db98d 487static inline uint64_t extractFloat64Frac( float64 a )
158142c2
FB
488{
489
f090c9d4 490 return float64_val(a) & LIT64( 0x000FFFFFFFFFFFFF );
158142c2
FB
491
492}
493
494/*----------------------------------------------------------------------------
495| Returns the exponent bits of the double-precision floating-point value `a'.
496*----------------------------------------------------------------------------*/
497
a49db98d 498static inline int_fast16_t extractFloat64Exp(float64 a)
158142c2
FB
499{
500
f090c9d4 501 return ( float64_val(a)>>52 ) & 0x7FF;
158142c2
FB
502
503}
504
505/*----------------------------------------------------------------------------
506| Returns the sign bit of the double-precision floating-point value `a'.
507*----------------------------------------------------------------------------*/
508
a49db98d 509static inline flag extractFloat64Sign( float64 a )
158142c2
FB
510{
511
f090c9d4 512 return float64_val(a)>>63;
158142c2
FB
513
514}
515
37d18660
PM
516/*----------------------------------------------------------------------------
517| If `a' is denormal and we are in flush-to-zero mode then set the
518| input-denormal exception and return zero. Otherwise just return the value.
519*----------------------------------------------------------------------------*/
7baeabce 520float64 float64_squash_input_denormal(float64 a STATUS_PARAM)
37d18660
PM
521{
522 if (STATUS(flush_inputs_to_zero)) {
523 if (extractFloat64Exp(a) == 0 && extractFloat64Frac(a) != 0) {
524 float_raise(float_flag_input_denormal STATUS_VAR);
525 return make_float64(float64_val(a) & (1ULL << 63));
526 }
527 }
528 return a;
529}
530
158142c2
FB
531/*----------------------------------------------------------------------------
532| Normalizes the subnormal double-precision floating-point value represented
533| by the denormalized significand `aSig'. The normalized exponent and
534| significand are stored at the locations pointed to by `zExpPtr' and
535| `zSigPtr', respectively.
536*----------------------------------------------------------------------------*/
537
538static void
94a49d86 539 normalizeFloat64Subnormal(uint64_t aSig, int_fast16_t *zExpPtr, uint64_t *zSigPtr)
158142c2
FB
540{
541 int8 shiftCount;
542
543 shiftCount = countLeadingZeros64( aSig ) - 11;
544 *zSigPtr = aSig<<shiftCount;
545 *zExpPtr = 1 - shiftCount;
546
547}
548
549/*----------------------------------------------------------------------------
550| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
551| double-precision floating-point value, returning the result. After being
552| shifted into the proper positions, the three fields are simply added
553| together to form the result. This means that any integer portion of `zSig'
554| will be added into the exponent. Since a properly normalized significand
555| will have an integer portion equal to 1, the `zExp' input should be 1 less
556| than the desired result exponent whenever `zSig' is a complete, normalized
557| significand.
558*----------------------------------------------------------------------------*/
559
a49db98d 560static inline float64 packFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig)
158142c2
FB
561{
562
f090c9d4 563 return make_float64(
bb98fe42 564 ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<52 ) + zSig);
158142c2
FB
565
566}
567
568/*----------------------------------------------------------------------------
569| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
570| and significand `zSig', and returns the proper double-precision floating-
571| point value corresponding to the abstract input. Ordinarily, the abstract
572| value is simply rounded and packed into the double-precision format, with
573| the inexact exception raised if the abstract input cannot be represented
574| exactly. However, if the abstract value is too large, the overflow and
575| inexact exceptions are raised and an infinity or maximal finite value is
a7d1ac78
PM
576| returned. If the abstract value is too small, the input value is rounded to
577| a subnormal number, and the underflow and inexact exceptions are raised if
578| the abstract input cannot be represented exactly as a subnormal double-
158142c2
FB
579| precision floating-point number.
580| The input significand `zSig' has its binary point between bits 62
581| and 61, which is 10 bits to the left of the usual location. This shifted
582| significand must be normalized or smaller. If `zSig' is not normalized,
583| `zExp' must be 0; in that case, the result returned is a subnormal number,
584| and it must not require rounding. In the usual case that `zSig' is
585| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
586| The handling of underflow and overflow follows the IEC/IEEE Standard for
587| Binary Floating-Point Arithmetic.
588*----------------------------------------------------------------------------*/
589
94a49d86 590static float64 roundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
591{
592 int8 roundingMode;
593 flag roundNearestEven;
94a49d86 594 int_fast16_t roundIncrement, roundBits;
158142c2
FB
595 flag isTiny;
596
597 roundingMode = STATUS(float_rounding_mode);
598 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
599 switch (roundingMode) {
600 case float_round_nearest_even:
f9288a76 601 case float_round_ties_away:
dc355b76
PM
602 roundIncrement = 0x200;
603 break;
604 case float_round_to_zero:
605 roundIncrement = 0;
606 break;
607 case float_round_up:
608 roundIncrement = zSign ? 0 : 0x3ff;
609 break;
610 case float_round_down:
611 roundIncrement = zSign ? 0x3ff : 0;
612 break;
613 default:
614 abort();
158142c2
FB
615 }
616 roundBits = zSig & 0x3FF;
bb98fe42 617 if ( 0x7FD <= (uint16_t) zExp ) {
158142c2
FB
618 if ( ( 0x7FD < zExp )
619 || ( ( zExp == 0x7FD )
bb98fe42 620 && ( (int64_t) ( zSig + roundIncrement ) < 0 ) )
158142c2
FB
621 ) {
622 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
f090c9d4 623 return packFloat64( zSign, 0x7FF, - ( roundIncrement == 0 ));
158142c2
FB
624 }
625 if ( zExp < 0 ) {
e6afc87f
PM
626 if (STATUS(flush_to_zero)) {
627 float_raise(float_flag_output_denormal STATUS_VAR);
628 return packFloat64(zSign, 0, 0);
629 }
158142c2
FB
630 isTiny =
631 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
632 || ( zExp < -1 )
633 || ( zSig + roundIncrement < LIT64( 0x8000000000000000 ) );
634 shift64RightJamming( zSig, - zExp, &zSig );
635 zExp = 0;
636 roundBits = zSig & 0x3FF;
637 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
638 }
639 }
640 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
641 zSig = ( zSig + roundIncrement )>>10;
642 zSig &= ~ ( ( ( roundBits ^ 0x200 ) == 0 ) & roundNearestEven );
643 if ( zSig == 0 ) zExp = 0;
644 return packFloat64( zSign, zExp, zSig );
645
646}
647
648/*----------------------------------------------------------------------------
649| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
650| and significand `zSig', and returns the proper double-precision floating-
651| point value corresponding to the abstract input. This routine is just like
652| `roundAndPackFloat64' except that `zSig' does not have to be normalized.
653| Bit 63 of `zSig' must be zero, and `zExp' must be 1 less than the ``true''
654| floating-point exponent.
655*----------------------------------------------------------------------------*/
656
657static float64
94a49d86 658 normalizeRoundAndPackFloat64(flag zSign, int_fast16_t zExp, uint64_t zSig STATUS_PARAM)
158142c2
FB
659{
660 int8 shiftCount;
661
662 shiftCount = countLeadingZeros64( zSig ) - 1;
663 return roundAndPackFloat64( zSign, zExp - shiftCount, zSig<<shiftCount STATUS_VAR);
664
665}
666
158142c2
FB
667/*----------------------------------------------------------------------------
668| Returns the fraction bits of the extended double-precision floating-point
669| value `a'.
670*----------------------------------------------------------------------------*/
671
a49db98d 672static inline uint64_t extractFloatx80Frac( floatx80 a )
158142c2
FB
673{
674
675 return a.low;
676
677}
678
679/*----------------------------------------------------------------------------
680| Returns the exponent bits of the extended double-precision floating-point
681| value `a'.
682*----------------------------------------------------------------------------*/
683
a49db98d 684static inline int32 extractFloatx80Exp( floatx80 a )
158142c2
FB
685{
686
687 return a.high & 0x7FFF;
688
689}
690
691/*----------------------------------------------------------------------------
692| Returns the sign bit of the extended double-precision floating-point value
693| `a'.
694*----------------------------------------------------------------------------*/
695
a49db98d 696static inline flag extractFloatx80Sign( floatx80 a )
158142c2
FB
697{
698
699 return a.high>>15;
700
701}
702
703/*----------------------------------------------------------------------------
704| Normalizes the subnormal extended double-precision floating-point value
705| represented by the denormalized significand `aSig'. The normalized exponent
706| and significand are stored at the locations pointed to by `zExpPtr' and
707| `zSigPtr', respectively.
708*----------------------------------------------------------------------------*/
709
710static void
bb98fe42 711 normalizeFloatx80Subnormal( uint64_t aSig, int32 *zExpPtr, uint64_t *zSigPtr )
158142c2
FB
712{
713 int8 shiftCount;
714
715 shiftCount = countLeadingZeros64( aSig );
716 *zSigPtr = aSig<<shiftCount;
717 *zExpPtr = 1 - shiftCount;
718
719}
720
721/*----------------------------------------------------------------------------
722| Packs the sign `zSign', exponent `zExp', and significand `zSig' into an
723| extended double-precision floating-point value, returning the result.
724*----------------------------------------------------------------------------*/
725
a49db98d 726static inline floatx80 packFloatx80( flag zSign, int32 zExp, uint64_t zSig )
158142c2
FB
727{
728 floatx80 z;
729
730 z.low = zSig;
bb98fe42 731 z.high = ( ( (uint16_t) zSign )<<15 ) + zExp;
158142c2
FB
732 return z;
733
734}
735
736/*----------------------------------------------------------------------------
737| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
738| and extended significand formed by the concatenation of `zSig0' and `zSig1',
739| and returns the proper extended double-precision floating-point value
740| corresponding to the abstract input. Ordinarily, the abstract value is
741| rounded and packed into the extended double-precision format, with the
742| inexact exception raised if the abstract input cannot be represented
743| exactly. However, if the abstract value is too large, the overflow and
744| inexact exceptions are raised and an infinity or maximal finite value is
745| returned. If the abstract value is too small, the input value is rounded to
746| a subnormal number, and the underflow and inexact exceptions are raised if
747| the abstract input cannot be represented exactly as a subnormal extended
748| double-precision floating-point number.
749| If `roundingPrecision' is 32 or 64, the result is rounded to the same
750| number of bits as single or double precision, respectively. Otherwise, the
751| result is rounded to the full precision of the extended double-precision
752| format.
753| The input significand must be normalized or smaller. If the input
754| significand is not normalized, `zExp' must be 0; in that case, the result
755| returned is a subnormal number, and it must not require rounding. The
756| handling of underflow and overflow follows the IEC/IEEE Standard for Binary
757| Floating-Point Arithmetic.
758*----------------------------------------------------------------------------*/
759
760static floatx80
761 roundAndPackFloatx80(
bb98fe42 762 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
763 STATUS_PARAM)
764{
765 int8 roundingMode;
766 flag roundNearestEven, increment, isTiny;
767 int64 roundIncrement, roundMask, roundBits;
768
769 roundingMode = STATUS(float_rounding_mode);
770 roundNearestEven = ( roundingMode == float_round_nearest_even );
771 if ( roundingPrecision == 80 ) goto precision80;
772 if ( roundingPrecision == 64 ) {
773 roundIncrement = LIT64( 0x0000000000000400 );
774 roundMask = LIT64( 0x00000000000007FF );
775 }
776 else if ( roundingPrecision == 32 ) {
777 roundIncrement = LIT64( 0x0000008000000000 );
778 roundMask = LIT64( 0x000000FFFFFFFFFF );
779 }
780 else {
781 goto precision80;
782 }
783 zSig0 |= ( zSig1 != 0 );
dc355b76
PM
784 switch (roundingMode) {
785 case float_round_nearest_even:
f9288a76 786 case float_round_ties_away:
dc355b76
PM
787 break;
788 case float_round_to_zero:
789 roundIncrement = 0;
790 break;
791 case float_round_up:
792 roundIncrement = zSign ? 0 : roundMask;
793 break;
794 case float_round_down:
795 roundIncrement = zSign ? roundMask : 0;
796 break;
797 default:
798 abort();
158142c2
FB
799 }
800 roundBits = zSig0 & roundMask;
bb98fe42 801 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
802 if ( ( 0x7FFE < zExp )
803 || ( ( zExp == 0x7FFE ) && ( zSig0 + roundIncrement < zSig0 ) )
804 ) {
805 goto overflow;
806 }
807 if ( zExp <= 0 ) {
e6afc87f
PM
808 if (STATUS(flush_to_zero)) {
809 float_raise(float_flag_output_denormal STATUS_VAR);
810 return packFloatx80(zSign, 0, 0);
811 }
158142c2
FB
812 isTiny =
813 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
814 || ( zExp < 0 )
815 || ( zSig0 <= zSig0 + roundIncrement );
816 shift64RightJamming( zSig0, 1 - zExp, &zSig0 );
817 zExp = 0;
818 roundBits = zSig0 & roundMask;
819 if ( isTiny && roundBits ) float_raise( float_flag_underflow STATUS_VAR);
820 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
821 zSig0 += roundIncrement;
bb98fe42 822 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
823 roundIncrement = roundMask + 1;
824 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
825 roundMask |= roundIncrement;
826 }
827 zSig0 &= ~ roundMask;
828 return packFloatx80( zSign, zExp, zSig0 );
829 }
830 }
831 if ( roundBits ) STATUS(float_exception_flags) |= float_flag_inexact;
832 zSig0 += roundIncrement;
833 if ( zSig0 < roundIncrement ) {
834 ++zExp;
835 zSig0 = LIT64( 0x8000000000000000 );
836 }
837 roundIncrement = roundMask + 1;
838 if ( roundNearestEven && ( roundBits<<1 == roundIncrement ) ) {
839 roundMask |= roundIncrement;
840 }
841 zSig0 &= ~ roundMask;
842 if ( zSig0 == 0 ) zExp = 0;
843 return packFloatx80( zSign, zExp, zSig0 );
844 precision80:
dc355b76
PM
845 switch (roundingMode) {
846 case float_round_nearest_even:
f9288a76 847 case float_round_ties_away:
dc355b76
PM
848 increment = ((int64_t)zSig1 < 0);
849 break;
850 case float_round_to_zero:
851 increment = 0;
852 break;
853 case float_round_up:
854 increment = !zSign && zSig1;
855 break;
856 case float_round_down:
857 increment = zSign && zSig1;
858 break;
859 default:
860 abort();
158142c2 861 }
bb98fe42 862 if ( 0x7FFD <= (uint32_t) ( zExp - 1 ) ) {
158142c2
FB
863 if ( ( 0x7FFE < zExp )
864 || ( ( zExp == 0x7FFE )
865 && ( zSig0 == LIT64( 0xFFFFFFFFFFFFFFFF ) )
866 && increment
867 )
868 ) {
869 roundMask = 0;
870 overflow:
871 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
872 if ( ( roundingMode == float_round_to_zero )
873 || ( zSign && ( roundingMode == float_round_up ) )
874 || ( ! zSign && ( roundingMode == float_round_down ) )
875 ) {
876 return packFloatx80( zSign, 0x7FFE, ~ roundMask );
877 }
878 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
879 }
880 if ( zExp <= 0 ) {
881 isTiny =
882 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
883 || ( zExp < 0 )
884 || ! increment
885 || ( zSig0 < LIT64( 0xFFFFFFFFFFFFFFFF ) );
886 shift64ExtraRightJamming( zSig0, zSig1, 1 - zExp, &zSig0, &zSig1 );
887 zExp = 0;
888 if ( isTiny && zSig1 ) float_raise( float_flag_underflow STATUS_VAR);
889 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
dc355b76
PM
890 switch (roundingMode) {
891 case float_round_nearest_even:
f9288a76 892 case float_round_ties_away:
dc355b76
PM
893 increment = ((int64_t)zSig1 < 0);
894 break;
895 case float_round_to_zero:
896 increment = 0;
897 break;
898 case float_round_up:
899 increment = !zSign && zSig1;
900 break;
901 case float_round_down:
902 increment = zSign && zSig1;
903 break;
904 default:
905 abort();
158142c2
FB
906 }
907 if ( increment ) {
908 ++zSig0;
909 zSig0 &=
bb98fe42
AF
910 ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
911 if ( (int64_t) zSig0 < 0 ) zExp = 1;
158142c2
FB
912 }
913 return packFloatx80( zSign, zExp, zSig0 );
914 }
915 }
916 if ( zSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
917 if ( increment ) {
918 ++zSig0;
919 if ( zSig0 == 0 ) {
920 ++zExp;
921 zSig0 = LIT64( 0x8000000000000000 );
922 }
923 else {
bb98fe42 924 zSig0 &= ~ ( ( (uint64_t) ( zSig1<<1 ) == 0 ) & roundNearestEven );
158142c2
FB
925 }
926 }
927 else {
928 if ( zSig0 == 0 ) zExp = 0;
929 }
930 return packFloatx80( zSign, zExp, zSig0 );
931
932}
933
934/*----------------------------------------------------------------------------
935| Takes an abstract floating-point value having sign `zSign', exponent
936| `zExp', and significand formed by the concatenation of `zSig0' and `zSig1',
937| and returns the proper extended double-precision floating-point value
938| corresponding to the abstract input. This routine is just like
939| `roundAndPackFloatx80' except that the input significand does not have to be
940| normalized.
941*----------------------------------------------------------------------------*/
942
943static floatx80
944 normalizeRoundAndPackFloatx80(
bb98fe42 945 int8 roundingPrecision, flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1
158142c2
FB
946 STATUS_PARAM)
947{
948 int8 shiftCount;
949
950 if ( zSig0 == 0 ) {
951 zSig0 = zSig1;
952 zSig1 = 0;
953 zExp -= 64;
954 }
955 shiftCount = countLeadingZeros64( zSig0 );
956 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
957 zExp -= shiftCount;
958 return
959 roundAndPackFloatx80( roundingPrecision, zSign, zExp, zSig0, zSig1 STATUS_VAR);
960
961}
962
158142c2
FB
963/*----------------------------------------------------------------------------
964| Returns the least-significant 64 fraction bits of the quadruple-precision
965| floating-point value `a'.
966*----------------------------------------------------------------------------*/
967
a49db98d 968static inline uint64_t extractFloat128Frac1( float128 a )
158142c2
FB
969{
970
971 return a.low;
972
973}
974
975/*----------------------------------------------------------------------------
976| Returns the most-significant 48 fraction bits of the quadruple-precision
977| floating-point value `a'.
978*----------------------------------------------------------------------------*/
979
a49db98d 980static inline uint64_t extractFloat128Frac0( float128 a )
158142c2
FB
981{
982
983 return a.high & LIT64( 0x0000FFFFFFFFFFFF );
984
985}
986
987/*----------------------------------------------------------------------------
988| Returns the exponent bits of the quadruple-precision floating-point value
989| `a'.
990*----------------------------------------------------------------------------*/
991
a49db98d 992static inline int32 extractFloat128Exp( float128 a )
158142c2
FB
993{
994
995 return ( a.high>>48 ) & 0x7FFF;
996
997}
998
999/*----------------------------------------------------------------------------
1000| Returns the sign bit of the quadruple-precision floating-point value `a'.
1001*----------------------------------------------------------------------------*/
1002
a49db98d 1003static inline flag extractFloat128Sign( float128 a )
158142c2
FB
1004{
1005
1006 return a.high>>63;
1007
1008}
1009
1010/*----------------------------------------------------------------------------
1011| Normalizes the subnormal quadruple-precision floating-point value
1012| represented by the denormalized significand formed by the concatenation of
1013| `aSig0' and `aSig1'. The normalized exponent is stored at the location
1014| pointed to by `zExpPtr'. The most significant 49 bits of the normalized
1015| significand are stored at the location pointed to by `zSig0Ptr', and the
1016| least significant 64 bits of the normalized significand are stored at the
1017| location pointed to by `zSig1Ptr'.
1018*----------------------------------------------------------------------------*/
1019
1020static void
1021 normalizeFloat128Subnormal(
bb98fe42
AF
1022 uint64_t aSig0,
1023 uint64_t aSig1,
158142c2 1024 int32 *zExpPtr,
bb98fe42
AF
1025 uint64_t *zSig0Ptr,
1026 uint64_t *zSig1Ptr
158142c2
FB
1027 )
1028{
1029 int8 shiftCount;
1030
1031 if ( aSig0 == 0 ) {
1032 shiftCount = countLeadingZeros64( aSig1 ) - 15;
1033 if ( shiftCount < 0 ) {
1034 *zSig0Ptr = aSig1>>( - shiftCount );
1035 *zSig1Ptr = aSig1<<( shiftCount & 63 );
1036 }
1037 else {
1038 *zSig0Ptr = aSig1<<shiftCount;
1039 *zSig1Ptr = 0;
1040 }
1041 *zExpPtr = - shiftCount - 63;
1042 }
1043 else {
1044 shiftCount = countLeadingZeros64( aSig0 ) - 15;
1045 shortShift128Left( aSig0, aSig1, shiftCount, zSig0Ptr, zSig1Ptr );
1046 *zExpPtr = 1 - shiftCount;
1047 }
1048
1049}
1050
1051/*----------------------------------------------------------------------------
1052| Packs the sign `zSign', the exponent `zExp', and the significand formed
1053| by the concatenation of `zSig0' and `zSig1' into a quadruple-precision
1054| floating-point value, returning the result. After being shifted into the
1055| proper positions, the three fields `zSign', `zExp', and `zSig0' are simply
1056| added together to form the most significant 32 bits of the result. This
1057| means that any integer portion of `zSig0' will be added into the exponent.
1058| Since a properly normalized significand will have an integer portion equal
1059| to 1, the `zExp' input should be 1 less than the desired result exponent
1060| whenever `zSig0' and `zSig1' concatenated form a complete, normalized
1061| significand.
1062*----------------------------------------------------------------------------*/
1063
a49db98d 1064static inline float128
bb98fe42 1065 packFloat128( flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 )
158142c2
FB
1066{
1067 float128 z;
1068
1069 z.low = zSig1;
bb98fe42 1070 z.high = ( ( (uint64_t) zSign )<<63 ) + ( ( (uint64_t) zExp )<<48 ) + zSig0;
158142c2
FB
1071 return z;
1072
1073}
1074
1075/*----------------------------------------------------------------------------
1076| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1077| and extended significand formed by the concatenation of `zSig0', `zSig1',
1078| and `zSig2', and returns the proper quadruple-precision floating-point value
1079| corresponding to the abstract input. Ordinarily, the abstract value is
1080| simply rounded and packed into the quadruple-precision format, with the
1081| inexact exception raised if the abstract input cannot be represented
1082| exactly. However, if the abstract value is too large, the overflow and
1083| inexact exceptions are raised and an infinity or maximal finite value is
1084| returned. If the abstract value is too small, the input value is rounded to
1085| a subnormal number, and the underflow and inexact exceptions are raised if
1086| the abstract input cannot be represented exactly as a subnormal quadruple-
1087| precision floating-point number.
1088| The input significand must be normalized or smaller. If the input
1089| significand is not normalized, `zExp' must be 0; in that case, the result
1090| returned is a subnormal number, and it must not require rounding. In the
1091| usual case that the input significand is normalized, `zExp' must be 1 less
1092| than the ``true'' floating-point exponent. The handling of underflow and
1093| overflow follows the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1094*----------------------------------------------------------------------------*/
1095
1096static float128
1097 roundAndPackFloat128(
bb98fe42 1098 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1, uint64_t zSig2 STATUS_PARAM)
158142c2
FB
1099{
1100 int8 roundingMode;
1101 flag roundNearestEven, increment, isTiny;
1102
1103 roundingMode = STATUS(float_rounding_mode);
1104 roundNearestEven = ( roundingMode == float_round_nearest_even );
dc355b76
PM
1105 switch (roundingMode) {
1106 case float_round_nearest_even:
f9288a76 1107 case float_round_ties_away:
dc355b76
PM
1108 increment = ((int64_t)zSig2 < 0);
1109 break;
1110 case float_round_to_zero:
1111 increment = 0;
1112 break;
1113 case float_round_up:
1114 increment = !zSign && zSig2;
1115 break;
1116 case float_round_down:
1117 increment = zSign && zSig2;
1118 break;
1119 default:
1120 abort();
158142c2 1121 }
bb98fe42 1122 if ( 0x7FFD <= (uint32_t) zExp ) {
158142c2
FB
1123 if ( ( 0x7FFD < zExp )
1124 || ( ( zExp == 0x7FFD )
1125 && eq128(
1126 LIT64( 0x0001FFFFFFFFFFFF ),
1127 LIT64( 0xFFFFFFFFFFFFFFFF ),
1128 zSig0,
1129 zSig1
1130 )
1131 && increment
1132 )
1133 ) {
1134 float_raise( float_flag_overflow | float_flag_inexact STATUS_VAR);
1135 if ( ( roundingMode == float_round_to_zero )
1136 || ( zSign && ( roundingMode == float_round_up ) )
1137 || ( ! zSign && ( roundingMode == float_round_down ) )
1138 ) {
1139 return
1140 packFloat128(
1141 zSign,
1142 0x7FFE,
1143 LIT64( 0x0000FFFFFFFFFFFF ),
1144 LIT64( 0xFFFFFFFFFFFFFFFF )
1145 );
1146 }
1147 return packFloat128( zSign, 0x7FFF, 0, 0 );
1148 }
1149 if ( zExp < 0 ) {
e6afc87f
PM
1150 if (STATUS(flush_to_zero)) {
1151 float_raise(float_flag_output_denormal STATUS_VAR);
1152 return packFloat128(zSign, 0, 0, 0);
1153 }
158142c2
FB
1154 isTiny =
1155 ( STATUS(float_detect_tininess) == float_tininess_before_rounding )
1156 || ( zExp < -1 )
1157 || ! increment
1158 || lt128(
1159 zSig0,
1160 zSig1,
1161 LIT64( 0x0001FFFFFFFFFFFF ),
1162 LIT64( 0xFFFFFFFFFFFFFFFF )
1163 );
1164 shift128ExtraRightJamming(
1165 zSig0, zSig1, zSig2, - zExp, &zSig0, &zSig1, &zSig2 );
1166 zExp = 0;
1167 if ( isTiny && zSig2 ) float_raise( float_flag_underflow STATUS_VAR);
dc355b76
PM
1168 switch (roundingMode) {
1169 case float_round_nearest_even:
f9288a76 1170 case float_round_ties_away:
dc355b76
PM
1171 increment = ((int64_t)zSig2 < 0);
1172 break;
1173 case float_round_to_zero:
1174 increment = 0;
1175 break;
1176 case float_round_up:
1177 increment = !zSign && zSig2;
1178 break;
1179 case float_round_down:
1180 increment = zSign && zSig2;
1181 break;
1182 default:
1183 abort();
158142c2
FB
1184 }
1185 }
1186 }
1187 if ( zSig2 ) STATUS(float_exception_flags) |= float_flag_inexact;
1188 if ( increment ) {
1189 add128( zSig0, zSig1, 0, 1, &zSig0, &zSig1 );
1190 zSig1 &= ~ ( ( zSig2 + zSig2 == 0 ) & roundNearestEven );
1191 }
1192 else {
1193 if ( ( zSig0 | zSig1 ) == 0 ) zExp = 0;
1194 }
1195 return packFloat128( zSign, zExp, zSig0, zSig1 );
1196
1197}
1198
1199/*----------------------------------------------------------------------------
1200| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
1201| and significand formed by the concatenation of `zSig0' and `zSig1', and
1202| returns the proper quadruple-precision floating-point value corresponding
1203| to the abstract input. This routine is just like `roundAndPackFloat128'
1204| except that the input significand has fewer bits and does not have to be
1205| normalized. In all cases, `zExp' must be 1 less than the ``true'' floating-
1206| point exponent.
1207*----------------------------------------------------------------------------*/
1208
1209static float128
1210 normalizeRoundAndPackFloat128(
bb98fe42 1211 flag zSign, int32 zExp, uint64_t zSig0, uint64_t zSig1 STATUS_PARAM)
158142c2
FB
1212{
1213 int8 shiftCount;
bb98fe42 1214 uint64_t zSig2;
158142c2
FB
1215
1216 if ( zSig0 == 0 ) {
1217 zSig0 = zSig1;
1218 zSig1 = 0;
1219 zExp -= 64;
1220 }
1221 shiftCount = countLeadingZeros64( zSig0 ) - 15;
1222 if ( 0 <= shiftCount ) {
1223 zSig2 = 0;
1224 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1225 }
1226 else {
1227 shift128ExtraRightJamming(
1228 zSig0, zSig1, 0, - shiftCount, &zSig0, &zSig1, &zSig2 );
1229 }
1230 zExp -= shiftCount;
1231 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR);
1232
1233}
1234
158142c2
FB
1235/*----------------------------------------------------------------------------
1236| Returns the result of converting the 32-bit two's complement integer `a'
1237| to the single-precision floating-point format. The conversion is performed
1238| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1239*----------------------------------------------------------------------------*/
1240
c4850f9e 1241float32 int32_to_float32(int32_t a STATUS_PARAM)
158142c2
FB
1242{
1243 flag zSign;
1244
f090c9d4 1245 if ( a == 0 ) return float32_zero;
bb98fe42 1246 if ( a == (int32_t) 0x80000000 ) return packFloat32( 1, 0x9E, 0 );
158142c2
FB
1247 zSign = ( a < 0 );
1248 return normalizeRoundAndPackFloat32( zSign, 0x9C, zSign ? - a : a STATUS_VAR );
1249
1250}
1251
1252/*----------------------------------------------------------------------------
1253| Returns the result of converting the 32-bit two's complement integer `a'
1254| to the double-precision floating-point format. The conversion is performed
1255| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1256*----------------------------------------------------------------------------*/
1257
c4850f9e 1258float64 int32_to_float64(int32_t a STATUS_PARAM)
158142c2
FB
1259{
1260 flag zSign;
1261 uint32 absA;
1262 int8 shiftCount;
bb98fe42 1263 uint64_t zSig;
158142c2 1264
f090c9d4 1265 if ( a == 0 ) return float64_zero;
158142c2
FB
1266 zSign = ( a < 0 );
1267 absA = zSign ? - a : a;
1268 shiftCount = countLeadingZeros32( absA ) + 21;
1269 zSig = absA;
1270 return packFloat64( zSign, 0x432 - shiftCount, zSig<<shiftCount );
1271
1272}
1273
158142c2
FB
1274/*----------------------------------------------------------------------------
1275| Returns the result of converting the 32-bit two's complement integer `a'
1276| to the extended double-precision floating-point format. The conversion
1277| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1278| Arithmetic.
1279*----------------------------------------------------------------------------*/
1280
c4850f9e 1281floatx80 int32_to_floatx80(int32_t a STATUS_PARAM)
158142c2
FB
1282{
1283 flag zSign;
1284 uint32 absA;
1285 int8 shiftCount;
bb98fe42 1286 uint64_t zSig;
158142c2
FB
1287
1288 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1289 zSign = ( a < 0 );
1290 absA = zSign ? - a : a;
1291 shiftCount = countLeadingZeros32( absA ) + 32;
1292 zSig = absA;
1293 return packFloatx80( zSign, 0x403E - shiftCount, zSig<<shiftCount );
1294
1295}
1296
158142c2
FB
1297/*----------------------------------------------------------------------------
1298| Returns the result of converting the 32-bit two's complement integer `a' to
1299| the quadruple-precision floating-point format. The conversion is performed
1300| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1301*----------------------------------------------------------------------------*/
1302
c4850f9e 1303float128 int32_to_float128(int32_t a STATUS_PARAM)
158142c2
FB
1304{
1305 flag zSign;
1306 uint32 absA;
1307 int8 shiftCount;
bb98fe42 1308 uint64_t zSig0;
158142c2
FB
1309
1310 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1311 zSign = ( a < 0 );
1312 absA = zSign ? - a : a;
1313 shiftCount = countLeadingZeros32( absA ) + 17;
1314 zSig0 = absA;
1315 return packFloat128( zSign, 0x402E - shiftCount, zSig0<<shiftCount, 0 );
1316
1317}
1318
158142c2
FB
1319/*----------------------------------------------------------------------------
1320| Returns the result of converting the 64-bit two's complement integer `a'
1321| to the single-precision floating-point format. The conversion is performed
1322| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1323*----------------------------------------------------------------------------*/
1324
c4850f9e 1325float32 int64_to_float32(int64_t a STATUS_PARAM)
158142c2
FB
1326{
1327 flag zSign;
1328 uint64 absA;
1329 int8 shiftCount;
1330
f090c9d4 1331 if ( a == 0 ) return float32_zero;
158142c2
FB
1332 zSign = ( a < 0 );
1333 absA = zSign ? - a : a;
1334 shiftCount = countLeadingZeros64( absA ) - 40;
1335 if ( 0 <= shiftCount ) {
1336 return packFloat32( zSign, 0x95 - shiftCount, absA<<shiftCount );
1337 }
1338 else {
1339 shiftCount += 7;
1340 if ( shiftCount < 0 ) {
1341 shift64RightJamming( absA, - shiftCount, &absA );
1342 }
1343 else {
1344 absA <<= shiftCount;
1345 }
1346 return roundAndPackFloat32( zSign, 0x9C - shiftCount, absA STATUS_VAR );
1347 }
1348
1349}
1350
1351/*----------------------------------------------------------------------------
1352| Returns the result of converting the 64-bit two's complement integer `a'
1353| to the double-precision floating-point format. The conversion is performed
1354| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1355*----------------------------------------------------------------------------*/
1356
c4850f9e 1357float64 int64_to_float64(int64_t a STATUS_PARAM)
158142c2
FB
1358{
1359 flag zSign;
1360
f090c9d4 1361 if ( a == 0 ) return float64_zero;
bb98fe42 1362 if ( a == (int64_t) LIT64( 0x8000000000000000 ) ) {
158142c2
FB
1363 return packFloat64( 1, 0x43E, 0 );
1364 }
1365 zSign = ( a < 0 );
1366 return normalizeRoundAndPackFloat64( zSign, 0x43C, zSign ? - a : a STATUS_VAR );
1367
1368}
1369
158142c2
FB
1370/*----------------------------------------------------------------------------
1371| Returns the result of converting the 64-bit two's complement integer `a'
1372| to the extended double-precision floating-point format. The conversion
1373| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1374| Arithmetic.
1375*----------------------------------------------------------------------------*/
1376
c4850f9e 1377floatx80 int64_to_floatx80(int64_t a STATUS_PARAM)
158142c2
FB
1378{
1379 flag zSign;
1380 uint64 absA;
1381 int8 shiftCount;
1382
1383 if ( a == 0 ) return packFloatx80( 0, 0, 0 );
1384 zSign = ( a < 0 );
1385 absA = zSign ? - a : a;
1386 shiftCount = countLeadingZeros64( absA );
1387 return packFloatx80( zSign, 0x403E - shiftCount, absA<<shiftCount );
1388
1389}
1390
158142c2
FB
1391/*----------------------------------------------------------------------------
1392| Returns the result of converting the 64-bit two's complement integer `a' to
1393| the quadruple-precision floating-point format. The conversion is performed
1394| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1395*----------------------------------------------------------------------------*/
1396
c4850f9e 1397float128 int64_to_float128(int64_t a STATUS_PARAM)
158142c2
FB
1398{
1399 flag zSign;
1400 uint64 absA;
1401 int8 shiftCount;
1402 int32 zExp;
bb98fe42 1403 uint64_t zSig0, zSig1;
158142c2
FB
1404
1405 if ( a == 0 ) return packFloat128( 0, 0, 0, 0 );
1406 zSign = ( a < 0 );
1407 absA = zSign ? - a : a;
1408 shiftCount = countLeadingZeros64( absA ) + 49;
1409 zExp = 0x406E - shiftCount;
1410 if ( 64 <= shiftCount ) {
1411 zSig1 = 0;
1412 zSig0 = absA;
1413 shiftCount -= 64;
1414 }
1415 else {
1416 zSig1 = absA;
1417 zSig0 = 0;
1418 }
1419 shortShift128Left( zSig0, zSig1, shiftCount, &zSig0, &zSig1 );
1420 return packFloat128( zSign, zExp, zSig0, zSig1 );
1421
1422}
1423
6bb8e0f1
PM
1424/*----------------------------------------------------------------------------
1425| Returns the result of converting the 64-bit unsigned integer `a'
1426| to the single-precision floating-point format. The conversion is performed
1427| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1428*----------------------------------------------------------------------------*/
1429
1430float32 uint64_to_float32(uint64_t a STATUS_PARAM)
1431{
1432 int shiftcount;
1433
1434 if (a == 0) {
1435 return float32_zero;
1436 }
1437
1438 /* Determine (left) shift needed to put first set bit into bit posn 23
1439 * (since packFloat32() expects the binary point between bits 23 and 22);
1440 * this is the fast case for smallish numbers.
1441 */
1442 shiftcount = countLeadingZeros64(a) - 40;
1443 if (shiftcount >= 0) {
1444 return packFloat32(0, 0x95 - shiftcount, a << shiftcount);
1445 }
1446 /* Otherwise we need to do a round-and-pack. roundAndPackFloat32()
1447 * expects the binary point between bits 30 and 29, hence the + 7.
1448 */
1449 shiftcount += 7;
1450 if (shiftcount < 0) {
1451 shift64RightJamming(a, -shiftcount, &a);
1452 } else {
1453 a <<= shiftcount;
1454 }
1455
1456 return roundAndPackFloat32(0, 0x9c - shiftcount, a STATUS_VAR);
1457}
1458
1459/*----------------------------------------------------------------------------
1460| Returns the result of converting the 64-bit unsigned integer `a'
1461| to the double-precision floating-point format. The conversion is performed
1462| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1463*----------------------------------------------------------------------------*/
1464
1465float64 uint64_to_float64(uint64_t a STATUS_PARAM)
1466{
1467 int exp = 0x43C;
1468 int shiftcount;
1469
1470 if (a == 0) {
1471 return float64_zero;
1472 }
1473
1474 shiftcount = countLeadingZeros64(a) - 1;
1475 if (shiftcount < 0) {
1476 shift64RightJamming(a, -shiftcount, &a);
1477 } else {
1478 a <<= shiftcount;
1479 }
1480 return roundAndPackFloat64(0, exp - shiftcount, a STATUS_VAR);
1481}
1482
1483/*----------------------------------------------------------------------------
1484| Returns the result of converting the 64-bit unsigned integer `a'
1485| to the quadruple-precision floating-point format. The conversion is performed
1486| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
1487*----------------------------------------------------------------------------*/
1488
c4850f9e 1489float128 uint64_to_float128(uint64_t a STATUS_PARAM)
1e397ead
RH
1490{
1491 if (a == 0) {
1492 return float128_zero;
1493 }
1494 return normalizeRoundAndPackFloat128(0, 0x406E, a, 0 STATUS_VAR);
1495}
1496
158142c2
FB
1497/*----------------------------------------------------------------------------
1498| Returns the result of converting the single-precision floating-point value
1499| `a' to the 32-bit two's complement integer format. The conversion is
1500| performed according to the IEC/IEEE Standard for Binary Floating-Point
1501| Arithmetic---which means in particular that the conversion is rounded
1502| according to the current rounding mode. If `a' is a NaN, the largest
1503| positive integer is returned. Otherwise, if the conversion overflows, the
1504| largest integer with the same sign as `a' is returned.
1505*----------------------------------------------------------------------------*/
1506
1507int32 float32_to_int32( float32 a STATUS_PARAM )
1508{
1509 flag aSign;
94a49d86 1510 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1511 uint32_t aSig;
1512 uint64_t aSig64;
158142c2 1513
37d18660 1514 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1515 aSig = extractFloat32Frac( a );
1516 aExp = extractFloat32Exp( a );
1517 aSign = extractFloat32Sign( a );
1518 if ( ( aExp == 0xFF ) && aSig ) aSign = 0;
1519 if ( aExp ) aSig |= 0x00800000;
1520 shiftCount = 0xAF - aExp;
1521 aSig64 = aSig;
1522 aSig64 <<= 32;
1523 if ( 0 < shiftCount ) shift64RightJamming( aSig64, shiftCount, &aSig64 );
1524 return roundAndPackInt32( aSign, aSig64 STATUS_VAR );
1525
1526}
1527
1528/*----------------------------------------------------------------------------
1529| Returns the result of converting the single-precision floating-point value
1530| `a' to the 32-bit two's complement integer format. The conversion is
1531| performed according to the IEC/IEEE Standard for Binary Floating-Point
1532| Arithmetic, except that the conversion is always rounded toward zero.
1533| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1534| the conversion overflows, the largest integer with the same sign as `a' is
1535| returned.
1536*----------------------------------------------------------------------------*/
1537
1538int32 float32_to_int32_round_to_zero( float32 a STATUS_PARAM )
1539{
1540 flag aSign;
94a49d86 1541 int_fast16_t aExp, shiftCount;
bb98fe42 1542 uint32_t aSig;
b3a6a2e0 1543 int32_t z;
37d18660 1544 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1545
1546 aSig = extractFloat32Frac( a );
1547 aExp = extractFloat32Exp( a );
1548 aSign = extractFloat32Sign( a );
1549 shiftCount = aExp - 0x9E;
1550 if ( 0 <= shiftCount ) {
f090c9d4 1551 if ( float32_val(a) != 0xCF000000 ) {
158142c2
FB
1552 float_raise( float_flag_invalid STATUS_VAR);
1553 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) return 0x7FFFFFFF;
1554 }
bb98fe42 1555 return (int32_t) 0x80000000;
158142c2
FB
1556 }
1557 else if ( aExp <= 0x7E ) {
1558 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1559 return 0;
1560 }
1561 aSig = ( aSig | 0x00800000 )<<8;
1562 z = aSig>>( - shiftCount );
bb98fe42 1563 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
158142c2
FB
1564 STATUS(float_exception_flags) |= float_flag_inexact;
1565 }
1566 if ( aSign ) z = - z;
1567 return z;
1568
1569}
1570
cbcef455
PM
1571/*----------------------------------------------------------------------------
1572| Returns the result of converting the single-precision floating-point value
1573| `a' to the 16-bit two's complement integer format. The conversion is
1574| performed according to the IEC/IEEE Standard for Binary Floating-Point
1575| Arithmetic, except that the conversion is always rounded toward zero.
1576| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
1577| the conversion overflows, the largest integer with the same sign as `a' is
1578| returned.
1579*----------------------------------------------------------------------------*/
1580
94a49d86 1581int_fast16_t float32_to_int16_round_to_zero(float32 a STATUS_PARAM)
cbcef455
PM
1582{
1583 flag aSign;
94a49d86 1584 int_fast16_t aExp, shiftCount;
bb98fe42 1585 uint32_t aSig;
cbcef455
PM
1586 int32 z;
1587
1588 aSig = extractFloat32Frac( a );
1589 aExp = extractFloat32Exp( a );
1590 aSign = extractFloat32Sign( a );
1591 shiftCount = aExp - 0x8E;
1592 if ( 0 <= shiftCount ) {
1593 if ( float32_val(a) != 0xC7000000 ) {
1594 float_raise( float_flag_invalid STATUS_VAR);
1595 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1596 return 0x7FFF;
1597 }
1598 }
bb98fe42 1599 return (int32_t) 0xffff8000;
cbcef455
PM
1600 }
1601 else if ( aExp <= 0x7E ) {
1602 if ( aExp | aSig ) {
1603 STATUS(float_exception_flags) |= float_flag_inexact;
1604 }
1605 return 0;
1606 }
1607 shiftCount -= 0x10;
1608 aSig = ( aSig | 0x00800000 )<<8;
1609 z = aSig>>( - shiftCount );
bb98fe42 1610 if ( (uint32_t) ( aSig<<( shiftCount & 31 ) ) ) {
cbcef455
PM
1611 STATUS(float_exception_flags) |= float_flag_inexact;
1612 }
1613 if ( aSign ) {
1614 z = - z;
1615 }
1616 return z;
1617
1618}
1619
158142c2
FB
1620/*----------------------------------------------------------------------------
1621| Returns the result of converting the single-precision floating-point value
1622| `a' to the 64-bit two's complement integer format. The conversion is
1623| performed according to the IEC/IEEE Standard for Binary Floating-Point
1624| Arithmetic---which means in particular that the conversion is rounded
1625| according to the current rounding mode. If `a' is a NaN, the largest
1626| positive integer is returned. Otherwise, if the conversion overflows, the
1627| largest integer with the same sign as `a' is returned.
1628*----------------------------------------------------------------------------*/
1629
1630int64 float32_to_int64( float32 a STATUS_PARAM )
1631{
1632 flag aSign;
94a49d86 1633 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1634 uint32_t aSig;
1635 uint64_t aSig64, aSigExtra;
37d18660 1636 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1637
1638 aSig = extractFloat32Frac( a );
1639 aExp = extractFloat32Exp( a );
1640 aSign = extractFloat32Sign( a );
1641 shiftCount = 0xBE - aExp;
1642 if ( shiftCount < 0 ) {
1643 float_raise( float_flag_invalid STATUS_VAR);
1644 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1645 return LIT64( 0x7FFFFFFFFFFFFFFF );
1646 }
bb98fe42 1647 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1648 }
1649 if ( aExp ) aSig |= 0x00800000;
1650 aSig64 = aSig;
1651 aSig64 <<= 40;
1652 shift64ExtraRightJamming( aSig64, 0, shiftCount, &aSig64, &aSigExtra );
1653 return roundAndPackInt64( aSign, aSig64, aSigExtra STATUS_VAR );
1654
1655}
1656
2f18bbf9
TM
1657/*----------------------------------------------------------------------------
1658| Returns the result of converting the single-precision floating-point value
1659| `a' to the 64-bit unsigned integer format. The conversion is
1660| performed according to the IEC/IEEE Standard for Binary Floating-Point
1661| Arithmetic---which means in particular that the conversion is rounded
1662| according to the current rounding mode. If `a' is a NaN, the largest
1663| unsigned integer is returned. Otherwise, if the conversion overflows, the
1664| largest unsigned integer is returned. If the 'a' is negative, the result
1665| is rounded and zero is returned; values that do not round to zero will
1666| raise the inexact exception flag.
1667*----------------------------------------------------------------------------*/
1668
1669uint64 float32_to_uint64(float32 a STATUS_PARAM)
1670{
1671 flag aSign;
1672 int_fast16_t aExp, shiftCount;
1673 uint32_t aSig;
1674 uint64_t aSig64, aSigExtra;
1675 a = float32_squash_input_denormal(a STATUS_VAR);
1676
1677 aSig = extractFloat32Frac(a);
1678 aExp = extractFloat32Exp(a);
1679 aSign = extractFloat32Sign(a);
1680 if ((aSign) && (aExp > 126)) {
1681 float_raise(float_flag_invalid STATUS_VAR);
1682 if (float32_is_any_nan(a)) {
1683 return LIT64(0xFFFFFFFFFFFFFFFF);
1684 } else {
1685 return 0;
1686 }
1687 }
1688 shiftCount = 0xBE - aExp;
1689 if (aExp) {
1690 aSig |= 0x00800000;
1691 }
1692 if (shiftCount < 0) {
1693 float_raise(float_flag_invalid STATUS_VAR);
1694 return LIT64(0xFFFFFFFFFFFFFFFF);
1695 }
1696
1697 aSig64 = aSig;
1698 aSig64 <<= 40;
1699 shift64ExtraRightJamming(aSig64, 0, shiftCount, &aSig64, &aSigExtra);
1700 return roundAndPackUint64(aSign, aSig64, aSigExtra STATUS_VAR);
1701}
1702
a13d4489
TM
1703/*----------------------------------------------------------------------------
1704| Returns the result of converting the single-precision floating-point value
1705| `a' to the 64-bit unsigned integer format. The conversion is
1706| performed according to the IEC/IEEE Standard for Binary Floating-Point
1707| Arithmetic, except that the conversion is always rounded toward zero. If
1708| `a' is a NaN, the largest unsigned integer is returned. Otherwise, if the
1709| conversion overflows, the largest unsigned integer is returned. If the
1710| 'a' is negative, the result is rounded and zero is returned; values that do
1711| not round to zero will raise the inexact flag.
1712*----------------------------------------------------------------------------*/
1713
1714uint64 float32_to_uint64_round_to_zero(float32 a STATUS_PARAM)
1715{
1716 signed char current_rounding_mode = STATUS(float_rounding_mode);
1717 set_float_rounding_mode(float_round_to_zero STATUS_VAR);
1718 int64_t v = float32_to_uint64(a STATUS_VAR);
1719 set_float_rounding_mode(current_rounding_mode STATUS_VAR);
1720 return v;
1721}
1722
158142c2
FB
1723/*----------------------------------------------------------------------------
1724| Returns the result of converting the single-precision floating-point value
1725| `a' to the 64-bit two's complement integer format. The conversion is
1726| performed according to the IEC/IEEE Standard for Binary Floating-Point
1727| Arithmetic, except that the conversion is always rounded toward zero. If
1728| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
1729| conversion overflows, the largest integer with the same sign as `a' is
1730| returned.
1731*----------------------------------------------------------------------------*/
1732
1733int64 float32_to_int64_round_to_zero( float32 a STATUS_PARAM )
1734{
1735 flag aSign;
94a49d86 1736 int_fast16_t aExp, shiftCount;
bb98fe42
AF
1737 uint32_t aSig;
1738 uint64_t aSig64;
158142c2 1739 int64 z;
37d18660 1740 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1741
1742 aSig = extractFloat32Frac( a );
1743 aExp = extractFloat32Exp( a );
1744 aSign = extractFloat32Sign( a );
1745 shiftCount = aExp - 0xBE;
1746 if ( 0 <= shiftCount ) {
f090c9d4 1747 if ( float32_val(a) != 0xDF000000 ) {
158142c2
FB
1748 float_raise( float_flag_invalid STATUS_VAR);
1749 if ( ! aSign || ( ( aExp == 0xFF ) && aSig ) ) {
1750 return LIT64( 0x7FFFFFFFFFFFFFFF );
1751 }
1752 }
bb98fe42 1753 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
1754 }
1755 else if ( aExp <= 0x7E ) {
1756 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
1757 return 0;
1758 }
1759 aSig64 = aSig | 0x00800000;
1760 aSig64 <<= 40;
1761 z = aSig64>>( - shiftCount );
bb98fe42 1762 if ( (uint64_t) ( aSig64<<( shiftCount & 63 ) ) ) {
158142c2
FB
1763 STATUS(float_exception_flags) |= float_flag_inexact;
1764 }
1765 if ( aSign ) z = - z;
1766 return z;
1767
1768}
1769
1770/*----------------------------------------------------------------------------
1771| Returns the result of converting the single-precision floating-point value
1772| `a' to the double-precision floating-point format. The conversion is
1773| performed according to the IEC/IEEE Standard for Binary Floating-Point
1774| Arithmetic.
1775*----------------------------------------------------------------------------*/
1776
1777float64 float32_to_float64( float32 a STATUS_PARAM )
1778{
1779 flag aSign;
94a49d86 1780 int_fast16_t aExp;
bb98fe42 1781 uint32_t aSig;
37d18660 1782 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1783
1784 aSig = extractFloat32Frac( a );
1785 aExp = extractFloat32Exp( a );
1786 aSign = extractFloat32Sign( a );
1787 if ( aExp == 0xFF ) {
bcd4d9af 1788 if ( aSig ) return commonNaNToFloat64( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1789 return packFloat64( aSign, 0x7FF, 0 );
1790 }
1791 if ( aExp == 0 ) {
1792 if ( aSig == 0 ) return packFloat64( aSign, 0, 0 );
1793 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1794 --aExp;
1795 }
bb98fe42 1796 return packFloat64( aSign, aExp + 0x380, ( (uint64_t) aSig )<<29 );
158142c2
FB
1797
1798}
1799
158142c2
FB
1800/*----------------------------------------------------------------------------
1801| Returns the result of converting the single-precision floating-point value
1802| `a' to the extended double-precision floating-point format. The conversion
1803| is performed according to the IEC/IEEE Standard for Binary Floating-Point
1804| Arithmetic.
1805*----------------------------------------------------------------------------*/
1806
1807floatx80 float32_to_floatx80( float32 a STATUS_PARAM )
1808{
1809 flag aSign;
94a49d86 1810 int_fast16_t aExp;
bb98fe42 1811 uint32_t aSig;
158142c2 1812
37d18660 1813 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1814 aSig = extractFloat32Frac( a );
1815 aExp = extractFloat32Exp( a );
1816 aSign = extractFloat32Sign( a );
1817 if ( aExp == 0xFF ) {
bcd4d9af 1818 if ( aSig ) return commonNaNToFloatx80( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1819 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
1820 }
1821 if ( aExp == 0 ) {
1822 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
1823 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1824 }
1825 aSig |= 0x00800000;
bb98fe42 1826 return packFloatx80( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<40 );
158142c2
FB
1827
1828}
1829
158142c2
FB
1830/*----------------------------------------------------------------------------
1831| Returns the result of converting the single-precision floating-point value
1832| `a' to the double-precision floating-point format. The conversion is
1833| performed according to the IEC/IEEE Standard for Binary Floating-Point
1834| Arithmetic.
1835*----------------------------------------------------------------------------*/
1836
1837float128 float32_to_float128( float32 a STATUS_PARAM )
1838{
1839 flag aSign;
94a49d86 1840 int_fast16_t aExp;
bb98fe42 1841 uint32_t aSig;
158142c2 1842
37d18660 1843 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1844 aSig = extractFloat32Frac( a );
1845 aExp = extractFloat32Exp( a );
1846 aSign = extractFloat32Sign( a );
1847 if ( aExp == 0xFF ) {
bcd4d9af 1848 if ( aSig ) return commonNaNToFloat128( float32ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
1849 return packFloat128( aSign, 0x7FFF, 0, 0 );
1850 }
1851 if ( aExp == 0 ) {
1852 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
1853 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
1854 --aExp;
1855 }
bb98fe42 1856 return packFloat128( aSign, aExp + 0x3F80, ( (uint64_t) aSig )<<25, 0 );
158142c2
FB
1857
1858}
1859
158142c2
FB
1860/*----------------------------------------------------------------------------
1861| Rounds the single-precision floating-point value `a' to an integer, and
1862| returns the result as a single-precision floating-point value. The
1863| operation is performed according to the IEC/IEEE Standard for Binary
1864| Floating-Point Arithmetic.
1865*----------------------------------------------------------------------------*/
1866
1867float32 float32_round_to_int( float32 a STATUS_PARAM)
1868{
1869 flag aSign;
94a49d86 1870 int_fast16_t aExp;
bb98fe42 1871 uint32_t lastBitMask, roundBitsMask;
bb98fe42 1872 uint32_t z;
37d18660 1873 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
1874
1875 aExp = extractFloat32Exp( a );
1876 if ( 0x96 <= aExp ) {
1877 if ( ( aExp == 0xFF ) && extractFloat32Frac( a ) ) {
1878 return propagateFloat32NaN( a, a STATUS_VAR );
1879 }
1880 return a;
1881 }
1882 if ( aExp <= 0x7E ) {
bb98fe42 1883 if ( (uint32_t) ( float32_val(a)<<1 ) == 0 ) return a;
158142c2
FB
1884 STATUS(float_exception_flags) |= float_flag_inexact;
1885 aSign = extractFloat32Sign( a );
1886 switch ( STATUS(float_rounding_mode) ) {
1887 case float_round_nearest_even:
1888 if ( ( aExp == 0x7E ) && extractFloat32Frac( a ) ) {
1889 return packFloat32( aSign, 0x7F, 0 );
1890 }
1891 break;
f9288a76
PM
1892 case float_round_ties_away:
1893 if (aExp == 0x7E) {
1894 return packFloat32(aSign, 0x7F, 0);
1895 }
1896 break;
158142c2 1897 case float_round_down:
f090c9d4 1898 return make_float32(aSign ? 0xBF800000 : 0);
158142c2 1899 case float_round_up:
f090c9d4 1900 return make_float32(aSign ? 0x80000000 : 0x3F800000);
158142c2
FB
1901 }
1902 return packFloat32( aSign, 0, 0 );
1903 }
1904 lastBitMask = 1;
1905 lastBitMask <<= 0x96 - aExp;
1906 roundBitsMask = lastBitMask - 1;
f090c9d4 1907 z = float32_val(a);
dc355b76
PM
1908 switch (STATUS(float_rounding_mode)) {
1909 case float_round_nearest_even:
158142c2 1910 z += lastBitMask>>1;
dc355b76
PM
1911 if ((z & roundBitsMask) == 0) {
1912 z &= ~lastBitMask;
1913 }
1914 break;
f9288a76
PM
1915 case float_round_ties_away:
1916 z += lastBitMask >> 1;
1917 break;
dc355b76
PM
1918 case float_round_to_zero:
1919 break;
1920 case float_round_up:
1921 if (!extractFloat32Sign(make_float32(z))) {
1922 z += roundBitsMask;
1923 }
1924 break;
1925 case float_round_down:
1926 if (extractFloat32Sign(make_float32(z))) {
158142c2
FB
1927 z += roundBitsMask;
1928 }
dc355b76
PM
1929 break;
1930 default:
1931 abort();
158142c2
FB
1932 }
1933 z &= ~ roundBitsMask;
f090c9d4
PB
1934 if ( z != float32_val(a) ) STATUS(float_exception_flags) |= float_flag_inexact;
1935 return make_float32(z);
158142c2
FB
1936
1937}
1938
1939/*----------------------------------------------------------------------------
1940| Returns the result of adding the absolute values of the single-precision
1941| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
1942| before being returned. `zSign' is ignored if the result is a NaN.
1943| The addition is performed according to the IEC/IEEE Standard for Binary
1944| Floating-Point Arithmetic.
1945*----------------------------------------------------------------------------*/
1946
1947static float32 addFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
1948{
94a49d86 1949 int_fast16_t aExp, bExp, zExp;
bb98fe42 1950 uint32_t aSig, bSig, zSig;
94a49d86 1951 int_fast16_t expDiff;
158142c2
FB
1952
1953 aSig = extractFloat32Frac( a );
1954 aExp = extractFloat32Exp( a );
1955 bSig = extractFloat32Frac( b );
1956 bExp = extractFloat32Exp( b );
1957 expDiff = aExp - bExp;
1958 aSig <<= 6;
1959 bSig <<= 6;
1960 if ( 0 < expDiff ) {
1961 if ( aExp == 0xFF ) {
1962 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1963 return a;
1964 }
1965 if ( bExp == 0 ) {
1966 --expDiff;
1967 }
1968 else {
1969 bSig |= 0x20000000;
1970 }
1971 shift32RightJamming( bSig, expDiff, &bSig );
1972 zExp = aExp;
1973 }
1974 else if ( expDiff < 0 ) {
1975 if ( bExp == 0xFF ) {
1976 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1977 return packFloat32( zSign, 0xFF, 0 );
1978 }
1979 if ( aExp == 0 ) {
1980 ++expDiff;
1981 }
1982 else {
1983 aSig |= 0x20000000;
1984 }
1985 shift32RightJamming( aSig, - expDiff, &aSig );
1986 zExp = bExp;
1987 }
1988 else {
1989 if ( aExp == 0xFF ) {
1990 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
1991 return a;
1992 }
fe76d976 1993 if ( aExp == 0 ) {
e6afc87f
PM
1994 if (STATUS(flush_to_zero)) {
1995 if (aSig | bSig) {
1996 float_raise(float_flag_output_denormal STATUS_VAR);
1997 }
1998 return packFloat32(zSign, 0, 0);
1999 }
fe76d976
PB
2000 return packFloat32( zSign, 0, ( aSig + bSig )>>6 );
2001 }
158142c2
FB
2002 zSig = 0x40000000 + aSig + bSig;
2003 zExp = aExp;
2004 goto roundAndPack;
2005 }
2006 aSig |= 0x20000000;
2007 zSig = ( aSig + bSig )<<1;
2008 --zExp;
bb98fe42 2009 if ( (int32_t) zSig < 0 ) {
158142c2
FB
2010 zSig = aSig + bSig;
2011 ++zExp;
2012 }
2013 roundAndPack:
2014 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2015
2016}
2017
2018/*----------------------------------------------------------------------------
2019| Returns the result of subtracting the absolute values of the single-
2020| precision floating-point values `a' and `b'. If `zSign' is 1, the
2021| difference is negated before being returned. `zSign' is ignored if the
2022| result is a NaN. The subtraction is performed according to the IEC/IEEE
2023| Standard for Binary Floating-Point Arithmetic.
2024*----------------------------------------------------------------------------*/
2025
2026static float32 subFloat32Sigs( float32 a, float32 b, flag zSign STATUS_PARAM)
2027{
94a49d86 2028 int_fast16_t aExp, bExp, zExp;
bb98fe42 2029 uint32_t aSig, bSig, zSig;
94a49d86 2030 int_fast16_t expDiff;
158142c2
FB
2031
2032 aSig = extractFloat32Frac( a );
2033 aExp = extractFloat32Exp( a );
2034 bSig = extractFloat32Frac( b );
2035 bExp = extractFloat32Exp( b );
2036 expDiff = aExp - bExp;
2037 aSig <<= 7;
2038 bSig <<= 7;
2039 if ( 0 < expDiff ) goto aExpBigger;
2040 if ( expDiff < 0 ) goto bExpBigger;
2041 if ( aExp == 0xFF ) {
2042 if ( aSig | bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2043 float_raise( float_flag_invalid STATUS_VAR);
2044 return float32_default_nan;
2045 }
2046 if ( aExp == 0 ) {
2047 aExp = 1;
2048 bExp = 1;
2049 }
2050 if ( bSig < aSig ) goto aBigger;
2051 if ( aSig < bSig ) goto bBigger;
2052 return packFloat32( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
2053 bExpBigger:
2054 if ( bExp == 0xFF ) {
2055 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2056 return packFloat32( zSign ^ 1, 0xFF, 0 );
2057 }
2058 if ( aExp == 0 ) {
2059 ++expDiff;
2060 }
2061 else {
2062 aSig |= 0x40000000;
2063 }
2064 shift32RightJamming( aSig, - expDiff, &aSig );
2065 bSig |= 0x40000000;
2066 bBigger:
2067 zSig = bSig - aSig;
2068 zExp = bExp;
2069 zSign ^= 1;
2070 goto normalizeRoundAndPack;
2071 aExpBigger:
2072 if ( aExp == 0xFF ) {
2073 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2074 return a;
2075 }
2076 if ( bExp == 0 ) {
2077 --expDiff;
2078 }
2079 else {
2080 bSig |= 0x40000000;
2081 }
2082 shift32RightJamming( bSig, expDiff, &bSig );
2083 aSig |= 0x40000000;
2084 aBigger:
2085 zSig = aSig - bSig;
2086 zExp = aExp;
2087 normalizeRoundAndPack:
2088 --zExp;
2089 return normalizeRoundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2090
2091}
2092
2093/*----------------------------------------------------------------------------
2094| Returns the result of adding the single-precision floating-point values `a'
2095| and `b'. The operation is performed according to the IEC/IEEE Standard for
2096| Binary Floating-Point Arithmetic.
2097*----------------------------------------------------------------------------*/
2098
2099float32 float32_add( float32 a, float32 b STATUS_PARAM )
2100{
2101 flag aSign, bSign;
37d18660
PM
2102 a = float32_squash_input_denormal(a STATUS_VAR);
2103 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2104
2105 aSign = extractFloat32Sign( a );
2106 bSign = extractFloat32Sign( b );
2107 if ( aSign == bSign ) {
2108 return addFloat32Sigs( a, b, aSign STATUS_VAR);
2109 }
2110 else {
2111 return subFloat32Sigs( a, b, aSign STATUS_VAR );
2112 }
2113
2114}
2115
2116/*----------------------------------------------------------------------------
2117| Returns the result of subtracting the single-precision floating-point values
2118| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2119| for Binary Floating-Point Arithmetic.
2120*----------------------------------------------------------------------------*/
2121
2122float32 float32_sub( float32 a, float32 b STATUS_PARAM )
2123{
2124 flag aSign, bSign;
37d18660
PM
2125 a = float32_squash_input_denormal(a STATUS_VAR);
2126 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2127
2128 aSign = extractFloat32Sign( a );
2129 bSign = extractFloat32Sign( b );
2130 if ( aSign == bSign ) {
2131 return subFloat32Sigs( a, b, aSign STATUS_VAR );
2132 }
2133 else {
2134 return addFloat32Sigs( a, b, aSign STATUS_VAR );
2135 }
2136
2137}
2138
2139/*----------------------------------------------------------------------------
2140| Returns the result of multiplying the single-precision floating-point values
2141| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
2142| for Binary Floating-Point Arithmetic.
2143*----------------------------------------------------------------------------*/
2144
2145float32 float32_mul( float32 a, float32 b STATUS_PARAM )
2146{
2147 flag aSign, bSign, zSign;
94a49d86 2148 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
2149 uint32_t aSig, bSig;
2150 uint64_t zSig64;
2151 uint32_t zSig;
158142c2 2152
37d18660
PM
2153 a = float32_squash_input_denormal(a STATUS_VAR);
2154 b = float32_squash_input_denormal(b STATUS_VAR);
2155
158142c2
FB
2156 aSig = extractFloat32Frac( a );
2157 aExp = extractFloat32Exp( a );
2158 aSign = extractFloat32Sign( a );
2159 bSig = extractFloat32Frac( b );
2160 bExp = extractFloat32Exp( b );
2161 bSign = extractFloat32Sign( b );
2162 zSign = aSign ^ bSign;
2163 if ( aExp == 0xFF ) {
2164 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2165 return propagateFloat32NaN( a, b STATUS_VAR );
2166 }
2167 if ( ( bExp | bSig ) == 0 ) {
2168 float_raise( float_flag_invalid STATUS_VAR);
2169 return float32_default_nan;
2170 }
2171 return packFloat32( zSign, 0xFF, 0 );
2172 }
2173 if ( bExp == 0xFF ) {
2174 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2175 if ( ( aExp | aSig ) == 0 ) {
2176 float_raise( float_flag_invalid STATUS_VAR);
2177 return float32_default_nan;
2178 }
2179 return packFloat32( zSign, 0xFF, 0 );
2180 }
2181 if ( aExp == 0 ) {
2182 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2183 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2184 }
2185 if ( bExp == 0 ) {
2186 if ( bSig == 0 ) return packFloat32( zSign, 0, 0 );
2187 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2188 }
2189 zExp = aExp + bExp - 0x7F;
2190 aSig = ( aSig | 0x00800000 )<<7;
2191 bSig = ( bSig | 0x00800000 )<<8;
bb98fe42 2192 shift64RightJamming( ( (uint64_t) aSig ) * bSig, 32, &zSig64 );
158142c2 2193 zSig = zSig64;
bb98fe42 2194 if ( 0 <= (int32_t) ( zSig<<1 ) ) {
158142c2
FB
2195 zSig <<= 1;
2196 --zExp;
2197 }
2198 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2199
2200}
2201
2202/*----------------------------------------------------------------------------
2203| Returns the result of dividing the single-precision floating-point value `a'
2204| by the corresponding value `b'. The operation is performed according to the
2205| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2206*----------------------------------------------------------------------------*/
2207
2208float32 float32_div( float32 a, float32 b STATUS_PARAM )
2209{
2210 flag aSign, bSign, zSign;
94a49d86 2211 int_fast16_t aExp, bExp, zExp;
bb98fe42 2212 uint32_t aSig, bSig, zSig;
37d18660
PM
2213 a = float32_squash_input_denormal(a STATUS_VAR);
2214 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2215
2216 aSig = extractFloat32Frac( a );
2217 aExp = extractFloat32Exp( a );
2218 aSign = extractFloat32Sign( a );
2219 bSig = extractFloat32Frac( b );
2220 bExp = extractFloat32Exp( b );
2221 bSign = extractFloat32Sign( b );
2222 zSign = aSign ^ bSign;
2223 if ( aExp == 0xFF ) {
2224 if ( aSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2225 if ( bExp == 0xFF ) {
2226 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2227 float_raise( float_flag_invalid STATUS_VAR);
2228 return float32_default_nan;
2229 }
2230 return packFloat32( zSign, 0xFF, 0 );
2231 }
2232 if ( bExp == 0xFF ) {
2233 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2234 return packFloat32( zSign, 0, 0 );
2235 }
2236 if ( bExp == 0 ) {
2237 if ( bSig == 0 ) {
2238 if ( ( aExp | aSig ) == 0 ) {
2239 float_raise( float_flag_invalid STATUS_VAR);
2240 return float32_default_nan;
2241 }
2242 float_raise( float_flag_divbyzero STATUS_VAR);
2243 return packFloat32( zSign, 0xFF, 0 );
2244 }
2245 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2246 }
2247 if ( aExp == 0 ) {
2248 if ( aSig == 0 ) return packFloat32( zSign, 0, 0 );
2249 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2250 }
2251 zExp = aExp - bExp + 0x7D;
2252 aSig = ( aSig | 0x00800000 )<<7;
2253 bSig = ( bSig | 0x00800000 )<<8;
2254 if ( bSig <= ( aSig + aSig ) ) {
2255 aSig >>= 1;
2256 ++zExp;
2257 }
bb98fe42 2258 zSig = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2 2259 if ( ( zSig & 0x3F ) == 0 ) {
bb98fe42 2260 zSig |= ( (uint64_t) bSig * zSig != ( (uint64_t) aSig )<<32 );
158142c2
FB
2261 }
2262 return roundAndPackFloat32( zSign, zExp, zSig STATUS_VAR );
2263
2264}
2265
2266/*----------------------------------------------------------------------------
2267| Returns the remainder of the single-precision floating-point value `a'
2268| with respect to the corresponding value `b'. The operation is performed
2269| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2270*----------------------------------------------------------------------------*/
2271
2272float32 float32_rem( float32 a, float32 b STATUS_PARAM )
2273{
ed086f3d 2274 flag aSign, zSign;
94a49d86 2275 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
2276 uint32_t aSig, bSig;
2277 uint32_t q;
2278 uint64_t aSig64, bSig64, q64;
2279 uint32_t alternateASig;
2280 int32_t sigMean;
37d18660
PM
2281 a = float32_squash_input_denormal(a STATUS_VAR);
2282 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2283
2284 aSig = extractFloat32Frac( a );
2285 aExp = extractFloat32Exp( a );
2286 aSign = extractFloat32Sign( a );
2287 bSig = extractFloat32Frac( b );
2288 bExp = extractFloat32Exp( b );
158142c2
FB
2289 if ( aExp == 0xFF ) {
2290 if ( aSig || ( ( bExp == 0xFF ) && bSig ) ) {
2291 return propagateFloat32NaN( a, b STATUS_VAR );
2292 }
2293 float_raise( float_flag_invalid STATUS_VAR);
2294 return float32_default_nan;
2295 }
2296 if ( bExp == 0xFF ) {
2297 if ( bSig ) return propagateFloat32NaN( a, b STATUS_VAR );
2298 return a;
2299 }
2300 if ( bExp == 0 ) {
2301 if ( bSig == 0 ) {
2302 float_raise( float_flag_invalid STATUS_VAR);
2303 return float32_default_nan;
2304 }
2305 normalizeFloat32Subnormal( bSig, &bExp, &bSig );
2306 }
2307 if ( aExp == 0 ) {
2308 if ( aSig == 0 ) return a;
2309 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2310 }
2311 expDiff = aExp - bExp;
2312 aSig |= 0x00800000;
2313 bSig |= 0x00800000;
2314 if ( expDiff < 32 ) {
2315 aSig <<= 8;
2316 bSig <<= 8;
2317 if ( expDiff < 0 ) {
2318 if ( expDiff < -1 ) return a;
2319 aSig >>= 1;
2320 }
2321 q = ( bSig <= aSig );
2322 if ( q ) aSig -= bSig;
2323 if ( 0 < expDiff ) {
bb98fe42 2324 q = ( ( (uint64_t) aSig )<<32 ) / bSig;
158142c2
FB
2325 q >>= 32 - expDiff;
2326 bSig >>= 2;
2327 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
2328 }
2329 else {
2330 aSig >>= 2;
2331 bSig >>= 2;
2332 }
2333 }
2334 else {
2335 if ( bSig <= aSig ) aSig -= bSig;
bb98fe42
AF
2336 aSig64 = ( (uint64_t) aSig )<<40;
2337 bSig64 = ( (uint64_t) bSig )<<40;
158142c2
FB
2338 expDiff -= 64;
2339 while ( 0 < expDiff ) {
2340 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2341 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2342 aSig64 = - ( ( bSig * q64 )<<38 );
2343 expDiff -= 62;
2344 }
2345 expDiff += 64;
2346 q64 = estimateDiv128To64( aSig64, 0, bSig64 );
2347 q64 = ( 2 < q64 ) ? q64 - 2 : 0;
2348 q = q64>>( 64 - expDiff );
2349 bSig <<= 6;
2350 aSig = ( ( aSig64>>33 )<<( expDiff - 1 ) ) - bSig * q;
2351 }
2352 do {
2353 alternateASig = aSig;
2354 ++q;
2355 aSig -= bSig;
bb98fe42 2356 } while ( 0 <= (int32_t) aSig );
158142c2
FB
2357 sigMean = aSig + alternateASig;
2358 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
2359 aSig = alternateASig;
2360 }
bb98fe42 2361 zSign = ( (int32_t) aSig < 0 );
158142c2
FB
2362 if ( zSign ) aSig = - aSig;
2363 return normalizeRoundAndPackFloat32( aSign ^ zSign, bExp, aSig STATUS_VAR );
2364
2365}
2366
369be8f6
PM
2367/*----------------------------------------------------------------------------
2368| Returns the result of multiplying the single-precision floating-point values
2369| `a' and `b' then adding 'c', with no intermediate rounding step after the
2370| multiplication. The operation is performed according to the IEC/IEEE
2371| Standard for Binary Floating-Point Arithmetic 754-2008.
2372| The flags argument allows the caller to select negation of the
2373| addend, the intermediate product, or the final result. (The difference
2374| between this and having the caller do a separate negation is that negating
2375| externally will flip the sign bit on NaNs.)
2376*----------------------------------------------------------------------------*/
2377
2378float32 float32_muladd(float32 a, float32 b, float32 c, int flags STATUS_PARAM)
2379{
2380 flag aSign, bSign, cSign, zSign;
94a49d86 2381 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
2382 uint32_t aSig, bSig, cSig;
2383 flag pInf, pZero, pSign;
2384 uint64_t pSig64, cSig64, zSig64;
2385 uint32_t pSig;
2386 int shiftcount;
2387 flag signflip, infzero;
2388
2389 a = float32_squash_input_denormal(a STATUS_VAR);
2390 b = float32_squash_input_denormal(b STATUS_VAR);
2391 c = float32_squash_input_denormal(c STATUS_VAR);
2392 aSig = extractFloat32Frac(a);
2393 aExp = extractFloat32Exp(a);
2394 aSign = extractFloat32Sign(a);
2395 bSig = extractFloat32Frac(b);
2396 bExp = extractFloat32Exp(b);
2397 bSign = extractFloat32Sign(b);
2398 cSig = extractFloat32Frac(c);
2399 cExp = extractFloat32Exp(c);
2400 cSign = extractFloat32Sign(c);
2401
2402 infzero = ((aExp == 0 && aSig == 0 && bExp == 0xff && bSig == 0) ||
2403 (aExp == 0xff && aSig == 0 && bExp == 0 && bSig == 0));
2404
2405 /* It is implementation-defined whether the cases of (0,inf,qnan)
2406 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
2407 * they return if they do), so we have to hand this information
2408 * off to the target-specific pick-a-NaN routine.
2409 */
2410 if (((aExp == 0xff) && aSig) ||
2411 ((bExp == 0xff) && bSig) ||
2412 ((cExp == 0xff) && cSig)) {
2413 return propagateFloat32MulAddNaN(a, b, c, infzero STATUS_VAR);
2414 }
2415
2416 if (infzero) {
2417 float_raise(float_flag_invalid STATUS_VAR);
2418 return float32_default_nan;
2419 }
2420
2421 if (flags & float_muladd_negate_c) {
2422 cSign ^= 1;
2423 }
2424
2425 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
2426
2427 /* Work out the sign and type of the product */
2428 pSign = aSign ^ bSign;
2429 if (flags & float_muladd_negate_product) {
2430 pSign ^= 1;
2431 }
2432 pInf = (aExp == 0xff) || (bExp == 0xff);
2433 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
2434
2435 if (cExp == 0xff) {
2436 if (pInf && (pSign ^ cSign)) {
2437 /* addition of opposite-signed infinities => InvalidOperation */
2438 float_raise(float_flag_invalid STATUS_VAR);
2439 return float32_default_nan;
2440 }
2441 /* Otherwise generate an infinity of the same sign */
2442 return packFloat32(cSign ^ signflip, 0xff, 0);
2443 }
2444
2445 if (pInf) {
2446 return packFloat32(pSign ^ signflip, 0xff, 0);
2447 }
2448
2449 if (pZero) {
2450 if (cExp == 0) {
2451 if (cSig == 0) {
2452 /* Adding two exact zeroes */
2453 if (pSign == cSign) {
2454 zSign = pSign;
2455 } else if (STATUS(float_rounding_mode) == float_round_down) {
2456 zSign = 1;
2457 } else {
2458 zSign = 0;
2459 }
2460 return packFloat32(zSign ^ signflip, 0, 0);
2461 }
2462 /* Exact zero plus a denorm */
2463 if (STATUS(flush_to_zero)) {
2464 float_raise(float_flag_output_denormal STATUS_VAR);
2465 return packFloat32(cSign ^ signflip, 0, 0);
2466 }
2467 }
2468 /* Zero plus something non-zero : just return the something */
67d43538
PM
2469 if (flags & float_muladd_halve_result) {
2470 if (cExp == 0) {
2471 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2472 }
2473 /* Subtract one to halve, and one again because roundAndPackFloat32
2474 * wants one less than the true exponent.
2475 */
2476 cExp -= 2;
2477 cSig = (cSig | 0x00800000) << 7;
2478 return roundAndPackFloat32(cSign ^ signflip, cExp, cSig STATUS_VAR);
2479 }
a6e7c184 2480 return packFloat32(cSign ^ signflip, cExp, cSig);
369be8f6
PM
2481 }
2482
2483 if (aExp == 0) {
2484 normalizeFloat32Subnormal(aSig, &aExp, &aSig);
2485 }
2486 if (bExp == 0) {
2487 normalizeFloat32Subnormal(bSig, &bExp, &bSig);
2488 }
2489
2490 /* Calculate the actual result a * b + c */
2491
2492 /* Multiply first; this is easy. */
2493 /* NB: we subtract 0x7e where float32_mul() subtracts 0x7f
2494 * because we want the true exponent, not the "one-less-than"
2495 * flavour that roundAndPackFloat32() takes.
2496 */
2497 pExp = aExp + bExp - 0x7e;
2498 aSig = (aSig | 0x00800000) << 7;
2499 bSig = (bSig | 0x00800000) << 8;
2500 pSig64 = (uint64_t)aSig * bSig;
2501 if ((int64_t)(pSig64 << 1) >= 0) {
2502 pSig64 <<= 1;
2503 pExp--;
2504 }
2505
2506 zSign = pSign ^ signflip;
2507
2508 /* Now pSig64 is the significand of the multiply, with the explicit bit in
2509 * position 62.
2510 */
2511 if (cExp == 0) {
2512 if (!cSig) {
2513 /* Throw out the special case of c being an exact zero now */
2514 shift64RightJamming(pSig64, 32, &pSig64);
2515 pSig = pSig64;
67d43538
PM
2516 if (flags & float_muladd_halve_result) {
2517 pExp--;
2518 }
369be8f6
PM
2519 return roundAndPackFloat32(zSign, pExp - 1,
2520 pSig STATUS_VAR);
2521 }
2522 normalizeFloat32Subnormal(cSig, &cExp, &cSig);
2523 }
2524
2525 cSig64 = (uint64_t)cSig << (62 - 23);
2526 cSig64 |= LIT64(0x4000000000000000);
2527 expDiff = pExp - cExp;
2528
2529 if (pSign == cSign) {
2530 /* Addition */
2531 if (expDiff > 0) {
2532 /* scale c to match p */
2533 shift64RightJamming(cSig64, expDiff, &cSig64);
2534 zExp = pExp;
2535 } else if (expDiff < 0) {
2536 /* scale p to match c */
2537 shift64RightJamming(pSig64, -expDiff, &pSig64);
2538 zExp = cExp;
2539 } else {
2540 /* no scaling needed */
2541 zExp = cExp;
2542 }
2543 /* Add significands and make sure explicit bit ends up in posn 62 */
2544 zSig64 = pSig64 + cSig64;
2545 if ((int64_t)zSig64 < 0) {
2546 shift64RightJamming(zSig64, 1, &zSig64);
2547 } else {
2548 zExp--;
2549 }
2550 } else {
2551 /* Subtraction */
2552 if (expDiff > 0) {
2553 shift64RightJamming(cSig64, expDiff, &cSig64);
2554 zSig64 = pSig64 - cSig64;
2555 zExp = pExp;
2556 } else if (expDiff < 0) {
2557 shift64RightJamming(pSig64, -expDiff, &pSig64);
2558 zSig64 = cSig64 - pSig64;
2559 zExp = cExp;
2560 zSign ^= 1;
2561 } else {
2562 zExp = pExp;
2563 if (cSig64 < pSig64) {
2564 zSig64 = pSig64 - cSig64;
2565 } else if (pSig64 < cSig64) {
2566 zSig64 = cSig64 - pSig64;
2567 zSign ^= 1;
2568 } else {
2569 /* Exact zero */
2570 zSign = signflip;
2571 if (STATUS(float_rounding_mode) == float_round_down) {
2572 zSign ^= 1;
2573 }
2574 return packFloat32(zSign, 0, 0);
2575 }
2576 }
2577 --zExp;
2578 /* Normalize to put the explicit bit back into bit 62. */
2579 shiftcount = countLeadingZeros64(zSig64) - 1;
2580 zSig64 <<= shiftcount;
2581 zExp -= shiftcount;
2582 }
67d43538
PM
2583 if (flags & float_muladd_halve_result) {
2584 zExp--;
2585 }
2586
369be8f6
PM
2587 shift64RightJamming(zSig64, 32, &zSig64);
2588 return roundAndPackFloat32(zSign, zExp, zSig64 STATUS_VAR);
2589}
2590
2591
158142c2
FB
2592/*----------------------------------------------------------------------------
2593| Returns the square root of the single-precision floating-point value `a'.
2594| The operation is performed according to the IEC/IEEE Standard for Binary
2595| Floating-Point Arithmetic.
2596*----------------------------------------------------------------------------*/
2597
2598float32 float32_sqrt( float32 a STATUS_PARAM )
2599{
2600 flag aSign;
94a49d86 2601 int_fast16_t aExp, zExp;
bb98fe42
AF
2602 uint32_t aSig, zSig;
2603 uint64_t rem, term;
37d18660 2604 a = float32_squash_input_denormal(a STATUS_VAR);
158142c2
FB
2605
2606 aSig = extractFloat32Frac( a );
2607 aExp = extractFloat32Exp( a );
2608 aSign = extractFloat32Sign( a );
2609 if ( aExp == 0xFF ) {
f090c9d4 2610 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
158142c2
FB
2611 if ( ! aSign ) return a;
2612 float_raise( float_flag_invalid STATUS_VAR);
2613 return float32_default_nan;
2614 }
2615 if ( aSign ) {
2616 if ( ( aExp | aSig ) == 0 ) return a;
2617 float_raise( float_flag_invalid STATUS_VAR);
2618 return float32_default_nan;
2619 }
2620 if ( aExp == 0 ) {
f090c9d4 2621 if ( aSig == 0 ) return float32_zero;
158142c2
FB
2622 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2623 }
2624 zExp = ( ( aExp - 0x7F )>>1 ) + 0x7E;
2625 aSig = ( aSig | 0x00800000 )<<8;
2626 zSig = estimateSqrt32( aExp, aSig ) + 2;
2627 if ( ( zSig & 0x7F ) <= 5 ) {
2628 if ( zSig < 2 ) {
2629 zSig = 0x7FFFFFFF;
2630 goto roundAndPack;
2631 }
2632 aSig >>= aExp & 1;
bb98fe42
AF
2633 term = ( (uint64_t) zSig ) * zSig;
2634 rem = ( ( (uint64_t) aSig )<<32 ) - term;
2635 while ( (int64_t) rem < 0 ) {
158142c2 2636 --zSig;
bb98fe42 2637 rem += ( ( (uint64_t) zSig )<<1 ) | 1;
158142c2
FB
2638 }
2639 zSig |= ( rem != 0 );
2640 }
2641 shift32RightJamming( zSig, 1, &zSig );
2642 roundAndPack:
2643 return roundAndPackFloat32( 0, zExp, zSig STATUS_VAR );
2644
2645}
2646
8229c991
AJ
2647/*----------------------------------------------------------------------------
2648| Returns the binary exponential of the single-precision floating-point value
2649| `a'. The operation is performed according to the IEC/IEEE Standard for
2650| Binary Floating-Point Arithmetic.
2651|
2652| Uses the following identities:
2653|
2654| 1. -------------------------------------------------------------------------
2655| x x*ln(2)
2656| 2 = e
2657|
2658| 2. -------------------------------------------------------------------------
2659| 2 3 4 5 n
2660| x x x x x x x
2661| e = 1 + --- + --- + --- + --- + --- + ... + --- + ...
2662| 1! 2! 3! 4! 5! n!
2663*----------------------------------------------------------------------------*/
2664
2665static const float64 float32_exp2_coefficients[15] =
2666{
d5138cf4
PM
2667 const_float64( 0x3ff0000000000000ll ), /* 1 */
2668 const_float64( 0x3fe0000000000000ll ), /* 2 */
2669 const_float64( 0x3fc5555555555555ll ), /* 3 */
2670 const_float64( 0x3fa5555555555555ll ), /* 4 */
2671 const_float64( 0x3f81111111111111ll ), /* 5 */
2672 const_float64( 0x3f56c16c16c16c17ll ), /* 6 */
2673 const_float64( 0x3f2a01a01a01a01all ), /* 7 */
2674 const_float64( 0x3efa01a01a01a01all ), /* 8 */
2675 const_float64( 0x3ec71de3a556c734ll ), /* 9 */
2676 const_float64( 0x3e927e4fb7789f5cll ), /* 10 */
2677 const_float64( 0x3e5ae64567f544e4ll ), /* 11 */
2678 const_float64( 0x3e21eed8eff8d898ll ), /* 12 */
2679 const_float64( 0x3de6124613a86d09ll ), /* 13 */
2680 const_float64( 0x3da93974a8c07c9dll ), /* 14 */
2681 const_float64( 0x3d6ae7f3e733b81fll ), /* 15 */
8229c991
AJ
2682};
2683
2684float32 float32_exp2( float32 a STATUS_PARAM )
2685{
2686 flag aSign;
94a49d86 2687 int_fast16_t aExp;
bb98fe42 2688 uint32_t aSig;
8229c991
AJ
2689 float64 r, x, xn;
2690 int i;
37d18660 2691 a = float32_squash_input_denormal(a STATUS_VAR);
8229c991
AJ
2692
2693 aSig = extractFloat32Frac( a );
2694 aExp = extractFloat32Exp( a );
2695 aSign = extractFloat32Sign( a );
2696
2697 if ( aExp == 0xFF) {
2698 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2699 return (aSign) ? float32_zero : a;
2700 }
2701 if (aExp == 0) {
2702 if (aSig == 0) return float32_one;
2703 }
2704
2705 float_raise( float_flag_inexact STATUS_VAR);
2706
2707 /* ******************************* */
2708 /* using float64 for approximation */
2709 /* ******************************* */
2710 x = float32_to_float64(a STATUS_VAR);
2711 x = float64_mul(x, float64_ln2 STATUS_VAR);
2712
2713 xn = x;
2714 r = float64_one;
2715 for (i = 0 ; i < 15 ; i++) {
2716 float64 f;
2717
2718 f = float64_mul(xn, float32_exp2_coefficients[i] STATUS_VAR);
2719 r = float64_add(r, f STATUS_VAR);
2720
2721 xn = float64_mul(xn, x STATUS_VAR);
2722 }
2723
2724 return float64_to_float32(r, status);
2725}
2726
374dfc33
AJ
2727/*----------------------------------------------------------------------------
2728| Returns the binary log of the single-precision floating-point value `a'.
2729| The operation is performed according to the IEC/IEEE Standard for Binary
2730| Floating-Point Arithmetic.
2731*----------------------------------------------------------------------------*/
2732float32 float32_log2( float32 a STATUS_PARAM )
2733{
2734 flag aSign, zSign;
94a49d86 2735 int_fast16_t aExp;
bb98fe42 2736 uint32_t aSig, zSig, i;
374dfc33 2737
37d18660 2738 a = float32_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
2739 aSig = extractFloat32Frac( a );
2740 aExp = extractFloat32Exp( a );
2741 aSign = extractFloat32Sign( a );
2742
2743 if ( aExp == 0 ) {
2744 if ( aSig == 0 ) return packFloat32( 1, 0xFF, 0 );
2745 normalizeFloat32Subnormal( aSig, &aExp, &aSig );
2746 }
2747 if ( aSign ) {
2748 float_raise( float_flag_invalid STATUS_VAR);
2749 return float32_default_nan;
2750 }
2751 if ( aExp == 0xFF ) {
2752 if ( aSig ) return propagateFloat32NaN( a, float32_zero STATUS_VAR );
2753 return a;
2754 }
2755
2756 aExp -= 0x7F;
2757 aSig |= 0x00800000;
2758 zSign = aExp < 0;
2759 zSig = aExp << 23;
2760
2761 for (i = 1 << 22; i > 0; i >>= 1) {
bb98fe42 2762 aSig = ( (uint64_t)aSig * aSig ) >> 23;
374dfc33
AJ
2763 if ( aSig & 0x01000000 ) {
2764 aSig >>= 1;
2765 zSig |= i;
2766 }
2767 }
2768
2769 if ( zSign )
2770 zSig = -zSig;
2771
2772 return normalizeRoundAndPackFloat32( zSign, 0x85, zSig STATUS_VAR );
2773}
2774
158142c2
FB
2775/*----------------------------------------------------------------------------
2776| Returns 1 if the single-precision floating-point value `a' is equal to
b689362d
AJ
2777| the corresponding value `b', and 0 otherwise. The invalid exception is
2778| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
2779| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2780*----------------------------------------------------------------------------*/
2781
b689362d 2782int float32_eq( float32 a, float32 b STATUS_PARAM )
158142c2 2783{
b689362d 2784 uint32_t av, bv;
37d18660
PM
2785 a = float32_squash_input_denormal(a STATUS_VAR);
2786 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2787
2788 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2789 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2790 ) {
b689362d 2791 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
2792 return 0;
2793 }
b689362d
AJ
2794 av = float32_val(a);
2795 bv = float32_val(b);
2796 return ( av == bv ) || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
2797}
2798
2799/*----------------------------------------------------------------------------
2800| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2801| or equal to the corresponding value `b', and 0 otherwise. The invalid
2802| exception is raised if either operand is a NaN. The comparison is performed
2803| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2804*----------------------------------------------------------------------------*/
2805
750afe93 2806int float32_le( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2807{
2808 flag aSign, bSign;
bb98fe42 2809 uint32_t av, bv;
37d18660
PM
2810 a = float32_squash_input_denormal(a STATUS_VAR);
2811 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2812
2813 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2814 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2815 ) {
2816 float_raise( float_flag_invalid STATUS_VAR);
2817 return 0;
2818 }
2819 aSign = extractFloat32Sign( a );
2820 bSign = extractFloat32Sign( b );
f090c9d4
PB
2821 av = float32_val(a);
2822 bv = float32_val(b);
bb98fe42 2823 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2824 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2825
2826}
2827
2828/*----------------------------------------------------------------------------
2829| Returns 1 if the single-precision floating-point value `a' is less than
f5a64251
AJ
2830| the corresponding value `b', and 0 otherwise. The invalid exception is
2831| raised if either operand is a NaN. The comparison is performed according
2832| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
2833*----------------------------------------------------------------------------*/
2834
750afe93 2835int float32_lt( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2836{
2837 flag aSign, bSign;
bb98fe42 2838 uint32_t av, bv;
37d18660
PM
2839 a = float32_squash_input_denormal(a STATUS_VAR);
2840 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2841
2842 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2843 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2844 ) {
2845 float_raise( float_flag_invalid STATUS_VAR);
2846 return 0;
2847 }
2848 aSign = extractFloat32Sign( a );
2849 bSign = extractFloat32Sign( b );
f090c9d4
PB
2850 av = float32_val(a);
2851 bv = float32_val(b);
bb98fe42 2852 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2853 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2854
2855}
2856
67b7861d
AJ
2857/*----------------------------------------------------------------------------
2858| Returns 1 if the single-precision floating-point values `a' and `b' cannot
f5a64251
AJ
2859| be compared, and 0 otherwise. The invalid exception is raised if either
2860| operand is a NaN. The comparison is performed according to the IEC/IEEE
2861| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
2862*----------------------------------------------------------------------------*/
2863
2864int float32_unordered( float32 a, float32 b STATUS_PARAM )
2865{
2866 a = float32_squash_input_denormal(a STATUS_VAR);
2867 b = float32_squash_input_denormal(b STATUS_VAR);
2868
2869 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2870 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2871 ) {
2872 float_raise( float_flag_invalid STATUS_VAR);
2873 return 1;
2874 }
2875 return 0;
2876}
b689362d 2877
158142c2
FB
2878/*----------------------------------------------------------------------------
2879| Returns 1 if the single-precision floating-point value `a' is equal to
f5a64251
AJ
2880| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2881| exception. The comparison is performed according to the IEC/IEEE Standard
2882| for Binary Floating-Point Arithmetic.
158142c2
FB
2883*----------------------------------------------------------------------------*/
2884
b689362d 2885int float32_eq_quiet( float32 a, float32 b STATUS_PARAM )
158142c2 2886{
37d18660
PM
2887 a = float32_squash_input_denormal(a STATUS_VAR);
2888 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2889
2890 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2891 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2892 ) {
b689362d
AJ
2893 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2894 float_raise( float_flag_invalid STATUS_VAR);
2895 }
158142c2
FB
2896 return 0;
2897 }
b689362d
AJ
2898 return ( float32_val(a) == float32_val(b) ) ||
2899 ( (uint32_t) ( ( float32_val(a) | float32_val(b) )<<1 ) == 0 );
158142c2
FB
2900}
2901
2902/*----------------------------------------------------------------------------
2903| Returns 1 if the single-precision floating-point value `a' is less than or
2904| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
2905| cause an exception. Otherwise, the comparison is performed according to the
2906| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
2907*----------------------------------------------------------------------------*/
2908
750afe93 2909int float32_le_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2910{
2911 flag aSign, bSign;
bb98fe42 2912 uint32_t av, bv;
37d18660
PM
2913 a = float32_squash_input_denormal(a STATUS_VAR);
2914 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2915
2916 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2917 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2918 ) {
2919 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2920 float_raise( float_flag_invalid STATUS_VAR);
2921 }
2922 return 0;
2923 }
2924 aSign = extractFloat32Sign( a );
2925 bSign = extractFloat32Sign( b );
f090c9d4
PB
2926 av = float32_val(a);
2927 bv = float32_val(b);
bb98fe42 2928 if ( aSign != bSign ) return aSign || ( (uint32_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 2929 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
2930
2931}
2932
2933/*----------------------------------------------------------------------------
2934| Returns 1 if the single-precision floating-point value `a' is less than
2935| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
2936| exception. Otherwise, the comparison is performed according to the IEC/IEEE
2937| Standard for Binary Floating-Point Arithmetic.
2938*----------------------------------------------------------------------------*/
2939
750afe93 2940int float32_lt_quiet( float32 a, float32 b STATUS_PARAM )
158142c2
FB
2941{
2942 flag aSign, bSign;
bb98fe42 2943 uint32_t av, bv;
37d18660
PM
2944 a = float32_squash_input_denormal(a STATUS_VAR);
2945 b = float32_squash_input_denormal(b STATUS_VAR);
158142c2
FB
2946
2947 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2948 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2949 ) {
2950 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2951 float_raise( float_flag_invalid STATUS_VAR);
2952 }
2953 return 0;
2954 }
2955 aSign = extractFloat32Sign( a );
2956 bSign = extractFloat32Sign( b );
f090c9d4
PB
2957 av = float32_val(a);
2958 bv = float32_val(b);
bb98fe42 2959 if ( aSign != bSign ) return aSign && ( (uint32_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 2960 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
2961
2962}
2963
67b7861d
AJ
2964/*----------------------------------------------------------------------------
2965| Returns 1 if the single-precision floating-point values `a' and `b' cannot
2966| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
2967| comparison is performed according to the IEC/IEEE Standard for Binary
2968| Floating-Point Arithmetic.
2969*----------------------------------------------------------------------------*/
2970
2971int float32_unordered_quiet( float32 a, float32 b STATUS_PARAM )
2972{
2973 a = float32_squash_input_denormal(a STATUS_VAR);
2974 b = float32_squash_input_denormal(b STATUS_VAR);
2975
2976 if ( ( ( extractFloat32Exp( a ) == 0xFF ) && extractFloat32Frac( a ) )
2977 || ( ( extractFloat32Exp( b ) == 0xFF ) && extractFloat32Frac( b ) )
2978 ) {
2979 if ( float32_is_signaling_nan( a ) || float32_is_signaling_nan( b ) ) {
2980 float_raise( float_flag_invalid STATUS_VAR);
2981 }
2982 return 1;
2983 }
2984 return 0;
2985}
2986
158142c2
FB
2987/*----------------------------------------------------------------------------
2988| Returns the result of converting the double-precision floating-point value
2989| `a' to the 32-bit two's complement integer format. The conversion is
2990| performed according to the IEC/IEEE Standard for Binary Floating-Point
2991| Arithmetic---which means in particular that the conversion is rounded
2992| according to the current rounding mode. If `a' is a NaN, the largest
2993| positive integer is returned. Otherwise, if the conversion overflows, the
2994| largest integer with the same sign as `a' is returned.
2995*----------------------------------------------------------------------------*/
2996
2997int32 float64_to_int32( float64 a STATUS_PARAM )
2998{
2999 flag aSign;
94a49d86 3000 int_fast16_t aExp, shiftCount;
bb98fe42 3001 uint64_t aSig;
37d18660 3002 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3003
3004 aSig = extractFloat64Frac( a );
3005 aExp = extractFloat64Exp( a );
3006 aSign = extractFloat64Sign( a );
3007 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3008 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3009 shiftCount = 0x42C - aExp;
3010 if ( 0 < shiftCount ) shift64RightJamming( aSig, shiftCount, &aSig );
3011 return roundAndPackInt32( aSign, aSig STATUS_VAR );
3012
3013}
3014
3015/*----------------------------------------------------------------------------
3016| Returns the result of converting the double-precision floating-point value
3017| `a' to the 32-bit two's complement integer format. The conversion is
3018| performed according to the IEC/IEEE Standard for Binary Floating-Point
3019| Arithmetic, except that the conversion is always rounded toward zero.
3020| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3021| the conversion overflows, the largest integer with the same sign as `a' is
3022| returned.
3023*----------------------------------------------------------------------------*/
3024
3025int32 float64_to_int32_round_to_zero( float64 a STATUS_PARAM )
3026{
3027 flag aSign;
94a49d86 3028 int_fast16_t aExp, shiftCount;
bb98fe42 3029 uint64_t aSig, savedASig;
b3a6a2e0 3030 int32_t z;
37d18660 3031 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3032
3033 aSig = extractFloat64Frac( a );
3034 aExp = extractFloat64Exp( a );
3035 aSign = extractFloat64Sign( a );
3036 if ( 0x41E < aExp ) {
3037 if ( ( aExp == 0x7FF ) && aSig ) aSign = 0;
3038 goto invalid;
3039 }
3040 else if ( aExp < 0x3FF ) {
3041 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3042 return 0;
3043 }
3044 aSig |= LIT64( 0x0010000000000000 );
3045 shiftCount = 0x433 - aExp;
3046 savedASig = aSig;
3047 aSig >>= shiftCount;
3048 z = aSig;
3049 if ( aSign ) z = - z;
3050 if ( ( z < 0 ) ^ aSign ) {
3051 invalid:
3052 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 3053 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
3054 }
3055 if ( ( aSig<<shiftCount ) != savedASig ) {
3056 STATUS(float_exception_flags) |= float_flag_inexact;
3057 }
3058 return z;
3059
3060}
3061
cbcef455
PM
3062/*----------------------------------------------------------------------------
3063| Returns the result of converting the double-precision floating-point value
3064| `a' to the 16-bit two's complement integer format. The conversion is
3065| performed according to the IEC/IEEE Standard for Binary Floating-Point
3066| Arithmetic, except that the conversion is always rounded toward zero.
3067| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3068| the conversion overflows, the largest integer with the same sign as `a' is
3069| returned.
3070*----------------------------------------------------------------------------*/
3071
94a49d86 3072int_fast16_t float64_to_int16_round_to_zero(float64 a STATUS_PARAM)
cbcef455
PM
3073{
3074 flag aSign;
94a49d86 3075 int_fast16_t aExp, shiftCount;
bb98fe42 3076 uint64_t aSig, savedASig;
cbcef455
PM
3077 int32 z;
3078
3079 aSig = extractFloat64Frac( a );
3080 aExp = extractFloat64Exp( a );
3081 aSign = extractFloat64Sign( a );
3082 if ( 0x40E < aExp ) {
3083 if ( ( aExp == 0x7FF ) && aSig ) {
3084 aSign = 0;
3085 }
3086 goto invalid;
3087 }
3088 else if ( aExp < 0x3FF ) {
3089 if ( aExp || aSig ) {
3090 STATUS(float_exception_flags) |= float_flag_inexact;
3091 }
3092 return 0;
3093 }
3094 aSig |= LIT64( 0x0010000000000000 );
3095 shiftCount = 0x433 - aExp;
3096 savedASig = aSig;
3097 aSig >>= shiftCount;
3098 z = aSig;
3099 if ( aSign ) {
3100 z = - z;
3101 }
3102 if ( ( (int16_t)z < 0 ) ^ aSign ) {
3103 invalid:
3104 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 3105 return aSign ? (int32_t) 0xffff8000 : 0x7FFF;
cbcef455
PM
3106 }
3107 if ( ( aSig<<shiftCount ) != savedASig ) {
3108 STATUS(float_exception_flags) |= float_flag_inexact;
3109 }
3110 return z;
3111}
3112
158142c2
FB
3113/*----------------------------------------------------------------------------
3114| Returns the result of converting the double-precision floating-point value
3115| `a' to the 64-bit two's complement integer format. The conversion is
3116| performed according to the IEC/IEEE Standard for Binary Floating-Point
3117| Arithmetic---which means in particular that the conversion is rounded
3118| according to the current rounding mode. If `a' is a NaN, the largest
3119| positive integer is returned. Otherwise, if the conversion overflows, the
3120| largest integer with the same sign as `a' is returned.
3121*----------------------------------------------------------------------------*/
3122
3123int64 float64_to_int64( float64 a STATUS_PARAM )
3124{
3125 flag aSign;
94a49d86 3126 int_fast16_t aExp, shiftCount;
bb98fe42 3127 uint64_t aSig, aSigExtra;
37d18660 3128 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3129
3130 aSig = extractFloat64Frac( a );
3131 aExp = extractFloat64Exp( a );
3132 aSign = extractFloat64Sign( a );
3133 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3134 shiftCount = 0x433 - aExp;
3135 if ( shiftCount <= 0 ) {
3136 if ( 0x43E < aExp ) {
3137 float_raise( float_flag_invalid STATUS_VAR);
3138 if ( ! aSign
3139 || ( ( aExp == 0x7FF )
3140 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3141 ) {
3142 return LIT64( 0x7FFFFFFFFFFFFFFF );
3143 }
bb98fe42 3144 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3145 }
3146 aSigExtra = 0;
3147 aSig <<= - shiftCount;
3148 }
3149 else {
3150 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
3151 }
3152 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
3153
3154}
3155
3156/*----------------------------------------------------------------------------
3157| Returns the result of converting the double-precision floating-point value
3158| `a' to the 64-bit two's complement integer format. The conversion is
3159| performed according to the IEC/IEEE Standard for Binary Floating-Point
3160| Arithmetic, except that the conversion is always rounded toward zero.
3161| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
3162| the conversion overflows, the largest integer with the same sign as `a' is
3163| returned.
3164*----------------------------------------------------------------------------*/
3165
3166int64 float64_to_int64_round_to_zero( float64 a STATUS_PARAM )
3167{
3168 flag aSign;
94a49d86 3169 int_fast16_t aExp, shiftCount;
bb98fe42 3170 uint64_t aSig;
158142c2 3171 int64 z;
37d18660 3172 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3173
3174 aSig = extractFloat64Frac( a );
3175 aExp = extractFloat64Exp( a );
3176 aSign = extractFloat64Sign( a );
3177 if ( aExp ) aSig |= LIT64( 0x0010000000000000 );
3178 shiftCount = aExp - 0x433;
3179 if ( 0 <= shiftCount ) {
3180 if ( 0x43E <= aExp ) {
f090c9d4 3181 if ( float64_val(a) != LIT64( 0xC3E0000000000000 ) ) {
158142c2
FB
3182 float_raise( float_flag_invalid STATUS_VAR);
3183 if ( ! aSign
3184 || ( ( aExp == 0x7FF )
3185 && ( aSig != LIT64( 0x0010000000000000 ) ) )
3186 ) {
3187 return LIT64( 0x7FFFFFFFFFFFFFFF );
3188 }
3189 }
bb98fe42 3190 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
3191 }
3192 z = aSig<<shiftCount;
3193 }
3194 else {
3195 if ( aExp < 0x3FE ) {
3196 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
3197 return 0;
3198 }
3199 z = aSig>>( - shiftCount );
bb98fe42 3200 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
3201 STATUS(float_exception_flags) |= float_flag_inexact;
3202 }
3203 }
3204 if ( aSign ) z = - z;
3205 return z;
3206
3207}
3208
3209/*----------------------------------------------------------------------------
3210| Returns the result of converting the double-precision floating-point value
3211| `a' to the single-precision floating-point format. The conversion is
3212| performed according to the IEC/IEEE Standard for Binary Floating-Point
3213| Arithmetic.
3214*----------------------------------------------------------------------------*/
3215
3216float32 float64_to_float32( float64 a STATUS_PARAM )
3217{
3218 flag aSign;
94a49d86 3219 int_fast16_t aExp;
bb98fe42
AF
3220 uint64_t aSig;
3221 uint32_t zSig;
37d18660 3222 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3223
3224 aSig = extractFloat64Frac( a );
3225 aExp = extractFloat64Exp( a );
3226 aSign = extractFloat64Sign( a );
3227 if ( aExp == 0x7FF ) {
bcd4d9af 3228 if ( aSig ) return commonNaNToFloat32( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3229 return packFloat32( aSign, 0xFF, 0 );
3230 }
3231 shift64RightJamming( aSig, 22, &aSig );
3232 zSig = aSig;
3233 if ( aExp || zSig ) {
3234 zSig |= 0x40000000;
3235 aExp -= 0x381;
3236 }
3237 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
3238
3239}
3240
60011498
PB
3241
3242/*----------------------------------------------------------------------------
3243| Packs the sign `zSign', exponent `zExp', and significand `zSig' into a
3244| half-precision floating-point value, returning the result. After being
3245| shifted into the proper positions, the three fields are simply added
3246| together to form the result. This means that any integer portion of `zSig'
3247| will be added into the exponent. Since a properly normalized significand
3248| will have an integer portion equal to 1, the `zExp' input should be 1 less
3249| than the desired result exponent whenever `zSig' is a complete, normalized
3250| significand.
3251*----------------------------------------------------------------------------*/
94a49d86 3252static float16 packFloat16(flag zSign, int_fast16_t zExp, uint16_t zSig)
60011498 3253{
bb4d4bb3 3254 return make_float16(
bb98fe42 3255 (((uint32_t)zSign) << 15) + (((uint32_t)zExp) << 10) + zSig);
60011498
PB
3256}
3257
c4a1c5e7
PM
3258/*----------------------------------------------------------------------------
3259| Takes an abstract floating-point value having sign `zSign', exponent `zExp',
3260| and significand `zSig', and returns the proper half-precision floating-
3261| point value corresponding to the abstract input. Ordinarily, the abstract
3262| value is simply rounded and packed into the half-precision format, with
3263| the inexact exception raised if the abstract input cannot be represented
3264| exactly. However, if the abstract value is too large, the overflow and
3265| inexact exceptions are raised and an infinity or maximal finite value is
3266| returned. If the abstract value is too small, the input value is rounded to
3267| a subnormal number, and the underflow and inexact exceptions are raised if
3268| the abstract input cannot be represented exactly as a subnormal half-
3269| precision floating-point number.
3270| The `ieee' flag indicates whether to use IEEE standard half precision, or
3271| ARM-style "alternative representation", which omits the NaN and Inf
3272| encodings in order to raise the maximum representable exponent by one.
3273| The input significand `zSig' has its binary point between bits 22
3274| and 23, which is 13 bits to the left of the usual location. This shifted
3275| significand must be normalized or smaller. If `zSig' is not normalized,
3276| `zExp' must be 0; in that case, the result returned is a subnormal number,
3277| and it must not require rounding. In the usual case that `zSig' is
3278| normalized, `zExp' must be 1 less than the ``true'' floating-point exponent.
3279| Note the slightly odd position of the binary point in zSig compared with the
3280| other roundAndPackFloat functions. This should probably be fixed if we
3281| need to implement more float16 routines than just conversion.
3282| The handling of underflow and overflow follows the IEC/IEEE Standard for
3283| Binary Floating-Point Arithmetic.
3284*----------------------------------------------------------------------------*/
3285
3286static float32 roundAndPackFloat16(flag zSign, int_fast16_t zExp,
3287 uint32_t zSig, flag ieee STATUS_PARAM)
3288{
3289 int maxexp = ieee ? 29 : 30;
3290 uint32_t mask;
3291 uint32_t increment;
c4a1c5e7
PM
3292 bool rounding_bumps_exp;
3293 bool is_tiny = false;
3294
3295 /* Calculate the mask of bits of the mantissa which are not
3296 * representable in half-precision and will be lost.
3297 */
3298 if (zExp < 1) {
3299 /* Will be denormal in halfprec */
3300 mask = 0x00ffffff;
3301 if (zExp >= -11) {
3302 mask >>= 11 + zExp;
3303 }
3304 } else {
3305 /* Normal number in halfprec */
3306 mask = 0x00001fff;
3307 }
3308
dc355b76 3309 switch (STATUS(float_rounding_mode)) {
c4a1c5e7
PM
3310 case float_round_nearest_even:
3311 increment = (mask + 1) >> 1;
3312 if ((zSig & mask) == increment) {
3313 increment = zSig & (increment << 1);
3314 }
3315 break;
f9288a76
PM
3316 case float_round_ties_away:
3317 increment = (mask + 1) >> 1;
3318 break;
c4a1c5e7
PM
3319 case float_round_up:
3320 increment = zSign ? 0 : mask;
3321 break;
3322 case float_round_down:
3323 increment = zSign ? mask : 0;
3324 break;
3325 default: /* round_to_zero */
3326 increment = 0;
3327 break;
3328 }
3329
3330 rounding_bumps_exp = (zSig + increment >= 0x01000000);
3331
3332 if (zExp > maxexp || (zExp == maxexp && rounding_bumps_exp)) {
3333 if (ieee) {
3334 float_raise(float_flag_overflow | float_flag_inexact STATUS_VAR);
3335 return packFloat16(zSign, 0x1f, 0);
3336 } else {
3337 float_raise(float_flag_invalid STATUS_VAR);
3338 return packFloat16(zSign, 0x1f, 0x3ff);
3339 }
3340 }
3341
3342 if (zExp < 0) {
3343 /* Note that flush-to-zero does not affect half-precision results */
3344 is_tiny =
3345 (STATUS(float_detect_tininess) == float_tininess_before_rounding)
3346 || (zExp < -1)
3347 || (!rounding_bumps_exp);
3348 }
3349 if (zSig & mask) {
3350 float_raise(float_flag_inexact STATUS_VAR);
3351 if (is_tiny) {
3352 float_raise(float_flag_underflow STATUS_VAR);
3353 }
3354 }
3355
3356 zSig += increment;
3357 if (rounding_bumps_exp) {
3358 zSig >>= 1;
3359 zExp++;
3360 }
3361
3362 if (zExp < -10) {
3363 return packFloat16(zSign, 0, 0);
3364 }
3365 if (zExp < 0) {
3366 zSig >>= -zExp;
3367 zExp = 0;
3368 }
3369 return packFloat16(zSign, zExp, zSig >> 13);
3370}
3371
3372static void normalizeFloat16Subnormal(uint32_t aSig, int_fast16_t *zExpPtr,
3373 uint32_t *zSigPtr)
3374{
3375 int8_t shiftCount = countLeadingZeros32(aSig) - 21;
3376 *zSigPtr = aSig << shiftCount;
3377 *zExpPtr = 1 - shiftCount;
3378}
3379
60011498
PB
3380/* Half precision floats come in two formats: standard IEEE and "ARM" format.
3381 The latter gains extra exponent range by omitting the NaN/Inf encodings. */
bb4d4bb3
PM
3382
3383float32 float16_to_float32(float16 a, flag ieee STATUS_PARAM)
60011498
PB
3384{
3385 flag aSign;
94a49d86 3386 int_fast16_t aExp;
bb98fe42 3387 uint32_t aSig;
60011498 3388
bb4d4bb3
PM
3389 aSign = extractFloat16Sign(a);
3390 aExp = extractFloat16Exp(a);
3391 aSig = extractFloat16Frac(a);
60011498
PB
3392
3393 if (aExp == 0x1f && ieee) {
3394 if (aSig) {
f591e1be 3395 return commonNaNToFloat32(float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
60011498 3396 }
4be8eeac 3397 return packFloat32(aSign, 0xff, 0);
60011498
PB
3398 }
3399 if (aExp == 0) {
60011498
PB
3400 if (aSig == 0) {
3401 return packFloat32(aSign, 0, 0);
3402 }
3403
c4a1c5e7
PM
3404 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3405 aExp--;
60011498
PB
3406 }
3407 return packFloat32( aSign, aExp + 0x70, aSig << 13);
3408}
3409
bb4d4bb3 3410float16 float32_to_float16(float32 a, flag ieee STATUS_PARAM)
60011498
PB
3411{
3412 flag aSign;
94a49d86 3413 int_fast16_t aExp;
bb98fe42 3414 uint32_t aSig;
38970efa 3415
37d18660 3416 a = float32_squash_input_denormal(a STATUS_VAR);
60011498
PB
3417
3418 aSig = extractFloat32Frac( a );
3419 aExp = extractFloat32Exp( a );
3420 aSign = extractFloat32Sign( a );
3421 if ( aExp == 0xFF ) {
3422 if (aSig) {
600e30d2 3423 /* Input is a NaN */
600e30d2 3424 if (!ieee) {
38970efa 3425 float_raise(float_flag_invalid STATUS_VAR);
600e30d2
PM
3426 return packFloat16(aSign, 0, 0);
3427 }
38970efa
PM
3428 return commonNaNToFloat16(
3429 float32ToCommonNaN(a STATUS_VAR) STATUS_VAR);
60011498 3430 }
600e30d2
PM
3431 /* Infinity */
3432 if (!ieee) {
3433 float_raise(float_flag_invalid STATUS_VAR);
3434 return packFloat16(aSign, 0x1f, 0x3ff);
3435 }
3436 return packFloat16(aSign, 0x1f, 0);
60011498 3437 }
600e30d2 3438 if (aExp == 0 && aSig == 0) {
60011498
PB
3439 return packFloat16(aSign, 0, 0);
3440 }
38970efa
PM
3441 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3442 * even if the input is denormal; however this is harmless because
3443 * the largest possible single-precision denormal is still smaller
3444 * than the smallest representable half-precision denormal, and so we
3445 * will end up ignoring aSig and returning via the "always return zero"
3446 * codepath.
3447 */
60011498 3448 aSig |= 0x00800000;
c4a1c5e7 3449 aExp -= 0x71;
60011498 3450
c4a1c5e7 3451 return roundAndPackFloat16(aSign, aExp, aSig, ieee STATUS_VAR);
60011498
PB
3452}
3453
14c9a07e
PM
3454float64 float16_to_float64(float16 a, flag ieee STATUS_PARAM)
3455{
3456 flag aSign;
3457 int_fast16_t aExp;
3458 uint32_t aSig;
3459
3460 aSign = extractFloat16Sign(a);
3461 aExp = extractFloat16Exp(a);
3462 aSig = extractFloat16Frac(a);
3463
3464 if (aExp == 0x1f && ieee) {
3465 if (aSig) {
3466 return commonNaNToFloat64(
3467 float16ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3468 }
3469 return packFloat64(aSign, 0x7ff, 0);
3470 }
3471 if (aExp == 0) {
3472 if (aSig == 0) {
3473 return packFloat64(aSign, 0, 0);
3474 }
3475
3476 normalizeFloat16Subnormal(aSig, &aExp, &aSig);
3477 aExp--;
3478 }
3479 return packFloat64(aSign, aExp + 0x3f0, ((uint64_t)aSig) << 42);
3480}
3481
3482float16 float64_to_float16(float64 a, flag ieee STATUS_PARAM)
3483{
3484 flag aSign;
3485 int_fast16_t aExp;
3486 uint64_t aSig;
3487 uint32_t zSig;
3488
3489 a = float64_squash_input_denormal(a STATUS_VAR);
3490
3491 aSig = extractFloat64Frac(a);
3492 aExp = extractFloat64Exp(a);
3493 aSign = extractFloat64Sign(a);
3494 if (aExp == 0x7FF) {
3495 if (aSig) {
3496 /* Input is a NaN */
3497 if (!ieee) {
3498 float_raise(float_flag_invalid STATUS_VAR);
3499 return packFloat16(aSign, 0, 0);
3500 }
3501 return commonNaNToFloat16(
3502 float64ToCommonNaN(a STATUS_VAR) STATUS_VAR);
3503 }
3504 /* Infinity */
3505 if (!ieee) {
3506 float_raise(float_flag_invalid STATUS_VAR);
3507 return packFloat16(aSign, 0x1f, 0x3ff);
3508 }
3509 return packFloat16(aSign, 0x1f, 0);
3510 }
3511 shift64RightJamming(aSig, 29, &aSig);
3512 zSig = aSig;
3513 if (aExp == 0 && zSig == 0) {
3514 return packFloat16(aSign, 0, 0);
3515 }
3516 /* Decimal point between bits 22 and 23. Note that we add the 1 bit
3517 * even if the input is denormal; however this is harmless because
3518 * the largest possible single-precision denormal is still smaller
3519 * than the smallest representable half-precision denormal, and so we
3520 * will end up ignoring aSig and returning via the "always return zero"
3521 * codepath.
3522 */
3523 zSig |= 0x00800000;
3524 aExp -= 0x3F1;
3525
3526 return roundAndPackFloat16(aSign, aExp, zSig, ieee STATUS_VAR);
3527}
3528
158142c2
FB
3529/*----------------------------------------------------------------------------
3530| Returns the result of converting the double-precision floating-point value
3531| `a' to the extended double-precision floating-point format. The conversion
3532| is performed according to the IEC/IEEE Standard for Binary Floating-Point
3533| Arithmetic.
3534*----------------------------------------------------------------------------*/
3535
3536floatx80 float64_to_floatx80( float64 a STATUS_PARAM )
3537{
3538 flag aSign;
94a49d86 3539 int_fast16_t aExp;
bb98fe42 3540 uint64_t aSig;
158142c2 3541
37d18660 3542 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3543 aSig = extractFloat64Frac( a );
3544 aExp = extractFloat64Exp( a );
3545 aSign = extractFloat64Sign( a );
3546 if ( aExp == 0x7FF ) {
bcd4d9af 3547 if ( aSig ) return commonNaNToFloatx80( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3548 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
3549 }
3550 if ( aExp == 0 ) {
3551 if ( aSig == 0 ) return packFloatx80( aSign, 0, 0 );
3552 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3553 }
3554 return
3555 packFloatx80(
3556 aSign, aExp + 0x3C00, ( aSig | LIT64( 0x0010000000000000 ) )<<11 );
3557
3558}
3559
158142c2
FB
3560/*----------------------------------------------------------------------------
3561| Returns the result of converting the double-precision floating-point value
3562| `a' to the quadruple-precision floating-point format. The conversion is
3563| performed according to the IEC/IEEE Standard for Binary Floating-Point
3564| Arithmetic.
3565*----------------------------------------------------------------------------*/
3566
3567float128 float64_to_float128( float64 a STATUS_PARAM )
3568{
3569 flag aSign;
94a49d86 3570 int_fast16_t aExp;
bb98fe42 3571 uint64_t aSig, zSig0, zSig1;
158142c2 3572
37d18660 3573 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3574 aSig = extractFloat64Frac( a );
3575 aExp = extractFloat64Exp( a );
3576 aSign = extractFloat64Sign( a );
3577 if ( aExp == 0x7FF ) {
bcd4d9af 3578 if ( aSig ) return commonNaNToFloat128( float64ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
3579 return packFloat128( aSign, 0x7FFF, 0, 0 );
3580 }
3581 if ( aExp == 0 ) {
3582 if ( aSig == 0 ) return packFloat128( aSign, 0, 0, 0 );
3583 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3584 --aExp;
3585 }
3586 shift128Right( aSig, 0, 4, &zSig0, &zSig1 );
3587 return packFloat128( aSign, aExp + 0x3C00, zSig0, zSig1 );
3588
3589}
3590
158142c2
FB
3591/*----------------------------------------------------------------------------
3592| Rounds the double-precision floating-point value `a' to an integer, and
3593| returns the result as a double-precision floating-point value. The
3594| operation is performed according to the IEC/IEEE Standard for Binary
3595| Floating-Point Arithmetic.
3596*----------------------------------------------------------------------------*/
3597
3598float64 float64_round_to_int( float64 a STATUS_PARAM )
3599{
3600 flag aSign;
94a49d86 3601 int_fast16_t aExp;
bb98fe42 3602 uint64_t lastBitMask, roundBitsMask;
bb98fe42 3603 uint64_t z;
37d18660 3604 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
3605
3606 aExp = extractFloat64Exp( a );
3607 if ( 0x433 <= aExp ) {
3608 if ( ( aExp == 0x7FF ) && extractFloat64Frac( a ) ) {
3609 return propagateFloat64NaN( a, a STATUS_VAR );
3610 }
3611 return a;
3612 }
3613 if ( aExp < 0x3FF ) {
bb98fe42 3614 if ( (uint64_t) ( float64_val(a)<<1 ) == 0 ) return a;
158142c2
FB
3615 STATUS(float_exception_flags) |= float_flag_inexact;
3616 aSign = extractFloat64Sign( a );
3617 switch ( STATUS(float_rounding_mode) ) {
3618 case float_round_nearest_even:
3619 if ( ( aExp == 0x3FE ) && extractFloat64Frac( a ) ) {
3620 return packFloat64( aSign, 0x3FF, 0 );
3621 }
3622 break;
f9288a76
PM
3623 case float_round_ties_away:
3624 if (aExp == 0x3FE) {
3625 return packFloat64(aSign, 0x3ff, 0);
3626 }
3627 break;
158142c2 3628 case float_round_down:
f090c9d4 3629 return make_float64(aSign ? LIT64( 0xBFF0000000000000 ) : 0);
158142c2 3630 case float_round_up:
f090c9d4
PB
3631 return make_float64(
3632 aSign ? LIT64( 0x8000000000000000 ) : LIT64( 0x3FF0000000000000 ));
158142c2
FB
3633 }
3634 return packFloat64( aSign, 0, 0 );
3635 }
3636 lastBitMask = 1;
3637 lastBitMask <<= 0x433 - aExp;
3638 roundBitsMask = lastBitMask - 1;
f090c9d4 3639 z = float64_val(a);
dc355b76
PM
3640 switch (STATUS(float_rounding_mode)) {
3641 case float_round_nearest_even:
3642 z += lastBitMask >> 1;
3643 if ((z & roundBitsMask) == 0) {
3644 z &= ~lastBitMask;
3645 }
3646 break;
f9288a76
PM
3647 case float_round_ties_away:
3648 z += lastBitMask >> 1;
3649 break;
dc355b76
PM
3650 case float_round_to_zero:
3651 break;
3652 case float_round_up:
3653 if (!extractFloat64Sign(make_float64(z))) {
3654 z += roundBitsMask;
3655 }
3656 break;
3657 case float_round_down:
3658 if (extractFloat64Sign(make_float64(z))) {
158142c2
FB
3659 z += roundBitsMask;
3660 }
dc355b76
PM
3661 break;
3662 default:
3663 abort();
158142c2
FB
3664 }
3665 z &= ~ roundBitsMask;
f090c9d4
PB
3666 if ( z != float64_val(a) )
3667 STATUS(float_exception_flags) |= float_flag_inexact;
3668 return make_float64(z);
158142c2
FB
3669
3670}
3671
e6e5906b
PB
3672float64 float64_trunc_to_int( float64 a STATUS_PARAM)
3673{
3674 int oldmode;
3675 float64 res;
3676 oldmode = STATUS(float_rounding_mode);
3677 STATUS(float_rounding_mode) = float_round_to_zero;
3678 res = float64_round_to_int(a STATUS_VAR);
3679 STATUS(float_rounding_mode) = oldmode;
3680 return res;
3681}
3682
158142c2
FB
3683/*----------------------------------------------------------------------------
3684| Returns the result of adding the absolute values of the double-precision
3685| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
3686| before being returned. `zSign' is ignored if the result is a NaN.
3687| The addition is performed according to the IEC/IEEE Standard for Binary
3688| Floating-Point Arithmetic.
3689*----------------------------------------------------------------------------*/
3690
3691static float64 addFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3692{
94a49d86 3693 int_fast16_t aExp, bExp, zExp;
bb98fe42 3694 uint64_t aSig, bSig, zSig;
94a49d86 3695 int_fast16_t expDiff;
158142c2
FB
3696
3697 aSig = extractFloat64Frac( a );
3698 aExp = extractFloat64Exp( a );
3699 bSig = extractFloat64Frac( b );
3700 bExp = extractFloat64Exp( b );
3701 expDiff = aExp - bExp;
3702 aSig <<= 9;
3703 bSig <<= 9;
3704 if ( 0 < expDiff ) {
3705 if ( aExp == 0x7FF ) {
3706 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3707 return a;
3708 }
3709 if ( bExp == 0 ) {
3710 --expDiff;
3711 }
3712 else {
3713 bSig |= LIT64( 0x2000000000000000 );
3714 }
3715 shift64RightJamming( bSig, expDiff, &bSig );
3716 zExp = aExp;
3717 }
3718 else if ( expDiff < 0 ) {
3719 if ( bExp == 0x7FF ) {
3720 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3721 return packFloat64( zSign, 0x7FF, 0 );
3722 }
3723 if ( aExp == 0 ) {
3724 ++expDiff;
3725 }
3726 else {
3727 aSig |= LIT64( 0x2000000000000000 );
3728 }
3729 shift64RightJamming( aSig, - expDiff, &aSig );
3730 zExp = bExp;
3731 }
3732 else {
3733 if ( aExp == 0x7FF ) {
3734 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3735 return a;
3736 }
fe76d976 3737 if ( aExp == 0 ) {
e6afc87f
PM
3738 if (STATUS(flush_to_zero)) {
3739 if (aSig | bSig) {
3740 float_raise(float_flag_output_denormal STATUS_VAR);
3741 }
3742 return packFloat64(zSign, 0, 0);
3743 }
fe76d976
PB
3744 return packFloat64( zSign, 0, ( aSig + bSig )>>9 );
3745 }
158142c2
FB
3746 zSig = LIT64( 0x4000000000000000 ) + aSig + bSig;
3747 zExp = aExp;
3748 goto roundAndPack;
3749 }
3750 aSig |= LIT64( 0x2000000000000000 );
3751 zSig = ( aSig + bSig )<<1;
3752 --zExp;
bb98fe42 3753 if ( (int64_t) zSig < 0 ) {
158142c2
FB
3754 zSig = aSig + bSig;
3755 ++zExp;
3756 }
3757 roundAndPack:
3758 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3759
3760}
3761
3762/*----------------------------------------------------------------------------
3763| Returns the result of subtracting the absolute values of the double-
3764| precision floating-point values `a' and `b'. If `zSign' is 1, the
3765| difference is negated before being returned. `zSign' is ignored if the
3766| result is a NaN. The subtraction is performed according to the IEC/IEEE
3767| Standard for Binary Floating-Point Arithmetic.
3768*----------------------------------------------------------------------------*/
3769
3770static float64 subFloat64Sigs( float64 a, float64 b, flag zSign STATUS_PARAM )
3771{
94a49d86 3772 int_fast16_t aExp, bExp, zExp;
bb98fe42 3773 uint64_t aSig, bSig, zSig;
94a49d86 3774 int_fast16_t expDiff;
158142c2
FB
3775
3776 aSig = extractFloat64Frac( a );
3777 aExp = extractFloat64Exp( a );
3778 bSig = extractFloat64Frac( b );
3779 bExp = extractFloat64Exp( b );
3780 expDiff = aExp - bExp;
3781 aSig <<= 10;
3782 bSig <<= 10;
3783 if ( 0 < expDiff ) goto aExpBigger;
3784 if ( expDiff < 0 ) goto bExpBigger;
3785 if ( aExp == 0x7FF ) {
3786 if ( aSig | bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3787 float_raise( float_flag_invalid STATUS_VAR);
3788 return float64_default_nan;
3789 }
3790 if ( aExp == 0 ) {
3791 aExp = 1;
3792 bExp = 1;
3793 }
3794 if ( bSig < aSig ) goto aBigger;
3795 if ( aSig < bSig ) goto bBigger;
3796 return packFloat64( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
3797 bExpBigger:
3798 if ( bExp == 0x7FF ) {
3799 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3800 return packFloat64( zSign ^ 1, 0x7FF, 0 );
3801 }
3802 if ( aExp == 0 ) {
3803 ++expDiff;
3804 }
3805 else {
3806 aSig |= LIT64( 0x4000000000000000 );
3807 }
3808 shift64RightJamming( aSig, - expDiff, &aSig );
3809 bSig |= LIT64( 0x4000000000000000 );
3810 bBigger:
3811 zSig = bSig - aSig;
3812 zExp = bExp;
3813 zSign ^= 1;
3814 goto normalizeRoundAndPack;
3815 aExpBigger:
3816 if ( aExp == 0x7FF ) {
3817 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3818 return a;
3819 }
3820 if ( bExp == 0 ) {
3821 --expDiff;
3822 }
3823 else {
3824 bSig |= LIT64( 0x4000000000000000 );
3825 }
3826 shift64RightJamming( bSig, expDiff, &bSig );
3827 aSig |= LIT64( 0x4000000000000000 );
3828 aBigger:
3829 zSig = aSig - bSig;
3830 zExp = aExp;
3831 normalizeRoundAndPack:
3832 --zExp;
3833 return normalizeRoundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
3834
3835}
3836
3837/*----------------------------------------------------------------------------
3838| Returns the result of adding the double-precision floating-point values `a'
3839| and `b'. The operation is performed according to the IEC/IEEE Standard for
3840| Binary Floating-Point Arithmetic.
3841*----------------------------------------------------------------------------*/
3842
3843float64 float64_add( float64 a, float64 b STATUS_PARAM )
3844{
3845 flag aSign, bSign;
37d18660
PM
3846 a = float64_squash_input_denormal(a STATUS_VAR);
3847 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3848
3849 aSign = extractFloat64Sign( a );
3850 bSign = extractFloat64Sign( b );
3851 if ( aSign == bSign ) {
3852 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3853 }
3854 else {
3855 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3856 }
3857
3858}
3859
3860/*----------------------------------------------------------------------------
3861| Returns the result of subtracting the double-precision floating-point values
3862| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3863| for Binary Floating-Point Arithmetic.
3864*----------------------------------------------------------------------------*/
3865
3866float64 float64_sub( float64 a, float64 b STATUS_PARAM )
3867{
3868 flag aSign, bSign;
37d18660
PM
3869 a = float64_squash_input_denormal(a STATUS_VAR);
3870 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3871
3872 aSign = extractFloat64Sign( a );
3873 bSign = extractFloat64Sign( b );
3874 if ( aSign == bSign ) {
3875 return subFloat64Sigs( a, b, aSign STATUS_VAR );
3876 }
3877 else {
3878 return addFloat64Sigs( a, b, aSign STATUS_VAR );
3879 }
3880
3881}
3882
3883/*----------------------------------------------------------------------------
3884| Returns the result of multiplying the double-precision floating-point values
3885| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
3886| for Binary Floating-Point Arithmetic.
3887*----------------------------------------------------------------------------*/
3888
3889float64 float64_mul( float64 a, float64 b STATUS_PARAM )
3890{
3891 flag aSign, bSign, zSign;
94a49d86 3892 int_fast16_t aExp, bExp, zExp;
bb98fe42 3893 uint64_t aSig, bSig, zSig0, zSig1;
158142c2 3894
37d18660
PM
3895 a = float64_squash_input_denormal(a STATUS_VAR);
3896 b = float64_squash_input_denormal(b STATUS_VAR);
3897
158142c2
FB
3898 aSig = extractFloat64Frac( a );
3899 aExp = extractFloat64Exp( a );
3900 aSign = extractFloat64Sign( a );
3901 bSig = extractFloat64Frac( b );
3902 bExp = extractFloat64Exp( b );
3903 bSign = extractFloat64Sign( b );
3904 zSign = aSign ^ bSign;
3905 if ( aExp == 0x7FF ) {
3906 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
3907 return propagateFloat64NaN( a, b STATUS_VAR );
3908 }
3909 if ( ( bExp | bSig ) == 0 ) {
3910 float_raise( float_flag_invalid STATUS_VAR);
3911 return float64_default_nan;
3912 }
3913 return packFloat64( zSign, 0x7FF, 0 );
3914 }
3915 if ( bExp == 0x7FF ) {
3916 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3917 if ( ( aExp | aSig ) == 0 ) {
3918 float_raise( float_flag_invalid STATUS_VAR);
3919 return float64_default_nan;
3920 }
3921 return packFloat64( zSign, 0x7FF, 0 );
3922 }
3923 if ( aExp == 0 ) {
3924 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3925 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3926 }
3927 if ( bExp == 0 ) {
3928 if ( bSig == 0 ) return packFloat64( zSign, 0, 0 );
3929 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3930 }
3931 zExp = aExp + bExp - 0x3FF;
3932 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3933 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3934 mul64To128( aSig, bSig, &zSig0, &zSig1 );
3935 zSig0 |= ( zSig1 != 0 );
bb98fe42 3936 if ( 0 <= (int64_t) ( zSig0<<1 ) ) {
158142c2
FB
3937 zSig0 <<= 1;
3938 --zExp;
3939 }
3940 return roundAndPackFloat64( zSign, zExp, zSig0 STATUS_VAR );
3941
3942}
3943
3944/*----------------------------------------------------------------------------
3945| Returns the result of dividing the double-precision floating-point value `a'
3946| by the corresponding value `b'. The operation is performed according to
3947| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
3948*----------------------------------------------------------------------------*/
3949
3950float64 float64_div( float64 a, float64 b STATUS_PARAM )
3951{
3952 flag aSign, bSign, zSign;
94a49d86 3953 int_fast16_t aExp, bExp, zExp;
bb98fe42
AF
3954 uint64_t aSig, bSig, zSig;
3955 uint64_t rem0, rem1;
3956 uint64_t term0, term1;
37d18660
PM
3957 a = float64_squash_input_denormal(a STATUS_VAR);
3958 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
3959
3960 aSig = extractFloat64Frac( a );
3961 aExp = extractFloat64Exp( a );
3962 aSign = extractFloat64Sign( a );
3963 bSig = extractFloat64Frac( b );
3964 bExp = extractFloat64Exp( b );
3965 bSign = extractFloat64Sign( b );
3966 zSign = aSign ^ bSign;
3967 if ( aExp == 0x7FF ) {
3968 if ( aSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3969 if ( bExp == 0x7FF ) {
3970 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3971 float_raise( float_flag_invalid STATUS_VAR);
3972 return float64_default_nan;
3973 }
3974 return packFloat64( zSign, 0x7FF, 0 );
3975 }
3976 if ( bExp == 0x7FF ) {
3977 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
3978 return packFloat64( zSign, 0, 0 );
3979 }
3980 if ( bExp == 0 ) {
3981 if ( bSig == 0 ) {
3982 if ( ( aExp | aSig ) == 0 ) {
3983 float_raise( float_flag_invalid STATUS_VAR);
3984 return float64_default_nan;
3985 }
3986 float_raise( float_flag_divbyzero STATUS_VAR);
3987 return packFloat64( zSign, 0x7FF, 0 );
3988 }
3989 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
3990 }
3991 if ( aExp == 0 ) {
3992 if ( aSig == 0 ) return packFloat64( zSign, 0, 0 );
3993 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
3994 }
3995 zExp = aExp - bExp + 0x3FD;
3996 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<10;
3997 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
3998 if ( bSig <= ( aSig + aSig ) ) {
3999 aSig >>= 1;
4000 ++zExp;
4001 }
4002 zSig = estimateDiv128To64( aSig, 0, bSig );
4003 if ( ( zSig & 0x1FF ) <= 2 ) {
4004 mul64To128( bSig, zSig, &term0, &term1 );
4005 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 4006 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4007 --zSig;
4008 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
4009 }
4010 zSig |= ( rem1 != 0 );
4011 }
4012 return roundAndPackFloat64( zSign, zExp, zSig STATUS_VAR );
4013
4014}
4015
4016/*----------------------------------------------------------------------------
4017| Returns the remainder of the double-precision floating-point value `a'
4018| with respect to the corresponding value `b'. The operation is performed
4019| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4020*----------------------------------------------------------------------------*/
4021
4022float64 float64_rem( float64 a, float64 b STATUS_PARAM )
4023{
ed086f3d 4024 flag aSign, zSign;
94a49d86 4025 int_fast16_t aExp, bExp, expDiff;
bb98fe42
AF
4026 uint64_t aSig, bSig;
4027 uint64_t q, alternateASig;
4028 int64_t sigMean;
158142c2 4029
37d18660
PM
4030 a = float64_squash_input_denormal(a STATUS_VAR);
4031 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4032 aSig = extractFloat64Frac( a );
4033 aExp = extractFloat64Exp( a );
4034 aSign = extractFloat64Sign( a );
4035 bSig = extractFloat64Frac( b );
4036 bExp = extractFloat64Exp( b );
158142c2
FB
4037 if ( aExp == 0x7FF ) {
4038 if ( aSig || ( ( bExp == 0x7FF ) && bSig ) ) {
4039 return propagateFloat64NaN( a, b STATUS_VAR );
4040 }
4041 float_raise( float_flag_invalid STATUS_VAR);
4042 return float64_default_nan;
4043 }
4044 if ( bExp == 0x7FF ) {
4045 if ( bSig ) return propagateFloat64NaN( a, b STATUS_VAR );
4046 return a;
4047 }
4048 if ( bExp == 0 ) {
4049 if ( bSig == 0 ) {
4050 float_raise( float_flag_invalid STATUS_VAR);
4051 return float64_default_nan;
4052 }
4053 normalizeFloat64Subnormal( bSig, &bExp, &bSig );
4054 }
4055 if ( aExp == 0 ) {
4056 if ( aSig == 0 ) return a;
4057 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4058 }
4059 expDiff = aExp - bExp;
4060 aSig = ( aSig | LIT64( 0x0010000000000000 ) )<<11;
4061 bSig = ( bSig | LIT64( 0x0010000000000000 ) )<<11;
4062 if ( expDiff < 0 ) {
4063 if ( expDiff < -1 ) return a;
4064 aSig >>= 1;
4065 }
4066 q = ( bSig <= aSig );
4067 if ( q ) aSig -= bSig;
4068 expDiff -= 64;
4069 while ( 0 < expDiff ) {
4070 q = estimateDiv128To64( aSig, 0, bSig );
4071 q = ( 2 < q ) ? q - 2 : 0;
4072 aSig = - ( ( bSig>>2 ) * q );
4073 expDiff -= 62;
4074 }
4075 expDiff += 64;
4076 if ( 0 < expDiff ) {
4077 q = estimateDiv128To64( aSig, 0, bSig );
4078 q = ( 2 < q ) ? q - 2 : 0;
4079 q >>= 64 - expDiff;
4080 bSig >>= 2;
4081 aSig = ( ( aSig>>1 )<<( expDiff - 1 ) ) - bSig * q;
4082 }
4083 else {
4084 aSig >>= 2;
4085 bSig >>= 2;
4086 }
4087 do {
4088 alternateASig = aSig;
4089 ++q;
4090 aSig -= bSig;
bb98fe42 4091 } while ( 0 <= (int64_t) aSig );
158142c2
FB
4092 sigMean = aSig + alternateASig;
4093 if ( ( sigMean < 0 ) || ( ( sigMean == 0 ) && ( q & 1 ) ) ) {
4094 aSig = alternateASig;
4095 }
bb98fe42 4096 zSign = ( (int64_t) aSig < 0 );
158142c2
FB
4097 if ( zSign ) aSig = - aSig;
4098 return normalizeRoundAndPackFloat64( aSign ^ zSign, bExp, aSig STATUS_VAR );
4099
4100}
4101
369be8f6
PM
4102/*----------------------------------------------------------------------------
4103| Returns the result of multiplying the double-precision floating-point values
4104| `a' and `b' then adding 'c', with no intermediate rounding step after the
4105| multiplication. The operation is performed according to the IEC/IEEE
4106| Standard for Binary Floating-Point Arithmetic 754-2008.
4107| The flags argument allows the caller to select negation of the
4108| addend, the intermediate product, or the final result. (The difference
4109| between this and having the caller do a separate negation is that negating
4110| externally will flip the sign bit on NaNs.)
4111*----------------------------------------------------------------------------*/
4112
4113float64 float64_muladd(float64 a, float64 b, float64 c, int flags STATUS_PARAM)
4114{
4115 flag aSign, bSign, cSign, zSign;
94a49d86 4116 int_fast16_t aExp, bExp, cExp, pExp, zExp, expDiff;
369be8f6
PM
4117 uint64_t aSig, bSig, cSig;
4118 flag pInf, pZero, pSign;
4119 uint64_t pSig0, pSig1, cSig0, cSig1, zSig0, zSig1;
4120 int shiftcount;
4121 flag signflip, infzero;
4122
4123 a = float64_squash_input_denormal(a STATUS_VAR);
4124 b = float64_squash_input_denormal(b STATUS_VAR);
4125 c = float64_squash_input_denormal(c STATUS_VAR);
4126 aSig = extractFloat64Frac(a);
4127 aExp = extractFloat64Exp(a);
4128 aSign = extractFloat64Sign(a);
4129 bSig = extractFloat64Frac(b);
4130 bExp = extractFloat64Exp(b);
4131 bSign = extractFloat64Sign(b);
4132 cSig = extractFloat64Frac(c);
4133 cExp = extractFloat64Exp(c);
4134 cSign = extractFloat64Sign(c);
4135
4136 infzero = ((aExp == 0 && aSig == 0 && bExp == 0x7ff && bSig == 0) ||
4137 (aExp == 0x7ff && aSig == 0 && bExp == 0 && bSig == 0));
4138
4139 /* It is implementation-defined whether the cases of (0,inf,qnan)
4140 * and (inf,0,qnan) raise InvalidOperation or not (and what QNaN
4141 * they return if they do), so we have to hand this information
4142 * off to the target-specific pick-a-NaN routine.
4143 */
4144 if (((aExp == 0x7ff) && aSig) ||
4145 ((bExp == 0x7ff) && bSig) ||
4146 ((cExp == 0x7ff) && cSig)) {
4147 return propagateFloat64MulAddNaN(a, b, c, infzero STATUS_VAR);
4148 }
4149
4150 if (infzero) {
4151 float_raise(float_flag_invalid STATUS_VAR);
4152 return float64_default_nan;
4153 }
4154
4155 if (flags & float_muladd_negate_c) {
4156 cSign ^= 1;
4157 }
4158
4159 signflip = (flags & float_muladd_negate_result) ? 1 : 0;
4160
4161 /* Work out the sign and type of the product */
4162 pSign = aSign ^ bSign;
4163 if (flags & float_muladd_negate_product) {
4164 pSign ^= 1;
4165 }
4166 pInf = (aExp == 0x7ff) || (bExp == 0x7ff);
4167 pZero = ((aExp | aSig) == 0) || ((bExp | bSig) == 0);
4168
4169 if (cExp == 0x7ff) {
4170 if (pInf && (pSign ^ cSign)) {
4171 /* addition of opposite-signed infinities => InvalidOperation */
4172 float_raise(float_flag_invalid STATUS_VAR);
4173 return float64_default_nan;
4174 }
4175 /* Otherwise generate an infinity of the same sign */
4176 return packFloat64(cSign ^ signflip, 0x7ff, 0);
4177 }
4178
4179 if (pInf) {
4180 return packFloat64(pSign ^ signflip, 0x7ff, 0);
4181 }
4182
4183 if (pZero) {
4184 if (cExp == 0) {
4185 if (cSig == 0) {
4186 /* Adding two exact zeroes */
4187 if (pSign == cSign) {
4188 zSign = pSign;
4189 } else if (STATUS(float_rounding_mode) == float_round_down) {
4190 zSign = 1;
4191 } else {
4192 zSign = 0;
4193 }
4194 return packFloat64(zSign ^ signflip, 0, 0);
4195 }
4196 /* Exact zero plus a denorm */
4197 if (STATUS(flush_to_zero)) {
4198 float_raise(float_flag_output_denormal STATUS_VAR);
4199 return packFloat64(cSign ^ signflip, 0, 0);
4200 }
4201 }
4202 /* Zero plus something non-zero : just return the something */
67d43538
PM
4203 if (flags & float_muladd_halve_result) {
4204 if (cExp == 0) {
4205 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4206 }
4207 /* Subtract one to halve, and one again because roundAndPackFloat64
4208 * wants one less than the true exponent.
4209 */
4210 cExp -= 2;
4211 cSig = (cSig | 0x0010000000000000ULL) << 10;
4212 return roundAndPackFloat64(cSign ^ signflip, cExp, cSig STATUS_VAR);
4213 }
a6e7c184 4214 return packFloat64(cSign ^ signflip, cExp, cSig);
369be8f6
PM
4215 }
4216
4217 if (aExp == 0) {
4218 normalizeFloat64Subnormal(aSig, &aExp, &aSig);
4219 }
4220 if (bExp == 0) {
4221 normalizeFloat64Subnormal(bSig, &bExp, &bSig);
4222 }
4223
4224 /* Calculate the actual result a * b + c */
4225
4226 /* Multiply first; this is easy. */
4227 /* NB: we subtract 0x3fe where float64_mul() subtracts 0x3ff
4228 * because we want the true exponent, not the "one-less-than"
4229 * flavour that roundAndPackFloat64() takes.
4230 */
4231 pExp = aExp + bExp - 0x3fe;
4232 aSig = (aSig | LIT64(0x0010000000000000))<<10;
4233 bSig = (bSig | LIT64(0x0010000000000000))<<11;
4234 mul64To128(aSig, bSig, &pSig0, &pSig1);
4235 if ((int64_t)(pSig0 << 1) >= 0) {
4236 shortShift128Left(pSig0, pSig1, 1, &pSig0, &pSig1);
4237 pExp--;
4238 }
4239
4240 zSign = pSign ^ signflip;
4241
4242 /* Now [pSig0:pSig1] is the significand of the multiply, with the explicit
4243 * bit in position 126.
4244 */
4245 if (cExp == 0) {
4246 if (!cSig) {
4247 /* Throw out the special case of c being an exact zero now */
4248 shift128RightJamming(pSig0, pSig1, 64, &pSig0, &pSig1);
67d43538
PM
4249 if (flags & float_muladd_halve_result) {
4250 pExp--;
4251 }
369be8f6
PM
4252 return roundAndPackFloat64(zSign, pExp - 1,
4253 pSig1 STATUS_VAR);
4254 }
4255 normalizeFloat64Subnormal(cSig, &cExp, &cSig);
4256 }
4257
4258 /* Shift cSig and add the explicit bit so [cSig0:cSig1] is the
4259 * significand of the addend, with the explicit bit in position 126.
4260 */
4261 cSig0 = cSig << (126 - 64 - 52);
4262 cSig1 = 0;
4263 cSig0 |= LIT64(0x4000000000000000);
4264 expDiff = pExp - cExp;
4265
4266 if (pSign == cSign) {
4267 /* Addition */
4268 if (expDiff > 0) {
4269 /* scale c to match p */
4270 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4271 zExp = pExp;
4272 } else if (expDiff < 0) {
4273 /* scale p to match c */
4274 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4275 zExp = cExp;
4276 } else {
4277 /* no scaling needed */
4278 zExp = cExp;
4279 }
4280 /* Add significands and make sure explicit bit ends up in posn 126 */
4281 add128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4282 if ((int64_t)zSig0 < 0) {
4283 shift128RightJamming(zSig0, zSig1, 1, &zSig0, &zSig1);
4284 } else {
4285 zExp--;
4286 }
4287 shift128RightJamming(zSig0, zSig1, 64, &zSig0, &zSig1);
67d43538
PM
4288 if (flags & float_muladd_halve_result) {
4289 zExp--;
4290 }
369be8f6
PM
4291 return roundAndPackFloat64(zSign, zExp, zSig1 STATUS_VAR);
4292 } else {
4293 /* Subtraction */
4294 if (expDiff > 0) {
4295 shift128RightJamming(cSig0, cSig1, expDiff, &cSig0, &cSig1);
4296 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4297 zExp = pExp;
4298 } else if (expDiff < 0) {
4299 shift128RightJamming(pSig0, pSig1, -expDiff, &pSig0, &pSig1);
4300 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4301 zExp = cExp;
4302 zSign ^= 1;
4303 } else {
4304 zExp = pExp;
4305 if (lt128(cSig0, cSig1, pSig0, pSig1)) {
4306 sub128(pSig0, pSig1, cSig0, cSig1, &zSig0, &zSig1);
4307 } else if (lt128(pSig0, pSig1, cSig0, cSig1)) {
4308 sub128(cSig0, cSig1, pSig0, pSig1, &zSig0, &zSig1);
4309 zSign ^= 1;
4310 } else {
4311 /* Exact zero */
4312 zSign = signflip;
4313 if (STATUS(float_rounding_mode) == float_round_down) {
4314 zSign ^= 1;
4315 }
4316 return packFloat64(zSign, 0, 0);
4317 }
4318 }
4319 --zExp;
4320 /* Do the equivalent of normalizeRoundAndPackFloat64() but
4321 * starting with the significand in a pair of uint64_t.
4322 */
4323 if (zSig0) {
4324 shiftcount = countLeadingZeros64(zSig0) - 1;
4325 shortShift128Left(zSig0, zSig1, shiftcount, &zSig0, &zSig1);
4326 if (zSig1) {
4327 zSig0 |= 1;
4328 }
4329 zExp -= shiftcount;
4330 } else {
e3d142d0
PM
4331 shiftcount = countLeadingZeros64(zSig1);
4332 if (shiftcount == 0) {
4333 zSig0 = (zSig1 >> 1) | (zSig1 & 1);
4334 zExp -= 63;
4335 } else {
4336 shiftcount--;
4337 zSig0 = zSig1 << shiftcount;
4338 zExp -= (shiftcount + 64);
4339 }
369be8f6 4340 }
67d43538
PM
4341 if (flags & float_muladd_halve_result) {
4342 zExp--;
4343 }
369be8f6
PM
4344 return roundAndPackFloat64(zSign, zExp, zSig0 STATUS_VAR);
4345 }
4346}
4347
158142c2
FB
4348/*----------------------------------------------------------------------------
4349| Returns the square root of the double-precision floating-point value `a'.
4350| The operation is performed according to the IEC/IEEE Standard for Binary
4351| Floating-Point Arithmetic.
4352*----------------------------------------------------------------------------*/
4353
4354float64 float64_sqrt( float64 a STATUS_PARAM )
4355{
4356 flag aSign;
94a49d86 4357 int_fast16_t aExp, zExp;
bb98fe42
AF
4358 uint64_t aSig, zSig, doubleZSig;
4359 uint64_t rem0, rem1, term0, term1;
37d18660 4360 a = float64_squash_input_denormal(a STATUS_VAR);
158142c2
FB
4361
4362 aSig = extractFloat64Frac( a );
4363 aExp = extractFloat64Exp( a );
4364 aSign = extractFloat64Sign( a );
4365 if ( aExp == 0x7FF ) {
4366 if ( aSig ) return propagateFloat64NaN( a, a STATUS_VAR );
4367 if ( ! aSign ) return a;
4368 float_raise( float_flag_invalid STATUS_VAR);
4369 return float64_default_nan;
4370 }
4371 if ( aSign ) {
4372 if ( ( aExp | aSig ) == 0 ) return a;
4373 float_raise( float_flag_invalid STATUS_VAR);
4374 return float64_default_nan;
4375 }
4376 if ( aExp == 0 ) {
f090c9d4 4377 if ( aSig == 0 ) return float64_zero;
158142c2
FB
4378 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4379 }
4380 zExp = ( ( aExp - 0x3FF )>>1 ) + 0x3FE;
4381 aSig |= LIT64( 0x0010000000000000 );
4382 zSig = estimateSqrt32( aExp, aSig>>21 );
4383 aSig <<= 9 - ( aExp & 1 );
4384 zSig = estimateDiv128To64( aSig, 0, zSig<<32 ) + ( zSig<<30 );
4385 if ( ( zSig & 0x1FF ) <= 5 ) {
4386 doubleZSig = zSig<<1;
4387 mul64To128( zSig, zSig, &term0, &term1 );
4388 sub128( aSig, 0, term0, term1, &rem0, &rem1 );
bb98fe42 4389 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
4390 --zSig;
4391 doubleZSig -= 2;
4392 add128( rem0, rem1, zSig>>63, doubleZSig | 1, &rem0, &rem1 );
4393 }
4394 zSig |= ( ( rem0 | rem1 ) != 0 );
4395 }
4396 return roundAndPackFloat64( 0, zExp, zSig STATUS_VAR );
4397
4398}
4399
374dfc33
AJ
4400/*----------------------------------------------------------------------------
4401| Returns the binary log of the double-precision floating-point value `a'.
4402| The operation is performed according to the IEC/IEEE Standard for Binary
4403| Floating-Point Arithmetic.
4404*----------------------------------------------------------------------------*/
4405float64 float64_log2( float64 a STATUS_PARAM )
4406{
4407 flag aSign, zSign;
94a49d86 4408 int_fast16_t aExp;
bb98fe42 4409 uint64_t aSig, aSig0, aSig1, zSig, i;
37d18660 4410 a = float64_squash_input_denormal(a STATUS_VAR);
374dfc33
AJ
4411
4412 aSig = extractFloat64Frac( a );
4413 aExp = extractFloat64Exp( a );
4414 aSign = extractFloat64Sign( a );
4415
4416 if ( aExp == 0 ) {
4417 if ( aSig == 0 ) return packFloat64( 1, 0x7FF, 0 );
4418 normalizeFloat64Subnormal( aSig, &aExp, &aSig );
4419 }
4420 if ( aSign ) {
4421 float_raise( float_flag_invalid STATUS_VAR);
4422 return float64_default_nan;
4423 }
4424 if ( aExp == 0x7FF ) {
4425 if ( aSig ) return propagateFloat64NaN( a, float64_zero STATUS_VAR );
4426 return a;
4427 }
4428
4429 aExp -= 0x3FF;
4430 aSig |= LIT64( 0x0010000000000000 );
4431 zSign = aExp < 0;
bb98fe42 4432 zSig = (uint64_t)aExp << 52;
374dfc33
AJ
4433 for (i = 1LL << 51; i > 0; i >>= 1) {
4434 mul64To128( aSig, aSig, &aSig0, &aSig1 );
4435 aSig = ( aSig0 << 12 ) | ( aSig1 >> 52 );
4436 if ( aSig & LIT64( 0x0020000000000000 ) ) {
4437 aSig >>= 1;
4438 zSig |= i;
4439 }
4440 }
4441
4442 if ( zSign )
4443 zSig = -zSig;
4444 return normalizeRoundAndPackFloat64( zSign, 0x408, zSig STATUS_VAR );
4445}
4446
158142c2
FB
4447/*----------------------------------------------------------------------------
4448| Returns 1 if the double-precision floating-point value `a' is equal to the
b689362d
AJ
4449| corresponding value `b', and 0 otherwise. The invalid exception is raised
4450| if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
4451| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4452*----------------------------------------------------------------------------*/
4453
b689362d 4454int float64_eq( float64 a, float64 b STATUS_PARAM )
158142c2 4455{
bb98fe42 4456 uint64_t av, bv;
37d18660
PM
4457 a = float64_squash_input_denormal(a STATUS_VAR);
4458 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4459
4460 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4461 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4462 ) {
b689362d 4463 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
4464 return 0;
4465 }
f090c9d4 4466 av = float64_val(a);
a1b91bb4 4467 bv = float64_val(b);
bb98fe42 4468 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4469
4470}
4471
4472/*----------------------------------------------------------------------------
4473| Returns 1 if the double-precision floating-point value `a' is less than or
f5a64251
AJ
4474| equal to the corresponding value `b', and 0 otherwise. The invalid
4475| exception is raised if either operand is a NaN. The comparison is performed
4476| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4477*----------------------------------------------------------------------------*/
4478
750afe93 4479int float64_le( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4480{
4481 flag aSign, bSign;
bb98fe42 4482 uint64_t av, bv;
37d18660
PM
4483 a = float64_squash_input_denormal(a STATUS_VAR);
4484 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4485
4486 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4487 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4488 ) {
4489 float_raise( float_flag_invalid STATUS_VAR);
4490 return 0;
4491 }
4492 aSign = extractFloat64Sign( a );
4493 bSign = extractFloat64Sign( b );
f090c9d4 4494 av = float64_val(a);
a1b91bb4 4495 bv = float64_val(b);
bb98fe42 4496 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4497 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4498
4499}
4500
4501/*----------------------------------------------------------------------------
4502| Returns 1 if the double-precision floating-point value `a' is less than
f5a64251
AJ
4503| the corresponding value `b', and 0 otherwise. The invalid exception is
4504| raised if either operand is a NaN. The comparison is performed according
4505| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
4506*----------------------------------------------------------------------------*/
4507
750afe93 4508int float64_lt( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4509{
4510 flag aSign, bSign;
bb98fe42 4511 uint64_t av, bv;
158142c2 4512
37d18660
PM
4513 a = float64_squash_input_denormal(a STATUS_VAR);
4514 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4515 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4516 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4517 ) {
4518 float_raise( float_flag_invalid STATUS_VAR);
4519 return 0;
4520 }
4521 aSign = extractFloat64Sign( a );
4522 bSign = extractFloat64Sign( b );
f090c9d4 4523 av = float64_val(a);
a1b91bb4 4524 bv = float64_val(b);
bb98fe42 4525 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4526 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4527
4528}
4529
67b7861d
AJ
4530/*----------------------------------------------------------------------------
4531| Returns 1 if the double-precision floating-point values `a' and `b' cannot
f5a64251
AJ
4532| be compared, and 0 otherwise. The invalid exception is raised if either
4533| operand is a NaN. The comparison is performed according to the IEC/IEEE
4534| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
4535*----------------------------------------------------------------------------*/
4536
4537int float64_unordered( float64 a, float64 b STATUS_PARAM )
4538{
4539 a = float64_squash_input_denormal(a STATUS_VAR);
4540 b = float64_squash_input_denormal(b STATUS_VAR);
4541
4542 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4543 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4544 ) {
4545 float_raise( float_flag_invalid STATUS_VAR);
4546 return 1;
4547 }
4548 return 0;
4549}
4550
158142c2
FB
4551/*----------------------------------------------------------------------------
4552| Returns 1 if the double-precision floating-point value `a' is equal to the
f5a64251
AJ
4553| corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4554| exception.The comparison is performed according to the IEC/IEEE Standard
4555| for Binary Floating-Point Arithmetic.
158142c2
FB
4556*----------------------------------------------------------------------------*/
4557
b689362d 4558int float64_eq_quiet( float64 a, float64 b STATUS_PARAM )
158142c2 4559{
bb98fe42 4560 uint64_t av, bv;
37d18660
PM
4561 a = float64_squash_input_denormal(a STATUS_VAR);
4562 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4563
4564 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4565 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4566 ) {
b689362d
AJ
4567 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4568 float_raise( float_flag_invalid STATUS_VAR);
4569 }
158142c2
FB
4570 return 0;
4571 }
f090c9d4 4572 av = float64_val(a);
a1b91bb4 4573 bv = float64_val(b);
bb98fe42 4574 return ( av == bv ) || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
158142c2
FB
4575
4576}
4577
4578/*----------------------------------------------------------------------------
4579| Returns 1 if the double-precision floating-point value `a' is less than or
4580| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
4581| cause an exception. Otherwise, the comparison is performed according to the
4582| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
4583*----------------------------------------------------------------------------*/
4584
750afe93 4585int float64_le_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4586{
4587 flag aSign, bSign;
bb98fe42 4588 uint64_t av, bv;
37d18660
PM
4589 a = float64_squash_input_denormal(a STATUS_VAR);
4590 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4591
4592 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4593 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4594 ) {
4595 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4596 float_raise( float_flag_invalid STATUS_VAR);
4597 }
4598 return 0;
4599 }
4600 aSign = extractFloat64Sign( a );
4601 bSign = extractFloat64Sign( b );
f090c9d4 4602 av = float64_val(a);
a1b91bb4 4603 bv = float64_val(b);
bb98fe42 4604 if ( aSign != bSign ) return aSign || ( (uint64_t) ( ( av | bv )<<1 ) == 0 );
f090c9d4 4605 return ( av == bv ) || ( aSign ^ ( av < bv ) );
158142c2
FB
4606
4607}
4608
4609/*----------------------------------------------------------------------------
4610| Returns 1 if the double-precision floating-point value `a' is less than
4611| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
4612| exception. Otherwise, the comparison is performed according to the IEC/IEEE
4613| Standard for Binary Floating-Point Arithmetic.
4614*----------------------------------------------------------------------------*/
4615
750afe93 4616int float64_lt_quiet( float64 a, float64 b STATUS_PARAM )
158142c2
FB
4617{
4618 flag aSign, bSign;
bb98fe42 4619 uint64_t av, bv;
37d18660
PM
4620 a = float64_squash_input_denormal(a STATUS_VAR);
4621 b = float64_squash_input_denormal(b STATUS_VAR);
158142c2
FB
4622
4623 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4624 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4625 ) {
4626 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4627 float_raise( float_flag_invalid STATUS_VAR);
4628 }
4629 return 0;
4630 }
4631 aSign = extractFloat64Sign( a );
4632 bSign = extractFloat64Sign( b );
f090c9d4 4633 av = float64_val(a);
a1b91bb4 4634 bv = float64_val(b);
bb98fe42 4635 if ( aSign != bSign ) return aSign && ( (uint64_t) ( ( av | bv )<<1 ) != 0 );
f090c9d4 4636 return ( av != bv ) && ( aSign ^ ( av < bv ) );
158142c2
FB
4637
4638}
4639
67b7861d
AJ
4640/*----------------------------------------------------------------------------
4641| Returns 1 if the double-precision floating-point values `a' and `b' cannot
4642| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
4643| comparison is performed according to the IEC/IEEE Standard for Binary
4644| Floating-Point Arithmetic.
4645*----------------------------------------------------------------------------*/
4646
4647int float64_unordered_quiet( float64 a, float64 b STATUS_PARAM )
4648{
4649 a = float64_squash_input_denormal(a STATUS_VAR);
4650 b = float64_squash_input_denormal(b STATUS_VAR);
4651
4652 if ( ( ( extractFloat64Exp( a ) == 0x7FF ) && extractFloat64Frac( a ) )
4653 || ( ( extractFloat64Exp( b ) == 0x7FF ) && extractFloat64Frac( b ) )
4654 ) {
4655 if ( float64_is_signaling_nan( a ) || float64_is_signaling_nan( b ) ) {
4656 float_raise( float_flag_invalid STATUS_VAR);
4657 }
4658 return 1;
4659 }
4660 return 0;
4661}
4662
158142c2
FB
4663/*----------------------------------------------------------------------------
4664| Returns the result of converting the extended double-precision floating-
4665| point value `a' to the 32-bit two's complement integer format. The
4666| conversion is performed according to the IEC/IEEE Standard for Binary
4667| Floating-Point Arithmetic---which means in particular that the conversion
4668| is rounded according to the current rounding mode. If `a' is a NaN, the
4669| largest positive integer is returned. Otherwise, if the conversion
4670| overflows, the largest integer with the same sign as `a' is returned.
4671*----------------------------------------------------------------------------*/
4672
4673int32 floatx80_to_int32( floatx80 a STATUS_PARAM )
4674{
4675 flag aSign;
4676 int32 aExp, shiftCount;
bb98fe42 4677 uint64_t aSig;
158142c2
FB
4678
4679 aSig = extractFloatx80Frac( a );
4680 aExp = extractFloatx80Exp( a );
4681 aSign = extractFloatx80Sign( a );
bb98fe42 4682 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4683 shiftCount = 0x4037 - aExp;
4684 if ( shiftCount <= 0 ) shiftCount = 1;
4685 shift64RightJamming( aSig, shiftCount, &aSig );
4686 return roundAndPackInt32( aSign, aSig STATUS_VAR );
4687
4688}
4689
4690/*----------------------------------------------------------------------------
4691| Returns the result of converting the extended double-precision floating-
4692| point value `a' to the 32-bit two's complement integer format. The
4693| conversion is performed according to the IEC/IEEE Standard for Binary
4694| Floating-Point Arithmetic, except that the conversion is always rounded
4695| toward zero. If `a' is a NaN, the largest positive integer is returned.
4696| Otherwise, if the conversion overflows, the largest integer with the same
4697| sign as `a' is returned.
4698*----------------------------------------------------------------------------*/
4699
4700int32 floatx80_to_int32_round_to_zero( floatx80 a STATUS_PARAM )
4701{
4702 flag aSign;
4703 int32 aExp, shiftCount;
bb98fe42 4704 uint64_t aSig, savedASig;
b3a6a2e0 4705 int32_t z;
158142c2
FB
4706
4707 aSig = extractFloatx80Frac( a );
4708 aExp = extractFloatx80Exp( a );
4709 aSign = extractFloatx80Sign( a );
4710 if ( 0x401E < aExp ) {
bb98fe42 4711 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) aSign = 0;
158142c2
FB
4712 goto invalid;
4713 }
4714 else if ( aExp < 0x3FFF ) {
4715 if ( aExp || aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4716 return 0;
4717 }
4718 shiftCount = 0x403E - aExp;
4719 savedASig = aSig;
4720 aSig >>= shiftCount;
4721 z = aSig;
4722 if ( aSign ) z = - z;
4723 if ( ( z < 0 ) ^ aSign ) {
4724 invalid:
4725 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 4726 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
4727 }
4728 if ( ( aSig<<shiftCount ) != savedASig ) {
4729 STATUS(float_exception_flags) |= float_flag_inexact;
4730 }
4731 return z;
4732
4733}
4734
4735/*----------------------------------------------------------------------------
4736| Returns the result of converting the extended double-precision floating-
4737| point value `a' to the 64-bit two's complement integer format. The
4738| conversion is performed according to the IEC/IEEE Standard for Binary
4739| Floating-Point Arithmetic---which means in particular that the conversion
4740| is rounded according to the current rounding mode. If `a' is a NaN,
4741| the largest positive integer is returned. Otherwise, if the conversion
4742| overflows, the largest integer with the same sign as `a' is returned.
4743*----------------------------------------------------------------------------*/
4744
4745int64 floatx80_to_int64( floatx80 a STATUS_PARAM )
4746{
4747 flag aSign;
4748 int32 aExp, shiftCount;
bb98fe42 4749 uint64_t aSig, aSigExtra;
158142c2
FB
4750
4751 aSig = extractFloatx80Frac( a );
4752 aExp = extractFloatx80Exp( a );
4753 aSign = extractFloatx80Sign( a );
4754 shiftCount = 0x403E - aExp;
4755 if ( shiftCount <= 0 ) {
4756 if ( shiftCount ) {
4757 float_raise( float_flag_invalid STATUS_VAR);
4758 if ( ! aSign
4759 || ( ( aExp == 0x7FFF )
4760 && ( aSig != LIT64( 0x8000000000000000 ) ) )
4761 ) {
4762 return LIT64( 0x7FFFFFFFFFFFFFFF );
4763 }
bb98fe42 4764 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4765 }
4766 aSigExtra = 0;
4767 }
4768 else {
4769 shift64ExtraRightJamming( aSig, 0, shiftCount, &aSig, &aSigExtra );
4770 }
4771 return roundAndPackInt64( aSign, aSig, aSigExtra STATUS_VAR );
4772
4773}
4774
4775/*----------------------------------------------------------------------------
4776| Returns the result of converting the extended double-precision floating-
4777| point value `a' to the 64-bit two's complement integer format. The
4778| conversion is performed according to the IEC/IEEE Standard for Binary
4779| Floating-Point Arithmetic, except that the conversion is always rounded
4780| toward zero. If `a' is a NaN, the largest positive integer is returned.
4781| Otherwise, if the conversion overflows, the largest integer with the same
4782| sign as `a' is returned.
4783*----------------------------------------------------------------------------*/
4784
4785int64 floatx80_to_int64_round_to_zero( floatx80 a STATUS_PARAM )
4786{
4787 flag aSign;
4788 int32 aExp, shiftCount;
bb98fe42 4789 uint64_t aSig;
158142c2
FB
4790 int64 z;
4791
4792 aSig = extractFloatx80Frac( a );
4793 aExp = extractFloatx80Exp( a );
4794 aSign = extractFloatx80Sign( a );
4795 shiftCount = aExp - 0x403E;
4796 if ( 0 <= shiftCount ) {
4797 aSig &= LIT64( 0x7FFFFFFFFFFFFFFF );
4798 if ( ( a.high != 0xC03E ) || aSig ) {
4799 float_raise( float_flag_invalid STATUS_VAR);
4800 if ( ! aSign || ( ( aExp == 0x7FFF ) && aSig ) ) {
4801 return LIT64( 0x7FFFFFFFFFFFFFFF );
4802 }
4803 }
bb98fe42 4804 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
4805 }
4806 else if ( aExp < 0x3FFF ) {
4807 if ( aExp | aSig ) STATUS(float_exception_flags) |= float_flag_inexact;
4808 return 0;
4809 }
4810 z = aSig>>( - shiftCount );
bb98fe42 4811 if ( (uint64_t) ( aSig<<( shiftCount & 63 ) ) ) {
158142c2
FB
4812 STATUS(float_exception_flags) |= float_flag_inexact;
4813 }
4814 if ( aSign ) z = - z;
4815 return z;
4816
4817}
4818
4819/*----------------------------------------------------------------------------
4820| Returns the result of converting the extended double-precision floating-
4821| point value `a' to the single-precision floating-point format. The
4822| conversion is performed according to the IEC/IEEE Standard for Binary
4823| Floating-Point Arithmetic.
4824*----------------------------------------------------------------------------*/
4825
4826float32 floatx80_to_float32( floatx80 a STATUS_PARAM )
4827{
4828 flag aSign;
4829 int32 aExp;
bb98fe42 4830 uint64_t aSig;
158142c2
FB
4831
4832 aSig = extractFloatx80Frac( a );
4833 aExp = extractFloatx80Exp( a );
4834 aSign = extractFloatx80Sign( a );
4835 if ( aExp == 0x7FFF ) {
bb98fe42 4836 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4837 return commonNaNToFloat32( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4838 }
4839 return packFloat32( aSign, 0xFF, 0 );
4840 }
4841 shift64RightJamming( aSig, 33, &aSig );
4842 if ( aExp || aSig ) aExp -= 0x3F81;
4843 return roundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
4844
4845}
4846
4847/*----------------------------------------------------------------------------
4848| Returns the result of converting the extended double-precision floating-
4849| point value `a' to the double-precision floating-point format. The
4850| conversion is performed according to the IEC/IEEE Standard for Binary
4851| Floating-Point Arithmetic.
4852*----------------------------------------------------------------------------*/
4853
4854float64 floatx80_to_float64( floatx80 a STATUS_PARAM )
4855{
4856 flag aSign;
4857 int32 aExp;
bb98fe42 4858 uint64_t aSig, zSig;
158142c2
FB
4859
4860 aSig = extractFloatx80Frac( a );
4861 aExp = extractFloatx80Exp( a );
4862 aSign = extractFloatx80Sign( a );
4863 if ( aExp == 0x7FFF ) {
bb98fe42 4864 if ( (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4865 return commonNaNToFloat64( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4866 }
4867 return packFloat64( aSign, 0x7FF, 0 );
4868 }
4869 shift64RightJamming( aSig, 1, &zSig );
4870 if ( aExp || aSig ) aExp -= 0x3C01;
4871 return roundAndPackFloat64( aSign, aExp, zSig STATUS_VAR );
4872
4873}
4874
158142c2
FB
4875/*----------------------------------------------------------------------------
4876| Returns the result of converting the extended double-precision floating-
4877| point value `a' to the quadruple-precision floating-point format. The
4878| conversion is performed according to the IEC/IEEE Standard for Binary
4879| Floating-Point Arithmetic.
4880*----------------------------------------------------------------------------*/
4881
4882float128 floatx80_to_float128( floatx80 a STATUS_PARAM )
4883{
4884 flag aSign;
94a49d86 4885 int_fast16_t aExp;
bb98fe42 4886 uint64_t aSig, zSig0, zSig1;
158142c2
FB
4887
4888 aSig = extractFloatx80Frac( a );
4889 aExp = extractFloatx80Exp( a );
4890 aSign = extractFloatx80Sign( a );
bb98fe42 4891 if ( ( aExp == 0x7FFF ) && (uint64_t) ( aSig<<1 ) ) {
bcd4d9af 4892 return commonNaNToFloat128( floatx80ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
4893 }
4894 shift128Right( aSig<<1, 0, 16, &zSig0, &zSig1 );
4895 return packFloat128( aSign, aExp, zSig0, zSig1 );
4896
4897}
4898
158142c2
FB
4899/*----------------------------------------------------------------------------
4900| Rounds the extended double-precision floating-point value `a' to an integer,
4901| and returns the result as an extended quadruple-precision floating-point
4902| value. The operation is performed according to the IEC/IEEE Standard for
4903| Binary Floating-Point Arithmetic.
4904*----------------------------------------------------------------------------*/
4905
4906floatx80 floatx80_round_to_int( floatx80 a STATUS_PARAM )
4907{
4908 flag aSign;
4909 int32 aExp;
bb98fe42 4910 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
4911 floatx80 z;
4912
4913 aExp = extractFloatx80Exp( a );
4914 if ( 0x403E <= aExp ) {
bb98fe42 4915 if ( ( aExp == 0x7FFF ) && (uint64_t) ( extractFloatx80Frac( a )<<1 ) ) {
158142c2
FB
4916 return propagateFloatx80NaN( a, a STATUS_VAR );
4917 }
4918 return a;
4919 }
4920 if ( aExp < 0x3FFF ) {
4921 if ( ( aExp == 0 )
bb98fe42 4922 && ( (uint64_t) ( extractFloatx80Frac( a )<<1 ) == 0 ) ) {
158142c2
FB
4923 return a;
4924 }
4925 STATUS(float_exception_flags) |= float_flag_inexact;
4926 aSign = extractFloatx80Sign( a );
4927 switch ( STATUS(float_rounding_mode) ) {
4928 case float_round_nearest_even:
bb98fe42 4929 if ( ( aExp == 0x3FFE ) && (uint64_t) ( extractFloatx80Frac( a )<<1 )
158142c2
FB
4930 ) {
4931 return
4932 packFloatx80( aSign, 0x3FFF, LIT64( 0x8000000000000000 ) );
4933 }
4934 break;
f9288a76
PM
4935 case float_round_ties_away:
4936 if (aExp == 0x3FFE) {
4937 return packFloatx80(aSign, 0x3FFF, LIT64(0x8000000000000000));
4938 }
4939 break;
158142c2
FB
4940 case float_round_down:
4941 return
4942 aSign ?
4943 packFloatx80( 1, 0x3FFF, LIT64( 0x8000000000000000 ) )
4944 : packFloatx80( 0, 0, 0 );
4945 case float_round_up:
4946 return
4947 aSign ? packFloatx80( 1, 0, 0 )
4948 : packFloatx80( 0, 0x3FFF, LIT64( 0x8000000000000000 ) );
4949 }
4950 return packFloatx80( aSign, 0, 0 );
4951 }
4952 lastBitMask = 1;
4953 lastBitMask <<= 0x403E - aExp;
4954 roundBitsMask = lastBitMask - 1;
4955 z = a;
dc355b76
PM
4956 switch (STATUS(float_rounding_mode)) {
4957 case float_round_nearest_even:
158142c2 4958 z.low += lastBitMask>>1;
dc355b76
PM
4959 if ((z.low & roundBitsMask) == 0) {
4960 z.low &= ~lastBitMask;
4961 }
4962 break;
f9288a76
PM
4963 case float_round_ties_away:
4964 z.low += lastBitMask >> 1;
4965 break;
dc355b76
PM
4966 case float_round_to_zero:
4967 break;
4968 case float_round_up:
4969 if (!extractFloatx80Sign(z)) {
4970 z.low += roundBitsMask;
4971 }
4972 break;
4973 case float_round_down:
4974 if (extractFloatx80Sign(z)) {
158142c2
FB
4975 z.low += roundBitsMask;
4976 }
dc355b76
PM
4977 break;
4978 default:
4979 abort();
158142c2
FB
4980 }
4981 z.low &= ~ roundBitsMask;
4982 if ( z.low == 0 ) {
4983 ++z.high;
4984 z.low = LIT64( 0x8000000000000000 );
4985 }
4986 if ( z.low != a.low ) STATUS(float_exception_flags) |= float_flag_inexact;
4987 return z;
4988
4989}
4990
4991/*----------------------------------------------------------------------------
4992| Returns the result of adding the absolute values of the extended double-
4993| precision floating-point values `a' and `b'. If `zSign' is 1, the sum is
4994| negated before being returned. `zSign' is ignored if the result is a NaN.
4995| The addition is performed according to the IEC/IEEE Standard for Binary
4996| Floating-Point Arithmetic.
4997*----------------------------------------------------------------------------*/
4998
4999static floatx80 addFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM)
5000{
5001 int32 aExp, bExp, zExp;
bb98fe42 5002 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
5003 int32 expDiff;
5004
5005 aSig = extractFloatx80Frac( a );
5006 aExp = extractFloatx80Exp( a );
5007 bSig = extractFloatx80Frac( b );
5008 bExp = extractFloatx80Exp( b );
5009 expDiff = aExp - bExp;
5010 if ( 0 < expDiff ) {
5011 if ( aExp == 0x7FFF ) {
bb98fe42 5012 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5013 return a;
5014 }
5015 if ( bExp == 0 ) --expDiff;
5016 shift64ExtraRightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5017 zExp = aExp;
5018 }
5019 else if ( expDiff < 0 ) {
5020 if ( bExp == 0x7FFF ) {
bb98fe42 5021 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5022 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5023 }
5024 if ( aExp == 0 ) ++expDiff;
5025 shift64ExtraRightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5026 zExp = bExp;
5027 }
5028 else {
5029 if ( aExp == 0x7FFF ) {
bb98fe42 5030 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
5031 return propagateFloatx80NaN( a, b STATUS_VAR );
5032 }
5033 return a;
5034 }
5035 zSig1 = 0;
5036 zSig0 = aSig + bSig;
5037 if ( aExp == 0 ) {
5038 normalizeFloatx80Subnormal( zSig0, &zExp, &zSig0 );
5039 goto roundAndPack;
5040 }
5041 zExp = aExp;
5042 goto shiftRight1;
5043 }
5044 zSig0 = aSig + bSig;
bb98fe42 5045 if ( (int64_t) zSig0 < 0 ) goto roundAndPack;
158142c2
FB
5046 shiftRight1:
5047 shift64ExtraRightJamming( zSig0, zSig1, 1, &zSig0, &zSig1 );
5048 zSig0 |= LIT64( 0x8000000000000000 );
5049 ++zExp;
5050 roundAndPack:
5051 return
5052 roundAndPackFloatx80(
5053 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5054
5055}
5056
5057/*----------------------------------------------------------------------------
5058| Returns the result of subtracting the absolute values of the extended
5059| double-precision floating-point values `a' and `b'. If `zSign' is 1, the
5060| difference is negated before being returned. `zSign' is ignored if the
5061| result is a NaN. The subtraction is performed according to the IEC/IEEE
5062| Standard for Binary Floating-Point Arithmetic.
5063*----------------------------------------------------------------------------*/
5064
5065static floatx80 subFloatx80Sigs( floatx80 a, floatx80 b, flag zSign STATUS_PARAM )
5066{
5067 int32 aExp, bExp, zExp;
bb98fe42 5068 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
5069 int32 expDiff;
5070 floatx80 z;
5071
5072 aSig = extractFloatx80Frac( a );
5073 aExp = extractFloatx80Exp( a );
5074 bSig = extractFloatx80Frac( b );
5075 bExp = extractFloatx80Exp( b );
5076 expDiff = aExp - bExp;
5077 if ( 0 < expDiff ) goto aExpBigger;
5078 if ( expDiff < 0 ) goto bExpBigger;
5079 if ( aExp == 0x7FFF ) {
bb98fe42 5080 if ( (uint64_t) ( ( aSig | bSig )<<1 ) ) {
158142c2
FB
5081 return propagateFloatx80NaN( a, b STATUS_VAR );
5082 }
5083 float_raise( float_flag_invalid STATUS_VAR);
5084 z.low = floatx80_default_nan_low;
5085 z.high = floatx80_default_nan_high;
5086 return z;
5087 }
5088 if ( aExp == 0 ) {
5089 aExp = 1;
5090 bExp = 1;
5091 }
5092 zSig1 = 0;
5093 if ( bSig < aSig ) goto aBigger;
5094 if ( aSig < bSig ) goto bBigger;
5095 return packFloatx80( STATUS(float_rounding_mode) == float_round_down, 0, 0 );
5096 bExpBigger:
5097 if ( bExp == 0x7FFF ) {
bb98fe42 5098 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5099 return packFloatx80( zSign ^ 1, 0x7FFF, LIT64( 0x8000000000000000 ) );
5100 }
5101 if ( aExp == 0 ) ++expDiff;
5102 shift128RightJamming( aSig, 0, - expDiff, &aSig, &zSig1 );
5103 bBigger:
5104 sub128( bSig, 0, aSig, zSig1, &zSig0, &zSig1 );
5105 zExp = bExp;
5106 zSign ^= 1;
5107 goto normalizeRoundAndPack;
5108 aExpBigger:
5109 if ( aExp == 0x7FFF ) {
bb98fe42 5110 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5111 return a;
5112 }
5113 if ( bExp == 0 ) --expDiff;
5114 shift128RightJamming( bSig, 0, expDiff, &bSig, &zSig1 );
5115 aBigger:
5116 sub128( aSig, 0, bSig, zSig1, &zSig0, &zSig1 );
5117 zExp = aExp;
5118 normalizeRoundAndPack:
5119 return
5120 normalizeRoundAndPackFloatx80(
5121 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5122
5123}
5124
5125/*----------------------------------------------------------------------------
5126| Returns the result of adding the extended double-precision floating-point
5127| values `a' and `b'. The operation is performed according to the IEC/IEEE
5128| Standard for Binary Floating-Point Arithmetic.
5129*----------------------------------------------------------------------------*/
5130
5131floatx80 floatx80_add( floatx80 a, floatx80 b STATUS_PARAM )
5132{
5133 flag aSign, bSign;
5134
5135 aSign = extractFloatx80Sign( a );
5136 bSign = extractFloatx80Sign( b );
5137 if ( aSign == bSign ) {
5138 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5139 }
5140 else {
5141 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5142 }
5143
5144}
5145
5146/*----------------------------------------------------------------------------
5147| Returns the result of subtracting the extended double-precision floating-
5148| point values `a' and `b'. The operation is performed according to the
5149| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5150*----------------------------------------------------------------------------*/
5151
5152floatx80 floatx80_sub( floatx80 a, floatx80 b STATUS_PARAM )
5153{
5154 flag aSign, bSign;
5155
5156 aSign = extractFloatx80Sign( a );
5157 bSign = extractFloatx80Sign( b );
5158 if ( aSign == bSign ) {
5159 return subFloatx80Sigs( a, b, aSign STATUS_VAR );
5160 }
5161 else {
5162 return addFloatx80Sigs( a, b, aSign STATUS_VAR );
5163 }
5164
5165}
5166
5167/*----------------------------------------------------------------------------
5168| Returns the result of multiplying the extended double-precision floating-
5169| point values `a' and `b'. The operation is performed according to the
5170| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5171*----------------------------------------------------------------------------*/
5172
5173floatx80 floatx80_mul( floatx80 a, floatx80 b STATUS_PARAM )
5174{
5175 flag aSign, bSign, zSign;
5176 int32 aExp, bExp, zExp;
bb98fe42 5177 uint64_t aSig, bSig, zSig0, zSig1;
158142c2
FB
5178 floatx80 z;
5179
5180 aSig = extractFloatx80Frac( a );
5181 aExp = extractFloatx80Exp( a );
5182 aSign = extractFloatx80Sign( a );
5183 bSig = extractFloatx80Frac( b );
5184 bExp = extractFloatx80Exp( b );
5185 bSign = extractFloatx80Sign( b );
5186 zSign = aSign ^ bSign;
5187 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5188 if ( (uint64_t) ( aSig<<1 )
5189 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
5190 return propagateFloatx80NaN( a, b STATUS_VAR );
5191 }
5192 if ( ( bExp | bSig ) == 0 ) goto invalid;
5193 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5194 }
5195 if ( bExp == 0x7FFF ) {
bb98fe42 5196 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5197 if ( ( aExp | aSig ) == 0 ) {
5198 invalid:
5199 float_raise( float_flag_invalid STATUS_VAR);
5200 z.low = floatx80_default_nan_low;
5201 z.high = floatx80_default_nan_high;
5202 return z;
5203 }
5204 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5205 }
5206 if ( aExp == 0 ) {
5207 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5208 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5209 }
5210 if ( bExp == 0 ) {
5211 if ( bSig == 0 ) return packFloatx80( zSign, 0, 0 );
5212 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5213 }
5214 zExp = aExp + bExp - 0x3FFE;
5215 mul64To128( aSig, bSig, &zSig0, &zSig1 );
bb98fe42 5216 if ( 0 < (int64_t) zSig0 ) {
158142c2
FB
5217 shortShift128Left( zSig0, zSig1, 1, &zSig0, &zSig1 );
5218 --zExp;
5219 }
5220 return
5221 roundAndPackFloatx80(
5222 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5223
5224}
5225
5226/*----------------------------------------------------------------------------
5227| Returns the result of dividing the extended double-precision floating-point
5228| value `a' by the corresponding value `b'. The operation is performed
5229| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5230*----------------------------------------------------------------------------*/
5231
5232floatx80 floatx80_div( floatx80 a, floatx80 b STATUS_PARAM )
5233{
5234 flag aSign, bSign, zSign;
5235 int32 aExp, bExp, zExp;
bb98fe42
AF
5236 uint64_t aSig, bSig, zSig0, zSig1;
5237 uint64_t rem0, rem1, rem2, term0, term1, term2;
158142c2
FB
5238 floatx80 z;
5239
5240 aSig = extractFloatx80Frac( a );
5241 aExp = extractFloatx80Exp( a );
5242 aSign = extractFloatx80Sign( a );
5243 bSig = extractFloatx80Frac( b );
5244 bExp = extractFloatx80Exp( b );
5245 bSign = extractFloatx80Sign( b );
5246 zSign = aSign ^ bSign;
5247 if ( aExp == 0x7FFF ) {
bb98fe42 5248 if ( (uint64_t) ( aSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2 5249 if ( bExp == 0x7FFF ) {
bb98fe42 5250 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5251 goto invalid;
5252 }
5253 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5254 }
5255 if ( bExp == 0x7FFF ) {
bb98fe42 5256 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5257 return packFloatx80( zSign, 0, 0 );
5258 }
5259 if ( bExp == 0 ) {
5260 if ( bSig == 0 ) {
5261 if ( ( aExp | aSig ) == 0 ) {
5262 invalid:
5263 float_raise( float_flag_invalid STATUS_VAR);
5264 z.low = floatx80_default_nan_low;
5265 z.high = floatx80_default_nan_high;
5266 return z;
5267 }
5268 float_raise( float_flag_divbyzero STATUS_VAR);
5269 return packFloatx80( zSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5270 }
5271 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5272 }
5273 if ( aExp == 0 ) {
5274 if ( aSig == 0 ) return packFloatx80( zSign, 0, 0 );
5275 normalizeFloatx80Subnormal( aSig, &aExp, &aSig );
5276 }
5277 zExp = aExp - bExp + 0x3FFE;
5278 rem1 = 0;
5279 if ( bSig <= aSig ) {
5280 shift128Right( aSig, 0, 1, &aSig, &rem1 );
5281 ++zExp;
5282 }
5283 zSig0 = estimateDiv128To64( aSig, rem1, bSig );
5284 mul64To128( bSig, zSig0, &term0, &term1 );
5285 sub128( aSig, rem1, term0, term1, &rem0, &rem1 );
bb98fe42 5286 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5287 --zSig0;
5288 add128( rem0, rem1, 0, bSig, &rem0, &rem1 );
5289 }
5290 zSig1 = estimateDiv128To64( rem1, 0, bSig );
bb98fe42 5291 if ( (uint64_t) ( zSig1<<1 ) <= 8 ) {
158142c2
FB
5292 mul64To128( bSig, zSig1, &term1, &term2 );
5293 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
bb98fe42 5294 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5295 --zSig1;
5296 add128( rem1, rem2, 0, bSig, &rem1, &rem2 );
5297 }
5298 zSig1 |= ( ( rem1 | rem2 ) != 0 );
5299 }
5300 return
5301 roundAndPackFloatx80(
5302 STATUS(floatx80_rounding_precision), zSign, zExp, zSig0, zSig1 STATUS_VAR );
5303
5304}
5305
5306/*----------------------------------------------------------------------------
5307| Returns the remainder of the extended double-precision floating-point value
5308| `a' with respect to the corresponding value `b'. The operation is performed
5309| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5310*----------------------------------------------------------------------------*/
5311
5312floatx80 floatx80_rem( floatx80 a, floatx80 b STATUS_PARAM )
5313{
ed086f3d 5314 flag aSign, zSign;
158142c2 5315 int32 aExp, bExp, expDiff;
bb98fe42
AF
5316 uint64_t aSig0, aSig1, bSig;
5317 uint64_t q, term0, term1, alternateASig0, alternateASig1;
158142c2
FB
5318 floatx80 z;
5319
5320 aSig0 = extractFloatx80Frac( a );
5321 aExp = extractFloatx80Exp( a );
5322 aSign = extractFloatx80Sign( a );
5323 bSig = extractFloatx80Frac( b );
5324 bExp = extractFloatx80Exp( b );
158142c2 5325 if ( aExp == 0x7FFF ) {
bb98fe42
AF
5326 if ( (uint64_t) ( aSig0<<1 )
5327 || ( ( bExp == 0x7FFF ) && (uint64_t) ( bSig<<1 ) ) ) {
158142c2
FB
5328 return propagateFloatx80NaN( a, b STATUS_VAR );
5329 }
5330 goto invalid;
5331 }
5332 if ( bExp == 0x7FFF ) {
bb98fe42 5333 if ( (uint64_t) ( bSig<<1 ) ) return propagateFloatx80NaN( a, b STATUS_VAR );
158142c2
FB
5334 return a;
5335 }
5336 if ( bExp == 0 ) {
5337 if ( bSig == 0 ) {
5338 invalid:
5339 float_raise( float_flag_invalid STATUS_VAR);
5340 z.low = floatx80_default_nan_low;
5341 z.high = floatx80_default_nan_high;
5342 return z;
5343 }
5344 normalizeFloatx80Subnormal( bSig, &bExp, &bSig );
5345 }
5346 if ( aExp == 0 ) {
bb98fe42 5347 if ( (uint64_t) ( aSig0<<1 ) == 0 ) return a;
158142c2
FB
5348 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5349 }
5350 bSig |= LIT64( 0x8000000000000000 );
5351 zSign = aSign;
5352 expDiff = aExp - bExp;
5353 aSig1 = 0;
5354 if ( expDiff < 0 ) {
5355 if ( expDiff < -1 ) return a;
5356 shift128Right( aSig0, 0, 1, &aSig0, &aSig1 );
5357 expDiff = 0;
5358 }
5359 q = ( bSig <= aSig0 );
5360 if ( q ) aSig0 -= bSig;
5361 expDiff -= 64;
5362 while ( 0 < expDiff ) {
5363 q = estimateDiv128To64( aSig0, aSig1, bSig );
5364 q = ( 2 < q ) ? q - 2 : 0;
5365 mul64To128( bSig, q, &term0, &term1 );
5366 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5367 shortShift128Left( aSig0, aSig1, 62, &aSig0, &aSig1 );
5368 expDiff -= 62;
5369 }
5370 expDiff += 64;
5371 if ( 0 < expDiff ) {
5372 q = estimateDiv128To64( aSig0, aSig1, bSig );
5373 q = ( 2 < q ) ? q - 2 : 0;
5374 q >>= 64 - expDiff;
5375 mul64To128( bSig, q<<( 64 - expDiff ), &term0, &term1 );
5376 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5377 shortShift128Left( 0, bSig, 64 - expDiff, &term0, &term1 );
5378 while ( le128( term0, term1, aSig0, aSig1 ) ) {
5379 ++q;
5380 sub128( aSig0, aSig1, term0, term1, &aSig0, &aSig1 );
5381 }
5382 }
5383 else {
5384 term1 = 0;
5385 term0 = bSig;
5386 }
5387 sub128( term0, term1, aSig0, aSig1, &alternateASig0, &alternateASig1 );
5388 if ( lt128( alternateASig0, alternateASig1, aSig0, aSig1 )
5389 || ( eq128( alternateASig0, alternateASig1, aSig0, aSig1 )
5390 && ( q & 1 ) )
5391 ) {
5392 aSig0 = alternateASig0;
5393 aSig1 = alternateASig1;
5394 zSign = ! zSign;
5395 }
5396 return
5397 normalizeRoundAndPackFloatx80(
5398 80, zSign, bExp + expDiff, aSig0, aSig1 STATUS_VAR );
5399
5400}
5401
5402/*----------------------------------------------------------------------------
5403| Returns the square root of the extended double-precision floating-point
5404| value `a'. The operation is performed according to the IEC/IEEE Standard
5405| for Binary Floating-Point Arithmetic.
5406*----------------------------------------------------------------------------*/
5407
5408floatx80 floatx80_sqrt( floatx80 a STATUS_PARAM )
5409{
5410 flag aSign;
5411 int32 aExp, zExp;
bb98fe42
AF
5412 uint64_t aSig0, aSig1, zSig0, zSig1, doubleZSig0;
5413 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
5414 floatx80 z;
5415
5416 aSig0 = extractFloatx80Frac( a );
5417 aExp = extractFloatx80Exp( a );
5418 aSign = extractFloatx80Sign( a );
5419 if ( aExp == 0x7FFF ) {
bb98fe42 5420 if ( (uint64_t) ( aSig0<<1 ) ) return propagateFloatx80NaN( a, a STATUS_VAR );
158142c2
FB
5421 if ( ! aSign ) return a;
5422 goto invalid;
5423 }
5424 if ( aSign ) {
5425 if ( ( aExp | aSig0 ) == 0 ) return a;
5426 invalid:
5427 float_raise( float_flag_invalid STATUS_VAR);
5428 z.low = floatx80_default_nan_low;
5429 z.high = floatx80_default_nan_high;
5430 return z;
5431 }
5432 if ( aExp == 0 ) {
5433 if ( aSig0 == 0 ) return packFloatx80( 0, 0, 0 );
5434 normalizeFloatx80Subnormal( aSig0, &aExp, &aSig0 );
5435 }
5436 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFF;
5437 zSig0 = estimateSqrt32( aExp, aSig0>>32 );
5438 shift128Right( aSig0, 0, 2 + ( aExp & 1 ), &aSig0, &aSig1 );
5439 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
5440 doubleZSig0 = zSig0<<1;
5441 mul64To128( zSig0, zSig0, &term0, &term1 );
5442 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 5443 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
5444 --zSig0;
5445 doubleZSig0 -= 2;
5446 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
5447 }
5448 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
5449 if ( ( zSig1 & LIT64( 0x3FFFFFFFFFFFFFFF ) ) <= 5 ) {
5450 if ( zSig1 == 0 ) zSig1 = 1;
5451 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
5452 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
5453 mul64To128( zSig1, zSig1, &term2, &term3 );
5454 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 5455 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
5456 --zSig1;
5457 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
5458 term3 |= 1;
5459 term2 |= doubleZSig0;
5460 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
5461 }
5462 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
5463 }
5464 shortShift128Left( 0, zSig1, 1, &zSig0, &zSig1 );
5465 zSig0 |= doubleZSig0;
5466 return
5467 roundAndPackFloatx80(
5468 STATUS(floatx80_rounding_precision), 0, zExp, zSig0, zSig1 STATUS_VAR );
5469
5470}
5471
5472/*----------------------------------------------------------------------------
b689362d
AJ
5473| Returns 1 if the extended double-precision floating-point value `a' is equal
5474| to the corresponding value `b', and 0 otherwise. The invalid exception is
5475| raised if either operand is a NaN. Otherwise, the comparison is performed
5476| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5477*----------------------------------------------------------------------------*/
5478
b689362d 5479int floatx80_eq( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5480{
5481
5482 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5483 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5484 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5485 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5486 ) {
b689362d 5487 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
5488 return 0;
5489 }
5490 return
5491 ( a.low == b.low )
5492 && ( ( a.high == b.high )
5493 || ( ( a.low == 0 )
bb98fe42 5494 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5495 );
5496
5497}
5498
5499/*----------------------------------------------------------------------------
5500| Returns 1 if the extended double-precision floating-point value `a' is
5501| less than or equal to the corresponding value `b', and 0 otherwise. The
f5a64251
AJ
5502| invalid exception is raised if either operand is a NaN. The comparison is
5503| performed according to the IEC/IEEE Standard for Binary Floating-Point
5504| Arithmetic.
158142c2
FB
5505*----------------------------------------------------------------------------*/
5506
750afe93 5507int floatx80_le( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5508{
5509 flag aSign, bSign;
5510
5511 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5512 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5513 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5514 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5515 ) {
5516 float_raise( float_flag_invalid STATUS_VAR);
5517 return 0;
5518 }
5519 aSign = extractFloatx80Sign( a );
5520 bSign = extractFloatx80Sign( b );
5521 if ( aSign != bSign ) {
5522 return
5523 aSign
bb98fe42 5524 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5525 == 0 );
5526 }
5527 return
5528 aSign ? le128( b.high, b.low, a.high, a.low )
5529 : le128( a.high, a.low, b.high, b.low );
5530
5531}
5532
5533/*----------------------------------------------------------------------------
5534| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5535| less than the corresponding value `b', and 0 otherwise. The invalid
5536| exception is raised if either operand is a NaN. The comparison is performed
5537| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5538*----------------------------------------------------------------------------*/
5539
750afe93 5540int floatx80_lt( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5541{
5542 flag aSign, bSign;
5543
5544 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5545 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5546 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5547 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5548 ) {
5549 float_raise( float_flag_invalid STATUS_VAR);
5550 return 0;
5551 }
5552 aSign = extractFloatx80Sign( a );
5553 bSign = extractFloatx80Sign( b );
5554 if ( aSign != bSign ) {
5555 return
5556 aSign
bb98fe42 5557 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5558 != 0 );
5559 }
5560 return
5561 aSign ? lt128( b.high, b.low, a.high, a.low )
5562 : lt128( a.high, a.low, b.high, b.low );
5563
5564}
5565
67b7861d
AJ
5566/*----------------------------------------------------------------------------
5567| Returns 1 if the extended double-precision floating-point values `a' and `b'
f5a64251
AJ
5568| cannot be compared, and 0 otherwise. The invalid exception is raised if
5569| either operand is a NaN. The comparison is performed according to the
5570| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
5571*----------------------------------------------------------------------------*/
5572int floatx80_unordered( floatx80 a, floatx80 b STATUS_PARAM )
5573{
5574 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5575 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5576 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5577 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5578 ) {
5579 float_raise( float_flag_invalid STATUS_VAR);
5580 return 1;
5581 }
5582 return 0;
5583}
5584
158142c2 5585/*----------------------------------------------------------------------------
b689362d 5586| Returns 1 if the extended double-precision floating-point value `a' is
f5a64251
AJ
5587| equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
5588| cause an exception. The comparison is performed according to the IEC/IEEE
5589| Standard for Binary Floating-Point Arithmetic.
158142c2
FB
5590*----------------------------------------------------------------------------*/
5591
b689362d 5592int floatx80_eq_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5593{
5594
5595 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5596 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5597 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5598 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2 5599 ) {
b689362d
AJ
5600 if ( floatx80_is_signaling_nan( a )
5601 || floatx80_is_signaling_nan( b ) ) {
5602 float_raise( float_flag_invalid STATUS_VAR);
5603 }
158142c2
FB
5604 return 0;
5605 }
5606 return
5607 ( a.low == b.low )
5608 && ( ( a.high == b.high )
5609 || ( ( a.low == 0 )
bb98fe42 5610 && ( (uint16_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
5611 );
5612
5613}
5614
5615/*----------------------------------------------------------------------------
5616| Returns 1 if the extended double-precision floating-point value `a' is less
5617| than or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs
5618| do not cause an exception. Otherwise, the comparison is performed according
5619| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5620*----------------------------------------------------------------------------*/
5621
750afe93 5622int floatx80_le_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5623{
5624 flag aSign, bSign;
5625
5626 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5627 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5628 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5629 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5630 ) {
5631 if ( floatx80_is_signaling_nan( a )
5632 || floatx80_is_signaling_nan( b ) ) {
5633 float_raise( float_flag_invalid STATUS_VAR);
5634 }
5635 return 0;
5636 }
5637 aSign = extractFloatx80Sign( a );
5638 bSign = extractFloatx80Sign( b );
5639 if ( aSign != bSign ) {
5640 return
5641 aSign
bb98fe42 5642 || ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5643 == 0 );
5644 }
5645 return
5646 aSign ? le128( b.high, b.low, a.high, a.low )
5647 : le128( a.high, a.low, b.high, b.low );
5648
5649}
5650
5651/*----------------------------------------------------------------------------
5652| Returns 1 if the extended double-precision floating-point value `a' is less
5653| than the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause
5654| an exception. Otherwise, the comparison is performed according to the
5655| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
5656*----------------------------------------------------------------------------*/
5657
750afe93 5658int floatx80_lt_quiet( floatx80 a, floatx80 b STATUS_PARAM )
158142c2
FB
5659{
5660 flag aSign, bSign;
5661
5662 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
bb98fe42 5663 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
158142c2 5664 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
bb98fe42 5665 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
158142c2
FB
5666 ) {
5667 if ( floatx80_is_signaling_nan( a )
5668 || floatx80_is_signaling_nan( b ) ) {
5669 float_raise( float_flag_invalid STATUS_VAR);
5670 }
5671 return 0;
5672 }
5673 aSign = extractFloatx80Sign( a );
5674 bSign = extractFloatx80Sign( b );
5675 if ( aSign != bSign ) {
5676 return
5677 aSign
bb98fe42 5678 && ( ( ( (uint16_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
5679 != 0 );
5680 }
5681 return
5682 aSign ? lt128( b.high, b.low, a.high, a.low )
5683 : lt128( a.high, a.low, b.high, b.low );
5684
5685}
5686
67b7861d
AJ
5687/*----------------------------------------------------------------------------
5688| Returns 1 if the extended double-precision floating-point values `a' and `b'
5689| cannot be compared, and 0 otherwise. Quiet NaNs do not cause an exception.
5690| The comparison is performed according to the IEC/IEEE Standard for Binary
5691| Floating-Point Arithmetic.
5692*----------------------------------------------------------------------------*/
5693int floatx80_unordered_quiet( floatx80 a, floatx80 b STATUS_PARAM )
5694{
5695 if ( ( ( extractFloatx80Exp( a ) == 0x7FFF )
5696 && (uint64_t) ( extractFloatx80Frac( a )<<1 ) )
5697 || ( ( extractFloatx80Exp( b ) == 0x7FFF )
5698 && (uint64_t) ( extractFloatx80Frac( b )<<1 ) )
5699 ) {
5700 if ( floatx80_is_signaling_nan( a )
5701 || floatx80_is_signaling_nan( b ) ) {
5702 float_raise( float_flag_invalid STATUS_VAR);
5703 }
5704 return 1;
5705 }
5706 return 0;
5707}
5708
158142c2
FB
5709/*----------------------------------------------------------------------------
5710| Returns the result of converting the quadruple-precision floating-point
5711| value `a' to the 32-bit two's complement integer format. The conversion
5712| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5713| Arithmetic---which means in particular that the conversion is rounded
5714| according to the current rounding mode. If `a' is a NaN, the largest
5715| positive integer is returned. Otherwise, if the conversion overflows, the
5716| largest integer with the same sign as `a' is returned.
5717*----------------------------------------------------------------------------*/
5718
5719int32 float128_to_int32( float128 a STATUS_PARAM )
5720{
5721 flag aSign;
5722 int32 aExp, shiftCount;
bb98fe42 5723 uint64_t aSig0, aSig1;
158142c2
FB
5724
5725 aSig1 = extractFloat128Frac1( a );
5726 aSig0 = extractFloat128Frac0( a );
5727 aExp = extractFloat128Exp( a );
5728 aSign = extractFloat128Sign( a );
5729 if ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) aSign = 0;
5730 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5731 aSig0 |= ( aSig1 != 0 );
5732 shiftCount = 0x4028 - aExp;
5733 if ( 0 < shiftCount ) shift64RightJamming( aSig0, shiftCount, &aSig0 );
5734 return roundAndPackInt32( aSign, aSig0 STATUS_VAR );
5735
5736}
5737
5738/*----------------------------------------------------------------------------
5739| Returns the result of converting the quadruple-precision floating-point
5740| value `a' to the 32-bit two's complement integer format. The conversion
5741| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5742| Arithmetic, except that the conversion is always rounded toward zero. If
5743| `a' is a NaN, the largest positive integer is returned. Otherwise, if the
5744| conversion overflows, the largest integer with the same sign as `a' is
5745| returned.
5746*----------------------------------------------------------------------------*/
5747
5748int32 float128_to_int32_round_to_zero( float128 a STATUS_PARAM )
5749{
5750 flag aSign;
5751 int32 aExp, shiftCount;
bb98fe42 5752 uint64_t aSig0, aSig1, savedASig;
b3a6a2e0 5753 int32_t z;
158142c2
FB
5754
5755 aSig1 = extractFloat128Frac1( a );
5756 aSig0 = extractFloat128Frac0( a );
5757 aExp = extractFloat128Exp( a );
5758 aSign = extractFloat128Sign( a );
5759 aSig0 |= ( aSig1 != 0 );
5760 if ( 0x401E < aExp ) {
5761 if ( ( aExp == 0x7FFF ) && aSig0 ) aSign = 0;
5762 goto invalid;
5763 }
5764 else if ( aExp < 0x3FFF ) {
5765 if ( aExp || aSig0 ) STATUS(float_exception_flags) |= float_flag_inexact;
5766 return 0;
5767 }
5768 aSig0 |= LIT64( 0x0001000000000000 );
5769 shiftCount = 0x402F - aExp;
5770 savedASig = aSig0;
5771 aSig0 >>= shiftCount;
5772 z = aSig0;
5773 if ( aSign ) z = - z;
5774 if ( ( z < 0 ) ^ aSign ) {
5775 invalid:
5776 float_raise( float_flag_invalid STATUS_VAR);
bb98fe42 5777 return aSign ? (int32_t) 0x80000000 : 0x7FFFFFFF;
158142c2
FB
5778 }
5779 if ( ( aSig0<<shiftCount ) != savedASig ) {
5780 STATUS(float_exception_flags) |= float_flag_inexact;
5781 }
5782 return z;
5783
5784}
5785
5786/*----------------------------------------------------------------------------
5787| Returns the result of converting the quadruple-precision floating-point
5788| value `a' to the 64-bit two's complement integer format. The conversion
5789| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5790| Arithmetic---which means in particular that the conversion is rounded
5791| according to the current rounding mode. If `a' is a NaN, the largest
5792| positive integer is returned. Otherwise, if the conversion overflows, the
5793| largest integer with the same sign as `a' is returned.
5794*----------------------------------------------------------------------------*/
5795
5796int64 float128_to_int64( float128 a STATUS_PARAM )
5797{
5798 flag aSign;
5799 int32 aExp, shiftCount;
bb98fe42 5800 uint64_t aSig0, aSig1;
158142c2
FB
5801
5802 aSig1 = extractFloat128Frac1( a );
5803 aSig0 = extractFloat128Frac0( a );
5804 aExp = extractFloat128Exp( a );
5805 aSign = extractFloat128Sign( a );
5806 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5807 shiftCount = 0x402F - aExp;
5808 if ( shiftCount <= 0 ) {
5809 if ( 0x403E < aExp ) {
5810 float_raise( float_flag_invalid STATUS_VAR);
5811 if ( ! aSign
5812 || ( ( aExp == 0x7FFF )
5813 && ( aSig1 || ( aSig0 != LIT64( 0x0001000000000000 ) ) )
5814 )
5815 ) {
5816 return LIT64( 0x7FFFFFFFFFFFFFFF );
5817 }
bb98fe42 5818 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5819 }
5820 shortShift128Left( aSig0, aSig1, - shiftCount, &aSig0, &aSig1 );
5821 }
5822 else {
5823 shift64ExtraRightJamming( aSig0, aSig1, shiftCount, &aSig0, &aSig1 );
5824 }
5825 return roundAndPackInt64( aSign, aSig0, aSig1 STATUS_VAR );
5826
5827}
5828
5829/*----------------------------------------------------------------------------
5830| Returns the result of converting the quadruple-precision floating-point
5831| value `a' to the 64-bit two's complement integer format. The conversion
5832| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5833| Arithmetic, except that the conversion is always rounded toward zero.
5834| If `a' is a NaN, the largest positive integer is returned. Otherwise, if
5835| the conversion overflows, the largest integer with the same sign as `a' is
5836| returned.
5837*----------------------------------------------------------------------------*/
5838
5839int64 float128_to_int64_round_to_zero( float128 a STATUS_PARAM )
5840{
5841 flag aSign;
5842 int32 aExp, shiftCount;
bb98fe42 5843 uint64_t aSig0, aSig1;
158142c2
FB
5844 int64 z;
5845
5846 aSig1 = extractFloat128Frac1( a );
5847 aSig0 = extractFloat128Frac0( a );
5848 aExp = extractFloat128Exp( a );
5849 aSign = extractFloat128Sign( a );
5850 if ( aExp ) aSig0 |= LIT64( 0x0001000000000000 );
5851 shiftCount = aExp - 0x402F;
5852 if ( 0 < shiftCount ) {
5853 if ( 0x403E <= aExp ) {
5854 aSig0 &= LIT64( 0x0000FFFFFFFFFFFF );
5855 if ( ( a.high == LIT64( 0xC03E000000000000 ) )
5856 && ( aSig1 < LIT64( 0x0002000000000000 ) ) ) {
5857 if ( aSig1 ) STATUS(float_exception_flags) |= float_flag_inexact;
5858 }
5859 else {
5860 float_raise( float_flag_invalid STATUS_VAR);
5861 if ( ! aSign || ( ( aExp == 0x7FFF ) && ( aSig0 | aSig1 ) ) ) {
5862 return LIT64( 0x7FFFFFFFFFFFFFFF );
5863 }
5864 }
bb98fe42 5865 return (int64_t) LIT64( 0x8000000000000000 );
158142c2
FB
5866 }
5867 z = ( aSig0<<shiftCount ) | ( aSig1>>( ( - shiftCount ) & 63 ) );
bb98fe42 5868 if ( (uint64_t) ( aSig1<<shiftCount ) ) {
158142c2
FB
5869 STATUS(float_exception_flags) |= float_flag_inexact;
5870 }
5871 }
5872 else {
5873 if ( aExp < 0x3FFF ) {
5874 if ( aExp | aSig0 | aSig1 ) {
5875 STATUS(float_exception_flags) |= float_flag_inexact;
5876 }
5877 return 0;
5878 }
5879 z = aSig0>>( - shiftCount );
5880 if ( aSig1
bb98fe42 5881 || ( shiftCount && (uint64_t) ( aSig0<<( shiftCount & 63 ) ) ) ) {
158142c2
FB
5882 STATUS(float_exception_flags) |= float_flag_inexact;
5883 }
5884 }
5885 if ( aSign ) z = - z;
5886 return z;
5887
5888}
5889
5890/*----------------------------------------------------------------------------
5891| Returns the result of converting the quadruple-precision floating-point
5892| value `a' to the single-precision floating-point format. The conversion
5893| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5894| Arithmetic.
5895*----------------------------------------------------------------------------*/
5896
5897float32 float128_to_float32( float128 a STATUS_PARAM )
5898{
5899 flag aSign;
5900 int32 aExp;
bb98fe42
AF
5901 uint64_t aSig0, aSig1;
5902 uint32_t zSig;
158142c2
FB
5903
5904 aSig1 = extractFloat128Frac1( a );
5905 aSig0 = extractFloat128Frac0( a );
5906 aExp = extractFloat128Exp( a );
5907 aSign = extractFloat128Sign( a );
5908 if ( aExp == 0x7FFF ) {
5909 if ( aSig0 | aSig1 ) {
bcd4d9af 5910 return commonNaNToFloat32( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5911 }
5912 return packFloat32( aSign, 0xFF, 0 );
5913 }
5914 aSig0 |= ( aSig1 != 0 );
5915 shift64RightJamming( aSig0, 18, &aSig0 );
5916 zSig = aSig0;
5917 if ( aExp || zSig ) {
5918 zSig |= 0x40000000;
5919 aExp -= 0x3F81;
5920 }
5921 return roundAndPackFloat32( aSign, aExp, zSig STATUS_VAR );
5922
5923}
5924
5925/*----------------------------------------------------------------------------
5926| Returns the result of converting the quadruple-precision floating-point
5927| value `a' to the double-precision floating-point format. The conversion
5928| is performed according to the IEC/IEEE Standard for Binary Floating-Point
5929| Arithmetic.
5930*----------------------------------------------------------------------------*/
5931
5932float64 float128_to_float64( float128 a STATUS_PARAM )
5933{
5934 flag aSign;
5935 int32 aExp;
bb98fe42 5936 uint64_t aSig0, aSig1;
158142c2
FB
5937
5938 aSig1 = extractFloat128Frac1( a );
5939 aSig0 = extractFloat128Frac0( a );
5940 aExp = extractFloat128Exp( a );
5941 aSign = extractFloat128Sign( a );
5942 if ( aExp == 0x7FFF ) {
5943 if ( aSig0 | aSig1 ) {
bcd4d9af 5944 return commonNaNToFloat64( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5945 }
5946 return packFloat64( aSign, 0x7FF, 0 );
5947 }
5948 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
5949 aSig0 |= ( aSig1 != 0 );
5950 if ( aExp || aSig0 ) {
5951 aSig0 |= LIT64( 0x4000000000000000 );
5952 aExp -= 0x3C01;
5953 }
5954 return roundAndPackFloat64( aSign, aExp, aSig0 STATUS_VAR );
5955
5956}
5957
158142c2
FB
5958/*----------------------------------------------------------------------------
5959| Returns the result of converting the quadruple-precision floating-point
5960| value `a' to the extended double-precision floating-point format. The
5961| conversion is performed according to the IEC/IEEE Standard for Binary
5962| Floating-Point Arithmetic.
5963*----------------------------------------------------------------------------*/
5964
5965floatx80 float128_to_floatx80( float128 a STATUS_PARAM )
5966{
5967 flag aSign;
5968 int32 aExp;
bb98fe42 5969 uint64_t aSig0, aSig1;
158142c2
FB
5970
5971 aSig1 = extractFloat128Frac1( a );
5972 aSig0 = extractFloat128Frac0( a );
5973 aExp = extractFloat128Exp( a );
5974 aSign = extractFloat128Sign( a );
5975 if ( aExp == 0x7FFF ) {
5976 if ( aSig0 | aSig1 ) {
bcd4d9af 5977 return commonNaNToFloatx80( float128ToCommonNaN( a STATUS_VAR ) STATUS_VAR );
158142c2
FB
5978 }
5979 return packFloatx80( aSign, 0x7FFF, LIT64( 0x8000000000000000 ) );
5980 }
5981 if ( aExp == 0 ) {
5982 if ( ( aSig0 | aSig1 ) == 0 ) return packFloatx80( aSign, 0, 0 );
5983 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
5984 }
5985 else {
5986 aSig0 |= LIT64( 0x0001000000000000 );
5987 }
5988 shortShift128Left( aSig0, aSig1, 15, &aSig0, &aSig1 );
5989 return roundAndPackFloatx80( 80, aSign, aExp, aSig0, aSig1 STATUS_VAR );
5990
5991}
5992
158142c2
FB
5993/*----------------------------------------------------------------------------
5994| Rounds the quadruple-precision floating-point value `a' to an integer, and
5995| returns the result as a quadruple-precision floating-point value. The
5996| operation is performed according to the IEC/IEEE Standard for Binary
5997| Floating-Point Arithmetic.
5998*----------------------------------------------------------------------------*/
5999
6000float128 float128_round_to_int( float128 a STATUS_PARAM )
6001{
6002 flag aSign;
6003 int32 aExp;
bb98fe42 6004 uint64_t lastBitMask, roundBitsMask;
158142c2
FB
6005 float128 z;
6006
6007 aExp = extractFloat128Exp( a );
6008 if ( 0x402F <= aExp ) {
6009 if ( 0x406F <= aExp ) {
6010 if ( ( aExp == 0x7FFF )
6011 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) )
6012 ) {
6013 return propagateFloat128NaN( a, a STATUS_VAR );
6014 }
6015 return a;
6016 }
6017 lastBitMask = 1;
6018 lastBitMask = ( lastBitMask<<( 0x406E - aExp ) )<<1;
6019 roundBitsMask = lastBitMask - 1;
6020 z = a;
dc355b76
PM
6021 switch (STATUS(float_rounding_mode)) {
6022 case float_round_nearest_even:
158142c2
FB
6023 if ( lastBitMask ) {
6024 add128( z.high, z.low, 0, lastBitMask>>1, &z.high, &z.low );
6025 if ( ( z.low & roundBitsMask ) == 0 ) z.low &= ~ lastBitMask;
6026 }
6027 else {
bb98fe42 6028 if ( (int64_t) z.low < 0 ) {
158142c2 6029 ++z.high;
bb98fe42 6030 if ( (uint64_t) ( z.low<<1 ) == 0 ) z.high &= ~1;
158142c2
FB
6031 }
6032 }
dc355b76 6033 break;
f9288a76
PM
6034 case float_round_ties_away:
6035 if (lastBitMask) {
6036 add128(z.high, z.low, 0, lastBitMask >> 1, &z.high, &z.low);
6037 } else {
6038 if ((int64_t) z.low < 0) {
6039 ++z.high;
6040 }
6041 }
6042 break;
dc355b76
PM
6043 case float_round_to_zero:
6044 break;
6045 case float_round_up:
6046 if (!extractFloat128Sign(z)) {
6047 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
6048 }
6049 break;
6050 case float_round_down:
6051 if (extractFloat128Sign(z)) {
6052 add128(z.high, z.low, 0, roundBitsMask, &z.high, &z.low);
158142c2 6053 }
dc355b76
PM
6054 break;
6055 default:
6056 abort();
158142c2
FB
6057 }
6058 z.low &= ~ roundBitsMask;
6059 }
6060 else {
6061 if ( aExp < 0x3FFF ) {
bb98fe42 6062 if ( ( ( (uint64_t) ( a.high<<1 ) ) | a.low ) == 0 ) return a;
158142c2
FB
6063 STATUS(float_exception_flags) |= float_flag_inexact;
6064 aSign = extractFloat128Sign( a );
6065 switch ( STATUS(float_rounding_mode) ) {
6066 case float_round_nearest_even:
6067 if ( ( aExp == 0x3FFE )
6068 && ( extractFloat128Frac0( a )
6069 | extractFloat128Frac1( a ) )
6070 ) {
6071 return packFloat128( aSign, 0x3FFF, 0, 0 );
6072 }
6073 break;
f9288a76
PM
6074 case float_round_ties_away:
6075 if (aExp == 0x3FFE) {
6076 return packFloat128(aSign, 0x3FFF, 0, 0);
6077 }
6078 break;
158142c2
FB
6079 case float_round_down:
6080 return
6081 aSign ? packFloat128( 1, 0x3FFF, 0, 0 )
6082 : packFloat128( 0, 0, 0, 0 );
6083 case float_round_up:
6084 return
6085 aSign ? packFloat128( 1, 0, 0, 0 )
6086 : packFloat128( 0, 0x3FFF, 0, 0 );
6087 }
6088 return packFloat128( aSign, 0, 0, 0 );
6089 }
6090 lastBitMask = 1;
6091 lastBitMask <<= 0x402F - aExp;
6092 roundBitsMask = lastBitMask - 1;
6093 z.low = 0;
6094 z.high = a.high;
dc355b76
PM
6095 switch (STATUS(float_rounding_mode)) {
6096 case float_round_nearest_even:
158142c2
FB
6097 z.high += lastBitMask>>1;
6098 if ( ( ( z.high & roundBitsMask ) | a.low ) == 0 ) {
6099 z.high &= ~ lastBitMask;
6100 }
dc355b76 6101 break;
f9288a76
PM
6102 case float_round_ties_away:
6103 z.high += lastBitMask>>1;
6104 break;
dc355b76
PM
6105 case float_round_to_zero:
6106 break;
6107 case float_round_up:
6108 if (!extractFloat128Sign(z)) {
158142c2
FB
6109 z.high |= ( a.low != 0 );
6110 z.high += roundBitsMask;
6111 }
dc355b76
PM
6112 break;
6113 case float_round_down:
6114 if (extractFloat128Sign(z)) {
6115 z.high |= (a.low != 0);
6116 z.high += roundBitsMask;
6117 }
6118 break;
6119 default:
6120 abort();
158142c2
FB
6121 }
6122 z.high &= ~ roundBitsMask;
6123 }
6124 if ( ( z.low != a.low ) || ( z.high != a.high ) ) {
6125 STATUS(float_exception_flags) |= float_flag_inexact;
6126 }
6127 return z;
6128
6129}
6130
6131/*----------------------------------------------------------------------------
6132| Returns the result of adding the absolute values of the quadruple-precision
6133| floating-point values `a' and `b'. If `zSign' is 1, the sum is negated
6134| before being returned. `zSign' is ignored if the result is a NaN.
6135| The addition is performed according to the IEC/IEEE Standard for Binary
6136| Floating-Point Arithmetic.
6137*----------------------------------------------------------------------------*/
6138
6139static float128 addFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6140{
6141 int32 aExp, bExp, zExp;
bb98fe42 6142 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
158142c2
FB
6143 int32 expDiff;
6144
6145 aSig1 = extractFloat128Frac1( a );
6146 aSig0 = extractFloat128Frac0( a );
6147 aExp = extractFloat128Exp( a );
6148 bSig1 = extractFloat128Frac1( b );
6149 bSig0 = extractFloat128Frac0( b );
6150 bExp = extractFloat128Exp( b );
6151 expDiff = aExp - bExp;
6152 if ( 0 < expDiff ) {
6153 if ( aExp == 0x7FFF ) {
6154 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6155 return a;
6156 }
6157 if ( bExp == 0 ) {
6158 --expDiff;
6159 }
6160 else {
6161 bSig0 |= LIT64( 0x0001000000000000 );
6162 }
6163 shift128ExtraRightJamming(
6164 bSig0, bSig1, 0, expDiff, &bSig0, &bSig1, &zSig2 );
6165 zExp = aExp;
6166 }
6167 else if ( expDiff < 0 ) {
6168 if ( bExp == 0x7FFF ) {
6169 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6170 return packFloat128( zSign, 0x7FFF, 0, 0 );
6171 }
6172 if ( aExp == 0 ) {
6173 ++expDiff;
6174 }
6175 else {
6176 aSig0 |= LIT64( 0x0001000000000000 );
6177 }
6178 shift128ExtraRightJamming(
6179 aSig0, aSig1, 0, - expDiff, &aSig0, &aSig1, &zSig2 );
6180 zExp = bExp;
6181 }
6182 else {
6183 if ( aExp == 0x7FFF ) {
6184 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6185 return propagateFloat128NaN( a, b STATUS_VAR );
6186 }
6187 return a;
6188 }
6189 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
fe76d976 6190 if ( aExp == 0 ) {
e6afc87f
PM
6191 if (STATUS(flush_to_zero)) {
6192 if (zSig0 | zSig1) {
6193 float_raise(float_flag_output_denormal STATUS_VAR);
6194 }
6195 return packFloat128(zSign, 0, 0, 0);
6196 }
fe76d976
PB
6197 return packFloat128( zSign, 0, zSig0, zSig1 );
6198 }
158142c2
FB
6199 zSig2 = 0;
6200 zSig0 |= LIT64( 0x0002000000000000 );
6201 zExp = aExp;
6202 goto shiftRight1;
6203 }
6204 aSig0 |= LIT64( 0x0001000000000000 );
6205 add128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6206 --zExp;
6207 if ( zSig0 < LIT64( 0x0002000000000000 ) ) goto roundAndPack;
6208 ++zExp;
6209 shiftRight1:
6210 shift128ExtraRightJamming(
6211 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6212 roundAndPack:
6213 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6214
6215}
6216
6217/*----------------------------------------------------------------------------
6218| Returns the result of subtracting the absolute values of the quadruple-
6219| precision floating-point values `a' and `b'. If `zSign' is 1, the
6220| difference is negated before being returned. `zSign' is ignored if the
6221| result is a NaN. The subtraction is performed according to the IEC/IEEE
6222| Standard for Binary Floating-Point Arithmetic.
6223*----------------------------------------------------------------------------*/
6224
6225static float128 subFloat128Sigs( float128 a, float128 b, flag zSign STATUS_PARAM)
6226{
6227 int32 aExp, bExp, zExp;
bb98fe42 6228 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1;
158142c2
FB
6229 int32 expDiff;
6230 float128 z;
6231
6232 aSig1 = extractFloat128Frac1( a );
6233 aSig0 = extractFloat128Frac0( a );
6234 aExp = extractFloat128Exp( a );
6235 bSig1 = extractFloat128Frac1( b );
6236 bSig0 = extractFloat128Frac0( b );
6237 bExp = extractFloat128Exp( b );
6238 expDiff = aExp - bExp;
6239 shortShift128Left( aSig0, aSig1, 14, &aSig0, &aSig1 );
6240 shortShift128Left( bSig0, bSig1, 14, &bSig0, &bSig1 );
6241 if ( 0 < expDiff ) goto aExpBigger;
6242 if ( expDiff < 0 ) goto bExpBigger;
6243 if ( aExp == 0x7FFF ) {
6244 if ( aSig0 | aSig1 | bSig0 | bSig1 ) {
6245 return propagateFloat128NaN( a, b STATUS_VAR );
6246 }
6247 float_raise( float_flag_invalid STATUS_VAR);
6248 z.low = float128_default_nan_low;
6249 z.high = float128_default_nan_high;
6250 return z;
6251 }
6252 if ( aExp == 0 ) {
6253 aExp = 1;
6254 bExp = 1;
6255 }
6256 if ( bSig0 < aSig0 ) goto aBigger;
6257 if ( aSig0 < bSig0 ) goto bBigger;
6258 if ( bSig1 < aSig1 ) goto aBigger;
6259 if ( aSig1 < bSig1 ) goto bBigger;
6260 return packFloat128( STATUS(float_rounding_mode) == float_round_down, 0, 0, 0 );
6261 bExpBigger:
6262 if ( bExp == 0x7FFF ) {
6263 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6264 return packFloat128( zSign ^ 1, 0x7FFF, 0, 0 );
6265 }
6266 if ( aExp == 0 ) {
6267 ++expDiff;
6268 }
6269 else {
6270 aSig0 |= LIT64( 0x4000000000000000 );
6271 }
6272 shift128RightJamming( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6273 bSig0 |= LIT64( 0x4000000000000000 );
6274 bBigger:
6275 sub128( bSig0, bSig1, aSig0, aSig1, &zSig0, &zSig1 );
6276 zExp = bExp;
6277 zSign ^= 1;
6278 goto normalizeRoundAndPack;
6279 aExpBigger:
6280 if ( aExp == 0x7FFF ) {
6281 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6282 return a;
6283 }
6284 if ( bExp == 0 ) {
6285 --expDiff;
6286 }
6287 else {
6288 bSig0 |= LIT64( 0x4000000000000000 );
6289 }
6290 shift128RightJamming( bSig0, bSig1, expDiff, &bSig0, &bSig1 );
6291 aSig0 |= LIT64( 0x4000000000000000 );
6292 aBigger:
6293 sub128( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1 );
6294 zExp = aExp;
6295 normalizeRoundAndPack:
6296 --zExp;
6297 return normalizeRoundAndPackFloat128( zSign, zExp - 14, zSig0, zSig1 STATUS_VAR );
6298
6299}
6300
6301/*----------------------------------------------------------------------------
6302| Returns the result of adding the quadruple-precision floating-point values
6303| `a' and `b'. The operation is performed according to the IEC/IEEE Standard
6304| for Binary Floating-Point Arithmetic.
6305*----------------------------------------------------------------------------*/
6306
6307float128 float128_add( float128 a, float128 b STATUS_PARAM )
6308{
6309 flag aSign, bSign;
6310
6311 aSign = extractFloat128Sign( a );
6312 bSign = extractFloat128Sign( b );
6313 if ( aSign == bSign ) {
6314 return addFloat128Sigs( a, b, aSign STATUS_VAR );
6315 }
6316 else {
6317 return subFloat128Sigs( a, b, aSign STATUS_VAR );
6318 }
6319
6320}
6321
6322/*----------------------------------------------------------------------------
6323| Returns the result of subtracting the quadruple-precision floating-point
6324| values `a' and `b'. The operation is performed according to the IEC/IEEE
6325| Standard for Binary Floating-Point Arithmetic.
6326*----------------------------------------------------------------------------*/
6327
6328float128 float128_sub( float128 a, float128 b STATUS_PARAM )
6329{
6330 flag aSign, bSign;
6331
6332 aSign = extractFloat128Sign( a );
6333 bSign = extractFloat128Sign( b );
6334 if ( aSign == bSign ) {
6335 return subFloat128Sigs( a, b, aSign STATUS_VAR );
6336 }
6337 else {
6338 return addFloat128Sigs( a, b, aSign STATUS_VAR );
6339 }
6340
6341}
6342
6343/*----------------------------------------------------------------------------
6344| Returns the result of multiplying the quadruple-precision floating-point
6345| values `a' and `b'. The operation is performed according to the IEC/IEEE
6346| Standard for Binary Floating-Point Arithmetic.
6347*----------------------------------------------------------------------------*/
6348
6349float128 float128_mul( float128 a, float128 b STATUS_PARAM )
6350{
6351 flag aSign, bSign, zSign;
6352 int32 aExp, bExp, zExp;
bb98fe42 6353 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2, zSig3;
158142c2
FB
6354 float128 z;
6355
6356 aSig1 = extractFloat128Frac1( a );
6357 aSig0 = extractFloat128Frac0( a );
6358 aExp = extractFloat128Exp( a );
6359 aSign = extractFloat128Sign( a );
6360 bSig1 = extractFloat128Frac1( b );
6361 bSig0 = extractFloat128Frac0( b );
6362 bExp = extractFloat128Exp( b );
6363 bSign = extractFloat128Sign( b );
6364 zSign = aSign ^ bSign;
6365 if ( aExp == 0x7FFF ) {
6366 if ( ( aSig0 | aSig1 )
6367 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6368 return propagateFloat128NaN( a, b STATUS_VAR );
6369 }
6370 if ( ( bExp | bSig0 | bSig1 ) == 0 ) goto invalid;
6371 return packFloat128( zSign, 0x7FFF, 0, 0 );
6372 }
6373 if ( bExp == 0x7FFF ) {
6374 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6375 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6376 invalid:
6377 float_raise( float_flag_invalid STATUS_VAR);
6378 z.low = float128_default_nan_low;
6379 z.high = float128_default_nan_high;
6380 return z;
6381 }
6382 return packFloat128( zSign, 0x7FFF, 0, 0 );
6383 }
6384 if ( aExp == 0 ) {
6385 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6386 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6387 }
6388 if ( bExp == 0 ) {
6389 if ( ( bSig0 | bSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6390 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6391 }
6392 zExp = aExp + bExp - 0x4000;
6393 aSig0 |= LIT64( 0x0001000000000000 );
6394 shortShift128Left( bSig0, bSig1, 16, &bSig0, &bSig1 );
6395 mul128To256( aSig0, aSig1, bSig0, bSig1, &zSig0, &zSig1, &zSig2, &zSig3 );
6396 add128( zSig0, zSig1, aSig0, aSig1, &zSig0, &zSig1 );
6397 zSig2 |= ( zSig3 != 0 );
6398 if ( LIT64( 0x0002000000000000 ) <= zSig0 ) {
6399 shift128ExtraRightJamming(
6400 zSig0, zSig1, zSig2, 1, &zSig0, &zSig1, &zSig2 );
6401 ++zExp;
6402 }
6403 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6404
6405}
6406
6407/*----------------------------------------------------------------------------
6408| Returns the result of dividing the quadruple-precision floating-point value
6409| `a' by the corresponding value `b'. The operation is performed according to
6410| the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6411*----------------------------------------------------------------------------*/
6412
6413float128 float128_div( float128 a, float128 b STATUS_PARAM )
6414{
6415 flag aSign, bSign, zSign;
6416 int32 aExp, bExp, zExp;
bb98fe42
AF
6417 uint64_t aSig0, aSig1, bSig0, bSig1, zSig0, zSig1, zSig2;
6418 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6419 float128 z;
6420
6421 aSig1 = extractFloat128Frac1( a );
6422 aSig0 = extractFloat128Frac0( a );
6423 aExp = extractFloat128Exp( a );
6424 aSign = extractFloat128Sign( a );
6425 bSig1 = extractFloat128Frac1( b );
6426 bSig0 = extractFloat128Frac0( b );
6427 bExp = extractFloat128Exp( b );
6428 bSign = extractFloat128Sign( b );
6429 zSign = aSign ^ bSign;
6430 if ( aExp == 0x7FFF ) {
6431 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6432 if ( bExp == 0x7FFF ) {
6433 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6434 goto invalid;
6435 }
6436 return packFloat128( zSign, 0x7FFF, 0, 0 );
6437 }
6438 if ( bExp == 0x7FFF ) {
6439 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6440 return packFloat128( zSign, 0, 0, 0 );
6441 }
6442 if ( bExp == 0 ) {
6443 if ( ( bSig0 | bSig1 ) == 0 ) {
6444 if ( ( aExp | aSig0 | aSig1 ) == 0 ) {
6445 invalid:
6446 float_raise( float_flag_invalid STATUS_VAR);
6447 z.low = float128_default_nan_low;
6448 z.high = float128_default_nan_high;
6449 return z;
6450 }
6451 float_raise( float_flag_divbyzero STATUS_VAR);
6452 return packFloat128( zSign, 0x7FFF, 0, 0 );
6453 }
6454 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6455 }
6456 if ( aExp == 0 ) {
6457 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( zSign, 0, 0, 0 );
6458 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6459 }
6460 zExp = aExp - bExp + 0x3FFD;
6461 shortShift128Left(
6462 aSig0 | LIT64( 0x0001000000000000 ), aSig1, 15, &aSig0, &aSig1 );
6463 shortShift128Left(
6464 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6465 if ( le128( bSig0, bSig1, aSig0, aSig1 ) ) {
6466 shift128Right( aSig0, aSig1, 1, &aSig0, &aSig1 );
6467 ++zExp;
6468 }
6469 zSig0 = estimateDiv128To64( aSig0, aSig1, bSig0 );
6470 mul128By64To192( bSig0, bSig1, zSig0, &term0, &term1, &term2 );
6471 sub192( aSig0, aSig1, 0, term0, term1, term2, &rem0, &rem1, &rem2 );
bb98fe42 6472 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6473 --zSig0;
6474 add192( rem0, rem1, rem2, 0, bSig0, bSig1, &rem0, &rem1, &rem2 );
6475 }
6476 zSig1 = estimateDiv128To64( rem1, rem2, bSig0 );
6477 if ( ( zSig1 & 0x3FFF ) <= 4 ) {
6478 mul128By64To192( bSig0, bSig1, zSig1, &term1, &term2, &term3 );
6479 sub192( rem1, rem2, 0, term1, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6480 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6481 --zSig1;
6482 add192( rem1, rem2, rem3, 0, bSig0, bSig1, &rem1, &rem2, &rem3 );
6483 }
6484 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6485 }
6486 shift128ExtraRightJamming( zSig0, zSig1, 0, 15, &zSig0, &zSig1, &zSig2 );
6487 return roundAndPackFloat128( zSign, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6488
6489}
6490
6491/*----------------------------------------------------------------------------
6492| Returns the remainder of the quadruple-precision floating-point value `a'
6493| with respect to the corresponding value `b'. The operation is performed
6494| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6495*----------------------------------------------------------------------------*/
6496
6497float128 float128_rem( float128 a, float128 b STATUS_PARAM )
6498{
ed086f3d 6499 flag aSign, zSign;
158142c2 6500 int32 aExp, bExp, expDiff;
bb98fe42
AF
6501 uint64_t aSig0, aSig1, bSig0, bSig1, q, term0, term1, term2;
6502 uint64_t allZero, alternateASig0, alternateASig1, sigMean1;
6503 int64_t sigMean0;
158142c2
FB
6504 float128 z;
6505
6506 aSig1 = extractFloat128Frac1( a );
6507 aSig0 = extractFloat128Frac0( a );
6508 aExp = extractFloat128Exp( a );
6509 aSign = extractFloat128Sign( a );
6510 bSig1 = extractFloat128Frac1( b );
6511 bSig0 = extractFloat128Frac0( b );
6512 bExp = extractFloat128Exp( b );
158142c2
FB
6513 if ( aExp == 0x7FFF ) {
6514 if ( ( aSig0 | aSig1 )
6515 || ( ( bExp == 0x7FFF ) && ( bSig0 | bSig1 ) ) ) {
6516 return propagateFloat128NaN( a, b STATUS_VAR );
6517 }
6518 goto invalid;
6519 }
6520 if ( bExp == 0x7FFF ) {
6521 if ( bSig0 | bSig1 ) return propagateFloat128NaN( a, b STATUS_VAR );
6522 return a;
6523 }
6524 if ( bExp == 0 ) {
6525 if ( ( bSig0 | bSig1 ) == 0 ) {
6526 invalid:
6527 float_raise( float_flag_invalid STATUS_VAR);
6528 z.low = float128_default_nan_low;
6529 z.high = float128_default_nan_high;
6530 return z;
6531 }
6532 normalizeFloat128Subnormal( bSig0, bSig1, &bExp, &bSig0, &bSig1 );
6533 }
6534 if ( aExp == 0 ) {
6535 if ( ( aSig0 | aSig1 ) == 0 ) return a;
6536 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6537 }
6538 expDiff = aExp - bExp;
6539 if ( expDiff < -1 ) return a;
6540 shortShift128Left(
6541 aSig0 | LIT64( 0x0001000000000000 ),
6542 aSig1,
6543 15 - ( expDiff < 0 ),
6544 &aSig0,
6545 &aSig1
6546 );
6547 shortShift128Left(
6548 bSig0 | LIT64( 0x0001000000000000 ), bSig1, 15, &bSig0, &bSig1 );
6549 q = le128( bSig0, bSig1, aSig0, aSig1 );
6550 if ( q ) sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
6551 expDiff -= 64;
6552 while ( 0 < expDiff ) {
6553 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6554 q = ( 4 < q ) ? q - 4 : 0;
6555 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6556 shortShift192Left( term0, term1, term2, 61, &term1, &term2, &allZero );
6557 shortShift128Left( aSig0, aSig1, 61, &aSig0, &allZero );
6558 sub128( aSig0, 0, term1, term2, &aSig0, &aSig1 );
6559 expDiff -= 61;
6560 }
6561 if ( -64 < expDiff ) {
6562 q = estimateDiv128To64( aSig0, aSig1, bSig0 );
6563 q = ( 4 < q ) ? q - 4 : 0;
6564 q >>= - expDiff;
6565 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6566 expDiff += 52;
6567 if ( expDiff < 0 ) {
6568 shift128Right( aSig0, aSig1, - expDiff, &aSig0, &aSig1 );
6569 }
6570 else {
6571 shortShift128Left( aSig0, aSig1, expDiff, &aSig0, &aSig1 );
6572 }
6573 mul128By64To192( bSig0, bSig1, q, &term0, &term1, &term2 );
6574 sub128( aSig0, aSig1, term1, term2, &aSig0, &aSig1 );
6575 }
6576 else {
6577 shift128Right( aSig0, aSig1, 12, &aSig0, &aSig1 );
6578 shift128Right( bSig0, bSig1, 12, &bSig0, &bSig1 );
6579 }
6580 do {
6581 alternateASig0 = aSig0;
6582 alternateASig1 = aSig1;
6583 ++q;
6584 sub128( aSig0, aSig1, bSig0, bSig1, &aSig0, &aSig1 );
bb98fe42 6585 } while ( 0 <= (int64_t) aSig0 );
158142c2 6586 add128(
bb98fe42 6587 aSig0, aSig1, alternateASig0, alternateASig1, (uint64_t *)&sigMean0, &sigMean1 );
158142c2
FB
6588 if ( ( sigMean0 < 0 )
6589 || ( ( ( sigMean0 | sigMean1 ) == 0 ) && ( q & 1 ) ) ) {
6590 aSig0 = alternateASig0;
6591 aSig1 = alternateASig1;
6592 }
bb98fe42 6593 zSign = ( (int64_t) aSig0 < 0 );
158142c2
FB
6594 if ( zSign ) sub128( 0, 0, aSig0, aSig1, &aSig0, &aSig1 );
6595 return
6596 normalizeRoundAndPackFloat128( aSign ^ zSign, bExp - 4, aSig0, aSig1 STATUS_VAR );
6597
6598}
6599
6600/*----------------------------------------------------------------------------
6601| Returns the square root of the quadruple-precision floating-point value `a'.
6602| The operation is performed according to the IEC/IEEE Standard for Binary
6603| Floating-Point Arithmetic.
6604*----------------------------------------------------------------------------*/
6605
6606float128 float128_sqrt( float128 a STATUS_PARAM )
6607{
6608 flag aSign;
6609 int32 aExp, zExp;
bb98fe42
AF
6610 uint64_t aSig0, aSig1, zSig0, zSig1, zSig2, doubleZSig0;
6611 uint64_t rem0, rem1, rem2, rem3, term0, term1, term2, term3;
158142c2
FB
6612 float128 z;
6613
6614 aSig1 = extractFloat128Frac1( a );
6615 aSig0 = extractFloat128Frac0( a );
6616 aExp = extractFloat128Exp( a );
6617 aSign = extractFloat128Sign( a );
6618 if ( aExp == 0x7FFF ) {
6619 if ( aSig0 | aSig1 ) return propagateFloat128NaN( a, a STATUS_VAR );
6620 if ( ! aSign ) return a;
6621 goto invalid;
6622 }
6623 if ( aSign ) {
6624 if ( ( aExp | aSig0 | aSig1 ) == 0 ) return a;
6625 invalid:
6626 float_raise( float_flag_invalid STATUS_VAR);
6627 z.low = float128_default_nan_low;
6628 z.high = float128_default_nan_high;
6629 return z;
6630 }
6631 if ( aExp == 0 ) {
6632 if ( ( aSig0 | aSig1 ) == 0 ) return packFloat128( 0, 0, 0, 0 );
6633 normalizeFloat128Subnormal( aSig0, aSig1, &aExp, &aSig0, &aSig1 );
6634 }
6635 zExp = ( ( aExp - 0x3FFF )>>1 ) + 0x3FFE;
6636 aSig0 |= LIT64( 0x0001000000000000 );
6637 zSig0 = estimateSqrt32( aExp, aSig0>>17 );
6638 shortShift128Left( aSig0, aSig1, 13 - ( aExp & 1 ), &aSig0, &aSig1 );
6639 zSig0 = estimateDiv128To64( aSig0, aSig1, zSig0<<32 ) + ( zSig0<<30 );
6640 doubleZSig0 = zSig0<<1;
6641 mul64To128( zSig0, zSig0, &term0, &term1 );
6642 sub128( aSig0, aSig1, term0, term1, &rem0, &rem1 );
bb98fe42 6643 while ( (int64_t) rem0 < 0 ) {
158142c2
FB
6644 --zSig0;
6645 doubleZSig0 -= 2;
6646 add128( rem0, rem1, zSig0>>63, doubleZSig0 | 1, &rem0, &rem1 );
6647 }
6648 zSig1 = estimateDiv128To64( rem1, 0, doubleZSig0 );
6649 if ( ( zSig1 & 0x1FFF ) <= 5 ) {
6650 if ( zSig1 == 0 ) zSig1 = 1;
6651 mul64To128( doubleZSig0, zSig1, &term1, &term2 );
6652 sub128( rem1, 0, term1, term2, &rem1, &rem2 );
6653 mul64To128( zSig1, zSig1, &term2, &term3 );
6654 sub192( rem1, rem2, 0, 0, term2, term3, &rem1, &rem2, &rem3 );
bb98fe42 6655 while ( (int64_t) rem1 < 0 ) {
158142c2
FB
6656 --zSig1;
6657 shortShift128Left( 0, zSig1, 1, &term2, &term3 );
6658 term3 |= 1;
6659 term2 |= doubleZSig0;
6660 add192( rem1, rem2, rem3, 0, term2, term3, &rem1, &rem2, &rem3 );
6661 }
6662 zSig1 |= ( ( rem1 | rem2 | rem3 ) != 0 );
6663 }
6664 shift128ExtraRightJamming( zSig0, zSig1, 0, 14, &zSig0, &zSig1, &zSig2 );
6665 return roundAndPackFloat128( 0, zExp, zSig0, zSig1, zSig2 STATUS_VAR );
6666
6667}
6668
6669/*----------------------------------------------------------------------------
6670| Returns 1 if the quadruple-precision floating-point value `a' is equal to
b689362d
AJ
6671| the corresponding value `b', and 0 otherwise. The invalid exception is
6672| raised if either operand is a NaN. Otherwise, the comparison is performed
158142c2
FB
6673| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6674*----------------------------------------------------------------------------*/
6675
b689362d 6676int float128_eq( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6677{
6678
6679 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6680 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6681 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6682 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6683 ) {
b689362d 6684 float_raise( float_flag_invalid STATUS_VAR);
158142c2
FB
6685 return 0;
6686 }
6687 return
6688 ( a.low == b.low )
6689 && ( ( a.high == b.high )
6690 || ( ( a.low == 0 )
bb98fe42 6691 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6692 );
6693
6694}
6695
6696/*----------------------------------------------------------------------------
6697| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6698| or equal to the corresponding value `b', and 0 otherwise. The invalid
6699| exception is raised if either operand is a NaN. The comparison is performed
6700| according to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6701*----------------------------------------------------------------------------*/
6702
750afe93 6703int float128_le( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6704{
6705 flag aSign, bSign;
6706
6707 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6708 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6709 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6710 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6711 ) {
6712 float_raise( float_flag_invalid STATUS_VAR);
6713 return 0;
6714 }
6715 aSign = extractFloat128Sign( a );
6716 bSign = extractFloat128Sign( b );
6717 if ( aSign != bSign ) {
6718 return
6719 aSign
bb98fe42 6720 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6721 == 0 );
6722 }
6723 return
6724 aSign ? le128( b.high, b.low, a.high, a.low )
6725 : le128( a.high, a.low, b.high, b.low );
6726
6727}
6728
6729/*----------------------------------------------------------------------------
6730| Returns 1 if the quadruple-precision floating-point value `a' is less than
f5a64251
AJ
6731| the corresponding value `b', and 0 otherwise. The invalid exception is
6732| raised if either operand is a NaN. The comparison is performed according
6733| to the IEC/IEEE Standard for Binary Floating-Point Arithmetic.
158142c2
FB
6734*----------------------------------------------------------------------------*/
6735
750afe93 6736int float128_lt( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6737{
6738 flag aSign, bSign;
6739
6740 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6741 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6742 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6743 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6744 ) {
6745 float_raise( float_flag_invalid STATUS_VAR);
6746 return 0;
6747 }
6748 aSign = extractFloat128Sign( a );
6749 bSign = extractFloat128Sign( b );
6750 if ( aSign != bSign ) {
6751 return
6752 aSign
bb98fe42 6753 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6754 != 0 );
6755 }
6756 return
6757 aSign ? lt128( b.high, b.low, a.high, a.low )
6758 : lt128( a.high, a.low, b.high, b.low );
6759
6760}
6761
67b7861d
AJ
6762/*----------------------------------------------------------------------------
6763| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
f5a64251
AJ
6764| be compared, and 0 otherwise. The invalid exception is raised if either
6765| operand is a NaN. The comparison is performed according to the IEC/IEEE
6766| Standard for Binary Floating-Point Arithmetic.
67b7861d
AJ
6767*----------------------------------------------------------------------------*/
6768
6769int float128_unordered( float128 a, float128 b STATUS_PARAM )
6770{
6771 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6772 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6773 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6774 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6775 ) {
6776 float_raise( float_flag_invalid STATUS_VAR);
6777 return 1;
6778 }
6779 return 0;
6780}
6781
158142c2
FB
6782/*----------------------------------------------------------------------------
6783| Returns 1 if the quadruple-precision floating-point value `a' is equal to
f5a64251
AJ
6784| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6785| exception. The comparison is performed according to the IEC/IEEE Standard
6786| for Binary Floating-Point Arithmetic.
158142c2
FB
6787*----------------------------------------------------------------------------*/
6788
b689362d 6789int float128_eq_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6790{
6791
6792 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6793 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6794 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6795 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6796 ) {
b689362d
AJ
6797 if ( float128_is_signaling_nan( a )
6798 || float128_is_signaling_nan( b ) ) {
6799 float_raise( float_flag_invalid STATUS_VAR);
6800 }
158142c2
FB
6801 return 0;
6802 }
6803 return
6804 ( a.low == b.low )
6805 && ( ( a.high == b.high )
6806 || ( ( a.low == 0 )
bb98fe42 6807 && ( (uint64_t) ( ( a.high | b.high )<<1 ) == 0 ) )
158142c2
FB
6808 );
6809
6810}
6811
6812/*----------------------------------------------------------------------------
6813| Returns 1 if the quadruple-precision floating-point value `a' is less than
6814| or equal to the corresponding value `b', and 0 otherwise. Quiet NaNs do not
6815| cause an exception. Otherwise, the comparison is performed according to the
6816| IEC/IEEE Standard for Binary Floating-Point Arithmetic.
6817*----------------------------------------------------------------------------*/
6818
750afe93 6819int float128_le_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6820{
6821 flag aSign, bSign;
6822
6823 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6824 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6825 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6826 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6827 ) {
6828 if ( float128_is_signaling_nan( a )
6829 || float128_is_signaling_nan( b ) ) {
6830 float_raise( float_flag_invalid STATUS_VAR);
6831 }
6832 return 0;
6833 }
6834 aSign = extractFloat128Sign( a );
6835 bSign = extractFloat128Sign( b );
6836 if ( aSign != bSign ) {
6837 return
6838 aSign
bb98fe42 6839 || ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6840 == 0 );
6841 }
6842 return
6843 aSign ? le128( b.high, b.low, a.high, a.low )
6844 : le128( a.high, a.low, b.high, b.low );
6845
6846}
6847
6848/*----------------------------------------------------------------------------
6849| Returns 1 if the quadruple-precision floating-point value `a' is less than
6850| the corresponding value `b', and 0 otherwise. Quiet NaNs do not cause an
6851| exception. Otherwise, the comparison is performed according to the IEC/IEEE
6852| Standard for Binary Floating-Point Arithmetic.
6853*----------------------------------------------------------------------------*/
6854
750afe93 6855int float128_lt_quiet( float128 a, float128 b STATUS_PARAM )
158142c2
FB
6856{
6857 flag aSign, bSign;
6858
6859 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6860 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6861 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6862 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6863 ) {
6864 if ( float128_is_signaling_nan( a )
6865 || float128_is_signaling_nan( b ) ) {
6866 float_raise( float_flag_invalid STATUS_VAR);
6867 }
6868 return 0;
6869 }
6870 aSign = extractFloat128Sign( a );
6871 bSign = extractFloat128Sign( b );
6872 if ( aSign != bSign ) {
6873 return
6874 aSign
bb98fe42 6875 && ( ( ( (uint64_t) ( ( a.high | b.high )<<1 ) ) | a.low | b.low )
158142c2
FB
6876 != 0 );
6877 }
6878 return
6879 aSign ? lt128( b.high, b.low, a.high, a.low )
6880 : lt128( a.high, a.low, b.high, b.low );
6881
6882}
6883
67b7861d
AJ
6884/*----------------------------------------------------------------------------
6885| Returns 1 if the quadruple-precision floating-point values `a' and `b' cannot
6886| be compared, and 0 otherwise. Quiet NaNs do not cause an exception. The
6887| comparison is performed according to the IEC/IEEE Standard for Binary
6888| Floating-Point Arithmetic.
6889*----------------------------------------------------------------------------*/
6890
6891int float128_unordered_quiet( float128 a, float128 b STATUS_PARAM )
6892{
6893 if ( ( ( extractFloat128Exp( a ) == 0x7FFF )
6894 && ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) )
6895 || ( ( extractFloat128Exp( b ) == 0x7FFF )
6896 && ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )
6897 ) {
6898 if ( float128_is_signaling_nan( a )
6899 || float128_is_signaling_nan( b ) ) {
6900 float_raise( float_flag_invalid STATUS_VAR);
6901 }
6902 return 1;
6903 }
6904 return 0;
6905}
6906
1d6bda35 6907/* misc functions */
c4850f9e 6908float32 uint32_to_float32(uint32_t a STATUS_PARAM)
1d6bda35
FB
6909{
6910 return int64_to_float32(a STATUS_VAR);
6911}
6912
c4850f9e 6913float64 uint32_to_float64(uint32_t a STATUS_PARAM)
1d6bda35
FB
6914{
6915 return int64_to_float64(a STATUS_VAR);
6916}
6917
9f8d2a09 6918uint32 float32_to_uint32( float32 a STATUS_PARAM )
1d6bda35
FB
6919{
6920 int64_t v;
9f8d2a09 6921 uint32 res;
34e1c27b 6922 int old_exc_flags = get_float_exception_flags(status);
1d6bda35
FB
6923
6924 v = float32_to_int64(a STATUS_VAR);
6925 if (v < 0) {
6926 res = 0;
1d6bda35
FB
6927 } else if (v > 0xffffffff) {
6928 res = 0xffffffff;
1d6bda35 6929 } else {
34e1c27b 6930 return v;
1d6bda35 6931 }
34e1c27b
PM
6932 set_float_exception_flags(old_exc_flags, status);
6933 float_raise(float_flag_invalid STATUS_VAR);
1d6bda35
FB
6934 return res;
6935}
6936
9f8d2a09 6937uint32 float32_to_uint32_round_to_zero( float32 a STATUS_PARAM )
1d6bda35
FB
6938{
6939 int64_t v;
9f8d2a09 6940 uint32 res;
34e1c27b 6941 int old_exc_flags = get_float_exception_flags(status);
1d6bda35
FB
6942
6943 v = float32_to_int64_round_to_zero(a STATUS_VAR);
6944 if (v < 0) {
6945 res = 0;
1d6bda35
FB
6946 } else if (v > 0xffffffff) {
6947 res = 0xffffffff;
1d6bda35 6948 } else {
34e1c27b 6949 return v;
1d6bda35 6950 }
34e1c27b
PM
6951 set_float_exception_flags(old_exc_flags, status);
6952 float_raise(float_flag_invalid STATUS_VAR);
1d6bda35
FB
6953 return res;
6954}
6955
f581bf54
WN
6956int_fast16_t float32_to_int16(float32 a STATUS_PARAM)
6957{
6958 int32_t v;
6959 int_fast16_t res;
6960 int old_exc_flags = get_float_exception_flags(status);
6961
6962 v = float32_to_int32(a STATUS_VAR);
6963 if (v < -0x8000) {
6964 res = -0x8000;
6965 } else if (v > 0x7fff) {
6966 res = 0x7fff;
6967 } else {
6968 return v;
6969 }
6970
6971 set_float_exception_flags(old_exc_flags, status);
6972 float_raise(float_flag_invalid STATUS_VAR);
6973 return res;
6974}
6975
6976uint_fast16_t float32_to_uint16(float32 a STATUS_PARAM)
6977{
6978 int32_t v;
6979 uint_fast16_t res;
6980 int old_exc_flags = get_float_exception_flags(status);
6981
6982 v = float32_to_int32(a STATUS_VAR);
6983 if (v < 0) {
6984 res = 0;
6985 } else if (v > 0xffff) {
6986 res = 0xffff;
6987 } else {
6988 return v;
6989 }
6990
6991 set_float_exception_flags(old_exc_flags, status);
6992 float_raise(float_flag_invalid STATUS_VAR);
6993 return res;
6994}
6995
5aea4c58 6996uint_fast16_t float32_to_uint16_round_to_zero(float32 a STATUS_PARAM)
cbcef455
PM
6997{
6998 int64_t v;
5aea4c58 6999 uint_fast16_t res;
34e1c27b 7000 int old_exc_flags = get_float_exception_flags(status);
cbcef455
PM
7001
7002 v = float32_to_int64_round_to_zero(a STATUS_VAR);
7003 if (v < 0) {
7004 res = 0;
cbcef455
PM
7005 } else if (v > 0xffff) {
7006 res = 0xffff;
cbcef455 7007 } else {
34e1c27b 7008 return v;
cbcef455 7009 }
34e1c27b
PM
7010 set_float_exception_flags(old_exc_flags, status);
7011 float_raise(float_flag_invalid STATUS_VAR);
cbcef455
PM
7012 return res;
7013}
7014
9f8d2a09 7015uint32 float64_to_uint32( float64 a STATUS_PARAM )
1d6bda35 7016{
5e7f654f 7017 uint64_t v;
9f8d2a09 7018 uint32 res;
5e7f654f 7019 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7020
5e7f654f
TM
7021 v = float64_to_uint64(a STATUS_VAR);
7022 if (v > 0xffffffff) {
1d6bda35 7023 res = 0xffffffff;
1d6bda35 7024 } else {
5e7f654f 7025 return v;
1d6bda35 7026 }
5e7f654f
TM
7027 set_float_exception_flags(old_exc_flags, status);
7028 float_raise(float_flag_invalid STATUS_VAR);
1d6bda35
FB
7029 return res;
7030}
7031
9f8d2a09 7032uint32 float64_to_uint32_round_to_zero( float64 a STATUS_PARAM )
1d6bda35 7033{
fd728f2f 7034 uint64_t v;
9f8d2a09 7035 uint32 res;
fd728f2f 7036 int old_exc_flags = get_float_exception_flags(status);
1d6bda35 7037
fd728f2f
TM
7038 v = float64_to_uint64_round_to_zero(a STATUS_VAR);
7039 if (v > 0xffffffff) {
1d6bda35 7040 res = 0xffffffff;
1d6bda35 7041 } else {
fd728f2f 7042 return v;
1d6bda35 7043 }
fd728f2f
TM
7044 set_float_exception_flags(old_exc_flags, status);
7045 float_raise(float_flag_invalid STATUS_VAR);
1d6bda35
FB
7046 return res;
7047}
7048
f581bf54
WN
7049int_fast16_t float64_to_int16(float64 a STATUS_PARAM)
7050{
7051 int64_t v;
7052 int_fast16_t res;
7053 int old_exc_flags = get_float_exception_flags(status);
7054
7055 v = float64_to_int32(a STATUS_VAR);
7056 if (v < -0x8000) {
7057 res = -0x8000;
7058 } else if (v > 0x7fff) {
7059 res = 0x7fff;
7060 } else {
7061 return v;
7062 }
7063
7064 set_float_exception_flags(old_exc_flags, status);
7065 float_raise(float_flag_invalid STATUS_VAR);
7066 return res;
7067}
7068
7069uint_fast16_t float64_to_uint16(float64 a STATUS_PARAM)
7070{
7071 int64_t v;
7072 uint_fast16_t res;
7073 int old_exc_flags = get_float_exception_flags(status);
7074
7075 v = float64_to_int32(a STATUS_VAR);
7076 if (v < 0) {
7077 res = 0;
7078 } else if (v > 0xffff) {
7079 res = 0xffff;
7080 } else {
7081 return v;
7082 }
7083
7084 set_float_exception_flags(old_exc_flags, status);
7085 float_raise(float_flag_invalid STATUS_VAR);
7086 return res;
7087}
7088
5aea4c58 7089uint_fast16_t float64_to_uint16_round_to_zero(float64 a STATUS_PARAM)
cbcef455
PM
7090{
7091 int64_t v;
5aea4c58 7092 uint_fast16_t res;
34e1c27b 7093 int old_exc_flags = get_float_exception_flags(status);
cbcef455
PM
7094
7095 v = float64_to_int64_round_to_zero(a STATUS_VAR);
7096 if (v < 0) {
7097 res = 0;
cbcef455
PM
7098 } else if (v > 0xffff) {
7099 res = 0xffff;
cbcef455 7100 } else {
34e1c27b 7101 return v;
cbcef455 7102 }
34e1c27b
PM
7103 set_float_exception_flags(old_exc_flags, status);
7104 float_raise(float_flag_invalid STATUS_VAR);
cbcef455
PM
7105 return res;
7106}
7107
fb3ea83a
TM
7108/*----------------------------------------------------------------------------
7109| Returns the result of converting the double-precision floating-point value
7110| `a' to the 64-bit unsigned integer format. The conversion is
7111| performed according to the IEC/IEEE Standard for Binary Floating-Point
7112| Arithmetic---which means in particular that the conversion is rounded
7113| according to the current rounding mode. If `a' is a NaN, the largest
7114| positive integer is returned. If the conversion overflows, the
7115| largest unsigned integer is returned. If 'a' is negative, the value is
7116| rounded and zero is returned; negative values that do not round to zero
7117| will raise the inexact exception.
7118*----------------------------------------------------------------------------*/
75d62a58 7119
fb3ea83a
TM
7120uint64_t float64_to_uint64(float64 a STATUS_PARAM)
7121{
7122 flag aSign;
7123 int_fast16_t aExp, shiftCount;
7124 uint64_t aSig, aSigExtra;
7125 a = float64_squash_input_denormal(a STATUS_VAR);
75d62a58 7126
fb3ea83a
TM
7127 aSig = extractFloat64Frac(a);
7128 aExp = extractFloat64Exp(a);
7129 aSign = extractFloat64Sign(a);
7130 if (aSign && (aExp > 1022)) {
7131 float_raise(float_flag_invalid STATUS_VAR);
7132 if (float64_is_any_nan(a)) {
7133 return LIT64(0xFFFFFFFFFFFFFFFF);
7134 } else {
7135 return 0;
7136 }
7137 }
7138 if (aExp) {
7139 aSig |= LIT64(0x0010000000000000);
7140 }
7141 shiftCount = 0x433 - aExp;
7142 if (shiftCount <= 0) {
7143 if (0x43E < aExp) {
7144 float_raise(float_flag_invalid STATUS_VAR);
7145 return LIT64(0xFFFFFFFFFFFFFFFF);
7146 }
7147 aSigExtra = 0;
7148 aSig <<= -shiftCount;
7149 } else {
7150 shift64ExtraRightJamming(aSig, 0, shiftCount, &aSig, &aSigExtra);
7151 }
7152 return roundAndPackUint64(aSign, aSig, aSigExtra STATUS_VAR);
75d62a58
JM
7153}
7154
7155uint64_t float64_to_uint64_round_to_zero (float64 a STATUS_PARAM)
7156{
0a87a310
TM
7157 signed char current_rounding_mode = STATUS(float_rounding_mode);
7158 set_float_rounding_mode(float_round_to_zero STATUS_VAR);
7159 int64_t v = float64_to_uint64(a STATUS_VAR);
7160 set_float_rounding_mode(current_rounding_mode STATUS_VAR);
7161 return v;
75d62a58
JM
7162}
7163
1d6bda35 7164#define COMPARE(s, nan_exp) \
a49db98d 7165static inline int float ## s ## _compare_internal( float ## s a, float ## s b, \
1d6bda35
FB
7166 int is_quiet STATUS_PARAM ) \
7167{ \
7168 flag aSign, bSign; \
bb98fe42 7169 uint ## s ## _t av, bv; \
37d18660
PM
7170 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
7171 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
1d6bda35
FB
7172 \
7173 if (( ( extractFloat ## s ## Exp( a ) == nan_exp ) && \
7174 extractFloat ## s ## Frac( a ) ) || \
7175 ( ( extractFloat ## s ## Exp( b ) == nan_exp ) && \
7176 extractFloat ## s ## Frac( b ) )) { \
7177 if (!is_quiet || \
7178 float ## s ## _is_signaling_nan( a ) || \
7179 float ## s ## _is_signaling_nan( b ) ) { \
7180 float_raise( float_flag_invalid STATUS_VAR); \
7181 } \
7182 return float_relation_unordered; \
7183 } \
7184 aSign = extractFloat ## s ## Sign( a ); \
7185 bSign = extractFloat ## s ## Sign( b ); \
f090c9d4 7186 av = float ## s ## _val(a); \
cd8a2533 7187 bv = float ## s ## _val(b); \
1d6bda35 7188 if ( aSign != bSign ) { \
bb98fe42 7189 if ( (uint ## s ## _t) ( ( av | bv )<<1 ) == 0 ) { \
1d6bda35
FB
7190 /* zero case */ \
7191 return float_relation_equal; \
7192 } else { \
7193 return 1 - (2 * aSign); \
7194 } \
7195 } else { \
f090c9d4 7196 if (av == bv) { \
1d6bda35
FB
7197 return float_relation_equal; \
7198 } else { \
f090c9d4 7199 return 1 - 2 * (aSign ^ ( av < bv )); \
1d6bda35
FB
7200 } \
7201 } \
7202} \
7203 \
750afe93 7204int float ## s ## _compare( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
7205{ \
7206 return float ## s ## _compare_internal(a, b, 0 STATUS_VAR); \
7207} \
7208 \
750afe93 7209int float ## s ## _compare_quiet( float ## s a, float ## s b STATUS_PARAM ) \
1d6bda35
FB
7210{ \
7211 return float ## s ## _compare_internal(a, b, 1 STATUS_VAR); \
7212}
7213
7214COMPARE(32, 0xff)
7215COMPARE(64, 0x7ff)
9ee6e8bb 7216
a49db98d 7217static inline int floatx80_compare_internal( floatx80 a, floatx80 b,
f6714d36
AJ
7218 int is_quiet STATUS_PARAM )
7219{
7220 flag aSign, bSign;
7221
7222 if (( ( extractFloatx80Exp( a ) == 0x7fff ) &&
7223 ( extractFloatx80Frac( a )<<1 ) ) ||
7224 ( ( extractFloatx80Exp( b ) == 0x7fff ) &&
7225 ( extractFloatx80Frac( b )<<1 ) )) {
7226 if (!is_quiet ||
7227 floatx80_is_signaling_nan( a ) ||
7228 floatx80_is_signaling_nan( b ) ) {
7229 float_raise( float_flag_invalid STATUS_VAR);
7230 }
7231 return float_relation_unordered;
7232 }
7233 aSign = extractFloatx80Sign( a );
7234 bSign = extractFloatx80Sign( b );
7235 if ( aSign != bSign ) {
7236
7237 if ( ( ( (uint16_t) ( ( a.high | b.high ) << 1 ) ) == 0) &&
7238 ( ( a.low | b.low ) == 0 ) ) {
7239 /* zero case */
7240 return float_relation_equal;
7241 } else {
7242 return 1 - (2 * aSign);
7243 }
7244 } else {
7245 if (a.low == b.low && a.high == b.high) {
7246 return float_relation_equal;
7247 } else {
7248 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7249 }
7250 }
7251}
7252
7253int floatx80_compare( floatx80 a, floatx80 b STATUS_PARAM )
7254{
7255 return floatx80_compare_internal(a, b, 0 STATUS_VAR);
7256}
7257
7258int floatx80_compare_quiet( floatx80 a, floatx80 b STATUS_PARAM )
7259{
7260 return floatx80_compare_internal(a, b, 1 STATUS_VAR);
7261}
7262
a49db98d 7263static inline int float128_compare_internal( float128 a, float128 b,
1f587329
BS
7264 int is_quiet STATUS_PARAM )
7265{
7266 flag aSign, bSign;
7267
7268 if (( ( extractFloat128Exp( a ) == 0x7fff ) &&
7269 ( extractFloat128Frac0( a ) | extractFloat128Frac1( a ) ) ) ||
7270 ( ( extractFloat128Exp( b ) == 0x7fff ) &&
7271 ( extractFloat128Frac0( b ) | extractFloat128Frac1( b ) ) )) {
7272 if (!is_quiet ||
7273 float128_is_signaling_nan( a ) ||
7274 float128_is_signaling_nan( b ) ) {
7275 float_raise( float_flag_invalid STATUS_VAR);
7276 }
7277 return float_relation_unordered;
7278 }
7279 aSign = extractFloat128Sign( a );
7280 bSign = extractFloat128Sign( b );
7281 if ( aSign != bSign ) {
7282 if ( ( ( ( a.high | b.high )<<1 ) | a.low | b.low ) == 0 ) {
7283 /* zero case */
7284 return float_relation_equal;
7285 } else {
7286 return 1 - (2 * aSign);
7287 }
7288 } else {
7289 if (a.low == b.low && a.high == b.high) {
7290 return float_relation_equal;
7291 } else {
7292 return 1 - 2 * (aSign ^ ( lt128( a.high, a.low, b.high, b.low ) ));
7293 }
7294 }
7295}
7296
7297int float128_compare( float128 a, float128 b STATUS_PARAM )
7298{
7299 return float128_compare_internal(a, b, 0 STATUS_VAR);
7300}
7301
7302int float128_compare_quiet( float128 a, float128 b STATUS_PARAM )
7303{
7304 return float128_compare_internal(a, b, 1 STATUS_VAR);
7305}
7306
274f1b04
PM
7307/* min() and max() functions. These can't be implemented as
7308 * 'compare and pick one input' because that would mishandle
7309 * NaNs and +0 vs -0.
e17ab310
WN
7310 *
7311 * minnum() and maxnum() functions. These are similar to the min()
7312 * and max() functions but if one of the arguments is a QNaN and
7313 * the other is numerical then the numerical argument is returned.
7314 * minnum() and maxnum correspond to the IEEE 754-2008 minNum()
7315 * and maxNum() operations. min() and max() are the typical min/max
7316 * semantics provided by many CPUs which predate that specification.
2d31e060
LA
7317 *
7318 * minnummag() and maxnummag() functions correspond to minNumMag()
7319 * and minNumMag() from the IEEE-754 2008.
274f1b04 7320 */
e70614ea 7321#define MINMAX(s) \
a49db98d 7322static inline float ## s float ## s ## _minmax(float ## s a, float ## s b, \
2d31e060
LA
7323 int ismin, int isieee, \
7324 int ismag STATUS_PARAM) \
274f1b04
PM
7325{ \
7326 flag aSign, bSign; \
2d31e060 7327 uint ## s ## _t av, bv, aav, abv; \
274f1b04
PM
7328 a = float ## s ## _squash_input_denormal(a STATUS_VAR); \
7329 b = float ## s ## _squash_input_denormal(b STATUS_VAR); \
7330 if (float ## s ## _is_any_nan(a) || \
7331 float ## s ## _is_any_nan(b)) { \
e17ab310
WN
7332 if (isieee) { \
7333 if (float ## s ## _is_quiet_nan(a) && \
7334 !float ## s ##_is_any_nan(b)) { \
7335 return b; \
7336 } else if (float ## s ## _is_quiet_nan(b) && \
7337 !float ## s ## _is_any_nan(a)) { \
7338 return a; \
7339 } \
7340 } \
274f1b04
PM
7341 return propagateFloat ## s ## NaN(a, b STATUS_VAR); \
7342 } \
7343 aSign = extractFloat ## s ## Sign(a); \
7344 bSign = extractFloat ## s ## Sign(b); \
7345 av = float ## s ## _val(a); \
7346 bv = float ## s ## _val(b); \
2d31e060
LA
7347 if (ismag) { \
7348 aav = float ## s ## _abs(av); \
7349 abv = float ## s ## _abs(bv); \
7350 if (aav != abv) { \
7351 if (ismin) { \
7352 return (aav < abv) ? a : b; \
7353 } else { \
7354 return (aav < abv) ? b : a; \
7355 } \
7356 } \
7357 } \
274f1b04
PM
7358 if (aSign != bSign) { \
7359 if (ismin) { \
7360 return aSign ? a : b; \
7361 } else { \
7362 return aSign ? b : a; \
7363 } \
7364 } else { \
7365 if (ismin) { \
7366 return (aSign ^ (av < bv)) ? a : b; \
7367 } else { \
7368 return (aSign ^ (av < bv)) ? b : a; \
7369 } \
7370 } \
7371} \
7372 \
7373float ## s float ## s ## _min(float ## s a, float ## s b STATUS_PARAM) \
7374{ \
2d31e060 7375 return float ## s ## _minmax(a, b, 1, 0, 0 STATUS_VAR); \
274f1b04
PM
7376} \
7377 \
7378float ## s float ## s ## _max(float ## s a, float ## s b STATUS_PARAM) \
7379{ \
2d31e060 7380 return float ## s ## _minmax(a, b, 0, 0, 0 STATUS_VAR); \
e17ab310
WN
7381} \
7382 \
7383float ## s float ## s ## _minnum(float ## s a, float ## s b STATUS_PARAM) \
7384{ \
2d31e060 7385 return float ## s ## _minmax(a, b, 1, 1, 0 STATUS_VAR); \
e17ab310
WN
7386} \
7387 \
7388float ## s float ## s ## _maxnum(float ## s a, float ## s b STATUS_PARAM) \
7389{ \
2d31e060
LA
7390 return float ## s ## _minmax(a, b, 0, 1, 0 STATUS_VAR); \
7391} \
7392 \
7393float ## s float ## s ## _minnummag(float ## s a, float ## s b STATUS_PARAM) \
7394{ \
7395 return float ## s ## _minmax(a, b, 1, 1, 1 STATUS_VAR); \
7396} \
7397 \
7398float ## s float ## s ## _maxnummag(float ## s a, float ## s b STATUS_PARAM) \
7399{ \
7400 return float ## s ## _minmax(a, b, 0, 1, 1 STATUS_VAR); \
274f1b04
PM
7401}
7402
e70614ea
WN
7403MINMAX(32)
7404MINMAX(64)
274f1b04
PM
7405
7406
9ee6e8bb
PB
7407/* Multiply A by 2 raised to the power N. */
7408float32 float32_scalbn( float32 a, int n STATUS_PARAM )
7409{
7410 flag aSign;
326b9e98 7411 int16_t aExp;
bb98fe42 7412 uint32_t aSig;
9ee6e8bb 7413
37d18660 7414 a = float32_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
7415 aSig = extractFloat32Frac( a );
7416 aExp = extractFloat32Exp( a );
7417 aSign = extractFloat32Sign( a );
7418
7419 if ( aExp == 0xFF ) {
326b9e98
AJ
7420 if ( aSig ) {
7421 return propagateFloat32NaN( a, a STATUS_VAR );
7422 }
9ee6e8bb
PB
7423 return a;
7424 }
3c85c37f 7425 if (aExp != 0) {
69397542 7426 aSig |= 0x00800000;
3c85c37f 7427 } else if (aSig == 0) {
69397542 7428 return a;
3c85c37f
PM
7429 } else {
7430 aExp++;
7431 }
69397542 7432
326b9e98
AJ
7433 if (n > 0x200) {
7434 n = 0x200;
7435 } else if (n < -0x200) {
7436 n = -0x200;
7437 }
7438
69397542
PB
7439 aExp += n - 1;
7440 aSig <<= 7;
7441 return normalizeRoundAndPackFloat32( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
7442}
7443
7444float64 float64_scalbn( float64 a, int n STATUS_PARAM )
7445{
7446 flag aSign;
326b9e98 7447 int16_t aExp;
bb98fe42 7448 uint64_t aSig;
9ee6e8bb 7449
37d18660 7450 a = float64_squash_input_denormal(a STATUS_VAR);
9ee6e8bb
PB
7451 aSig = extractFloat64Frac( a );
7452 aExp = extractFloat64Exp( a );
7453 aSign = extractFloat64Sign( a );
7454
7455 if ( aExp == 0x7FF ) {
326b9e98
AJ
7456 if ( aSig ) {
7457 return propagateFloat64NaN( a, a STATUS_VAR );
7458 }
9ee6e8bb
PB
7459 return a;
7460 }
3c85c37f 7461 if (aExp != 0) {
69397542 7462 aSig |= LIT64( 0x0010000000000000 );
3c85c37f 7463 } else if (aSig == 0) {
69397542 7464 return a;
3c85c37f
PM
7465 } else {
7466 aExp++;
7467 }
69397542 7468
326b9e98
AJ
7469 if (n > 0x1000) {
7470 n = 0x1000;
7471 } else if (n < -0x1000) {
7472 n = -0x1000;
7473 }
7474
69397542
PB
7475 aExp += n - 1;
7476 aSig <<= 10;
7477 return normalizeRoundAndPackFloat64( aSign, aExp, aSig STATUS_VAR );
9ee6e8bb
PB
7478}
7479
9ee6e8bb
PB
7480floatx80 floatx80_scalbn( floatx80 a, int n STATUS_PARAM )
7481{
7482 flag aSign;
326b9e98 7483 int32_t aExp;
bb98fe42 7484 uint64_t aSig;
9ee6e8bb
PB
7485
7486 aSig = extractFloatx80Frac( a );
7487 aExp = extractFloatx80Exp( a );
7488 aSign = extractFloatx80Sign( a );
7489
326b9e98
AJ
7490 if ( aExp == 0x7FFF ) {
7491 if ( aSig<<1 ) {
7492 return propagateFloatx80NaN( a, a STATUS_VAR );
7493 }
9ee6e8bb
PB
7494 return a;
7495 }
326b9e98 7496
3c85c37f
PM
7497 if (aExp == 0) {
7498 if (aSig == 0) {
7499 return a;
7500 }
7501 aExp++;
7502 }
69397542 7503
326b9e98
AJ
7504 if (n > 0x10000) {
7505 n = 0x10000;
7506 } else if (n < -0x10000) {
7507 n = -0x10000;
7508 }
7509
9ee6e8bb 7510 aExp += n;
69397542
PB
7511 return normalizeRoundAndPackFloatx80( STATUS(floatx80_rounding_precision),
7512 aSign, aExp, aSig, 0 STATUS_VAR );
9ee6e8bb 7513}
9ee6e8bb 7514
9ee6e8bb
PB
7515float128 float128_scalbn( float128 a, int n STATUS_PARAM )
7516{
7517 flag aSign;
326b9e98 7518 int32_t aExp;
bb98fe42 7519 uint64_t aSig0, aSig1;
9ee6e8bb
PB
7520
7521 aSig1 = extractFloat128Frac1( a );
7522 aSig0 = extractFloat128Frac0( a );
7523 aExp = extractFloat128Exp( a );
7524 aSign = extractFloat128Sign( a );
7525 if ( aExp == 0x7FFF ) {
326b9e98
AJ
7526 if ( aSig0 | aSig1 ) {
7527 return propagateFloat128NaN( a, a STATUS_VAR );
7528 }
9ee6e8bb
PB
7529 return a;
7530 }
3c85c37f 7531 if (aExp != 0) {
69397542 7532 aSig0 |= LIT64( 0x0001000000000000 );
3c85c37f 7533 } else if (aSig0 == 0 && aSig1 == 0) {
69397542 7534 return a;
3c85c37f
PM
7535 } else {
7536 aExp++;
7537 }
69397542 7538
326b9e98
AJ
7539 if (n > 0x10000) {
7540 n = 0x10000;
7541 } else if (n < -0x10000) {
7542 n = -0x10000;
7543 }
7544
69397542
PB
7545 aExp += n - 1;
7546 return normalizeRoundAndPackFloat128( aSign, aExp, aSig0, aSig1
7547 STATUS_VAR );
9ee6e8bb
PB
7548
7549}