[mirror_edk2.git] / StdLib / LibC / Math / e_sqrt.c

/** @file\r
  Compute the logrithm of x.\r
\r
  Copyright (c) 2010 - 2011, Intel Corporation. All rights reserved.<BR>\r
  This program and the accompanying materials are licensed and made available under\r
  the terms and conditions of the BSD License that accompanies this distribution.\r
  The full text of the license may be found at\r
  http://opensource.org/licenses/bsd-license.\r
\r
  THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,\r
  WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.\r
\r
 * ====================================================\r
 * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.\r
 *\r
 * Developed at SunPro, a Sun Microsystems, Inc. business.\r
 * Permission to use, copy, modify, and distribute this\r
 * software is freely granted, provided that this notice\r
 * is preserved.\r
 * ====================================================\r
\r
  e_sqrt.c 5.1 93/09/24\r
  NetBSD: e_sqrt.c,v 1.12 2002/05/26 22:01:52 wiz Exp\r
**/\r
#include  <LibConfig.h>\r
#include  <sys/EfiCdefs.h>\r
\r
#include  <errno.h>\r
#include "math.h"\r
#include "math_private.h"\r
\r
#if defined(_MSC_VER)           /* Handle Microsoft VC++ compiler specifics. */\r
// potential divide by 0 -- near line 129, (x-x)/(x-x) is on purpose\r
#pragma warning ( disable : 4723 )\r
#endif\r
\r
/* __ieee754_sqrt(x)\r
 * Return correctly rounded sqrt.\r
 *           ------------------------------------------\r
 *       |  Use the hardware sqrt if you have one |\r
 *           ------------------------------------------\r
 * Method:\r
 *   Bit by bit method using integer arithmetic. (Slow, but portable)\r
 *   1. Normalization\r
 *  Scale x to y in [1,4) with even powers of 2:\r
 *  find an integer k such that  1 <= (y=x*2^(2k)) < 4, then\r
 *    sqrt(x) = 2^k * sqrt(y)\r
 *   2. Bit by bit computation\r
 *  Let q  = sqrt(y) truncated to i bit after binary point (q = 1),\r
 *       i               0\r
 *                                     i+1         2\r
 *      s  = 2*q , and  y  =  2   * ( y - q  ).   (1)\r
 *       i      i            i                 i\r
 *\r
 *  To compute q    from q , one checks whether\r
 *        i+1       i\r
 *\r
 *            -(i+1) 2\r
 *      (q + 2      ) <= y.     (2)\r
 *            i\r
 *                    -(i+1)\r
 *  If (2) is false, then q   = q ; otherwise q   = q  + 2      .\r
 *             i+1   i             i+1   i\r
 *\r
 *  With some algebric manipulation, it is not difficult to see\r
 *  that (2) is equivalent to\r
 *                             -(i+1)\r
 *      s  +  2       <= y      (3)\r
 *       i                i\r
 *\r
 *  The advantage of (3) is that s  and y  can be computed by\r
 *              i      i\r
 *  the following recurrence formula:\r
 *      if (3) is false\r
 *\r
 *      s     =  s  , y    = y   ;      (4)\r
 *       i+1      i    i+1    i\r
 *\r
 *      otherwise,\r
 *                         -i                     -(i+1)\r
 *      s   =  s  + 2  ,  y    = y  -  s  - 2     (5)\r
 *           i+1      i          i+1    i     i\r
 *\r
 *  One may easily use induction to prove (4) and (5).\r
 *  Note. Since the left hand side of (3) contain only i+2 bits,\r
 *        it does not necessary to do a full (53-bit) comparison\r
 *        in (3).\r
 *   3. Final rounding\r
 *  After generating the 53 bits result, we compute one more bit.\r
 *  Together with the remainder, we can decide whether the\r
 *  result is exact, bigger than 1/2ulp, or less than 1/2ulp\r
 *  (it will never equal to 1/2ulp).\r
 *  The rounding mode can be detected by checking whether\r
 *  huge + tiny is equal to huge, and whether huge - tiny is\r
 *  equal to huge for some floating point number "huge" and "tiny".\r
 *\r
 * Special cases:\r
 *  sqrt(+-0) = +-0   ... exact\r
 *  sqrt(inf) = inf\r
 *  sqrt(-ve) = NaN   ... with invalid signal\r
 *  sqrt(NaN) = NaN   ... with invalid signal for signaling NaN\r
 *\r
 * Other methods : see the appended file at the end of the program below.\r
 *---------------\r
 */\r
\r
static  const double  one = 1.0, tiny=1.0e-300;\r
\r
double\r
__ieee754_sqrt(double x)\r
{\r
  double z;\r
  int32_t sign = (int)0x80000000;\r
  int32_t ix0,s0,q,m,t,i;\r
  u_int32_t r,t1,s1,ix1,q1;\r
\r
  EXTRACT_WORDS(ix0,ix1,x);\r
\r
    /* take care of Inf and NaN */\r
  if((ix0&0x7ff00000)==0x7ff00000) {\r
      return x*x+x;   /* sqrt(NaN)=NaN, sqrt(+inf)=+inf\r
             sqrt(-inf)=sNaN */\r
  }\r
    /* take care of zero */\r
  if(ix0<=0) {\r
      if(((ix0&(~sign))|ix1)==0) return x;/* sqrt(+-0) = +-0 */\r
      else if(ix0<0) {\r
        errno = EDOM;\r
        return (x-x)/(x-x);   /* sqrt(-ve) = sNaN */\r
      }\r
  }\r
    /* normalize x */\r
  m = (ix0>>20);\r
  if(m==0) {        /* subnormal x */\r
      while(ix0==0) {\r
    m -= 21;\r
    ix0 |= (ix1>>11); ix1 <<= 21;\r
      }\r
      for(i=0;(ix0&0x00100000)==0;i++) ix0<<=1;\r
      m -= i-1;\r
      ix0 |= (ix1>>(32-i));\r
      ix1 <<= i;\r
  }\r
  m -= 1023;  /* unbias exponent */\r
  ix0 = (ix0&0x000fffff)|0x00100000;\r
  if(m&1){  /* odd m, double x to make it even */\r
      ix0 += ix0 + ((ix1&sign)>>31);\r
      ix1 += ix1;\r
  }\r
  m >>= 1;  /* m = [m/2] */\r
\r
    /* generate sqrt(x) bit by bit */\r
  ix0 += ix0 + ((ix1&sign)>>31);\r
  ix1 += ix1;\r
  q = q1 = s0 = s1 = 0; /* [q,q1] = sqrt(x) */\r
  r = 0x00200000;   /* r = moving bit from right to left */\r
\r
  while(r!=0) {\r
      t = s0+r;\r
      if(t<=ix0) {\r
    s0   = t+r;\r
    ix0 -= t;\r
    q   += r;\r
      }\r
      ix0 += ix0 + ((ix1&sign)>>31);\r
      ix1 += ix1;\r
      r>>=1;\r
  }\r
\r
  r = sign;\r
  while(r!=0) {\r
      t1 = s1+r;\r
      t  = s0;\r
      if((t<ix0)||((t==ix0)&&(t1<=ix1))) {\r
    s1  = t1+r;\r
    if(((t1&sign)==(u_int32_t)sign)&&(s1&sign)==0) s0 += 1;\r
    ix0 -= t;\r
    if (ix1 < t1) ix0 -= 1;\r
    ix1 -= t1;\r
    q1  += r;\r
      }\r
      ix0 += ix0 + ((ix1&sign)>>31);\r
      ix1 += ix1;\r
      r>>=1;\r
  }\r
\r
    /* use floating add to find out rounding direction */\r
  if((ix0|ix1)!=0) {\r
      z = one-tiny; /* trigger inexact flag */\r
      if (z>=one) {\r
          z = one+tiny;\r
          if (q1==(u_int32_t)0xffffffff) { q1=0; q += 1;}\r
    else if (z>one) {\r
        if (q1==(u_int32_t)0xfffffffe) q+=1;\r
        q1+=2;\r
    } else\r
              q1 += (q1&1);\r
      }\r
  }\r
  ix0 = (q>>1)+0x3fe00000;\r
  ix1 =  q1>>1;\r
  if ((q&1)==1) ix1 |= sign;\r
  ix0 += (m <<20);\r
  INSERT_WORDS(z,ix0,ix1);\r
  return z;\r
}\r
\r
/*\r
Other methods  (use floating-point arithmetic)\r
-------------\r
(This is a copy of a drafted paper by Prof W. Kahan\r
and K.C. Ng, written in May, 1986)\r
\r
  Two algorithms are given here to implement sqrt(x)\r
  (IEEE double precision arithmetic) in software.\r
  Both supply sqrt(x) correctly rounded. The first algorithm (in\r
  Section A) uses newton iterations and involves four divisions.\r
  The second one uses reciproot iterations to avoid division, but\r
  requires more multiplications. Both algorithms need the ability\r
  to chop results of arithmetic operations instead of round them,\r
  and the INEXACT flag to indicate when an arithmetic operation\r
  is executed exactly with no roundoff error, all part of the\r
  standard (IEEE 754-1985). The ability to perform shift, add,\r
  subtract and logical AND operations upon 32-bit words is needed\r
  too, though not part of the standard.\r
\r
A.  sqrt(x) by Newton Iteration\r
\r
   (1)  Initial approximation\r
\r
  Let x0 and x1 be the leading and the trailing 32-bit words of\r
  a floating point number x (in IEEE double format) respectively\r
\r
      1    11        52         ...widths\r
     ------------------------------------------------------\r
  x: |s|    e     |       f       |\r
     ------------------------------------------------------\r
        msb    lsb  msb             lsb ...order\r
\r
\r
       ------------------------        ------------------------\r
  x0:  |s|   e    |    f1     |  x1: |          f2           |\r
       ------------------------        ------------------------\r
\r
  By performing shifts and subtracts on x0 and x1 (both regarded\r
  as integers), we obtain an 8-bit approximation of sqrt(x) as\r
  follows.\r
\r
    k  := (x0>>1) + 0x1ff80000;\r
    y0 := k - T1[31&(k>>15)]. ... y ~ sqrt(x) to 8 bits\r
  Here k is a 32-bit integer and T1[] is an integer array containing\r
  correction terms. Now magically the floating value of y (y's\r
  leading 32-bit word is y0, the value of its trailing word is 0)\r
  approximates sqrt(x) to almost 8-bit.\r
\r
  Value of T1:\r
  static int T1[32]= {\r
  0,  1024, 3062, 5746, 9193, 13348,  18162,  23592,\r
  29598,  36145,  43202,  50740,  58733,  67158,  75992,  85215,\r
  83599,  71378,  60428,  50647,  41945,  34246,  27478,  21581,\r
  16499,  12183,  8588, 5674, 3403, 1742, 661,  130,};\r
\r
    (2) Iterative refinement\r
\r
  Apply Heron's rule three times to y, we have y approximates\r
  sqrt(x) to within 1 ulp (Unit in the Last Place):\r
\r
    y := (y+x/y)/2    ... almost 17 sig. bits\r
    y := (y+x/y)/2    ... almost 35 sig. bits\r
    y := y-(y-x/y)/2  ... within 1 ulp\r
\r
\r
  Remark 1.\r
      Another way to improve y to within 1 ulp is:\r
\r
    y := (y+x/y)    ... almost 17 sig. bits to 2*sqrt(x)\r
    y := y - 0x00100006 ... almost 18 sig. bits to sqrt(x)\r
\r
        2\r
          (x-y )*y\r
    y := y + 2* ----------  ...within 1 ulp\r
             2\r
           3y  + x\r
\r
\r
  This formula has one division fewer than the one above; however,\r
  it requires more multiplications and additions. Also x must be\r
  scaled in advance to avoid spurious overflow in evaluating the\r
  expression 3y*y+x. Hence it is not recommended uless division\r
  is slow. If division is very slow, then one should use the\r
  reciproot algorithm given in section B.\r
\r
    (3) Final adjustment\r
\r
  By twiddling y's last bit it is possible to force y to be\r
  correctly rounded according to the prevailing rounding mode\r
  as follows. Let r and i be copies of the rounding mode and\r
  inexact flag before entering the square root program. Also we\r
  use the expression y+-ulp for the next representable floating\r
  numbers (up and down) of y. Note that y+-ulp = either fixed\r
  point y+-1, or multiply y by nextafter(1,+-inf) in chopped\r
  mode.\r
\r
    I := FALSE; ... reset INEXACT flag I\r
    R := RZ;  ... set rounding mode to round-toward-zero\r
    z := x/y; ... chopped quotient, possibly inexact\r
    If(not I) then {  ... if the quotient is exact\r
        if(z=y) {\r
            I := i;  ... restore inexact flag\r
            R := r;  ... restore rounded mode\r
            return sqrt(x):=y.\r
        } else {\r
      z := z - ulp; ... special rounding\r
        }\r
    }\r
    i := TRUE;    ... sqrt(x) is inexact\r
    If (r=RN) then z=z+ulp  ... rounded-to-nearest\r
    If (r=RP) then {  ... round-toward-+inf\r
        y = y+ulp; z=z+ulp;\r
    }\r
    y := y+z;   ... chopped sum\r
    y0:=y0-0x00100000;  ... y := y/2 is correctly rounded.\r
          I := i;     ... restore inexact flag\r
          R := r;     ... restore rounded mode\r
          return sqrt(x):=y.\r
\r
    (4) Special cases\r
\r
  Square root of +inf, +-0, or NaN is itself;\r
  Square root of a negative number is NaN with invalid signal.\r
\r
\r
B.  sqrt(x) by Reciproot Iteration\r
\r
   (1)  Initial approximation\r
\r
  Let x0 and x1 be the leading and the trailing 32-bit words of\r
  a floating point number x (in IEEE double format) respectively\r
  (see section A). By performing shifs and subtracts on x0 and y0,\r
  we obtain a 7.8-bit approximation of 1/sqrt(x) as follows.\r
\r
      k := 0x5fe80000 - (x0>>1);\r
      y0:= k - T2[63&(k>>14)].  ... y ~ 1/sqrt(x) to 7.8 bits\r
\r
  Here k is a 32-bit integer and T2[] is an integer array\r
  containing correction terms. Now magically the floating\r
  value of y (y's leading 32-bit word is y0, the value of\r
  its trailing word y1 is set to zero) approximates 1/sqrt(x)\r
  to almost 7.8-bit.\r
\r
  Value of T2:\r
  static int T2[64]= {\r
  0x1500, 0x2ef8, 0x4d67, 0x6b02, 0x87be, 0xa395, 0xbe7a, 0xd866,\r
  0xf14a, 0x1091b,0x11fcd,0x13552,0x14999,0x15c98,0x16e34,0x17e5f,\r
  0x18d03,0x19a01,0x1a545,0x1ae8a,0x1b5c4,0x1bb01,0x1bfde,0x1c28d,\r
  0x1c2de,0x1c0db,0x1ba73,0x1b11c,0x1a4b5,0x1953d,0x18266,0x16be0,\r
  0x1683e,0x179d8,0x18a4d,0x19992,0x1a789,0x1b445,0x1bf61,0x1c989,\r
  0x1d16d,0x1d77b,0x1dddf,0x1e2ad,0x1e5bf,0x1e6e8,0x1e654,0x1e3cd,\r
  0x1df2a,0x1d635,0x1cb16,0x1be2c,0x1ae4e,0x19bde,0x1868e,0x16e2e,\r
  0x1527f,0x1334a,0x11051,0xe951, 0xbe01, 0x8e0d, 0x5924, 0x1edd,};\r
\r
    (2) Iterative refinement\r
\r
  Apply Reciproot iteration three times to y and multiply the\r
  result by x to get an approximation z that matches sqrt(x)\r
  to about 1 ulp. To be exact, we will have\r
    -1ulp < sqrt(x)-z<1.0625ulp.\r
\r
  ... set rounding mode to Round-to-nearest\r
     y := y*(1.5-0.5*x*y*y) ... almost 15 sig. bits to 1/sqrt(x)\r
     y := y*((1.5-2^-30)+0.5*x*y*y)... about 29 sig. bits to 1/sqrt(x)\r
  ... special arrangement for better accuracy\r
     z := x*y     ... 29 bits to sqrt(x), with z*y<1\r
     z := z + 0.5*z*(1-z*y) ... about 1 ulp to sqrt(x)\r
\r
  Remark 2. The constant 1.5-2^-30 is chosen to bias the error so that\r
  (a) the term z*y in the final iteration is always less than 1;\r
  (b) the error in the final result is biased upward so that\r
    -1 ulp < sqrt(x) - z < 1.0625 ulp\r
      instead of |sqrt(x)-z|<1.03125ulp.\r
\r
    (3) Final adjustment\r
\r
  By twiddling y's last bit it is possible to force y to be\r
  correctly rounded according to the prevailing rounding mode\r
  as follows. Let r and i be copies of the rounding mode and\r
  inexact flag before entering the square root program. Also we\r
  use the expression y+-ulp for the next representable floating\r
  numbers (up and down) of y. Note that y+-ulp = either fixed\r
  point y+-1, or multiply y by nextafter(1,+-inf) in chopped\r
  mode.\r
\r
  R := RZ;    ... set rounding mode to round-toward-zero\r
  switch(r) {\r
      case RN:    ... round-to-nearest\r
         if(x<= z*(z-ulp)...chopped) z = z - ulp; else\r
         if(x<= z*(z+ulp)...chopped) z = z; else z = z+ulp;\r
         break;\r
      case RZ:case RM:  ... round-to-zero or round-to--inf\r
         R:=RP;   ... reset rounding mod to round-to-+inf\r
         if(x<z*z ... rounded up) z = z - ulp; else\r
         if(x>=(z+ulp)*(z+ulp) ...rounded up) z = z+ulp;\r
         break;\r
      case RP:    ... round-to-+inf\r
         if(x>(z+ulp)*(z+ulp)...chopped) z = z+2*ulp; else\r
         if(x>z*z ...chopped) z = z+ulp;\r
         break;\r
  }\r
\r
  Remark 3. The above comparisons can be done in fixed point. For\r
  example, to compare x and w=z*z chopped, it suffices to compare\r
  x1 and w1 (the trailing parts of x and w), regarding them as\r
  two's complement integers.\r
\r
  ...Is z an exact square root?\r
  To determine whether z is an exact square root of x, let z1 be the\r
  trailing part of z, and also let x0 and x1 be the leading and\r
  trailing parts of x.\r
\r
  If ((z1&0x03ffffff)!=0) ... not exact if trailing 26 bits of z!=0\r
      I := 1;   ... Raise Inexact flag: z is not exact\r
  else {\r
      j := 1 - [(x0>>20)&1] ... j = logb(x) mod 2\r
      k := z1 >> 26;    ... get z's 25-th and 26-th\r
              fraction bits\r
      I := i or (k&j) or ((k&(j+j+1))!=(x1&3));\r
  }\r
  R:= r   ... restore rounded mode\r
  return sqrt(x):=z.\r
\r
  If multiplication is cheaper than the foregoing red tape, the\r
  Inexact flag can be evaluated by\r
\r
      I := i;\r
      I := (z*z!=x) or I.\r
\r
  Note that z*z can overwrite I; this value must be sensed if it is\r
  True.\r
\r
  Remark 4. If z*z = x exactly, then bit 25 to bit 0 of z1 must be\r
  zero.\r
\r
        --------------------\r
    z1: |        f2        |\r
        --------------------\r
    bit 31       bit 0\r
\r
  Further more, bit 27 and 26 of z1, bit 0 and 1 of x1, and the odd\r
  or even of logb(x) have the following relations:\r
\r
  -------------------------------------------------\r
  bit 27,26 of z1   bit 1,0 of x1 logb(x)\r
  -------------------------------------------------\r
  00      00    odd and even\r
  01      01    even\r
  10      10    odd\r
  10      00    even\r
  11      01    even\r
  -------------------------------------------------\r
\r
    (4) Special cases (see (4) of Section A).\r
\r
 */\r
\r
Commit	Line	Data
2aa62f2b	1	/** @file\r
	2	Compute the logrithm of x.\r
	3	\r
	4	Copyright (c) 2010 - 2011, Intel Corporation. All rights reserved.<BR>\r
	5	This program and the accompanying materials are licensed and made available under\r
	6	the terms and conditions of the BSD License that accompanies this distribution.\r
	7	The full text of the license may be found at\r
	8	http://opensource.org/licenses/bsd-license.\r
	9	\r
	10	THE PROGRAM IS DISTRIBUTED UNDER THE BSD LICENSE ON AN "AS IS" BASIS,\r
	11	WITHOUT WARRANTIES OR REPRESENTATIONS OF ANY KIND, EITHER EXPRESS OR IMPLIED.\r
	12	\r
	13	* ====================================================\r
	14	* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.\r
	15	*\r
	16	* Developed at SunPro, a Sun Microsystems, Inc. business.\r
	17	* Permission to use, copy, modify, and distribute this\r
	18	* software is freely granted, provided that this notice\r
	19	* is preserved.\r
	20	* ====================================================\r
	21	\r
	22	e_sqrt.c 5.1 93/09/24\r
	23	NetBSD: e_sqrt.c,v 1.12 2002/05/26 22:01:52 wiz Exp\r
	24	**/\r
	25	#include <LibConfig.h>\r
	26	#include <sys/EfiCdefs.h>\r
	27	\r
	28	#include <errno.h>\r
	29	#include "math.h"\r
	30	#include "math_private.h"\r
	31	\r
	32	#if defined(_MSC_VER) /* Handle Microsoft VC++ compiler specifics. */\r
	33	// potential divide by 0 -- near line 129, (x-x)/(x-x) is on purpose\r
	34	#pragma warning ( disable : 4723 )\r
	35	#endif\r
	36	\r
	37	/* __ieee754_sqrt(x)\r
	38	* Return correctly rounded sqrt.\r
	39	* ------------------------------------------\r
	40	* \| Use the hardware sqrt if you have one \|\r
	41	* ------------------------------------------\r
	42	* Method:\r
	43	* Bit by bit method using integer arithmetic. (Slow, but portable)\r
	44	* 1. Normalization\r
	45	* Scale x to y in [1,4) with even powers of 2:\r
	46	* find an integer k such that 1 <= (y=x*2^(2k)) < 4, then\r
	47	* sqrt(x) = 2^k * sqrt(y)\r
	48	* 2. Bit by bit computation\r
	49	* Let q = sqrt(y) truncated to i bit after binary point (q = 1),\r
	50	* i 0\r
	51	* i+1 2\r
	52	* s = 2q , and y = 2 ( y - q ). (1)\r
	53	* i i i i\r
	54	*\r
	55	* To compute q from q , one checks whether\r
	56	* i+1 i\r
	57	*\r
	58	* -(i+1) 2\r
	59	* (q + 2 ) <= y. (2)\r
	60	* i\r
	61	* -(i+1)\r
	62	* If (2) is false, then q = q ; otherwise q = q + 2 .\r
	63	* i+1 i i+1 i\r
	64	*\r
65	* With some algebric manipulation, it is not difficult to see\r
66	* that (2) is equivalent to\r
67	* -(i+1)\r
68	* s + 2 <= y (3)\r
69	* i i\r
70	*\r
71	* The advantage of (3) is that s and y can be computed by\r
72	* i i\r
73	* the following recurrence formula:\r
74	* if (3) is false\r
75	*\r
76	* s = s , y = y ; (4)\r
77	* i+1 i i+1 i\r
78	*\r
79	* otherwise,\r
80	* -i -(i+1)\r
81	* s = s + 2 , y = y - s - 2 (5)\r
82	* i+1 i i+1 i i\r
83	*\r
84	* One may easily use induction to prove (4) and (5).\r
85	* Note. Since the left hand side of (3) contain only i+2 bits,\r
86	* it does not necessary to do a full (53-bit) comparison\r
87	* in (3).\r
88	* 3. Final rounding\r
89	* After generating the 53 bits result, we compute one more bit.\r
90	* Together with the remainder, we can decide whether the\r
91	* result is exact, bigger than 1/2ulp, or less than 1/2ulp\r
92	* (it will never equal to 1/2ulp).\r
93	* The rounding mode can be detected by checking whether\r
94	* huge + tiny is equal to huge, and whether huge - tiny is\r
95	* equal to huge for some floating point number "huge" and "tiny".\r
96	*\r
97	* Special cases:\r
98	* sqrt(+-0) = +-0 ... exact\r
99	* sqrt(inf) = inf\r
100	* sqrt(-ve) = NaN ... with invalid signal\r
101	* sqrt(NaN) = NaN ... with invalid signal for signaling NaN\r
102	*\r
103	* Other methods : see the appended file at the end of the program below.\r
104	*---------------\r
105	*/\r
106	\r
107	static const double one = 1.0, tiny=1.0e-300;\r
108	\r
109	double\r
110	__ieee754_sqrt(double x)\r
111	{\r
112	double z;\r
113	int32_t sign = (int)0x80000000;\r
114	int32_t ix0,s0,q,m,t,i;\r
115	u_int32_t r,t1,s1,ix1,q1;\r
116	\r
117	EXTRACT_WORDS(ix0,ix1,x);\r
118	\r
119	/* take care of Inf and NaN */\r
120	if((ix0&0x7ff00000)==0x7ff00000) {\r
121	return xx+x; / sqrt(NaN)=NaN, sqrt(+inf)=+inf\r
122	sqrt(-inf)=sNaN */\r
123	}\r
124	/* take care of zero */\r
125	if(ix0<=0) {\r
126	if(((ix0&(~sign))\|ix1)==0) return x;/* sqrt(+-0) = +-0 */\r
127	else if(ix0<0) {\r
128	errno = EDOM;\r
129	return (x-x)/(x-x); /* sqrt(-ve) = sNaN */\r
130	}\r
131	}\r
132	/* normalize x */\r
133	m = (ix0>>20);\r
134	if(m==0) { /* subnormal x */\r
135	while(ix0==0) {\r
136	m -= 21;\r
137	ix0 \|= (ix1>>11); ix1 <<= 21;\r
138	}\r
139	for(i=0;(ix0&0x00100000)==0;i++) ix0<<=1;\r
140	m -= i-1;\r
141	ix0 \|= (ix1>>(32-i));\r
142	ix1 <<= i;\r
143	}\r
144	m -= 1023; /* unbias exponent */\r
145	ix0 = (ix0&0x000fffff)\|0x00100000;\r
146	if(m&1){ /* odd m, double x to make it even */\r
147	ix0 += ix0 + ((ix1&sign)>>31);\r
148	ix1 += ix1;\r
149	}\r
150	m >>= 1; /* m = [m/2] */\r
151	\r
152	/* generate sqrt(x) bit by bit */\r
153	ix0 += ix0 + ((ix1&sign)>>31);\r
154	ix1 += ix1;\r
155	q = q1 = s0 = s1 = 0; /* [q,q1] = sqrt(x) */\r
156	r = 0x00200000; /* r = moving bit from right to left */\r
157	\r
158	while(r!=0) {\r
159	t = s0+r;\r
160	if(t<=ix0) {\r
161	s0 = t+r;\r
162	ix0 -= t;\r
163	q += r;\r
164	}\r
165	ix0 += ix0 + ((ix1&sign)>>31);\r
166	ix1 += ix1;\r
167	r>>=1;\r
168	}\r
169	\r
170	r = sign;\r
171	while(r!=0) {\r
172	t1 = s1+r;\r
173	t = s0;\r
174	if((t<ix0)\|\|((t==ix0)&&(t1<=ix1))) {\r
175	s1 = t1+r;\r
176	if(((t1&sign)==(u_int32_t)sign)&&(s1&sign)==0) s0 += 1;\r
177	ix0 -= t;\r
178	if (ix1 < t1) ix0 -= 1;\r
179	ix1 -= t1;\r
180	q1 += r;\r
181	}\r
182	ix0 += ix0 + ((ix1&sign)>>31);\r
183	ix1 += ix1;\r
184	r>>=1;\r
185	}\r
186	\r
187	/* use floating add to find out rounding direction */\r
188	if((ix0\|ix1)!=0) {\r
189	z = one-tiny; /* trigger inexact flag */\r
190	if (z>=one) {\r
191	z = one+tiny;\r
192	if (q1==(u_int32_t)0xffffffff) { q1=0; q += 1;}\r
193	else if (z>one) {\r
194	if (q1==(u_int32_t)0xfffffffe) q+=1;\r
195	q1+=2;\r
196	} else\r
197	q1 += (q1&1);\r
198	}\r
199	}\r
200	ix0 = (q>>1)+0x3fe00000;\r
201	ix1 = q1>>1;\r
202	if ((q&1)==1) ix1 \|= sign;\r
203	ix0 += (m <<20);\r
204	INSERT_WORDS(z,ix0,ix1);\r
205	return z;\r
206	}\r
207	\r
208	/*\r
209	Other methods (use floating-point arithmetic)\r
210	-------------\r
211	(This is a copy of a drafted paper by Prof W. Kahan\r
212	and K.C. Ng, written in May, 1986)\r
213	\r
214	Two algorithms are given here to implement sqrt(x)\r
215	(IEEE double precision arithmetic) in software.\r
216	Both supply sqrt(x) correctly rounded. The first algorithm (in\r
217	Section A) uses newton iterations and involves four divisions.\r
218	The second one uses reciproot iterations to avoid division, but\r
219	requires more multiplications. Both algorithms need the ability\r
220	to chop results of arithmetic operations instead of round them,\r
221	and the INEXACT flag to indicate when an arithmetic operation\r
222	is executed exactly with no roundoff error, all part of the\r
223	standard (IEEE 754-1985). The ability to perform shift, add,\r
224	subtract and logical AND operations upon 32-bit words is needed\r
225	too, though not part of the standard.\r
226	\r
227	A. sqrt(x) by Newton Iteration\r
228	\r
229	(1) Initial approximation\r
230	\r
231	Let x0 and x1 be the leading and the trailing 32-bit words of\r
232	a floating point number x (in IEEE double format) respectively\r
233	\r
234	1 11 52 ...widths\r
235	------------------------------------------------------\r
236	x: \|s\| e \| f \|\r
237	------------------------------------------------------\r
238	msb lsb msb lsb ...order\r
239	\r
240	\r
241	------------------------ ------------------------\r
242	x0: \|s\| e \| f1 \| x1: \| f2 \|\r
243	------------------------ ------------------------\r
244	\r
245	By performing shifts and subtracts on x0 and x1 (both regarded\r
246	as integers), we obtain an 8-bit approximation of sqrt(x) as\r
247	follows.\r
248	\r
249	k := (x0>>1) + 0x1ff80000;\r
250	y0 := k - T1[31&(k>>15)]. ... y ~ sqrt(x) to 8 bits\r
251	Here k is a 32-bit integer and T1[] is an integer array containing\r
252	correction terms. Now magically the floating value of y (y's\r
253	leading 32-bit word is y0, the value of its trailing word is 0)\r
254	approximates sqrt(x) to almost 8-bit.\r
255	\r
256	Value of T1:\r
257	static int T1[32]= {\r
258	0, 1024, 3062, 5746, 9193, 13348, 18162, 23592,\r
259	29598, 36145, 43202, 50740, 58733, 67158, 75992, 85215,\r
260	83599, 71378, 60428, 50647, 41945, 34246, 27478, 21581,\r
261	16499, 12183, 8588, 5674, 3403, 1742, 661, 130,};\r
262	\r
263	(2) Iterative refinement\r
264	\r
265	Apply Heron's rule three times to y, we have y approximates\r
266	sqrt(x) to within 1 ulp (Unit in the Last Place):\r
267	\r
268	y := (y+x/y)/2 ... almost 17 sig. bits\r
269	y := (y+x/y)/2 ... almost 35 sig. bits\r
270	y := y-(y-x/y)/2 ... within 1 ulp\r
271	\r
272	\r
273	Remark 1.\r
274	Another way to improve y to within 1 ulp is:\r
275	\r
276	y := (y+x/y) ... almost 17 sig. bits to 2*sqrt(x)\r
277	y := y - 0x00100006 ... almost 18 sig. bits to sqrt(x)\r
278	\r
279	2\r
280	(x-y )*y\r
281	y := y + 2* ---------- ...within 1 ulp\r
282	2\r
283	3y + x\r
284	\r
285	\r
286	This formula has one division fewer than the one above; however,\r
287	it requires more multiplications and additions. Also x must be\r
288	scaled in advance to avoid spurious overflow in evaluating the\r
289	expression 3y*y+x. Hence it is not recommended uless division\r
290	is slow. If division is very slow, then one should use the\r
291	reciproot algorithm given in section B.\r
292	\r
293	(3) Final adjustment\r
294	\r
295	By twiddling y's last bit it is possible to force y to be\r
296	correctly rounded according to the prevailing rounding mode\r
297	as follows. Let r and i be copies of the rounding mode and\r
298	inexact flag before entering the square root program. Also we\r
299	use the expression y+-ulp for the next representable floating\r
300	numbers (up and down) of y. Note that y+-ulp = either fixed\r
301	point y+-1, or multiply y by nextafter(1,+-inf) in chopped\r
302	mode.\r
303	\r
304	I := FALSE; ... reset INEXACT flag I\r
305	R := RZ; ... set rounding mode to round-toward-zero\r
306	z := x/y; ... chopped quotient, possibly inexact\r
307	If(not I) then { ... if the quotient is exact\r
308	if(z=y) {\r
309	I := i; ... restore inexact flag\r
310	R := r; ... restore rounded mode\r
311	return sqrt(x):=y.\r
312	} else {\r
313	z := z - ulp; ... special rounding\r
314	}\r
315	}\r
316	i := TRUE; ... sqrt(x) is inexact\r
317	If (r=RN) then z=z+ulp ... rounded-to-nearest\r
318	If (r=RP) then { ... round-toward-+inf\r
319	y = y+ulp; z=z+ulp;\r
320	}\r
321	y := y+z; ... chopped sum\r
322	y0:=y0-0x00100000; ... y := y/2 is correctly rounded.\r
323	I := i; ... restore inexact flag\r
324	R := r; ... restore rounded mode\r
325	return sqrt(x):=y.\r
326	\r
327	(4) Special cases\r
328	\r
329	Square root of +inf, +-0, or NaN is itself;\r
330	Square root of a negative number is NaN with invalid signal.\r
331	\r
332	\r
333	B. sqrt(x) by Reciproot Iteration\r
334	\r
335	(1) Initial approximation\r
336	\r
337	Let x0 and x1 be the leading and the trailing 32-bit words of\r
338	a floating point number x (in IEEE double format) respectively\r
339	(see section A). By performing shifs and subtracts on x0 and y0,\r
340	we obtain a 7.8-bit approximation of 1/sqrt(x) as follows.\r
341	\r
342	k := 0x5fe80000 - (x0>>1);\r
343	y0:= k - T2[63&(k>>14)]. ... y ~ 1/sqrt(x) to 7.8 bits\r
344	\r
345	Here k is a 32-bit integer and T2[] is an integer array\r
346	containing correction terms. Now magically the floating\r
347	value of y (y's leading 32-bit word is y0, the value of\r
348	its trailing word y1 is set to zero) approximates 1/sqrt(x)\r
349	to almost 7.8-bit.\r
350	\r
351	Value of T2:\r
352	static int T2[64]= {\r
353	0x1500, 0x2ef8, 0x4d67, 0x6b02, 0x87be, 0xa395, 0xbe7a, 0xd866,\r
354	0xf14a, 0x1091b,0x11fcd,0x13552,0x14999,0x15c98,0x16e34,0x17e5f,\r
355	0x18d03,0x19a01,0x1a545,0x1ae8a,0x1b5c4,0x1bb01,0x1bfde,0x1c28d,\r
356	0x1c2de,0x1c0db,0x1ba73,0x1b11c,0x1a4b5,0x1953d,0x18266,0x16be0,\r
357	0x1683e,0x179d8,0x18a4d,0x19992,0x1a789,0x1b445,0x1bf61,0x1c989,\r
358	0x1d16d,0x1d77b,0x1dddf,0x1e2ad,0x1e5bf,0x1e6e8,0x1e654,0x1e3cd,\r
359	0x1df2a,0x1d635,0x1cb16,0x1be2c,0x1ae4e,0x19bde,0x1868e,0x16e2e,\r
360	0x1527f,0x1334a,0x11051,0xe951, 0xbe01, 0x8e0d, 0x5924, 0x1edd,};\r
361	\r
362	(2) Iterative refinement\r
363	\r
364	Apply Reciproot iteration three times to y and multiply the\r
365	result by x to get an approximation z that matches sqrt(x)\r
366	to about 1 ulp. To be exact, we will have\r
367	-1ulp < sqrt(x)-z<1.0625ulp.\r
368	\r
369	... set rounding mode to Round-to-nearest\r
370	y := y(1.5-0.5xyy) ... almost 15 sig. bits to 1/sqrt(x)\r
371	y := y((1.5-2^-30)+0.5xyy)... about 29 sig. bits to 1/sqrt(x)\r
372	... special arrangement for better accuracy\r
373	z := xy ... 29 bits to sqrt(x), with zy<1\r
374	z := z + 0.5z(1-z*y) ... about 1 ulp to sqrt(x)\r
375	\r
376	Remark 2. The constant 1.5-2^-30 is chosen to bias the error so that\r
377	(a) the term z*y in the final iteration is always less than 1;\r
378	(b) the error in the final result is biased upward so that\r
379	-1 ulp < sqrt(x) - z < 1.0625 ulp\r
380	instead of \|sqrt(x)-z\|<1.03125ulp.\r
381	\r
382	(3) Final adjustment\r
383	\r
384	By twiddling y's last bit it is possible to force y to be\r
385	correctly rounded according to the prevailing rounding mode\r
386	as follows. Let r and i be copies of the rounding mode and\r
387	inexact flag before entering the square root program. Also we\r
388	use the expression y+-ulp for the next representable floating\r
389	numbers (up and down) of y. Note that y+-ulp = either fixed\r
390	point y+-1, or multiply y by nextafter(1,+-inf) in chopped\r
391	mode.\r
392	\r
393	R := RZ; ... set rounding mode to round-toward-zero\r
394	switch(r) {\r
395	case RN: ... round-to-nearest\r
396	if(x<= z*(z-ulp)...chopped) z = z - ulp; else\r
397	if(x<= z*(z+ulp)...chopped) z = z; else z = z+ulp;\r
398	break;\r
399	case RZ:case RM: ... round-to-zero or round-to--inf\r
400	R:=RP; ... reset rounding mod to round-to-+inf\r
401	if(x<z*z ... rounded up) z = z - ulp; else\r
402	if(x>=(z+ulp)*(z+ulp) ...rounded up) z = z+ulp;\r
403	break;\r
404	case RP: ... round-to-+inf\r
405	if(x>(z+ulp)(z+ulp)...chopped) z = z+2ulp; else\r
406	if(x>z*z ...chopped) z = z+ulp;\r
407	break;\r
408	}\r
409	\r
410	Remark 3. The above comparisons can be done in fixed point. For\r
411	example, to compare x and w=z*z chopped, it suffices to compare\r
412	x1 and w1 (the trailing parts of x and w), regarding them as\r
413	two's complement integers.\r
414	\r
415	...Is z an exact square root?\r
416	To determine whether z is an exact square root of x, let z1 be the\r
417	trailing part of z, and also let x0 and x1 be the leading and\r
418	trailing parts of x.\r
419	\r
420	If ((z1&0x03ffffff)!=0) ... not exact if trailing 26 bits of z!=0\r
421	I := 1; ... Raise Inexact flag: z is not exact\r
422	else {\r
423	j := 1 - [(x0>>20)&1] ... j = logb(x) mod 2\r
424	k := z1 >> 26; ... get z's 25-th and 26-th\r
425	fraction bits\r
426	I := i or (k&j) or ((k&(j+j+1))!=(x1&3));\r
427	}\r
428	R:= r ... restore rounded mode\r
429	return sqrt(x):=z.\r
430	\r
431	If multiplication is cheaper than the foregoing red tape, the\r
432	Inexact flag can be evaluated by\r
433	\r
434	I := i;\r
435	I := (z*z!=x) or I.\r
436	\r
437	Note that z*z can overwrite I; this value must be sensed if it is\r
438	True.\r
439	\r
440	Remark 4. If z*z = x exactly, then bit 25 to bit 0 of z1 must be\r
441	zero.\r
442	\r
443	--------------------\r
444	z1: \| f2 \|\r
445	--------------------\r
446	bit 31 bit 0\r
447	\r
448	Further more, bit 27 and 26 of z1, bit 0 and 1 of x1, and the odd\r
449	or even of logb(x) have the following relations:\r
450	\r
451	-------------------------------------------------\r
452	bit 27,26 of z1 bit 1,0 of x1 logb(x)\r
453	-------------------------------------------------\r
454	00 00 odd and even\r
455	01 01 even\r
456	10 10 odd\r
457	10 00 even\r
458	11 01 even\r
459	-------------------------------------------------\r
460	\r
461	(4) Special cases (see (4) of Section A).\r
462	\r
463	*/\r
464	\r