ceph/src/boost/boost/math/distributions/kolmogorov_smirnov.hpp

   1 // Kolmogorov-Smirnov 1st order asymptotic distribution
   2 // Copyright Evan Miller 2020
   3 //
   4 // Use, modification and distribution are subject to the
   5 // Boost Software License, Version 1.0. (See accompanying file
   6 // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
   7 //
   8 // The Kolmogorov-Smirnov test in statistics compares two empirical distributions,
   9 // or an empirical distribution against any theoretical distribution. It makes
  10 // use of a specific distribution which doesn't have a formal name, but which
  11 // is often called the Kolmogorv-Smirnov distribution for lack of anything
  12 // better. This file implements the limiting form of this distribution, first
  13 // identified by Andrey Kolmogorov in
  14 //
  15 // Kolmogorov, A. (1933) “Sulla Determinazione Empirica di una Legge di
  16 // Distribuzione.” Giornale dell’ Istituto Italiano degli Attuari
  17 //
  18 // This limiting form of the CDF is a first-order Taylor expansion that is
  19 // easily implemented by the fourth Jacobi Theta function (setting z=0). The
  20 // PDF is then implemented here as a derivative of the Theta function. Note
  21 // that this derivative is with respect to x, which enters into \tau, and not
  22 // with respect to the z argument, which is always zero, and so the derivative
  23 // identities in DLMF 20.4 do not apply here.
  24 //
  25 // A higher order order expansion is possible, and was first outlined by
  26 //
  27 // Pelz W, Good IJ (1976). “Approximating the Lower Tail-Areas of the
  28 // Kolmogorov-Smirnov One-sample Statistic.” Journal of the Royal Statistical
  29 // Society B.
  30 //
  31 // The terms in this expansion get fairly complicated, and as far as I know the
  32 // Pelz-Good expansion is not used in any statistics software. Someone could
  33 // consider updating this implementation to use the Pelz-Good expansion in the
  34 // future, but the math gets considerably hairier with each additional term.
  35 //
  36 // A formula for an exact version of the Kolmogorov-Smirnov test is laid out in
  37 // Equation 2.4.4 of
  38 //
  39 // Durbin J (1973). “Distribution Theory for Tests Based on the Sample
  40 // Distribution Func- tion.” In SIAM CBMS-NSF Regional Conference Series in
  41 // Applied Mathematics. SIAM, Philadelphia, PA.
  42 //
  43 // which is available in book form from Amazon and others. This exact version
  44 // involves taking powers of large matrices. To do that right you need to
  45 // compute eigenvalues and eigenvectors, which are beyond the scope of Boost.
  46 // (Some recent work indicates the exact form can also be computed via FFT, see
  47 // https://cran.r-project.org/web/packages/KSgeneral/KSgeneral.pdf).
  48 //
  49 // Even if the CDF of the exact distribution could be computed using Boost
  50 // libraries (which would be cumbersome), the PDF would present another
  51 // difficulty. Therefore I am limiting this implementation to the asymptotic
  52 // form, even though the exact form has trivial values for certain specific
  53 // values of x and n. For more on trivial values see
  54 //
  55 // Ruben H, Gambino J (1982). “The Exact Distribution of Kolmogorov’s Statistic
  56 // Dn for n ≤ 10.” Annals of the Institute of Statistical Mathematics.
  57 //
  58 // For a good bibliography and overview of the various algorithms, including
  59 // both exact and asymptotic forms, see
  60 // https://www.jstatsoft.org/article/view/v039i11
  61 //
  62 // As for this implementation: the distribution is parameterized by n (number
  63 // of observations) in the spirit of chi-squared's degrees of freedom. It then
  64 // takes a single argument x. In terms of the Kolmogorov-Smirnov statistical
  65 // test, x represents the distribution of D_n, where D_n is the maximum
  66 // difference between the CDFs being compared, that is,
  67 //
  68 //   D_n = sup|F_n(x) - G(x)|
  69 //
  70 // In the exact distribution, x is confined to the support [0, 1], but in this
  71 // limiting approximation, we allow x to exceed unity (similar to how a normal
  72 // approximation always spills over any boundaries).
  73 //
  74 // As mentioned previously, the CDF is implemented using the \tau
  75 // parameterization of the fourth Jacobi Theta function as
  76 //
  77 // CDF=θ₄(0|2*x*x*n/pi)
  78 //
  79 // The PDF is a hand-coded derivative of that function. Actually, there are two
  80 // (independent) derivatives, as separate code paths are used for "small x"
  81 // (2*x*x*n < pi) and "large x", mirroring the separate code paths in the
  82 // Jacobi Theta implementation to achieve fast convergence. Quantiles are
  83 // computed using a Newton-Raphson iteration from an initial guess that I
  84 // arrived at by trial and error.
  85 //
  86 // The mean and variance are implemented using simple closed-form expressions.
  87 // Skewness and kurtosis use slightly more complicated closed-form expressions
  88 // that involve the zeta function. The mode is calculated at run-time by
  89 // maximizing the PDF. If you have an analytical solution for the mode, feel
  90 // free to plop it in.
  91 //
  92 // The CDF and PDF could almost certainly be re-implemented and sped up using a
  93 // polynomial or rational approximation, since the only meaningful argument is
  94 // x * sqrt(n). But that is left as an exercise for the next maintainer.
  95 //
  96 // In the future, the Pelz-Good approximation could be added. I suggest adding
  97 // a second parameter representing the order, e.g.
  98 //
  99 // kolmogorov_smirnov_dist<>(100) // N=100, order=1
 100 // kolmogorov_smirnov_dist<>(100, 1) // N=100, order=1, i.e. Kolmogorov's formula
 101 // kolmogorov_smirnov_dist<>(100, 4) // N=100, order=4, i.e. Pelz-Good formula
 102 //
 103 // The exact distribution could be added to the API with a special order
 104 // parameter (e.g. 0 or infinity), or a separate distribution type altogether
 105 // (e.g. kolmogorov_smirnov_exact_distribution).
 106 //
 107 #ifndef BOOST_MATH_DISTRIBUTIONS_KOLMOGOROV_SMIRNOV_HPP
 108 #define BOOST_MATH_DISTRIBUTIONS_KOLMOGOROV_SMIRNOV_HPP
 109
 110 #include <boost/math/distributions/fwd.hpp>
 111 #include <boost/math/distributions/complement.hpp>
 112 #include <boost/math/distributions/detail/common_error_handling.hpp>
 113 #include <boost/math/special_functions/jacobi_theta.hpp>
 114 #include <boost/math/tools/tuple.hpp>
 115 #include <boost/math/tools/roots.hpp> // Newton-Raphson
 116 #include <boost/math/tools/minima.hpp> // For the mode
 117
 118 namespace boost { namespace math {
 119
 120 namespace detail {
 121 template <class RealType>
 122 inline RealType kolmogorov_smirnov_quantile_guess(RealType p) {
 123     // Choose a starting point for the Newton-Raphson iteration
 124     if (p > 0.9)
 125         return RealType(1.8) - 5 * (1 - p);
 126     if (p < 0.3)
 127         return p + RealType(0.45);
 128     return p + RealType(0.3);
 129 }
 130
 131 // d/dk (theta2(0, 1/(2*k*k/M_PI))/sqrt(2*k*k*M_PI))
 132 template <class RealType, class Policy>
 133 RealType kolmogorov_smirnov_pdf_small_x(RealType x, RealType n, const Policy&) {
 134     BOOST_MATH_STD_USING
 135     RealType value = RealType(0), delta = RealType(0), last_delta = RealType(0);
 136     RealType eps = policies::get_epsilon<RealType, Policy>();
 137     int i = 0;
 138     RealType pi2 = constants::pi_sqr<RealType>();
 139     RealType x2n = x*x*n;
 140     if (x2n*x2n == 0.0) {
 141         return static_cast<RealType>(0);
 142     }
 143     while (1) {
 144         delta = exp(-RealType(i+0.5)*RealType(i+0.5)*pi2/(2*x2n)) * (RealType(i+0.5)*RealType(i+0.5)*pi2 - x2n);
 145
 146         if (delta == 0.0)
 147             break;
 148
 149         if (last_delta != 0.0 && fabs(delta/last_delta) < eps)
 150             break;
 151
 152         value += delta + delta;
 153         last_delta = delta;
 154         i++;
 155     }
 156
 157     return value * sqrt(n) * constants::root_half_pi<RealType>() / (x2n*x2n);
 158 }
 159
 160 // d/dx (theta4(0, 2*x*x*n/M_PI))
 161 template <class RealType, class Policy>
 162 inline RealType kolmogorov_smirnov_pdf_large_x(RealType x, RealType n, const Policy&) {
 163     BOOST_MATH_STD_USING
 164     RealType value = RealType(0), delta = RealType(0), last_delta = RealType(0);
 165     RealType eps = policies::get_epsilon<RealType, Policy>();
 166     int i = 1;
 167     while (1) {
 168         delta = 8*x*i*i*exp(-2*i*i*x*x*n);
 169
 170         if (delta == 0.0)
 171             break;
 172
 173         if (last_delta != 0.0 && fabs(delta / last_delta) < eps)
 174             break;
 175
 176         if (i%2 == 0)
 177             delta = -delta;
 178
 179         value += delta;
 180         last_delta = delta;
 181         i++;
 182     }
 183
 184     return value * n;
 185 }
 186
 187 }; // detail
 188
 189 template <class RealType = double, class Policy = policies::policy<> >
 190     class kolmogorov_smirnov_distribution
 191 {
 192     public:
 193         typedef RealType value_type;
 194         typedef Policy policy_type;
 195
 196         // Constructor
 197     kolmogorov_smirnov_distribution( RealType n ) : n_obs_(n)
 198     {
 199         RealType result;
 200         detail::check_df(
 201                 "boost::math::kolmogorov_smirnov_distribution<%1%>::kolmogorov_smirnov_distribution", n_obs_, &result, Policy());
 202     }
 203
 204     RealType number_of_observations()const
 205     {
 206         return n_obs_;
 207     }
 208
 209     private:
 210
 211     RealType n_obs_; // positive integer
 212 };
 213
 214 typedef kolmogorov_smirnov_distribution<double> kolmogorov_k; // Convenience typedef for double version.
 215
 216 namespace detail {
 217 template <class RealType, class Policy>
 218 struct kolmogorov_smirnov_quantile_functor
 219 {
 220   kolmogorov_smirnov_quantile_functor(const boost::math::kolmogorov_smirnov_distribution<RealType, Policy> dist, RealType const& p)
 221     : distribution(dist), prob(p)
 222   {
 223   }
 224
 225   boost::math::tuple<RealType, RealType> operator()(RealType const& x)
 226   {
 227     RealType fx = cdf(distribution, x) - prob;  // Difference cdf - value - to minimize.
 228     RealType dx = pdf(distribution, x); // pdf is 1st derivative.
 229     // return both function evaluation difference f(x) and 1st derivative f'(x).
 230     return boost::math::make_tuple(fx, dx);
 231   }
 232 private:
 233   const boost::math::kolmogorov_smirnov_distribution<RealType, Policy> distribution;
 234   RealType prob;
 235 };
 236
 237 template <class RealType, class Policy>
 238 struct kolmogorov_smirnov_complementary_quantile_functor
 239 {
 240   kolmogorov_smirnov_complementary_quantile_functor(const boost::math::kolmogorov_smirnov_distribution<RealType, Policy> dist, RealType const& p)
 241     : distribution(dist), prob(p)
 242   {
 243   }
 244
 245   boost::math::tuple<RealType, RealType> operator()(RealType const& x)
 246   {
 247     RealType fx = cdf(complement(distribution, x)) - prob;  // Difference cdf - value - to minimize.
 248     RealType dx = -pdf(distribution, x); // pdf is the negative of the derivative of (1-CDF)
 249     // return both function evaluation difference f(x) and 1st derivative f'(x).
 250     return boost::math::make_tuple(fx, dx);
 251   }
 252 private:
 253   const boost::math::kolmogorov_smirnov_distribution<RealType, Policy> distribution;
 254   RealType prob;
 255 };
 256
 257 template <class RealType, class Policy>
 258 struct kolmogorov_smirnov_negative_pdf_functor
 259 {
 260     RealType operator()(RealType const& x) {
 261         if (2*x*x < constants::pi<RealType>()) {
 262             return -kolmogorov_smirnov_pdf_small_x(x, static_cast<RealType>(1), Policy());
 263         }
 264         return -kolmogorov_smirnov_pdf_large_x(x, static_cast<RealType>(1), Policy());
 265     }
 266 };
 267 } // namespace detail
 268
 269 template <class RealType, class Policy>
 270 inline const std::pair<RealType, RealType> range(const kolmogorov_smirnov_distribution<RealType, Policy>& /*dist*/)
 271 { // Range of permissible values for random variable x.
 272    using boost::math::tools::max_value;
 273    return std::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>());
 274 }
 275
 276 template <class RealType, class Policy>
 277 inline const std::pair<RealType, RealType> support(const kolmogorov_smirnov_distribution<RealType, Policy>& /*dist*/)
 278 { // Range of supported values for random variable x.
 279    // This is range where cdf rises from 0 to 1, and outside it, the pdf is zero.
 280    // In the exact distribution, the upper limit would be 1.
 281    using boost::math::tools::max_value;
 282    return std::pair<RealType, RealType>(static_cast<RealType>(0), max_value<RealType>());
 283 }
 284
 285 template <class RealType, class Policy>
 286 inline RealType pdf(const kolmogorov_smirnov_distribution<RealType, Policy>& dist, const RealType& x)
 287 {
 288    BOOST_FPU_EXCEPTION_GUARD
 289    BOOST_MATH_STD_USING  // for ADL of std functions.
 290
 291    RealType n = dist.number_of_observations();
 292    RealType error_result;
 293    static const char* function = "boost::math::pdf(const kolmogorov_smirnov_distribution<%1%>&, %1%)";
 294    if(false == detail::check_x_not_NaN(function, x, &error_result, Policy()))
 295       return error_result;
 296
 297    if(false == detail::check_df(function, n, &error_result, Policy()))
 298       return error_result;
 299
 300    if (x < 0 || !(boost::math::isfinite)(x))
 301    {
 302       return policies::raise_domain_error<RealType>(
 303          function, "Kolmogorov-Smirnov parameter was %1%, but must be > 0 !", x, Policy());
 304    }
 305
 306    if (2*x*x*n < constants::pi<RealType>()) {
 307        return detail::kolmogorov_smirnov_pdf_small_x(x, n, Policy());
 308    }
 309
 310    return detail::kolmogorov_smirnov_pdf_large_x(x, n, Policy());
 311 } // pdf
 312
 313 template <class RealType, class Policy>
 314 inline RealType cdf(const kolmogorov_smirnov_distribution<RealType, Policy>& dist, const RealType& x)
 315 {
 316     BOOST_MATH_STD_USING // for ADL of std function exp.
 317    static const char* function = "boost::math::cdf(const kolmogorov_smirnov_distribution<%1%>&, %1%)";
 318    RealType error_result;
 319    RealType n = dist.number_of_observations();
 320    if(false == detail::check_x_not_NaN(function, x, &error_result, Policy()))
 321       return error_result;
 322    if(false == detail::check_df(function, n, &error_result, Policy()))
 323       return error_result;
 324    if((x < 0) || !(boost::math::isfinite)(x)) {
 325       return policies::raise_domain_error<RealType>(
 326          function, "Random variable parameter was %1%, but must be between > 0 !", x, Policy());
 327    }
 328
 329    if (x*x*n == 0)
 330        return 0;
 331
 332    return jacobi_theta4tau(RealType(0), 2*x*x*n/constants::pi<RealType>(), Policy());
 333 } // cdf
 334
 335 template <class RealType, class Policy>
 336 inline RealType cdf(const complemented2_type<kolmogorov_smirnov_distribution<RealType, Policy>, RealType>& c) {
 337     BOOST_MATH_STD_USING // for ADL of std function exp.
 338     RealType x = c.param;
 339    static const char* function = "boost::math::cdf(const complemented2_type<const kolmogorov_smirnov_distribution<%1%>&, %1%>)";
 340    RealType error_result;
 341    kolmogorov_smirnov_distribution<RealType, Policy> const& dist = c.dist;
 342    RealType n = dist.number_of_observations();
 343
 344    if(false == detail::check_x_not_NaN(function, x, &error_result, Policy()))
 345       return error_result;
 346    if(false == detail::check_df(function, n, &error_result, Policy()))
 347       return error_result;
 348
 349    if((x < 0) || !(boost::math::isfinite)(x))
 350       return policies::raise_domain_error<RealType>(
 351          function, "Random variable parameter was %1%, but must be between > 0 !", x, Policy());
 352
 353    if (x*x*n == 0)
 354        return 1;
 355
 356    if (2*x*x*n > constants::pi<RealType>())
 357        return -jacobi_theta4m1tau(RealType(0), 2*x*x*n/constants::pi<RealType>(), Policy());
 358
 359    return RealType(1) - jacobi_theta4tau(RealType(0), 2*x*x*n/constants::pi<RealType>(), Policy());
 360 } // cdf (complemented)
 361
 362 template <class RealType, class Policy>
 363 inline RealType quantile(const kolmogorov_smirnov_distribution<RealType, Policy>& dist, const RealType& p)
 364 {
 365     BOOST_MATH_STD_USING
 366    static const char* function = "boost::math::quantile(const kolmogorov_smirnov_distribution<%1%>&, %1%)";
 367    // Error check:
 368    RealType error_result;
 369    RealType n = dist.number_of_observations();
 370    if(false == detail::check_probability(function, p, &error_result, Policy()))
 371       return error_result;
 372    if(false == detail::check_df(function, n, &error_result, Policy()))
 373       return error_result;
 374
 375    RealType k = detail::kolmogorov_smirnov_quantile_guess(p) / sqrt(n);
 376    const int get_digits = policies::digits<RealType, Policy>();// get digits from policy,
 377    boost::uintmax_t m = policies::get_max_root_iterations<Policy>(); // and max iterations.
 378
 379    return tools::newton_raphson_iterate(detail::kolmogorov_smirnov_quantile_functor<RealType, Policy>(dist, p),
 380            k, RealType(0), boost::math::tools::max_value<RealType>(), get_digits, m);
 381 } // quantile
 382
 383 template <class RealType, class Policy>
 384 inline RealType quantile(const complemented2_type<kolmogorov_smirnov_distribution<RealType, Policy>, RealType>& c) {
 385     BOOST_MATH_STD_USING
 386    static const char* function = "boost::math::quantile(const kolmogorov_smirnov_distribution<%1%>&, %1%)";
 387    kolmogorov_smirnov_distribution<RealType, Policy> const& dist = c.dist;
 388    RealType n = dist.number_of_observations();
 389    // Error check:
 390    RealType error_result;
 391    RealType p = c.param;
 392
 393    if(false == detail::check_probability(function, p, &error_result, Policy()))
 394       return error_result;
 395    if(false == detail::check_df(function, n, &error_result, Policy()))
 396       return error_result;
 397
 398    RealType k = detail::kolmogorov_smirnov_quantile_guess(RealType(1-p)) / sqrt(n);
 399
 400    const int get_digits = policies::digits<RealType, Policy>();// get digits from policy,
 401    boost::uintmax_t m = policies::get_max_root_iterations<Policy>(); // and max iterations.
 402
 403    return tools::newton_raphson_iterate(
 404            detail::kolmogorov_smirnov_complementary_quantile_functor<RealType, Policy>(dist, p),
 405            k, RealType(0), boost::math::tools::max_value<RealType>(), get_digits, m);
 406 } // quantile (complemented)
 407
 408 template <class RealType, class Policy>
 409 inline RealType mode(const kolmogorov_smirnov_distribution<RealType, Policy>& dist)
 410 {
 411     BOOST_MATH_STD_USING
 412    static const char* function = "boost::math::mode(const kolmogorov_smirnov_distribution<%1%>&)";
 413    RealType n = dist.number_of_observations();
 414    RealType error_result;
 415    if(false == detail::check_df(function, n, &error_result, Policy()))
 416       return error_result;
 417
 418     std::pair<RealType, RealType> r = boost::math::tools::brent_find_minima(
 419             detail::kolmogorov_smirnov_negative_pdf_functor<RealType, Policy>(),
 420             static_cast<RealType>(0), static_cast<RealType>(1), policies::digits<RealType, Policy>());
 421     return r.first / sqrt(n);
 422 }
 423
 424 // Mean and variance come directly from
 425 // https://www.jstatsoft.org/article/view/v008i18 Section 3
 426 template <class RealType, class Policy>
 427 inline RealType mean(const kolmogorov_smirnov_distribution<RealType, Policy>& dist)
 428 {
 429     BOOST_MATH_STD_USING
 430    static const char* function = "boost::math::mean(const kolmogorov_smirnov_distribution<%1%>&)";
 431     RealType n = dist.number_of_observations();
 432     RealType error_result;
 433     if(false == detail::check_df(function, n, &error_result, Policy()))
 434         return error_result;
 435     return constants::root_half_pi<RealType>() * constants::ln_two<RealType>() / sqrt(n);
 436 }
 437
 438 template <class RealType, class Policy>
 439 inline RealType variance(const kolmogorov_smirnov_distribution<RealType, Policy>& dist)
 440 {
 441    static const char* function = "boost::math::variance(const kolmogorov_smirnov_distribution<%1%>&)";
 442     RealType n = dist.number_of_observations();
 443     RealType error_result;
 444     if(false == detail::check_df(function, n, &error_result, Policy()))
 445         return error_result;
 446     return (constants::pi_sqr_div_six<RealType>()
 447             - constants::pi<RealType>() * constants::ln_two<RealType>() * constants::ln_two<RealType>()) / (2*n);
 448 }
 449
 450 // Skewness and kurtosis come from integrating the PDF
 451 // The alternating series pops out a Dirichlet eta function which is related to the zeta function
 452 template <class RealType, class Policy>
 453 inline RealType skewness(const kolmogorov_smirnov_distribution<RealType, Policy>& dist)
 454 {
 455     BOOST_MATH_STD_USING
 456    static const char* function = "boost::math::skewness(const kolmogorov_smirnov_distribution<%1%>&)";
 457     RealType n = dist.number_of_observations();
 458     RealType error_result;
 459     if(false == detail::check_df(function, n, &error_result, Policy()))
 460         return error_result;
 461     RealType ex3 = RealType(0.5625) * constants::root_half_pi<RealType>() * constants::zeta_three<RealType>() / n / sqrt(n);
 462     RealType mean = boost::math::mean(dist);
 463     RealType var = boost::math::variance(dist);
 464     return (ex3 - 3 * mean * var - mean * mean * mean) / var / sqrt(var);
 465 }
 466
 467 template <class RealType, class Policy>
 468 inline RealType kurtosis(const kolmogorov_smirnov_distribution<RealType, Policy>& dist)
 469 {
 470     BOOST_MATH_STD_USING
 471    static const char* function = "boost::math::kurtosis(const kolmogorov_smirnov_distribution<%1%>&)";
 472     RealType n = dist.number_of_observations();
 473     RealType error_result;
 474     if(false == detail::check_df(function, n, &error_result, Policy()))
 475         return error_result;
 476     RealType ex4 = 7 * constants::pi_sqr_div_six<RealType>() * constants::pi_sqr_div_six<RealType>() / 20 / n / n;
 477     RealType mean = boost::math::mean(dist);
 478     RealType var = boost::math::variance(dist);
 479     RealType skew = boost::math::skewness(dist);
 480     return (ex4 - 4 * mean * skew * var * sqrt(var) - 6 * mean * mean * var - mean * mean * mean * mean) / var / var;
 481 }
 482
 483 template <class RealType, class Policy>
 484 inline RealType kurtosis_excess(const kolmogorov_smirnov_distribution<RealType, Policy>& dist)
 485 {
 486    static const char* function = "boost::math::kurtosis_excess(const kolmogorov_smirnov_distribution<%1%>&)";
 487     RealType n = dist.number_of_observations();
 488     RealType error_result;
 489     if(false == detail::check_df(function, n, &error_result, Policy()))
 490         return error_result;
 491     return kurtosis(dist) - 3;
 492 }
 493 }}
 494 #endif