ceph/src/boost/libs/multiprecision/example/exercise_threading_log_agm.cpp

   1 ///////////////////////////////////////////////////////////////////////////////
   2 //      Copyright Christopher Kormanyos 2020 - 2021.
   3 // Distributed under the Boost Software License, Version 1.0.
   4 //    (See accompanying file LICENSE_1_0.txt or copy at
   5 //          http://www.boost.org/LICENSE_1_0.txt)
   6 //
   7
   8 // This example exercises Boost.Multiprecision in concurrent
   9 // multi-threaded environments. To do so a loop involving
  10 // non-trivial calculations of numerous function values
  11 // has been set up within both concurrent as well as
  12 // sequential running environments. In particular,
  13 // this example uses an AGM method to do a "from the ground up"
  14 // calculation of logarithms. The logarithm functions values
  15 // are compared with the values from Boost.Multiprecision's
  16 // specific log functions for the relevant backends.
  17 // The log GM here is not optimized or intended for
  18 // high-performance work, but can be taken as an
  19 // interesting example of an AGM iteration if helpful.
  20
  21 // This example has been initially motivated in part
  22 // by discussions in:
  23 // https://github.com/boostorg/multiprecision/pull/211
  24
  25 // We find the following performance data here:
  26 // https://github.com/boostorg/multiprecision/pull/213
  27 //
  28 // cpp_dec_float:
  29 // result_is_ok_concurrent: true, calculation_time_concurrent: 18.1s
  30 // result_is_ok_sequential: true, calculation_time_sequential: 48.5s
  31 //
  32 // cpp_bin_float:
  33 // result_is_ok_concurrent: true, calculation_time_concurrent: 18.7s
  34 // result_is_ok_sequential: true, calculation_time_sequential: 50.4s
  35 //
  36 // gmp_float:
  37 // result_is_ok_concurrent: true, calculation_time_concurrent: 3.3s
  38 // result_is_ok_sequential: true, calculation_time_sequential: 12.4s
  39 //
  40 // mpfr_float:
  41 // result_is_ok_concurrent: true, calculation_time_concurrent: 0.6s
  42 // result_is_ok_sequential: true, calculation_time_sequential: 1.9s
  43
  44 #include <array>
  45 #include <atomic>
  46 #include <cstddef>
  47 #include <cstdint>
  48 #include <iomanip>
  49 #include <iostream>
  50 #include <limits>
  51 #include <thread>
  52 #include <vector>
  53
  54 #include <boost/math/constants/constants.hpp>
  55 #include <boost/math/special_functions/prime.hpp>
  56
  57 #define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_DEC_FLOAT       101
  58 #define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_GMP_FLOAT           102
  59 #define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_BIN_FLOAT       103
  60 #define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_MPFR_FLOAT          104
  61
  62 #if !defined(BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE)
  63 #define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_DEC_FLOAT
  64 //#define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_BIN_FLOAT
  65 //#define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_GMP_FLOAT
  66 //#define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_MPFR_FLOAT
  67 #endif
  68
  69 #if  (BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE == BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_DEC_FLOAT)
  70 #include <boost/multiprecision/cpp_dec_float.hpp>
  71
  72 using big_float_type = boost::multiprecision::number<boost::multiprecision::cpp_dec_float<501>,
  73                                                      boost::multiprecision::et_off>;
  74
  75 #elif (BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE == BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_BIN_FLOAT)
  76 #include <boost/multiprecision/cpp_bin_float.hpp>
  77
  78 using big_float_type = boost::multiprecision::number<boost::multiprecision::cpp_bin_float<501>,
  79                                                      boost::multiprecision::et_off>;
  80
  81 #elif  (BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE == BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_GMP_FLOAT)
  82 #include <boost/multiprecision/gmp.hpp>
  83
  84 using big_float_type = boost::multiprecision::number<boost::multiprecision::gmp_float<501>,
  85                                                      boost::multiprecision::et_off>;
  86
  87 #elif  (BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE == BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_MPFR_FLOAT)
  88 #include <boost/multiprecision/mpfr.hpp>
  89
  90 using big_float_type = boost::multiprecision::number<boost::multiprecision::mpfr_float_backend<501>,
  91                                                      boost::multiprecision::et_off>;
  92
  93 #else
  94 #error BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE is undefined.
  95 #endif
  96
  97 namespace boost { namespace multiprecision { namespace exercise_threading {
  98
  99 namespace detail {
 100
 101 namespace my_concurrency {
 102 template<typename index_type,
 103          typename callable_function_type>
 104 void parallel_for(index_type             start,
 105                   index_type             end,
 106                   callable_function_type parallel_function)
 107 {
 108   // Estimate the number of threads available.
 109   static const unsigned int number_of_threads_hint =
 110     std::thread::hardware_concurrency();
 111
 112   static const unsigned int number_of_threads_total =
 113     ((number_of_threads_hint == 0U) ? 4U : number_of_threads_hint);
 114
 115   // Use only 3/4 of the available cores.
 116   static const unsigned int number_of_threads = number_of_threads_total - (number_of_threads_total / 8U);
 117
 118   std::cout << "Executing with " << number_of_threads << " threads" << std::endl;
 119
 120   // Set the size of a slice for the range functions.
 121   index_type n = index_type(end - start) + index_type(1);
 122
 123   index_type slice =
 124     static_cast<index_type>(std::round(n / static_cast<float>(number_of_threads)));
 125
 126   slice = (std::max)(slice, index_type(1));
 127
 128   // Inner loop.
 129   auto launch_range =
 130     [&parallel_function](index_type index_lo, index_type index_hi)
 131     {
 132       for(index_type i = index_lo; i < index_hi; ++i)
 133       {
 134         parallel_function(i);
 135       }
 136     };
 137
 138   // Create the thread pool and launch the jobs.
 139   std::vector<std::thread> pool;
 140
 141   pool.reserve(number_of_threads);
 142
 143   index_type i1 = start;
 144   index_type i2 = (std::min)(index_type(start + slice), end);
 145
 146   for(index_type i = 0U; ((index_type(i + index_type(1U)) < number_of_threads) && (i1 < end)); ++i)
 147   {
 148     pool.emplace_back(launch_range, i1, i2);
 149
 150     i1 = i2;
 151
 152     i2 = (std::min)(index_type(i2 + slice), end);
 153   }
 154
 155   if(i1 < end)
 156   {
 157     pool.emplace_back(launch_range, i1, end);
 158   }
 159
 160   // Wait for the jobs to finish.
 161   for(std::thread& thread_in_pool : pool)
 162   {
 163     if(thread_in_pool.joinable())
 164     {
 165       thread_in_pool.join();
 166     }
 167   }
 168 }
 169 } // namespace my_concurrency
 170
 171 template<typename FloatingPointType,
 172          typename UnsignedIntegralType>
 173 FloatingPointType pown(const FloatingPointType& b, const UnsignedIntegralType& p)
 174 {
 175   // Calculate (b ^ p).
 176
 177   using local_floating_point_type    = FloatingPointType;
 178   using local_unsigned_integral_type = UnsignedIntegralType;
 179
 180   local_floating_point_type result;
 181
 182   if     (p == local_unsigned_integral_type(0U)) { result = local_floating_point_type(1U); }
 183   else if(p == local_unsigned_integral_type(1U)) { result = b; }
 184   else if(p == local_unsigned_integral_type(2U)) { result = b; result *= b; }
 185   else
 186   {
 187     result = local_floating_point_type(1U);
 188
 189     local_floating_point_type y(b);
 190
 191     for(local_unsigned_integral_type p_local(p); p_local != local_unsigned_integral_type(0U); p_local >>= 1U)
 192     {
 193       if((static_cast<unsigned>(p_local) & 1U) != 0U)
 194       {
 195         result *= y;
 196       }
 197
 198       y *= y;
 199     }
 200   }
 201
 202   return result;
 203 }
 204
 205 const std::vector<std::uint32_t>& primes()
 206 {
 207   static std::vector<std::uint32_t> my_primes;
 208
 209   if(my_primes.empty())
 210   {
 211     my_primes.resize(10000U);
 212
 213     // Get exactly 10,000 primes.
 214     for(std::size_t i = 0U; i < my_primes.size(); ++i)
 215     {
 216       my_primes[i] = boost::math::prime((unsigned int) i);
 217     }
 218   }
 219
 220   return my_primes;
 221 }
 222
 223 } // namespace detail
 224
 225 template<typename FloatingPointType>
 226 FloatingPointType log(const FloatingPointType& x)
 227 {
 228   // Use an AGM method to compute the logarithm of x.
 229
 230   // For values less than 1 invert the argument and
 231   // remember (in this case) to negate the result below.
 232   const bool b_negate = (x < 1);
 233
 234   const FloatingPointType xx = ((b_negate == false) ? x : 1 / x);
 235
 236   // Set a0 = 1
 237   // Set b0 = 4 / (x * 2^m) = 1 / (x * 2^(m - 2))
 238
 239   FloatingPointType ak(1U);
 240
 241   const float n_times_factor = static_cast<float>(static_cast<float>(std::numeric_limits<FloatingPointType>::digits10) * 1.67F);
 242   const float lgx_over_lg2   = std::log(static_cast<float>(xx)) / std::log(2.0F);
 243
 244   std::int32_t m = static_cast<std::int32_t>(n_times_factor - lgx_over_lg2);
 245
 246   // Ensure that the resulting power is non-negative.
 247   // Also enforce that m >= 8.
 248   m = (std::max)(m, static_cast<std::int32_t>(8));
 249
 250   FloatingPointType bk = detail::pown(FloatingPointType(2), static_cast<std::uint32_t>(m));
 251
 252   bk *= xx;
 253   bk  = 4 / bk;
 254
 255   FloatingPointType ak_tmp(0U);
 256
 257   using std::sqrt;
 258
 259   // Determine the requested precision of the upcoming iteration in units of digits10.
 260   const FloatingPointType target_tolerance = sqrt(std::numeric_limits<FloatingPointType>::epsilon()) / 100;
 261
 262   for(std::int32_t k = static_cast<std::int32_t>(0); k < static_cast<std::int32_t>(64); ++k)
 263   {
 264     using std::fabs;
 265
 266     // Check for the number of significant digits to be
 267     // at least half of the requested digits. If at least
 268     // half of the requested digits have been achieved,
 269     // then break after the upcoming iteration.
 270     const bool break_after_this_iteration = (   (k > static_cast<std::int32_t>(4))
 271                                              && (fabs(1 - fabs(ak / bk)) < target_tolerance));
 272
 273     ak_tmp  = ak;
 274     ak     += bk;
 275     ak     /= 2;
 276
 277     if(break_after_this_iteration)
 278     {
 279       break;
 280     }
 281
 282     bk *= ak_tmp;
 283     bk  = sqrt(bk);
 284   }
 285
 286   // We are now finished with the AGM iteration for log(x).
 287
 288   // Compute log(x) = {pi / [2 * AGM(1, 4 / 2^m)]} - (m * ln2)
 289   // Note at this time that (ak = bk) = AGM(...)
 290
 291   // Retrieve the value of pi, divide by (2 * a) and subtract (m * ln2).
 292   const FloatingPointType result =
 293        boost::math::constants::pi<FloatingPointType>() / (ak * 2)
 294     - (boost::math::constants::ln_two<FloatingPointType>() * m);
 295
 296   return ((b_negate == true) ? -result : result);
 297 }
 298
 299 } } } // namespace boost::multiprecision::exercise_threading
 300
 301 template<typename FloatingPointType>
 302 bool log_agm_concurrent(float& calculation_time)
 303 {
 304   const std::size_t count = boost::multiprecision::exercise_threading::detail::primes().size();
 305
 306   std::vector<FloatingPointType> log_results(count);
 307   std::vector<FloatingPointType> log_control(count);
 308
 309   std::atomic_flag log_agm_lock = ATOMIC_FLAG_INIT;
 310
 311   std::size_t concurrent_log_agm_count = 0U;
 312
 313   const std::clock_t start = std::clock();
 314
 315   boost::multiprecision::exercise_threading::detail::my_concurrency::parallel_for
 316   (
 317     std::size_t(0U),
 318     log_results.size(),
 319     [&log_results, &log_control, &concurrent_log_agm_count, &log_agm_lock](std::size_t i)
 320     {
 321       while(log_agm_lock.test_and_set()) { ; }
 322       const FloatingPointType dx = (FloatingPointType(1U) / (boost::multiprecision::exercise_threading::detail::primes()[i]));
 323       log_agm_lock.clear();
 324
 325       const FloatingPointType  x = boost::math::constants::catalan<FloatingPointType>() + dx;
 326
 327       const FloatingPointType lr = boost::multiprecision::exercise_threading::log(x);
 328       const FloatingPointType lc = boost::multiprecision::log(x);
 329
 330       while(log_agm_lock.test_and_set()) { ; }
 331
 332       log_results[i] = lr;
 333       log_control[i] = lc;
 334
 335       ++concurrent_log_agm_count;
 336
 337       if((concurrent_log_agm_count % 100U) == 0U)
 338       {
 339         std::cout << "log agm concurrent at index "
 340                   << concurrent_log_agm_count
 341                   << " of "
 342                   << log_results.size()
 343                   << ". Total processed so far: "
 344                   << std::fixed
 345                   << std::setprecision(1)
 346                   << (100.0F * float(concurrent_log_agm_count)) / float(log_results.size())
 347                   << "%."
 348                   << "\r";
 349       }
 350
 351       log_agm_lock.clear();
 352     }
 353   );
 354
 355   calculation_time = static_cast<float>(std::clock() - start) / static_cast<float>(CLOCKS_PER_SEC);
 356
 357   std::cout << std::endl;
 358
 359   std::cout << "Checking results concurrent: ";
 360
 361   bool result_is_ok = true;
 362
 363   const FloatingPointType tol = std::numeric_limits<FloatingPointType>::epsilon() * 1000000U;
 364
 365   for(std::size_t i = 0U; i < log_results.size(); ++i)
 366   {
 367     using std::fabs;
 368
 369     const FloatingPointType close_fraction = fabs(1 - (log_results[i] / log_control[i]));
 370
 371     result_is_ok &= (close_fraction < tol);
 372   }
 373
 374   std::cout << std::boolalpha << result_is_ok << std::endl;
 375
 376   return result_is_ok;
 377 }
 378
 379 template<typename FloatingPointType>
 380 bool log_agm_sequential(float& calculation_time)
 381 {
 382   const std::size_t count = boost::multiprecision::exercise_threading::detail::primes().size();
 383
 384   std::vector<FloatingPointType> log_results(count);
 385   std::vector<FloatingPointType> log_control(count);
 386
 387   const std::clock_t start = std::clock();
 388
 389   for(std::size_t i = 0U; i < log_results.size(); ++i)
 390   {
 391     const std::size_t sequential_log_agm_count = i + 1U;
 392
 393     const FloatingPointType dx = (FloatingPointType(1U) / (boost::multiprecision::exercise_threading::detail::primes()[i]));
 394     const FloatingPointType  x = boost::math::constants::catalan<FloatingPointType>() + dx;
 395
 396     log_results[i] = boost::multiprecision::exercise_threading::log(x);
 397     log_control[i] = boost::multiprecision::log(x);
 398
 399     if((sequential_log_agm_count % 100U) == 0U)
 400     {
 401       std::cout << "log agm sequential at index "
 402                 << sequential_log_agm_count
 403                 << " of "
 404                 << log_results.size()
 405                 << ". Total processed so far: "
 406                 << std::fixed
 407                 << std::setprecision(1)
 408                 << (100.0F * float(sequential_log_agm_count)) / float(log_results.size())
 409                 << "%."
 410                 << "\r";
 411     }
 412   }
 413
 414   calculation_time = static_cast<float>(std::clock() - start) / static_cast<float>(CLOCKS_PER_SEC);
 415
 416   std::cout << std::endl;
 417
 418   std::cout << "Checking results sequential: ";
 419
 420   bool result_is_ok = true;
 421
 422   const FloatingPointType tol = std::numeric_limits<FloatingPointType>::epsilon() * 1000000U;
 423
 424   for(std::size_t i = 0U; i < log_results.size(); ++i)
 425   {
 426     using std::fabs;
 427
 428     const FloatingPointType close_fraction = fabs(1 - (log_results[i] / log_control[i]));
 429
 430     result_is_ok &= (close_fraction < tol);
 431   }
 432
 433   std::cout << std::boolalpha << result_is_ok << std::endl;
 434
 435   return result_is_ok;
 436 }
 437
 438 int main()
 439 {
 440   std::cout << "Calculating "
 441             << boost::multiprecision::exercise_threading::detail::primes().size()
 442             << " primes"
 443             << std::endl;
 444
 445   float calculation_time_concurrent;
 446   const bool result_is_ok_concurrent = log_agm_concurrent<big_float_type>(calculation_time_concurrent);
 447
 448   float calculation_time_sequential;
 449   const bool result_is_ok_sequential = log_agm_sequential<big_float_type>(calculation_time_sequential);
 450
 451   std::cout << std::endl;
 452
 453   std::cout << "result_is_ok_concurrent: "
 454             << std::boolalpha
 455             << result_is_ok_concurrent
 456             << ", calculation_time_concurrent: "
 457             << std::fixed
 458             << std::setprecision(1)
 459             << calculation_time_concurrent
 460             << "s"
 461             << std::endl;
 462
 463   std::cout << "result_is_ok_sequential: "
 464             << std::boolalpha
 465             << result_is_ok_sequential
 466             << ", calculation_time_sequential: "
 467             << std::fixed
 468             << std::setprecision(1)
 469             << calculation_time_sequential
 470             << "s"
 471             << std::endl;
 472 }