ceph/src/boost/libs/multiprecision/example/exercise_threading_log_agm.cpp

   1 ///////////////////////////////////////////////////////////////////////////////
   2 //      Copyright Christopher Kormanyos 2020.
   3 // Distributed under the Boost Software License, Version 1.0.
   4 //    (See accompanying file LICENSE_1_0.txt or copy at
   5 //          http://www.boost.org/LICENSE_1_0.txt)
   6 //
   7
   8 // This example exercises Boost.Multiprecision in concurrent
   9 // multi-threaded environments. To do so a loop involving
  10 // non-trivial calculations of numerous function values
  11 // has been set up within both concurrent as well as
  12 // sequential running environments. In particular,
  13 // this example uses an AGM method to do a "from the ground up"
  14 // calculation of logarithms. The logarithm functions values
  15 // are compared with the values from Boost.Multiprecision's
  16 // specific log functions for the relevant backends.
  17 // The log GM here is not optimized or intended for
  18 // high-performance work, but can be taken as an
  19 // interesting example of an AGM iteration if helpful.
  20
  21 // This example has been initially motivated in part
  22 // by discussions in:
  23 // https://github.com/boostorg/multiprecision/pull/211
  24
  25 // We find the following performance data here:
  26 // https://github.com/boostorg/multiprecision/pull/213
  27 //
  28 // cpp_dec_float:
  29 // result_is_ok_concurrent: true, calculation_time_concurrent: 18.1s
  30 // result_is_ok_sequential: true, calculation_time_sequential: 48.5s
  31 //
  32 // cpp_bin_float:
  33 // result_is_ok_concurrent: true, calculation_time_concurrent: 18.7s
  34 // result_is_ok_sequential: true, calculation_time_sequential: 50.4s
  35 //
  36 // gmp_float:
  37 // result_is_ok_concurrent: true, calculation_time_concurrent: 3.3s
  38 // result_is_ok_sequential: true, calculation_time_sequential: 12.4s
  39 //
  40 // mpfr_float:
  41 // result_is_ok_concurrent: true, calculation_time_concurrent: 0.6s
  42 // result_is_ok_sequential: true, calculation_time_sequential: 1.9s
  43
  44 #include <array>
  45 #include <atomic>
  46 #include <cstddef>
  47 #include <cstdint>
  48 #include <iomanip>
  49 #include <iostream>
  50 #include <limits>
  51 #include <thread>
  52 #include <vector>
  53
  54 #include <boost/math/constants/constants.hpp>
  55 #include <boost/math/special_functions/prime.hpp>
  56
  57 #define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_DEC_FLOAT       101
  58 #define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_GMP_FLOAT           102
  59 #define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_BIN_FLOAT       103
  60 #define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_MPFR_FLOAT          104
  61
  62 #if !defined(BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE)
  63 #define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_DEC_FLOAT
  64 //#define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_BIN_FLOAT
  65 //#define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_GMP_FLOAT
  66 //#define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_MPFR_FLOAT
  67 #endif
  68
  69 #if  (BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE == BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_DEC_FLOAT)
  70 #include <boost/multiprecision/cpp_dec_float.hpp>
  71
  72 using big_float_type = boost::multiprecision::number<boost::multiprecision::cpp_dec_float<501>,
  73                                                      boost::multiprecision::et_off>;
  74
  75 #elif (BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE == BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_BIN_FLOAT)
  76 #include <boost/multiprecision/cpp_bin_float.hpp>
  77
  78 using big_float_type = boost::multiprecision::number<boost::multiprecision::cpp_bin_float<501>,
  79                                                      boost::multiprecision::et_off>;
  80
  81 #elif  (BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE == BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_GMP_FLOAT)
  82 #include <boost/multiprecision/gmp.hpp>
  83
  84 using big_float_type = boost::multiprecision::number<boost::multiprecision::gmp_float<501>,
  85                                                      boost::multiprecision::et_off>;
  86
  87 #elif  (BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE == BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_MPFR_FLOAT)
  88 #include <boost/multiprecision/mpfr.hpp>
  89
  90 using big_float_type = boost::multiprecision::number<boost::multiprecision::mpfr_float_backend<501>,
  91                                                      boost::multiprecision::et_off>;
  92
  93 #else
  94 #error BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE is undefined.
  95 #endif
  96
  97 namespace boost { namespace multiprecision { namespace exercise_threading {
  98
  99 namespace detail {
 100
 101 namespace my_concurrency {
 102 template<typename index_type,
 103          typename callable_function_type>
 104 void parallel_for(index_type             start,
 105                   index_type             end,
 106                   callable_function_type parallel_function)
 107 {
 108   // Estimate the number of threads available.
 109   static const unsigned int number_of_threads_hint =
 110     std::thread::hardware_concurrency();
 111
 112   static const unsigned int number_of_threads_total =
 113     ((number_of_threads_hint == 0U) ? 4U : number_of_threads_hint);
 114
 115   // Use only 3/4 of the available cores.
 116   static const unsigned int number_of_threads = number_of_threads_total - (number_of_threads_total / 8U);
 117
 118   std::cout << "Executing with " << number_of_threads << " threads" << std::endl;
 119
 120   // Set the size of a slice for the range functions.
 121   index_type n = index_type(end - start) + index_type(1);
 122
 123   index_type slice =
 124     static_cast<index_type>(std::round(n / static_cast<float>(number_of_threads)));
 125
 126   slice = (std::max)(slice, index_type(1));
 127
 128   // Inner loop.
 129   auto launch_range =
 130     [&parallel_function](index_type index_lo, index_type index_hi)
 131     {
 132       for(index_type i = index_lo; i < index_hi; ++i)
 133       {
 134         parallel_function(i);
 135       }
 136     };
 137
 138   // Create the thread pool and launch the jobs.
 139   std::vector<std::thread> pool;
 140
 141   pool.reserve(number_of_threads);
 142
 143   index_type i1 = start;
 144   index_type i2 = (std::min)(index_type(start + slice), end);
 145
 146   for(index_type i = 0U; ((index_type(i + index_type(1U)) < number_of_threads) && (i1 < end)); ++i)
 147   {
 148     pool.emplace_back(launch_range, i1, i2);
 149
 150     i1 = i2;
 151
 152     i2 = (std::min)(index_type(i2 + slice), end);
 153   }
 154
 155   if(i1 < end)
 156   {
 157     pool.emplace_back(launch_range, i1, end);
 158   }
 159
 160   // Wait for the jobs to finish.
 161   for(std::thread& thread_in_pool : pool)
 162   {
 163     if(thread_in_pool.joinable())
 164     {
 165       thread_in_pool.join();
 166     }
 167   }
 168 }
 169 } // namespace my_concurrency
 170
 171 template<typename FloatingPointType,
 172          typename UnsignedIntegralType>
 173 FloatingPointType pown(const FloatingPointType& b, const UnsignedIntegralType& p)
 174 {
 175   // Calculate (b ^ p).
 176
 177   using local_floating_point_type    = FloatingPointType;
 178   using local_unsigned_integral_type = UnsignedIntegralType;
 179
 180   local_floating_point_type result;
 181
 182   if     (p == local_unsigned_integral_type(0U)) { result = local_floating_point_type(1U); }
 183   else if(p == local_unsigned_integral_type(1U)) { result = b; }
 184   else if(p == local_unsigned_integral_type(2U)) { result = b; result *= b; }
 185   else
 186   {
 187     result = local_floating_point_type(1U);
 188
 189     local_floating_point_type y(b);
 190
 191     for(local_unsigned_integral_type p_local(p); p_local != local_unsigned_integral_type(0U); p_local >>= 1U)
 192     {
 193       if((static_cast<unsigned>(p_local) & 1U) != 0U)
 194       {
 195         result *= y;
 196       }
 197
 198       y *= y;
 199     }
 200   }
 201
 202   return result;
 203 }
 204
 205 const std::vector<std::uint32_t>& primes()
 206 {
 207   static const std::vector<std::uint32_t> my_primes =
 208   []() -> std::vector<std::uint32_t>
 209   {
 210     std::vector<std::uint32_t> local_primes(10000U);
 211
 212     // Get exactly 10,000 primes.
 213     for(auto i = 0U; i < local_primes.size(); ++i)
 214     {
 215       local_primes[i] = boost::math::prime(i);
 216     }
 217
 218     return local_primes;
 219   }();
 220
 221   return my_primes;
 222 }
 223
 224 } // namespace detail
 225
 226 template<typename FloatingPointType>
 227 FloatingPointType log(const FloatingPointType& x)
 228 {
 229   // Use an AGM method to compute the logarithm of x.
 230
 231   // For values less than 1 invert the argument and
 232   // remember (in this case) to negate the result below.
 233   const bool b_negate = (x < 1);
 234
 235   const FloatingPointType xx = ((b_negate == false) ? x : 1 / x);
 236
 237   // Set a0 = 1
 238   // Set b0 = 4 / (x * 2^m) = 1 / (x * 2^(m - 2))
 239
 240   FloatingPointType ak(1U);
 241
 242   const float n_times_factor = static_cast<float>(static_cast<float>(std::numeric_limits<FloatingPointType>::digits10) * 1.67F);
 243   const float lgx_over_lg2   = std::log(static_cast<float>(xx)) / std::log(2.0F);
 244
 245   std::int32_t m = static_cast<std::int32_t>(n_times_factor - lgx_over_lg2);
 246
 247   // Ensure that the resulting power is non-negative.
 248   // Also enforce that m >= 8.
 249   m = (std::max)(m, static_cast<std::int32_t>(8));
 250
 251   FloatingPointType bk = detail::pown(FloatingPointType(2), static_cast<std::uint32_t>(m));
 252
 253   bk *= xx;
 254   bk  = 4 / bk;
 255
 256   FloatingPointType ak_tmp(0U);
 257
 258   using std::sqrt;
 259
 260   // Determine the requested precision of the upcoming iteration in units of digits10.
 261   const FloatingPointType target_tolerance = sqrt(std::numeric_limits<FloatingPointType>::epsilon()) / 100;
 262
 263   for(std::int32_t k = static_cast<std::int32_t>(0); k < static_cast<std::int32_t>(64); ++k)
 264   {
 265     using std::fabs;
 266
 267     // Check for the number of significant digits to be
 268     // at least half of the requested digits. If at least
 269     // half of the requested digits have been achieved,
 270     // then break after the upcoming iteration.
 271     const bool break_after_this_iteration = (   (k > static_cast<std::int32_t>(4))
 272                                              && (fabs(1 - fabs(ak / bk)) < target_tolerance));
 273
 274     ak_tmp  = ak;
 275     ak     += bk;
 276     ak     /= 2;
 277
 278     if(break_after_this_iteration)
 279     {
 280       break;
 281     }
 282
 283     bk *= ak_tmp;
 284     bk  = sqrt(bk);
 285   }
 286
 287   // We are now finished with the AGM iteration for log(x).
 288
 289   // Compute log(x) = {pi / [2 * AGM(1, 4 / 2^m)]} - (m * ln2)
 290   // Note at this time that (ak = bk) = AGM(...)
 291
 292   // Retrieve the value of pi, divide by (2 * a) and subtract (m * ln2).
 293   const FloatingPointType result =
 294        boost::math::constants::pi<FloatingPointType>() / (ak * 2)
 295     - (boost::math::constants::ln_two<FloatingPointType>() * m);
 296
 297   return ((b_negate == true) ? -result : result);
 298 }
 299
 300 } } } // namespace boost::multiprecision::exercise_threading
 301
 302 template<typename FloatingPointType>
 303 bool log_agm_concurrent(float& calculation_time)
 304 {
 305   const std::size_t count = boost::multiprecision::exercise_threading::detail::primes().size();
 306
 307   std::vector<FloatingPointType> log_results(count);
 308   std::vector<FloatingPointType> log_control(count);
 309
 310   std::atomic_flag log_agm_lock = ATOMIC_FLAG_INIT;
 311
 312   std::size_t concurrent_log_agm_count = 0U;
 313
 314   const std::clock_t start = std::clock();
 315
 316   boost::multiprecision::exercise_threading::detail::my_concurrency::parallel_for
 317   (
 318     std::size_t(0U),
 319     log_results.size(),
 320     [&log_results, &log_control, &concurrent_log_agm_count, &log_agm_lock](std::size_t i)
 321     {
 322       while(log_agm_lock.test_and_set()) { ; }
 323       ++concurrent_log_agm_count;
 324       if((concurrent_log_agm_count % 100U) == 0U)
 325       {
 326         std::cout << "log agm concurrent at index "
 327                   << concurrent_log_agm_count
 328                   << " of "
 329                   << log_results.size()
 330                   << ". Total processed so far: "
 331                   << std::fixed
 332                   << std::setprecision(1)
 333                   << (100.0F * float(concurrent_log_agm_count)) / float(log_results.size())
 334                   << "%."
 335                   << "\r";
 336       }
 337       log_agm_lock.clear();
 338
 339       const FloatingPointType dx = (FloatingPointType(1U) / (boost::multiprecision::exercise_threading::detail::primes()[i]));
 340       const FloatingPointType  x = boost::math::constants::catalan<FloatingPointType>() + dx;
 341
 342       const FloatingPointType lr = boost::multiprecision::exercise_threading::log(x);
 343       const FloatingPointType lc = boost::multiprecision::log(x);
 344
 345       log_results[i] = lr;
 346       log_control[i] = lc;
 347     }
 348   );
 349
 350   calculation_time = static_cast<float>(std::clock() - start) / static_cast<float>(CLOCKS_PER_SEC);
 351
 352   std::cout << std::endl;
 353
 354   std::cout << "Checking results concurrent: ";
 355
 356   bool result_is_ok = true;
 357
 358   for(std::size_t i = 0U; i < log_results.size(); ++i)
 359   {
 360     using std::fabs;
 361
 362     const FloatingPointType close_fraction = fabs(1 - (log_results[i] / log_control[i]));
 363
 364     result_is_ok &= (close_fraction < std::numeric_limits<FloatingPointType>::epsilon() * 1000000U);
 365   }
 366
 367   std::cout << std::boolalpha << result_is_ok << std::endl;
 368
 369   return result_is_ok;
 370 }
 371
 372 template<typename FloatingPointType>
 373 bool log_agm_sequential(float& calculation_time)
 374 {
 375   const std::size_t count = boost::multiprecision::exercise_threading::detail::primes().size();
 376
 377   std::vector<FloatingPointType> log_results(count);
 378   std::vector<FloatingPointType> log_control(count);
 379
 380   std::atomic_flag log_agm_lock = ATOMIC_FLAG_INIT;
 381
 382   const std::clock_t start = std::clock();
 383
 384   for(std::size_t i = 0U; i < log_results.size(); ++i)
 385   {
 386     const std::size_t sequential_log_agm_count = i + 1U;
 387
 388     if((sequential_log_agm_count % 100U) == 0U)
 389     {
 390       std::cout << "log agm sequential at index "
 391                 << sequential_log_agm_count
 392                 << " of "
 393                 << log_results.size()
 394                 << ". Total processed so far: "
 395                 << std::fixed
 396                 << std::setprecision(1)
 397                 << (100.0F * float(sequential_log_agm_count)) / float(log_results.size())
 398                 << "%."
 399                 << "\r";
 400     }
 401
 402     const FloatingPointType dx = (FloatingPointType(1U) / (boost::multiprecision::exercise_threading::detail::primes()[i]));
 403     const FloatingPointType  x = boost::math::constants::catalan<FloatingPointType>() + dx;
 404
 405     log_results[i] = boost::multiprecision::exercise_threading::log(x);
 406     log_control[i] = boost::multiprecision::log(x);
 407   }
 408
 409   calculation_time = static_cast<float>(std::clock() - start) / static_cast<float>(CLOCKS_PER_SEC);
 410
 411   std::cout << std::endl;
 412
 413   std::cout << "Checking results sequential: ";
 414
 415   bool result_is_ok = true;
 416
 417   for(std::size_t i = 0U; i < log_results.size(); ++i)
 418   {
 419     using std::fabs;
 420
 421     const FloatingPointType close_fraction = fabs(1 - (log_results[i] / log_control[i]));
 422
 423     result_is_ok &= (close_fraction < std::numeric_limits<FloatingPointType>::epsilon() * 1000000U);
 424   }
 425
 426   std::cout << std::boolalpha << result_is_ok << std::endl;
 427
 428   return result_is_ok;
 429 }
 430
 431 int main()
 432 {
 433   std::cout << "Calculating "
 434             << boost::multiprecision::exercise_threading::detail::primes().size()
 435             << " primes"
 436             << std::endl;
 437
 438   float calculation_time_concurrent;
 439   const bool result_is_ok_concurrent = log_agm_concurrent<big_float_type>(calculation_time_concurrent);
 440
 441   float calculation_time_sequential;
 442   const bool result_is_ok_sequential = log_agm_sequential<big_float_type>(calculation_time_sequential);
 443
 444   std::cout << std::endl;
 445
 446   std::cout << "result_is_ok_concurrent: "
 447             << std::boolalpha
 448             << result_is_ok_concurrent
 449             << ", calculation_time_concurrent: "
 450             << std::fixed
 451             << std::setprecision(1)
 452             << calculation_time_concurrent
 453             << "s"
 454             << std::endl;
 455
 456   std::cout << "result_is_ok_sequential: "
 457             << std::boolalpha
 458             << result_is_ok_sequential
 459             << ", calculation_time_sequential: "
 460             << std::fixed
 461             << std::setprecision(1)
 462             << calculation_time_sequential
 463             << "s"
 464             << std::endl;
 465 }