]>
Commit | Line | Data |
---|---|---|
20effc67 | 1 | /////////////////////////////////////////////////////////////////////////////// |
1e59de90 | 2 | // Copyright Christopher Kormanyos 2020 - 2021. |
20effc67 TL |
3 | // Distributed under the Boost Software License, Version 1.0. |
4 | // (See accompanying file LICENSE_1_0.txt or copy at | |
5 | // http://www.boost.org/LICENSE_1_0.txt) | |
6 | // | |
7 | ||
8 | // This example exercises Boost.Multiprecision in concurrent | |
9 | // multi-threaded environments. To do so a loop involving | |
10 | // non-trivial calculations of numerous function values | |
11 | // has been set up within both concurrent as well as | |
12 | // sequential running environments. In particular, | |
13 | // this example uses an AGM method to do a "from the ground up" | |
14 | // calculation of logarithms. The logarithm functions values | |
15 | // are compared with the values from Boost.Multiprecision's | |
16 | // specific log functions for the relevant backends. | |
17 | // The log GM here is not optimized or intended for | |
18 | // high-performance work, but can be taken as an | |
19 | // interesting example of an AGM iteration if helpful. | |
20 | ||
21 | // This example has been initially motivated in part | |
22 | // by discussions in: | |
23 | // https://github.com/boostorg/multiprecision/pull/211 | |
24 | ||
25 | // We find the following performance data here: | |
26 | // https://github.com/boostorg/multiprecision/pull/213 | |
27 | // | |
28 | // cpp_dec_float: | |
29 | // result_is_ok_concurrent: true, calculation_time_concurrent: 18.1s | |
30 | // result_is_ok_sequential: true, calculation_time_sequential: 48.5s | |
31 | // | |
32 | // cpp_bin_float: | |
33 | // result_is_ok_concurrent: true, calculation_time_concurrent: 18.7s | |
34 | // result_is_ok_sequential: true, calculation_time_sequential: 50.4s | |
35 | // | |
36 | // gmp_float: | |
37 | // result_is_ok_concurrent: true, calculation_time_concurrent: 3.3s | |
38 | // result_is_ok_sequential: true, calculation_time_sequential: 12.4s | |
39 | // | |
40 | // mpfr_float: | |
41 | // result_is_ok_concurrent: true, calculation_time_concurrent: 0.6s | |
42 | // result_is_ok_sequential: true, calculation_time_sequential: 1.9s | |
43 | ||
44 | #include <array> | |
45 | #include <atomic> | |
46 | #include <cstddef> | |
47 | #include <cstdint> | |
48 | #include <iomanip> | |
49 | #include <iostream> | |
50 | #include <limits> | |
51 | #include <thread> | |
52 | #include <vector> | |
53 | ||
54 | #include <boost/math/constants/constants.hpp> | |
55 | #include <boost/math/special_functions/prime.hpp> | |
56 | ||
57 | #define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_DEC_FLOAT 101 | |
58 | #define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_GMP_FLOAT 102 | |
59 | #define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_BIN_FLOAT 103 | |
60 | #define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_MPFR_FLOAT 104 | |
61 | ||
62 | #if !defined(BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE) | |
63 | #define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_DEC_FLOAT | |
64 | //#define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_BIN_FLOAT | |
65 | //#define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_GMP_FLOAT | |
66 | //#define BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_MPFR_FLOAT | |
67 | #endif | |
68 | ||
69 | #if (BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE == BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_DEC_FLOAT) | |
70 | #include <boost/multiprecision/cpp_dec_float.hpp> | |
71 | ||
72 | using big_float_type = boost::multiprecision::number<boost::multiprecision::cpp_dec_float<501>, | |
73 | boost::multiprecision::et_off>; | |
74 | ||
75 | #elif (BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE == BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_CPP_BIN_FLOAT) | |
76 | #include <boost/multiprecision/cpp_bin_float.hpp> | |
77 | ||
78 | using big_float_type = boost::multiprecision::number<boost::multiprecision::cpp_bin_float<501>, | |
79 | boost::multiprecision::et_off>; | |
80 | ||
81 | #elif (BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE == BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_GMP_FLOAT) | |
82 | #include <boost/multiprecision/gmp.hpp> | |
83 | ||
84 | using big_float_type = boost::multiprecision::number<boost::multiprecision::gmp_float<501>, | |
85 | boost::multiprecision::et_off>; | |
86 | ||
87 | #elif (BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE == BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_MPFR_FLOAT) | |
88 | #include <boost/multiprecision/mpfr.hpp> | |
89 | ||
90 | using big_float_type = boost::multiprecision::number<boost::multiprecision::mpfr_float_backend<501>, | |
91 | boost::multiprecision::et_off>; | |
92 | ||
93 | #else | |
94 | #error BOOST_MULTIPRECISION_EXERCISE_THREADING_BACKEND_TYPE is undefined. | |
95 | #endif | |
96 | ||
97 | namespace boost { namespace multiprecision { namespace exercise_threading { | |
98 | ||
99 | namespace detail { | |
100 | ||
101 | namespace my_concurrency { | |
102 | template<typename index_type, | |
103 | typename callable_function_type> | |
104 | void parallel_for(index_type start, | |
105 | index_type end, | |
106 | callable_function_type parallel_function) | |
107 | { | |
108 | // Estimate the number of threads available. | |
109 | static const unsigned int number_of_threads_hint = | |
110 | std::thread::hardware_concurrency(); | |
111 | ||
112 | static const unsigned int number_of_threads_total = | |
113 | ((number_of_threads_hint == 0U) ? 4U : number_of_threads_hint); | |
114 | ||
115 | // Use only 3/4 of the available cores. | |
116 | static const unsigned int number_of_threads = number_of_threads_total - (number_of_threads_total / 8U); | |
117 | ||
118 | std::cout << "Executing with " << number_of_threads << " threads" << std::endl; | |
119 | ||
120 | // Set the size of a slice for the range functions. | |
121 | index_type n = index_type(end - start) + index_type(1); | |
122 | ||
123 | index_type slice = | |
124 | static_cast<index_type>(std::round(n / static_cast<float>(number_of_threads))); | |
125 | ||
126 | slice = (std::max)(slice, index_type(1)); | |
127 | ||
128 | // Inner loop. | |
129 | auto launch_range = | |
130 | [¶llel_function](index_type index_lo, index_type index_hi) | |
131 | { | |
132 | for(index_type i = index_lo; i < index_hi; ++i) | |
133 | { | |
134 | parallel_function(i); | |
135 | } | |
136 | }; | |
137 | ||
138 | // Create the thread pool and launch the jobs. | |
139 | std::vector<std::thread> pool; | |
140 | ||
141 | pool.reserve(number_of_threads); | |
142 | ||
143 | index_type i1 = start; | |
144 | index_type i2 = (std::min)(index_type(start + slice), end); | |
145 | ||
146 | for(index_type i = 0U; ((index_type(i + index_type(1U)) < number_of_threads) && (i1 < end)); ++i) | |
147 | { | |
148 | pool.emplace_back(launch_range, i1, i2); | |
149 | ||
150 | i1 = i2; | |
151 | ||
152 | i2 = (std::min)(index_type(i2 + slice), end); | |
153 | } | |
154 | ||
155 | if(i1 < end) | |
156 | { | |
157 | pool.emplace_back(launch_range, i1, end); | |
158 | } | |
159 | ||
160 | // Wait for the jobs to finish. | |
161 | for(std::thread& thread_in_pool : pool) | |
162 | { | |
163 | if(thread_in_pool.joinable()) | |
164 | { | |
165 | thread_in_pool.join(); | |
166 | } | |
167 | } | |
168 | } | |
169 | } // namespace my_concurrency | |
170 | ||
171 | template<typename FloatingPointType, | |
172 | typename UnsignedIntegralType> | |
173 | FloatingPointType pown(const FloatingPointType& b, const UnsignedIntegralType& p) | |
174 | { | |
175 | // Calculate (b ^ p). | |
176 | ||
177 | using local_floating_point_type = FloatingPointType; | |
178 | using local_unsigned_integral_type = UnsignedIntegralType; | |
179 | ||
180 | local_floating_point_type result; | |
181 | ||
182 | if (p == local_unsigned_integral_type(0U)) { result = local_floating_point_type(1U); } | |
183 | else if(p == local_unsigned_integral_type(1U)) { result = b; } | |
184 | else if(p == local_unsigned_integral_type(2U)) { result = b; result *= b; } | |
185 | else | |
186 | { | |
187 | result = local_floating_point_type(1U); | |
188 | ||
189 | local_floating_point_type y(b); | |
190 | ||
191 | for(local_unsigned_integral_type p_local(p); p_local != local_unsigned_integral_type(0U); p_local >>= 1U) | |
192 | { | |
193 | if((static_cast<unsigned>(p_local) & 1U) != 0U) | |
194 | { | |
195 | result *= y; | |
196 | } | |
197 | ||
198 | y *= y; | |
199 | } | |
200 | } | |
201 | ||
202 | return result; | |
203 | } | |
204 | ||
205 | const std::vector<std::uint32_t>& primes() | |
206 | { | |
1e59de90 TL |
207 | static std::vector<std::uint32_t> my_primes; |
208 | ||
209 | if(my_primes.empty()) | |
20effc67 | 210 | { |
1e59de90 | 211 | my_primes.resize(10000U); |
20effc67 TL |
212 | |
213 | // Get exactly 10,000 primes. | |
1e59de90 | 214 | for(std::size_t i = 0U; i < my_primes.size(); ++i) |
20effc67 | 215 | { |
1e59de90 | 216 | my_primes[i] = boost::math::prime((unsigned int) i); |
20effc67 | 217 | } |
1e59de90 | 218 | } |
20effc67 TL |
219 | |
220 | return my_primes; | |
221 | } | |
222 | ||
223 | } // namespace detail | |
224 | ||
225 | template<typename FloatingPointType> | |
226 | FloatingPointType log(const FloatingPointType& x) | |
227 | { | |
228 | // Use an AGM method to compute the logarithm of x. | |
229 | ||
230 | // For values less than 1 invert the argument and | |
231 | // remember (in this case) to negate the result below. | |
232 | const bool b_negate = (x < 1); | |
233 | ||
234 | const FloatingPointType xx = ((b_negate == false) ? x : 1 / x); | |
235 | ||
236 | // Set a0 = 1 | |
237 | // Set b0 = 4 / (x * 2^m) = 1 / (x * 2^(m - 2)) | |
238 | ||
239 | FloatingPointType ak(1U); | |
240 | ||
241 | const float n_times_factor = static_cast<float>(static_cast<float>(std::numeric_limits<FloatingPointType>::digits10) * 1.67F); | |
242 | const float lgx_over_lg2 = std::log(static_cast<float>(xx)) / std::log(2.0F); | |
243 | ||
244 | std::int32_t m = static_cast<std::int32_t>(n_times_factor - lgx_over_lg2); | |
245 | ||
246 | // Ensure that the resulting power is non-negative. | |
247 | // Also enforce that m >= 8. | |
248 | m = (std::max)(m, static_cast<std::int32_t>(8)); | |
249 | ||
250 | FloatingPointType bk = detail::pown(FloatingPointType(2), static_cast<std::uint32_t>(m)); | |
251 | ||
252 | bk *= xx; | |
253 | bk = 4 / bk; | |
254 | ||
255 | FloatingPointType ak_tmp(0U); | |
256 | ||
257 | using std::sqrt; | |
258 | ||
259 | // Determine the requested precision of the upcoming iteration in units of digits10. | |
260 | const FloatingPointType target_tolerance = sqrt(std::numeric_limits<FloatingPointType>::epsilon()) / 100; | |
261 | ||
262 | for(std::int32_t k = static_cast<std::int32_t>(0); k < static_cast<std::int32_t>(64); ++k) | |
263 | { | |
264 | using std::fabs; | |
265 | ||
266 | // Check for the number of significant digits to be | |
267 | // at least half of the requested digits. If at least | |
268 | // half of the requested digits have been achieved, | |
269 | // then break after the upcoming iteration. | |
270 | const bool break_after_this_iteration = ( (k > static_cast<std::int32_t>(4)) | |
271 | && (fabs(1 - fabs(ak / bk)) < target_tolerance)); | |
272 | ||
273 | ak_tmp = ak; | |
274 | ak += bk; | |
275 | ak /= 2; | |
276 | ||
277 | if(break_after_this_iteration) | |
278 | { | |
279 | break; | |
280 | } | |
281 | ||
282 | bk *= ak_tmp; | |
283 | bk = sqrt(bk); | |
284 | } | |
285 | ||
286 | // We are now finished with the AGM iteration for log(x). | |
287 | ||
288 | // Compute log(x) = {pi / [2 * AGM(1, 4 / 2^m)]} - (m * ln2) | |
289 | // Note at this time that (ak = bk) = AGM(...) | |
290 | ||
291 | // Retrieve the value of pi, divide by (2 * a) and subtract (m * ln2). | |
292 | const FloatingPointType result = | |
293 | boost::math::constants::pi<FloatingPointType>() / (ak * 2) | |
294 | - (boost::math::constants::ln_two<FloatingPointType>() * m); | |
295 | ||
296 | return ((b_negate == true) ? -result : result); | |
297 | } | |
298 | ||
299 | } } } // namespace boost::multiprecision::exercise_threading | |
300 | ||
301 | template<typename FloatingPointType> | |
302 | bool log_agm_concurrent(float& calculation_time) | |
303 | { | |
304 | const std::size_t count = boost::multiprecision::exercise_threading::detail::primes().size(); | |
305 | ||
306 | std::vector<FloatingPointType> log_results(count); | |
307 | std::vector<FloatingPointType> log_control(count); | |
308 | ||
309 | std::atomic_flag log_agm_lock = ATOMIC_FLAG_INIT; | |
310 | ||
311 | std::size_t concurrent_log_agm_count = 0U; | |
312 | ||
313 | const std::clock_t start = std::clock(); | |
314 | ||
315 | boost::multiprecision::exercise_threading::detail::my_concurrency::parallel_for | |
316 | ( | |
317 | std::size_t(0U), | |
318 | log_results.size(), | |
319 | [&log_results, &log_control, &concurrent_log_agm_count, &log_agm_lock](std::size_t i) | |
320 | { | |
321 | while(log_agm_lock.test_and_set()) { ; } | |
1e59de90 TL |
322 | const FloatingPointType dx = (FloatingPointType(1U) / (boost::multiprecision::exercise_threading::detail::primes()[i])); |
323 | log_agm_lock.clear(); | |
324 | ||
325 | const FloatingPointType x = boost::math::constants::catalan<FloatingPointType>() + dx; | |
326 | ||
327 | const FloatingPointType lr = boost::multiprecision::exercise_threading::log(x); | |
328 | const FloatingPointType lc = boost::multiprecision::log(x); | |
329 | ||
330 | while(log_agm_lock.test_and_set()) { ; } | |
331 | ||
332 | log_results[i] = lr; | |
333 | log_control[i] = lc; | |
334 | ||
20effc67 | 335 | ++concurrent_log_agm_count; |
1e59de90 | 336 | |
20effc67 TL |
337 | if((concurrent_log_agm_count % 100U) == 0U) |
338 | { | |
339 | std::cout << "log agm concurrent at index " | |
340 | << concurrent_log_agm_count | |
341 | << " of " | |
342 | << log_results.size() | |
343 | << ". Total processed so far: " | |
344 | << std::fixed | |
345 | << std::setprecision(1) | |
346 | << (100.0F * float(concurrent_log_agm_count)) / float(log_results.size()) | |
347 | << "%." | |
348 | << "\r"; | |
349 | } | |
20effc67 | 350 | |
1e59de90 | 351 | log_agm_lock.clear(); |
20effc67 TL |
352 | } |
353 | ); | |
354 | ||
355 | calculation_time = static_cast<float>(std::clock() - start) / static_cast<float>(CLOCKS_PER_SEC); | |
356 | ||
357 | std::cout << std::endl; | |
358 | ||
359 | std::cout << "Checking results concurrent: "; | |
360 | ||
361 | bool result_is_ok = true; | |
362 | ||
1e59de90 TL |
363 | const FloatingPointType tol = std::numeric_limits<FloatingPointType>::epsilon() * 1000000U; |
364 | ||
20effc67 TL |
365 | for(std::size_t i = 0U; i < log_results.size(); ++i) |
366 | { | |
367 | using std::fabs; | |
368 | ||
369 | const FloatingPointType close_fraction = fabs(1 - (log_results[i] / log_control[i])); | |
370 | ||
1e59de90 | 371 | result_is_ok &= (close_fraction < tol); |
20effc67 TL |
372 | } |
373 | ||
374 | std::cout << std::boolalpha << result_is_ok << std::endl; | |
375 | ||
376 | return result_is_ok; | |
377 | } | |
378 | ||
379 | template<typename FloatingPointType> | |
380 | bool log_agm_sequential(float& calculation_time) | |
381 | { | |
382 | const std::size_t count = boost::multiprecision::exercise_threading::detail::primes().size(); | |
383 | ||
384 | std::vector<FloatingPointType> log_results(count); | |
385 | std::vector<FloatingPointType> log_control(count); | |
386 | ||
20effc67 TL |
387 | const std::clock_t start = std::clock(); |
388 | ||
389 | for(std::size_t i = 0U; i < log_results.size(); ++i) | |
390 | { | |
391 | const std::size_t sequential_log_agm_count = i + 1U; | |
392 | ||
1e59de90 TL |
393 | const FloatingPointType dx = (FloatingPointType(1U) / (boost::multiprecision::exercise_threading::detail::primes()[i])); |
394 | const FloatingPointType x = boost::math::constants::catalan<FloatingPointType>() + dx; | |
395 | ||
396 | log_results[i] = boost::multiprecision::exercise_threading::log(x); | |
397 | log_control[i] = boost::multiprecision::log(x); | |
398 | ||
20effc67 TL |
399 | if((sequential_log_agm_count % 100U) == 0U) |
400 | { | |
401 | std::cout << "log agm sequential at index " | |
402 | << sequential_log_agm_count | |
403 | << " of " | |
404 | << log_results.size() | |
405 | << ". Total processed so far: " | |
406 | << std::fixed | |
407 | << std::setprecision(1) | |
408 | << (100.0F * float(sequential_log_agm_count)) / float(log_results.size()) | |
409 | << "%." | |
410 | << "\r"; | |
411 | } | |
20effc67 TL |
412 | } |
413 | ||
414 | calculation_time = static_cast<float>(std::clock() - start) / static_cast<float>(CLOCKS_PER_SEC); | |
415 | ||
416 | std::cout << std::endl; | |
417 | ||
418 | std::cout << "Checking results sequential: "; | |
419 | ||
420 | bool result_is_ok = true; | |
421 | ||
1e59de90 TL |
422 | const FloatingPointType tol = std::numeric_limits<FloatingPointType>::epsilon() * 1000000U; |
423 | ||
20effc67 TL |
424 | for(std::size_t i = 0U; i < log_results.size(); ++i) |
425 | { | |
426 | using std::fabs; | |
427 | ||
428 | const FloatingPointType close_fraction = fabs(1 - (log_results[i] / log_control[i])); | |
429 | ||
1e59de90 | 430 | result_is_ok &= (close_fraction < tol); |
20effc67 TL |
431 | } |
432 | ||
433 | std::cout << std::boolalpha << result_is_ok << std::endl; | |
434 | ||
435 | return result_is_ok; | |
436 | } | |
437 | ||
438 | int main() | |
439 | { | |
440 | std::cout << "Calculating " | |
441 | << boost::multiprecision::exercise_threading::detail::primes().size() | |
442 | << " primes" | |
443 | << std::endl; | |
444 | ||
445 | float calculation_time_concurrent; | |
446 | const bool result_is_ok_concurrent = log_agm_concurrent<big_float_type>(calculation_time_concurrent); | |
447 | ||
448 | float calculation_time_sequential; | |
449 | const bool result_is_ok_sequential = log_agm_sequential<big_float_type>(calculation_time_sequential); | |
450 | ||
451 | std::cout << std::endl; | |
452 | ||
453 | std::cout << "result_is_ok_concurrent: " | |
454 | << std::boolalpha | |
455 | << result_is_ok_concurrent | |
456 | << ", calculation_time_concurrent: " | |
457 | << std::fixed | |
458 | << std::setprecision(1) | |
459 | << calculation_time_concurrent | |
460 | << "s" | |
461 | << std::endl; | |
462 | ||
463 | std::cout << "result_is_ok_sequential: " | |
464 | << std::boolalpha | |
465 | << result_is_ok_sequential | |
466 | << ", calculation_time_sequential: " | |
467 | << std::fixed | |
468 | << std::setprecision(1) | |
469 | << calculation_time_sequential | |
470 | << "s" | |
471 | << std::endl; | |
472 | } |