]> git.proxmox.com Git - ceph.git/blame - ceph/src/boost/boost/compute/algorithm/reduce.hpp
import new upstream nautilus stable release 14.2.8
[ceph.git] / ceph / src / boost / boost / compute / algorithm / reduce.hpp
CommitLineData
7c673cae
FG
1//---------------------------------------------------------------------------//
2// Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
3//
4// Distributed under the Boost Software License, Version 1.0
5// See accompanying file LICENSE_1_0.txt or copy at
6// http://www.boost.org/LICENSE_1_0.txt
7//
8// See http://boostorg.github.com/compute for more information.
9//---------------------------------------------------------------------------//
10
11#ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
12#define BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
13
14#include <iterator>
15
92f5a8d4
TL
16#include <boost/static_assert.hpp>
17
7c673cae
FG
18#include <boost/compute/system.hpp>
19#include <boost/compute/functional.hpp>
20#include <boost/compute/detail/meta_kernel.hpp>
21#include <boost/compute/command_queue.hpp>
22#include <boost/compute/container/array.hpp>
23#include <boost/compute/container/vector.hpp>
24#include <boost/compute/algorithm/copy_n.hpp>
25#include <boost/compute/algorithm/detail/inplace_reduce.hpp>
26#include <boost/compute/algorithm/detail/reduce_on_gpu.hpp>
27#include <boost/compute/algorithm/detail/reduce_on_cpu.hpp>
28#include <boost/compute/detail/iterator_range_size.hpp>
29#include <boost/compute/memory/local_buffer.hpp>
30#include <boost/compute/type_traits/result_of.hpp>
92f5a8d4 31#include <boost/compute/type_traits/is_device_iterator.hpp>
7c673cae
FG
32
33namespace boost {
34namespace compute {
35namespace detail {
36
37template<class InputIterator, class OutputIterator, class BinaryFunction>
38size_t reduce(InputIterator first,
39 size_t count,
40 OutputIterator result,
41 size_t block_size,
42 BinaryFunction function,
43 command_queue &queue)
44{
45 typedef typename
46 std::iterator_traits<InputIterator>::value_type
47 input_type;
48 typedef typename
49 boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
50 result_type;
51
52 const context &context = queue.get_context();
53 size_t block_count = count / 2 / block_size;
54 size_t total_block_count =
55 static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
56
57 if(block_count != 0){
58 meta_kernel k("block_reduce");
59 size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
60 size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
61
62 k <<
63 "const uint gid = get_global_id(0);\n" <<
64 "const uint lid = get_local_id(0);\n" <<
65
66 // copy values to local memory
67 "block[lid] = " <<
68 function(first[k.make_var<uint_>("gid*2+0")],
69 first[k.make_var<uint_>("gid*2+1")]) << ";\n" <<
70
71 // perform reduction
72 "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" <<
73 " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
74 " uint mask = (i << 1) - 1;\n" <<
75 " if((lid & mask) == 0){\n" <<
76 " block[lid] = " <<
77 function(k.expr<input_type>("block[lid]"),
78 k.expr<input_type>("block[lid+i]")) << ";\n" <<
79 " }\n" <<
80 "}\n" <<
81
82 // write block result to global output
83 "if(lid == 0)\n" <<
84 " output[get_group_id(0)] = block[0];\n";
85
86 kernel kernel = k.compile(context);
87 kernel.set_arg(output_arg, result.get_buffer());
88 kernel.set_arg(block_arg, local_buffer<input_type>(block_size));
89
90 queue.enqueue_1d_range_kernel(kernel,
91 0,
92 block_count * block_size,
93 block_size);
94 }
95
96 // serially reduce any leftovers
97 if(block_count * block_size * 2 < count){
98 size_t last_block_start = block_count * block_size * 2;
99
100 meta_kernel k("extra_serial_reduce");
101 size_t count_arg = k.add_arg<uint_>("count");
102 size_t offset_arg = k.add_arg<uint_>("offset");
103 size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
104 size_t output_offset_arg = k.add_arg<uint_>("output_offset");
105
106 k <<
107 k.decl<result_type>("result") << " = \n" <<
108 first[k.expr<uint_>("offset")] << ";\n" <<
109 "for(uint i = offset + 1; i < count; i++)\n" <<
110 " result = " <<
111 function(k.var<result_type>("result"),
112 first[k.var<uint_>("i")]) << ";\n" <<
113 "output[output_offset] = result;\n";
114
115 kernel kernel = k.compile(context);
116 kernel.set_arg(count_arg, static_cast<uint_>(count));
117 kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start));
118 kernel.set_arg(output_arg, result.get_buffer());
119 kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count));
120
121 queue.enqueue_task(kernel);
122 }
123
124 return total_block_count;
125}
126
127template<class InputIterator, class BinaryFunction>
128inline vector<
129 typename boost::compute::result_of<
130 BinaryFunction(
131 typename std::iterator_traits<InputIterator>::value_type,
132 typename std::iterator_traits<InputIterator>::value_type
133 )
134 >::type
135>
136block_reduce(InputIterator first,
137 size_t count,
138 size_t block_size,
139 BinaryFunction function,
140 command_queue &queue)
141{
142 typedef typename
143 std::iterator_traits<InputIterator>::value_type
144 input_type;
145 typedef typename
146 boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
147 result_type;
148
149 const context &context = queue.get_context();
150 size_t total_block_count =
151 static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
152 vector<result_type> result_vector(total_block_count, context);
153
154 reduce(first, count, result_vector.begin(), block_size, function, queue);
155
156 return result_vector;
157}
158
b32b8144 159// Space complexity: O( ceil(n / 2 / 256) )
7c673cae
FG
160template<class InputIterator, class OutputIterator, class BinaryFunction>
161inline void generic_reduce(InputIterator first,
162 InputIterator last,
163 OutputIterator result,
164 BinaryFunction function,
165 command_queue &queue)
166{
167 typedef typename
168 std::iterator_traits<InputIterator>::value_type
169 input_type;
170 typedef typename
171 boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
172 result_type;
173
174 const device &device = queue.get_device();
175 const context &context = queue.get_context();
176
177 size_t count = detail::iterator_range_size(first, last);
178
179 if(device.type() & device::cpu){
180 array<result_type, 1> value(context);
181 detail::reduce_on_cpu(first, last, value.begin(), function, queue);
182 boost::compute::copy_n(value.begin(), 1, result, queue);
183 }
184 else {
185 size_t block_size = 256;
186
187 // first pass
188 vector<result_type> results = detail::block_reduce(first,
189 count,
190 block_size,
191 function,
192 queue);
193
194 if(results.size() > 1){
195 detail::inplace_reduce(results.begin(),
196 results.end(),
197 function,
198 queue);
199 }
200
201 boost::compute::copy_n(results.begin(), 1, result, queue);
202 }
203}
204
205template<class InputIterator, class OutputIterator, class T>
206inline void dispatch_reduce(InputIterator first,
207 InputIterator last,
208 OutputIterator result,
209 const plus<T> &function,
210 command_queue &queue)
211{
212 const context &context = queue.get_context();
213 const device &device = queue.get_device();
214
215 // reduce to temporary buffer on device
216 array<T, 1> value(context);
217 if(device.type() & device::cpu){
218 detail::reduce_on_cpu(first, last, value.begin(), function, queue);
219 }
220 else {
221 reduce_on_gpu(first, last, value.begin(), function, queue);
222 }
223
224 // copy to result iterator
225 copy_n(value.begin(), 1, result, queue);
226}
227
228template<class InputIterator, class OutputIterator, class BinaryFunction>
229inline void dispatch_reduce(InputIterator first,
230 InputIterator last,
231 OutputIterator result,
232 BinaryFunction function,
233 command_queue &queue)
234{
235 generic_reduce(first, last, result, function, queue);
236}
237
238} // end detail namespace
239
240/// Returns the result of applying \p function to the elements in the
241/// range [\p first, \p last).
242///
243/// If no function is specified, \c plus will be used.
244///
245/// \param first first element in the input range
246/// \param last last element in the input range
247/// \param result iterator pointing to the output
248/// \param function binary reduction function
249/// \param queue command queue to perform the operation
250///
251/// The \c reduce() algorithm assumes that the binary reduction function is
252/// associative. When used with non-associative functions the result may
253/// be non-deterministic and vary in precision. Notably this affects the
254/// \c plus<float>() function as floating-point addition is not associative
255/// and may produce slightly different results than a serial algorithm.
256///
257/// This algorithm supports both host and device iterators for the
258/// result argument. This allows for values to be reduced and copied
259/// to the host all with a single function call.
260///
261/// For example, to calculate the sum of the values in a device vector and
262/// copy the result to a value on the host:
263///
264/// \snippet test/test_reduce.cpp sum_int
265///
266/// Note that while the the \c reduce() algorithm is conceptually identical to
267/// the \c accumulate() algorithm, its implementation is substantially more
268/// efficient on parallel hardware. For more information, see the documentation
269/// on the \c accumulate() algorithm.
270///
b32b8144
FG
271/// Space complexity on GPUs: \Omega(n)<br>
272/// Space complexity on CPUs: \Omega(1)
273///
7c673cae
FG
274/// \see accumulate()
275template<class InputIterator, class OutputIterator, class BinaryFunction>
276inline void reduce(InputIterator first,
277 InputIterator last,
278 OutputIterator result,
279 BinaryFunction function,
280 command_queue &queue = system::default_queue())
281{
92f5a8d4 282 BOOST_STATIC_ASSERT(is_device_iterator<InputIterator>::value);
7c673cae
FG
283 if(first == last){
284 return;
285 }
286
287 detail::dispatch_reduce(first, last, result, function, queue);
288}
289
290/// \overload
291template<class InputIterator, class OutputIterator>
292inline void reduce(InputIterator first,
293 InputIterator last,
294 OutputIterator result,
295 command_queue &queue = system::default_queue())
296{
92f5a8d4 297 BOOST_STATIC_ASSERT(is_device_iterator<InputIterator>::value);
7c673cae
FG
298 typedef typename std::iterator_traits<InputIterator>::value_type T;
299
300 if(first == last){
301 return;
302 }
303
304 detail::dispatch_reduce(first, last, result, plus<T>(), queue);
305}
306
307} // end compute namespace
308} // end boost namespace
309
310#endif // BOOST_COMPUTE_ALGORITHM_REDUCE_HPP