]> git.proxmox.com Git - ceph.git/blob - ceph/src/boost/libs/compute/include/boost/compute/algorithm/reduce.hpp
bump version to 12.2.2-pve1
[ceph.git] / ceph / src / boost / libs / compute / include / boost / compute / algorithm / reduce.hpp
1 //---------------------------------------------------------------------------//
2 // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com>
3 //
4 // Distributed under the Boost Software License, Version 1.0
5 // See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt
7 //
8 // See http://boostorg.github.com/compute for more information.
9 //---------------------------------------------------------------------------//
10
11 #ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
12 #define BOOST_COMPUTE_ALGORITHM_REDUCE_HPP
13
14 #include <iterator>
15
16 #include <boost/compute/system.hpp>
17 #include <boost/compute/functional.hpp>
18 #include <boost/compute/detail/meta_kernel.hpp>
19 #include <boost/compute/command_queue.hpp>
20 #include <boost/compute/container/array.hpp>
21 #include <boost/compute/container/vector.hpp>
22 #include <boost/compute/algorithm/copy_n.hpp>
23 #include <boost/compute/algorithm/detail/inplace_reduce.hpp>
24 #include <boost/compute/algorithm/detail/reduce_on_gpu.hpp>
25 #include <boost/compute/algorithm/detail/reduce_on_cpu.hpp>
26 #include <boost/compute/detail/iterator_range_size.hpp>
27 #include <boost/compute/memory/local_buffer.hpp>
28 #include <boost/compute/type_traits/result_of.hpp>
29
30 namespace boost {
31 namespace compute {
32 namespace detail {
33
34 template<class InputIterator, class OutputIterator, class BinaryFunction>
35 size_t reduce(InputIterator first,
36 size_t count,
37 OutputIterator result,
38 size_t block_size,
39 BinaryFunction function,
40 command_queue &queue)
41 {
42 typedef typename
43 std::iterator_traits<InputIterator>::value_type
44 input_type;
45 typedef typename
46 boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
47 result_type;
48
49 const context &context = queue.get_context();
50 size_t block_count = count / 2 / block_size;
51 size_t total_block_count =
52 static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
53
54 if(block_count != 0){
55 meta_kernel k("block_reduce");
56 size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
57 size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block");
58
59 k <<
60 "const uint gid = get_global_id(0);\n" <<
61 "const uint lid = get_local_id(0);\n" <<
62
63 // copy values to local memory
64 "block[lid] = " <<
65 function(first[k.make_var<uint_>("gid*2+0")],
66 first[k.make_var<uint_>("gid*2+1")]) << ";\n" <<
67
68 // perform reduction
69 "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" <<
70 " barrier(CLK_LOCAL_MEM_FENCE);\n" <<
71 " uint mask = (i << 1) - 1;\n" <<
72 " if((lid & mask) == 0){\n" <<
73 " block[lid] = " <<
74 function(k.expr<input_type>("block[lid]"),
75 k.expr<input_type>("block[lid+i]")) << ";\n" <<
76 " }\n" <<
77 "}\n" <<
78
79 // write block result to global output
80 "if(lid == 0)\n" <<
81 " output[get_group_id(0)] = block[0];\n";
82
83 kernel kernel = k.compile(context);
84 kernel.set_arg(output_arg, result.get_buffer());
85 kernel.set_arg(block_arg, local_buffer<input_type>(block_size));
86
87 queue.enqueue_1d_range_kernel(kernel,
88 0,
89 block_count * block_size,
90 block_size);
91 }
92
93 // serially reduce any leftovers
94 if(block_count * block_size * 2 < count){
95 size_t last_block_start = block_count * block_size * 2;
96
97 meta_kernel k("extra_serial_reduce");
98 size_t count_arg = k.add_arg<uint_>("count");
99 size_t offset_arg = k.add_arg<uint_>("offset");
100 size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output");
101 size_t output_offset_arg = k.add_arg<uint_>("output_offset");
102
103 k <<
104 k.decl<result_type>("result") << " = \n" <<
105 first[k.expr<uint_>("offset")] << ";\n" <<
106 "for(uint i = offset + 1; i < count; i++)\n" <<
107 " result = " <<
108 function(k.var<result_type>("result"),
109 first[k.var<uint_>("i")]) << ";\n" <<
110 "output[output_offset] = result;\n";
111
112 kernel kernel = k.compile(context);
113 kernel.set_arg(count_arg, static_cast<uint_>(count));
114 kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start));
115 kernel.set_arg(output_arg, result.get_buffer());
116 kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count));
117
118 queue.enqueue_task(kernel);
119 }
120
121 return total_block_count;
122 }
123
124 template<class InputIterator, class BinaryFunction>
125 inline vector<
126 typename boost::compute::result_of<
127 BinaryFunction(
128 typename std::iterator_traits<InputIterator>::value_type,
129 typename std::iterator_traits<InputIterator>::value_type
130 )
131 >::type
132 >
133 block_reduce(InputIterator first,
134 size_t count,
135 size_t block_size,
136 BinaryFunction function,
137 command_queue &queue)
138 {
139 typedef typename
140 std::iterator_traits<InputIterator>::value_type
141 input_type;
142 typedef typename
143 boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
144 result_type;
145
146 const context &context = queue.get_context();
147 size_t total_block_count =
148 static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size)));
149 vector<result_type> result_vector(total_block_count, context);
150
151 reduce(first, count, result_vector.begin(), block_size, function, queue);
152
153 return result_vector;
154 }
155
156 template<class InputIterator, class OutputIterator, class BinaryFunction>
157 inline void generic_reduce(InputIterator first,
158 InputIterator last,
159 OutputIterator result,
160 BinaryFunction function,
161 command_queue &queue)
162 {
163 typedef typename
164 std::iterator_traits<InputIterator>::value_type
165 input_type;
166 typedef typename
167 boost::compute::result_of<BinaryFunction(input_type, input_type)>::type
168 result_type;
169
170 const device &device = queue.get_device();
171 const context &context = queue.get_context();
172
173 size_t count = detail::iterator_range_size(first, last);
174
175 if(device.type() & device::cpu){
176 array<result_type, 1> value(context);
177 detail::reduce_on_cpu(first, last, value.begin(), function, queue);
178 boost::compute::copy_n(value.begin(), 1, result, queue);
179 }
180 else {
181 size_t block_size = 256;
182
183 // first pass
184 vector<result_type> results = detail::block_reduce(first,
185 count,
186 block_size,
187 function,
188 queue);
189
190 if(results.size() > 1){
191 detail::inplace_reduce(results.begin(),
192 results.end(),
193 function,
194 queue);
195 }
196
197 boost::compute::copy_n(results.begin(), 1, result, queue);
198 }
199 }
200
201 template<class InputIterator, class OutputIterator, class T>
202 inline void dispatch_reduce(InputIterator first,
203 InputIterator last,
204 OutputIterator result,
205 const plus<T> &function,
206 command_queue &queue)
207 {
208 const context &context = queue.get_context();
209 const device &device = queue.get_device();
210
211 // reduce to temporary buffer on device
212 array<T, 1> value(context);
213 if(device.type() & device::cpu){
214 detail::reduce_on_cpu(first, last, value.begin(), function, queue);
215 }
216 else {
217 reduce_on_gpu(first, last, value.begin(), function, queue);
218 }
219
220 // copy to result iterator
221 copy_n(value.begin(), 1, result, queue);
222 }
223
224 template<class InputIterator, class OutputIterator, class BinaryFunction>
225 inline void dispatch_reduce(InputIterator first,
226 InputIterator last,
227 OutputIterator result,
228 BinaryFunction function,
229 command_queue &queue)
230 {
231 generic_reduce(first, last, result, function, queue);
232 }
233
234 } // end detail namespace
235
236 /// Returns the result of applying \p function to the elements in the
237 /// range [\p first, \p last).
238 ///
239 /// If no function is specified, \c plus will be used.
240 ///
241 /// \param first first element in the input range
242 /// \param last last element in the input range
243 /// \param result iterator pointing to the output
244 /// \param function binary reduction function
245 /// \param queue command queue to perform the operation
246 ///
247 /// The \c reduce() algorithm assumes that the binary reduction function is
248 /// associative. When used with non-associative functions the result may
249 /// be non-deterministic and vary in precision. Notably this affects the
250 /// \c plus<float>() function as floating-point addition is not associative
251 /// and may produce slightly different results than a serial algorithm.
252 ///
253 /// This algorithm supports both host and device iterators for the
254 /// result argument. This allows for values to be reduced and copied
255 /// to the host all with a single function call.
256 ///
257 /// For example, to calculate the sum of the values in a device vector and
258 /// copy the result to a value on the host:
259 ///
260 /// \snippet test/test_reduce.cpp sum_int
261 ///
262 /// Note that while the the \c reduce() algorithm is conceptually identical to
263 /// the \c accumulate() algorithm, its implementation is substantially more
264 /// efficient on parallel hardware. For more information, see the documentation
265 /// on the \c accumulate() algorithm.
266 ///
267 /// \see accumulate()
268 template<class InputIterator, class OutputIterator, class BinaryFunction>
269 inline void reduce(InputIterator first,
270 InputIterator last,
271 OutputIterator result,
272 BinaryFunction function,
273 command_queue &queue = system::default_queue())
274 {
275 if(first == last){
276 return;
277 }
278
279 detail::dispatch_reduce(first, last, result, function, queue);
280 }
281
282 /// \overload
283 template<class InputIterator, class OutputIterator>
284 inline void reduce(InputIterator first,
285 InputIterator last,
286 OutputIterator result,
287 command_queue &queue = system::default_queue())
288 {
289 typedef typename std::iterator_traits<InputIterator>::value_type T;
290
291 if(first == last){
292 return;
293 }
294
295 detail::dispatch_reduce(first, last, result, plus<T>(), queue);
296 }
297
298 } // end compute namespace
299 } // end boost namespace
300
301 #endif // BOOST_COMPUTE_ALGORITHM_REDUCE_HPP