]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | //---------------------------------------------------------------------------// |
2 | // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0 | |
5 | // See accompanying file LICENSE_1_0.txt or copy at | |
6 | // http://www.boost.org/LICENSE_1_0.txt | |
7 | // | |
8 | // See http://boostorg.github.com/compute for more information. | |
9 | //---------------------------------------------------------------------------// | |
10 | ||
11 | #ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_HPP | |
12 | #define BOOST_COMPUTE_ALGORITHM_REDUCE_HPP | |
13 | ||
14 | #include <iterator> | |
15 | ||
92f5a8d4 TL |
16 | #include <boost/static_assert.hpp> |
17 | ||
7c673cae FG |
18 | #include <boost/compute/system.hpp> |
19 | #include <boost/compute/functional.hpp> | |
20 | #include <boost/compute/detail/meta_kernel.hpp> | |
21 | #include <boost/compute/command_queue.hpp> | |
22 | #include <boost/compute/container/array.hpp> | |
23 | #include <boost/compute/container/vector.hpp> | |
24 | #include <boost/compute/algorithm/copy_n.hpp> | |
25 | #include <boost/compute/algorithm/detail/inplace_reduce.hpp> | |
26 | #include <boost/compute/algorithm/detail/reduce_on_gpu.hpp> | |
27 | #include <boost/compute/algorithm/detail/reduce_on_cpu.hpp> | |
28 | #include <boost/compute/detail/iterator_range_size.hpp> | |
29 | #include <boost/compute/memory/local_buffer.hpp> | |
30 | #include <boost/compute/type_traits/result_of.hpp> | |
92f5a8d4 | 31 | #include <boost/compute/type_traits/is_device_iterator.hpp> |
7c673cae FG |
32 | |
33 | namespace boost { | |
34 | namespace compute { | |
35 | namespace detail { | |
36 | ||
37 | template<class InputIterator, class OutputIterator, class BinaryFunction> | |
38 | size_t reduce(InputIterator first, | |
39 | size_t count, | |
40 | OutputIterator result, | |
41 | size_t block_size, | |
42 | BinaryFunction function, | |
43 | command_queue &queue) | |
44 | { | |
45 | typedef typename | |
46 | std::iterator_traits<InputIterator>::value_type | |
47 | input_type; | |
48 | typedef typename | |
49 | boost::compute::result_of<BinaryFunction(input_type, input_type)>::type | |
50 | result_type; | |
51 | ||
52 | const context &context = queue.get_context(); | |
53 | size_t block_count = count / 2 / block_size; | |
54 | size_t total_block_count = | |
55 | static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size))); | |
56 | ||
57 | if(block_count != 0){ | |
58 | meta_kernel k("block_reduce"); | |
59 | size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); | |
60 | size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block"); | |
61 | ||
62 | k << | |
63 | "const uint gid = get_global_id(0);\n" << | |
64 | "const uint lid = get_local_id(0);\n" << | |
65 | ||
66 | // copy values to local memory | |
67 | "block[lid] = " << | |
68 | function(first[k.make_var<uint_>("gid*2+0")], | |
69 | first[k.make_var<uint_>("gid*2+1")]) << ";\n" << | |
70 | ||
71 | // perform reduction | |
72 | "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" << | |
73 | " barrier(CLK_LOCAL_MEM_FENCE);\n" << | |
74 | " uint mask = (i << 1) - 1;\n" << | |
75 | " if((lid & mask) == 0){\n" << | |
76 | " block[lid] = " << | |
77 | function(k.expr<input_type>("block[lid]"), | |
78 | k.expr<input_type>("block[lid+i]")) << ";\n" << | |
79 | " }\n" << | |
80 | "}\n" << | |
81 | ||
82 | // write block result to global output | |
83 | "if(lid == 0)\n" << | |
84 | " output[get_group_id(0)] = block[0];\n"; | |
85 | ||
86 | kernel kernel = k.compile(context); | |
87 | kernel.set_arg(output_arg, result.get_buffer()); | |
88 | kernel.set_arg(block_arg, local_buffer<input_type>(block_size)); | |
89 | ||
90 | queue.enqueue_1d_range_kernel(kernel, | |
91 | 0, | |
92 | block_count * block_size, | |
93 | block_size); | |
94 | } | |
95 | ||
96 | // serially reduce any leftovers | |
97 | if(block_count * block_size * 2 < count){ | |
98 | size_t last_block_start = block_count * block_size * 2; | |
99 | ||
100 | meta_kernel k("extra_serial_reduce"); | |
101 | size_t count_arg = k.add_arg<uint_>("count"); | |
102 | size_t offset_arg = k.add_arg<uint_>("offset"); | |
103 | size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); | |
104 | size_t output_offset_arg = k.add_arg<uint_>("output_offset"); | |
105 | ||
106 | k << | |
107 | k.decl<result_type>("result") << " = \n" << | |
108 | first[k.expr<uint_>("offset")] << ";\n" << | |
109 | "for(uint i = offset + 1; i < count; i++)\n" << | |
110 | " result = " << | |
111 | function(k.var<result_type>("result"), | |
112 | first[k.var<uint_>("i")]) << ";\n" << | |
113 | "output[output_offset] = result;\n"; | |
114 | ||
115 | kernel kernel = k.compile(context); | |
116 | kernel.set_arg(count_arg, static_cast<uint_>(count)); | |
117 | kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start)); | |
118 | kernel.set_arg(output_arg, result.get_buffer()); | |
119 | kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count)); | |
120 | ||
121 | queue.enqueue_task(kernel); | |
122 | } | |
123 | ||
124 | return total_block_count; | |
125 | } | |
126 | ||
127 | template<class InputIterator, class BinaryFunction> | |
128 | inline vector< | |
129 | typename boost::compute::result_of< | |
130 | BinaryFunction( | |
131 | typename std::iterator_traits<InputIterator>::value_type, | |
132 | typename std::iterator_traits<InputIterator>::value_type | |
133 | ) | |
134 | >::type | |
135 | > | |
136 | block_reduce(InputIterator first, | |
137 | size_t count, | |
138 | size_t block_size, | |
139 | BinaryFunction function, | |
140 | command_queue &queue) | |
141 | { | |
142 | typedef typename | |
143 | std::iterator_traits<InputIterator>::value_type | |
144 | input_type; | |
145 | typedef typename | |
146 | boost::compute::result_of<BinaryFunction(input_type, input_type)>::type | |
147 | result_type; | |
148 | ||
149 | const context &context = queue.get_context(); | |
150 | size_t total_block_count = | |
151 | static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size))); | |
152 | vector<result_type> result_vector(total_block_count, context); | |
153 | ||
154 | reduce(first, count, result_vector.begin(), block_size, function, queue); | |
155 | ||
156 | return result_vector; | |
157 | } | |
158 | ||
b32b8144 | 159 | // Space complexity: O( ceil(n / 2 / 256) ) |
7c673cae FG |
160 | template<class InputIterator, class OutputIterator, class BinaryFunction> |
161 | inline void generic_reduce(InputIterator first, | |
162 | InputIterator last, | |
163 | OutputIterator result, | |
164 | BinaryFunction function, | |
165 | command_queue &queue) | |
166 | { | |
167 | typedef typename | |
168 | std::iterator_traits<InputIterator>::value_type | |
169 | input_type; | |
170 | typedef typename | |
171 | boost::compute::result_of<BinaryFunction(input_type, input_type)>::type | |
172 | result_type; | |
173 | ||
174 | const device &device = queue.get_device(); | |
175 | const context &context = queue.get_context(); | |
176 | ||
177 | size_t count = detail::iterator_range_size(first, last); | |
178 | ||
179 | if(device.type() & device::cpu){ | |
180 | array<result_type, 1> value(context); | |
181 | detail::reduce_on_cpu(first, last, value.begin(), function, queue); | |
182 | boost::compute::copy_n(value.begin(), 1, result, queue); | |
183 | } | |
184 | else { | |
185 | size_t block_size = 256; | |
186 | ||
187 | // first pass | |
188 | vector<result_type> results = detail::block_reduce(first, | |
189 | count, | |
190 | block_size, | |
191 | function, | |
192 | queue); | |
193 | ||
194 | if(results.size() > 1){ | |
195 | detail::inplace_reduce(results.begin(), | |
196 | results.end(), | |
197 | function, | |
198 | queue); | |
199 | } | |
200 | ||
201 | boost::compute::copy_n(results.begin(), 1, result, queue); | |
202 | } | |
203 | } | |
204 | ||
205 | template<class InputIterator, class OutputIterator, class T> | |
206 | inline void dispatch_reduce(InputIterator first, | |
207 | InputIterator last, | |
208 | OutputIterator result, | |
209 | const plus<T> &function, | |
210 | command_queue &queue) | |
211 | { | |
212 | const context &context = queue.get_context(); | |
213 | const device &device = queue.get_device(); | |
214 | ||
215 | // reduce to temporary buffer on device | |
216 | array<T, 1> value(context); | |
217 | if(device.type() & device::cpu){ | |
218 | detail::reduce_on_cpu(first, last, value.begin(), function, queue); | |
219 | } | |
220 | else { | |
221 | reduce_on_gpu(first, last, value.begin(), function, queue); | |
222 | } | |
223 | ||
224 | // copy to result iterator | |
225 | copy_n(value.begin(), 1, result, queue); | |
226 | } | |
227 | ||
228 | template<class InputIterator, class OutputIterator, class BinaryFunction> | |
229 | inline void dispatch_reduce(InputIterator first, | |
230 | InputIterator last, | |
231 | OutputIterator result, | |
232 | BinaryFunction function, | |
233 | command_queue &queue) | |
234 | { | |
235 | generic_reduce(first, last, result, function, queue); | |
236 | } | |
237 | ||
238 | } // end detail namespace | |
239 | ||
240 | /// Returns the result of applying \p function to the elements in the | |
241 | /// range [\p first, \p last). | |
242 | /// | |
243 | /// If no function is specified, \c plus will be used. | |
244 | /// | |
245 | /// \param first first element in the input range | |
246 | /// \param last last element in the input range | |
247 | /// \param result iterator pointing to the output | |
248 | /// \param function binary reduction function | |
249 | /// \param queue command queue to perform the operation | |
250 | /// | |
251 | /// The \c reduce() algorithm assumes that the binary reduction function is | |
252 | /// associative. When used with non-associative functions the result may | |
253 | /// be non-deterministic and vary in precision. Notably this affects the | |
254 | /// \c plus<float>() function as floating-point addition is not associative | |
255 | /// and may produce slightly different results than a serial algorithm. | |
256 | /// | |
257 | /// This algorithm supports both host and device iterators for the | |
258 | /// result argument. This allows for values to be reduced and copied | |
259 | /// to the host all with a single function call. | |
260 | /// | |
261 | /// For example, to calculate the sum of the values in a device vector and | |
262 | /// copy the result to a value on the host: | |
263 | /// | |
264 | /// \snippet test/test_reduce.cpp sum_int | |
265 | /// | |
266 | /// Note that while the the \c reduce() algorithm is conceptually identical to | |
267 | /// the \c accumulate() algorithm, its implementation is substantially more | |
268 | /// efficient on parallel hardware. For more information, see the documentation | |
269 | /// on the \c accumulate() algorithm. | |
270 | /// | |
b32b8144 FG |
271 | /// Space complexity on GPUs: \Omega(n)<br> |
272 | /// Space complexity on CPUs: \Omega(1) | |
273 | /// | |
7c673cae FG |
274 | /// \see accumulate() |
275 | template<class InputIterator, class OutputIterator, class BinaryFunction> | |
276 | inline void reduce(InputIterator first, | |
277 | InputIterator last, | |
278 | OutputIterator result, | |
279 | BinaryFunction function, | |
280 | command_queue &queue = system::default_queue()) | |
281 | { | |
92f5a8d4 | 282 | BOOST_STATIC_ASSERT(is_device_iterator<InputIterator>::value); |
7c673cae FG |
283 | if(first == last){ |
284 | return; | |
285 | } | |
286 | ||
287 | detail::dispatch_reduce(first, last, result, function, queue); | |
288 | } | |
289 | ||
290 | /// \overload | |
291 | template<class InputIterator, class OutputIterator> | |
292 | inline void reduce(InputIterator first, | |
293 | InputIterator last, | |
294 | OutputIterator result, | |
295 | command_queue &queue = system::default_queue()) | |
296 | { | |
92f5a8d4 | 297 | BOOST_STATIC_ASSERT(is_device_iterator<InputIterator>::value); |
7c673cae FG |
298 | typedef typename std::iterator_traits<InputIterator>::value_type T; |
299 | ||
300 | if(first == last){ | |
301 | return; | |
302 | } | |
303 | ||
304 | detail::dispatch_reduce(first, last, result, plus<T>(), queue); | |
305 | } | |
306 | ||
307 | } // end compute namespace | |
308 | } // end boost namespace | |
309 | ||
310 | #endif // BOOST_COMPUTE_ALGORITHM_REDUCE_HPP |