]>
Commit | Line | Data |
---|---|---|
7c673cae FG |
1 | //---------------------------------------------------------------------------// |
2 | // Copyright (c) 2013 Kyle Lutz <kyle.r.lutz@gmail.com> | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0 | |
5 | // See accompanying file LICENSE_1_0.txt or copy at | |
6 | // http://www.boost.org/LICENSE_1_0.txt | |
7 | // | |
8 | // See http://boostorg.github.com/compute for more information. | |
9 | //---------------------------------------------------------------------------// | |
10 | ||
11 | #ifndef BOOST_COMPUTE_ALGORITHM_REDUCE_HPP | |
12 | #define BOOST_COMPUTE_ALGORITHM_REDUCE_HPP | |
13 | ||
14 | #include <iterator> | |
15 | ||
16 | #include <boost/compute/system.hpp> | |
17 | #include <boost/compute/functional.hpp> | |
18 | #include <boost/compute/detail/meta_kernel.hpp> | |
19 | #include <boost/compute/command_queue.hpp> | |
20 | #include <boost/compute/container/array.hpp> | |
21 | #include <boost/compute/container/vector.hpp> | |
22 | #include <boost/compute/algorithm/copy_n.hpp> | |
23 | #include <boost/compute/algorithm/detail/inplace_reduce.hpp> | |
24 | #include <boost/compute/algorithm/detail/reduce_on_gpu.hpp> | |
25 | #include <boost/compute/algorithm/detail/reduce_on_cpu.hpp> | |
26 | #include <boost/compute/detail/iterator_range_size.hpp> | |
27 | #include <boost/compute/memory/local_buffer.hpp> | |
28 | #include <boost/compute/type_traits/result_of.hpp> | |
29 | ||
30 | namespace boost { | |
31 | namespace compute { | |
32 | namespace detail { | |
33 | ||
34 | template<class InputIterator, class OutputIterator, class BinaryFunction> | |
35 | size_t reduce(InputIterator first, | |
36 | size_t count, | |
37 | OutputIterator result, | |
38 | size_t block_size, | |
39 | BinaryFunction function, | |
40 | command_queue &queue) | |
41 | { | |
42 | typedef typename | |
43 | std::iterator_traits<InputIterator>::value_type | |
44 | input_type; | |
45 | typedef typename | |
46 | boost::compute::result_of<BinaryFunction(input_type, input_type)>::type | |
47 | result_type; | |
48 | ||
49 | const context &context = queue.get_context(); | |
50 | size_t block_count = count / 2 / block_size; | |
51 | size_t total_block_count = | |
52 | static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size))); | |
53 | ||
54 | if(block_count != 0){ | |
55 | meta_kernel k("block_reduce"); | |
56 | size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); | |
57 | size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block"); | |
58 | ||
59 | k << | |
60 | "const uint gid = get_global_id(0);\n" << | |
61 | "const uint lid = get_local_id(0);\n" << | |
62 | ||
63 | // copy values to local memory | |
64 | "block[lid] = " << | |
65 | function(first[k.make_var<uint_>("gid*2+0")], | |
66 | first[k.make_var<uint_>("gid*2+1")]) << ";\n" << | |
67 | ||
68 | // perform reduction | |
69 | "for(uint i = 1; i < " << uint_(block_size) << "; i <<= 1){\n" << | |
70 | " barrier(CLK_LOCAL_MEM_FENCE);\n" << | |
71 | " uint mask = (i << 1) - 1;\n" << | |
72 | " if((lid & mask) == 0){\n" << | |
73 | " block[lid] = " << | |
74 | function(k.expr<input_type>("block[lid]"), | |
75 | k.expr<input_type>("block[lid+i]")) << ";\n" << | |
76 | " }\n" << | |
77 | "}\n" << | |
78 | ||
79 | // write block result to global output | |
80 | "if(lid == 0)\n" << | |
81 | " output[get_group_id(0)] = block[0];\n"; | |
82 | ||
83 | kernel kernel = k.compile(context); | |
84 | kernel.set_arg(output_arg, result.get_buffer()); | |
85 | kernel.set_arg(block_arg, local_buffer<input_type>(block_size)); | |
86 | ||
87 | queue.enqueue_1d_range_kernel(kernel, | |
88 | 0, | |
89 | block_count * block_size, | |
90 | block_size); | |
91 | } | |
92 | ||
93 | // serially reduce any leftovers | |
94 | if(block_count * block_size * 2 < count){ | |
95 | size_t last_block_start = block_count * block_size * 2; | |
96 | ||
97 | meta_kernel k("extra_serial_reduce"); | |
98 | size_t count_arg = k.add_arg<uint_>("count"); | |
99 | size_t offset_arg = k.add_arg<uint_>("offset"); | |
100 | size_t output_arg = k.add_arg<result_type *>(memory_object::global_memory, "output"); | |
101 | size_t output_offset_arg = k.add_arg<uint_>("output_offset"); | |
102 | ||
103 | k << | |
104 | k.decl<result_type>("result") << " = \n" << | |
105 | first[k.expr<uint_>("offset")] << ";\n" << | |
106 | "for(uint i = offset + 1; i < count; i++)\n" << | |
107 | " result = " << | |
108 | function(k.var<result_type>("result"), | |
109 | first[k.var<uint_>("i")]) << ";\n" << | |
110 | "output[output_offset] = result;\n"; | |
111 | ||
112 | kernel kernel = k.compile(context); | |
113 | kernel.set_arg(count_arg, static_cast<uint_>(count)); | |
114 | kernel.set_arg(offset_arg, static_cast<uint_>(last_block_start)); | |
115 | kernel.set_arg(output_arg, result.get_buffer()); | |
116 | kernel.set_arg(output_offset_arg, static_cast<uint_>(block_count)); | |
117 | ||
118 | queue.enqueue_task(kernel); | |
119 | } | |
120 | ||
121 | return total_block_count; | |
122 | } | |
123 | ||
124 | template<class InputIterator, class BinaryFunction> | |
125 | inline vector< | |
126 | typename boost::compute::result_of< | |
127 | BinaryFunction( | |
128 | typename std::iterator_traits<InputIterator>::value_type, | |
129 | typename std::iterator_traits<InputIterator>::value_type | |
130 | ) | |
131 | >::type | |
132 | > | |
133 | block_reduce(InputIterator first, | |
134 | size_t count, | |
135 | size_t block_size, | |
136 | BinaryFunction function, | |
137 | command_queue &queue) | |
138 | { | |
139 | typedef typename | |
140 | std::iterator_traits<InputIterator>::value_type | |
141 | input_type; | |
142 | typedef typename | |
143 | boost::compute::result_of<BinaryFunction(input_type, input_type)>::type | |
144 | result_type; | |
145 | ||
146 | const context &context = queue.get_context(); | |
147 | size_t total_block_count = | |
148 | static_cast<size_t>(std::ceil(float(count) / 2.f / float(block_size))); | |
149 | vector<result_type> result_vector(total_block_count, context); | |
150 | ||
151 | reduce(first, count, result_vector.begin(), block_size, function, queue); | |
152 | ||
153 | return result_vector; | |
154 | } | |
155 | ||
156 | template<class InputIterator, class OutputIterator, class BinaryFunction> | |
157 | inline void generic_reduce(InputIterator first, | |
158 | InputIterator last, | |
159 | OutputIterator result, | |
160 | BinaryFunction function, | |
161 | command_queue &queue) | |
162 | { | |
163 | typedef typename | |
164 | std::iterator_traits<InputIterator>::value_type | |
165 | input_type; | |
166 | typedef typename | |
167 | boost::compute::result_of<BinaryFunction(input_type, input_type)>::type | |
168 | result_type; | |
169 | ||
170 | const device &device = queue.get_device(); | |
171 | const context &context = queue.get_context(); | |
172 | ||
173 | size_t count = detail::iterator_range_size(first, last); | |
174 | ||
175 | if(device.type() & device::cpu){ | |
176 | array<result_type, 1> value(context); | |
177 | detail::reduce_on_cpu(first, last, value.begin(), function, queue); | |
178 | boost::compute::copy_n(value.begin(), 1, result, queue); | |
179 | } | |
180 | else { | |
181 | size_t block_size = 256; | |
182 | ||
183 | // first pass | |
184 | vector<result_type> results = detail::block_reduce(first, | |
185 | count, | |
186 | block_size, | |
187 | function, | |
188 | queue); | |
189 | ||
190 | if(results.size() > 1){ | |
191 | detail::inplace_reduce(results.begin(), | |
192 | results.end(), | |
193 | function, | |
194 | queue); | |
195 | } | |
196 | ||
197 | boost::compute::copy_n(results.begin(), 1, result, queue); | |
198 | } | |
199 | } | |
200 | ||
201 | template<class InputIterator, class OutputIterator, class T> | |
202 | inline void dispatch_reduce(InputIterator first, | |
203 | InputIterator last, | |
204 | OutputIterator result, | |
205 | const plus<T> &function, | |
206 | command_queue &queue) | |
207 | { | |
208 | const context &context = queue.get_context(); | |
209 | const device &device = queue.get_device(); | |
210 | ||
211 | // reduce to temporary buffer on device | |
212 | array<T, 1> value(context); | |
213 | if(device.type() & device::cpu){ | |
214 | detail::reduce_on_cpu(first, last, value.begin(), function, queue); | |
215 | } | |
216 | else { | |
217 | reduce_on_gpu(first, last, value.begin(), function, queue); | |
218 | } | |
219 | ||
220 | // copy to result iterator | |
221 | copy_n(value.begin(), 1, result, queue); | |
222 | } | |
223 | ||
224 | template<class InputIterator, class OutputIterator, class BinaryFunction> | |
225 | inline void dispatch_reduce(InputIterator first, | |
226 | InputIterator last, | |
227 | OutputIterator result, | |
228 | BinaryFunction function, | |
229 | command_queue &queue) | |
230 | { | |
231 | generic_reduce(first, last, result, function, queue); | |
232 | } | |
233 | ||
234 | } // end detail namespace | |
235 | ||
236 | /// Returns the result of applying \p function to the elements in the | |
237 | /// range [\p first, \p last). | |
238 | /// | |
239 | /// If no function is specified, \c plus will be used. | |
240 | /// | |
241 | /// \param first first element in the input range | |
242 | /// \param last last element in the input range | |
243 | /// \param result iterator pointing to the output | |
244 | /// \param function binary reduction function | |
245 | /// \param queue command queue to perform the operation | |
246 | /// | |
247 | /// The \c reduce() algorithm assumes that the binary reduction function is | |
248 | /// associative. When used with non-associative functions the result may | |
249 | /// be non-deterministic and vary in precision. Notably this affects the | |
250 | /// \c plus<float>() function as floating-point addition is not associative | |
251 | /// and may produce slightly different results than a serial algorithm. | |
252 | /// | |
253 | /// This algorithm supports both host and device iterators for the | |
254 | /// result argument. This allows for values to be reduced and copied | |
255 | /// to the host all with a single function call. | |
256 | /// | |
257 | /// For example, to calculate the sum of the values in a device vector and | |
258 | /// copy the result to a value on the host: | |
259 | /// | |
260 | /// \snippet test/test_reduce.cpp sum_int | |
261 | /// | |
262 | /// Note that while the the \c reduce() algorithm is conceptually identical to | |
263 | /// the \c accumulate() algorithm, its implementation is substantially more | |
264 | /// efficient on parallel hardware. For more information, see the documentation | |
265 | /// on the \c accumulate() algorithm. | |
266 | /// | |
267 | /// \see accumulate() | |
268 | template<class InputIterator, class OutputIterator, class BinaryFunction> | |
269 | inline void reduce(InputIterator first, | |
270 | InputIterator last, | |
271 | OutputIterator result, | |
272 | BinaryFunction function, | |
273 | command_queue &queue = system::default_queue()) | |
274 | { | |
275 | if(first == last){ | |
276 | return; | |
277 | } | |
278 | ||
279 | detail::dispatch_reduce(first, last, result, function, queue); | |
280 | } | |
281 | ||
282 | /// \overload | |
283 | template<class InputIterator, class OutputIterator> | |
284 | inline void reduce(InputIterator first, | |
285 | InputIterator last, | |
286 | OutputIterator result, | |
287 | command_queue &queue = system::default_queue()) | |
288 | { | |
289 | typedef typename std::iterator_traits<InputIterator>::value_type T; | |
290 | ||
291 | if(first == last){ | |
292 | return; | |
293 | } | |
294 | ||
295 | detail::dispatch_reduce(first, last, result, plus<T>(), queue); | |
296 | } | |
297 | ||
298 | } // end compute namespace | |
299 | } // end boost namespace | |
300 | ||
301 | #endif // BOOST_COMPUTE_ALGORITHM_REDUCE_HPP |