]>
Commit | Line | Data |
---|---|---|
1 | //---------------------------------------------------------------------------// | |
2 | // Copyright (c) 2015 Jakub Szuppe <j.szuppe@gmail.com> | |
3 | // | |
4 | // Distributed under the Boost Software License, Version 1.0 | |
5 | // See accompanying file LICENSE_1_0.txt or copy at | |
6 | // http://www.boost.org/LICENSE_1_0.txt | |
7 | // | |
8 | // See http://boostorg.github.com/compute for more information. | |
9 | //---------------------------------------------------------------------------// | |
10 | ||
11 | #ifndef BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP | |
12 | #define BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP | |
13 | ||
14 | #include <algorithm> | |
15 | ||
16 | #include <boost/compute/types.hpp> | |
17 | #include <boost/compute/command_queue.hpp> | |
18 | #include <boost/compute/algorithm/copy.hpp> | |
19 | #include <boost/compute/allocator/pinned_allocator.hpp> | |
20 | #include <boost/compute/container/vector.hpp> | |
21 | #include <boost/compute/detail/meta_kernel.hpp> | |
22 | #include <boost/compute/detail/iterator_range_size.hpp> | |
23 | #include <boost/compute/detail/parameter_cache.hpp> | |
24 | #include <boost/compute/memory/local_buffer.hpp> | |
25 | #include <boost/compute/type_traits/type_name.hpp> | |
26 | #include <boost/compute/utility/program_cache.hpp> | |
27 | ||
28 | namespace boost { | |
29 | namespace compute { | |
30 | namespace detail { | |
31 | ||
32 | template<class InputIterator> | |
33 | bool find_extrema_with_reduce_requirements_met(InputIterator first, | |
34 | InputIterator last, | |
35 | command_queue &queue) | |
36 | { | |
37 | typedef typename std::iterator_traits<InputIterator>::value_type input_type; | |
38 | ||
39 | const device &device = queue.get_device(); | |
40 | ||
41 | // device must have dedicated local memory storage | |
42 | // otherwise reduction would be highly inefficient | |
43 | if(device.get_info<CL_DEVICE_LOCAL_MEM_TYPE>() != CL_LOCAL) | |
44 | { | |
45 | return false; | |
46 | } | |
47 | ||
48 | const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); | |
49 | // local memory size in bytes (per compute unit) | |
50 | const size_t local_mem_size = device.get_info<CL_DEVICE_LOCAL_MEM_SIZE>(); | |
51 | ||
52 | std::string cache_key = std::string("__boost_find_extrema_reduce_") | |
53 | + type_name<input_type>(); | |
54 | // load parameters | |
55 | boost::shared_ptr<parameter_cache> parameters = | |
56 | detail::parameter_cache::get_global_cache(device); | |
57 | ||
58 | // Get preferred work group size | |
59 | size_t work_group_size = parameters->get(cache_key, "wgsize", 256); | |
60 | ||
61 | work_group_size = (std::min)(max_work_group_size, work_group_size); | |
62 | ||
63 | // local memory size needed to perform parallel reduction | |
64 | size_t required_local_mem_size = 0; | |
65 | // indices size | |
66 | required_local_mem_size += sizeof(uint_) * work_group_size; | |
67 | // values size | |
68 | required_local_mem_size += sizeof(input_type) * work_group_size; | |
69 | ||
70 | // at least 4 work groups per compute unit otherwise reduction | |
71 | // would be highly inefficient | |
72 | return ((required_local_mem_size * 4) <= local_mem_size); | |
73 | } | |
74 | ||
75 | /// \internal_ | |
76 | /// Algorithm finds the first extremum in given range, i.e., with the lowest | |
77 | /// index. | |
78 | /// | |
79 | /// If \p use_input_idx is false, it's assumed that input data is ordered by | |
80 | /// increasing index and \p input_idx is not used in the algorithm. | |
81 | template<class InputIterator, class ResultIterator, class Compare> | |
82 | inline void find_extrema_with_reduce(InputIterator input, | |
83 | vector<uint_>::iterator input_idx, | |
84 | size_t count, | |
85 | ResultIterator result, | |
86 | vector<uint_>::iterator result_idx, | |
87 | size_t work_groups_no, | |
88 | size_t work_group_size, | |
89 | Compare compare, | |
90 | const bool find_minimum, | |
91 | const bool use_input_idx, | |
92 | command_queue &queue) | |
93 | { | |
94 | typedef typename std::iterator_traits<InputIterator>::value_type input_type; | |
95 | ||
96 | const context &context = queue.get_context(); | |
97 | ||
98 | meta_kernel k("find_extrema_reduce"); | |
99 | size_t count_arg = k.add_arg<uint_>("count"); | |
100 | size_t block_arg = k.add_arg<input_type *>(memory_object::local_memory, "block"); | |
101 | size_t block_idx_arg = k.add_arg<uint_ *>(memory_object::local_memory, "block_idx"); | |
102 | ||
103 | k << | |
104 | // Work item global id | |
105 | k.decl<const uint_>("gid") << " = get_global_id(0);\n" << | |
106 | ||
107 | // Index of element that will be read from input buffer | |
108 | k.decl<uint_>("idx") << " = gid;\n" << | |
109 | ||
110 | k.decl<input_type>("acc") << ";\n" << | |
111 | k.decl<uint_>("acc_idx") << ";\n" << | |
112 | "if(gid < count) {\n" << | |
113 | // Real index of currently best element | |
114 | "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << | |
115 | k.var<uint_>("acc_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" << | |
116 | "#else\n" << | |
117 | k.var<uint_>("acc_idx") << " = idx;\n" << | |
118 | "#endif\n" << | |
119 | ||
120 | // Init accumulator with first[get_global_id(0)] | |
121 | "acc = " << input[k.var<uint_>("idx")] << ";\n" << | |
122 | "idx += get_global_size(0);\n" << | |
123 | "}\n" << | |
124 | ||
125 | k.decl<bool>("compare_result") << ";\n" << | |
126 | k.decl<bool>("equal") << ";\n\n" << | |
127 | "while( idx < count ){\n" << | |
128 | // Next element | |
129 | k.decl<input_type>("next") << " = " << input[k.var<uint_>("idx")] << ";\n" << | |
130 | "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << | |
131 | k.decl<uint_>("next_idx") << " = " << input_idx[k.var<uint_>("idx")] << ";\n" << | |
132 | "#endif\n" << | |
133 | ||
134 | // Comparison between currently best element (acc) and next element | |
135 | "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" << | |
136 | "compare_result = " << compare(k.var<input_type>("next"), | |
137 | k.var<input_type>("acc")) << ";\n" << | |
138 | "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << | |
139 | "equal = !compare_result && !" << | |
140 | compare(k.var<input_type>("acc"), | |
141 | k.var<input_type>("next")) << ";\n" << | |
142 | "# endif\n" << | |
143 | "#else\n" << | |
144 | "compare_result = " << compare(k.var<input_type>("acc"), | |
145 | k.var<input_type>("next")) << ";\n" << | |
146 | "# ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << | |
147 | "equal = !compare_result && !" << | |
148 | compare(k.var<input_type>("next"), | |
149 | k.var<input_type>("acc")) << ";\n" << | |
150 | "# endif\n" << | |
151 | "#endif\n" << | |
152 | ||
153 | // save the winner | |
154 | "acc = compare_result ? acc : next;\n" << | |
155 | "#ifdef BOOST_COMPUTE_USE_INPUT_IDX\n" << | |
156 | "acc_idx = compare_result ? " << | |
157 | "acc_idx : " << | |
158 | "(equal ? min(acc_idx, next_idx) : next_idx);\n" << | |
159 | "#else\n" << | |
160 | "acc_idx = compare_result ? acc_idx : idx;\n" << | |
161 | "#endif\n" << | |
162 | "idx += get_global_size(0);\n" << | |
163 | "}\n\n" << | |
164 | ||
165 | // Work item local id | |
166 | k.decl<const uint_>("lid") << " = get_local_id(0);\n" << | |
167 | "block[lid] = acc;\n" << | |
168 | "block_idx[lid] = acc_idx;\n" << | |
169 | "barrier(CLK_LOCAL_MEM_FENCE);\n" << | |
170 | ||
171 | k.decl<uint_>("group_offset") << | |
172 | " = count - (get_local_size(0) * get_group_id(0));\n\n"; | |
173 | ||
174 | k << | |
175 | "#pragma unroll\n" | |
176 | "for(" << k.decl<uint_>("offset") << " = " << uint_(work_group_size) << " / 2; offset > 0; " << | |
177 | "offset = offset / 2) {\n" << | |
178 | "if((lid < offset) && ((lid + offset) < group_offset)) { \n" << | |
179 | k.decl<input_type>("mine") << " = block[lid];\n" << | |
180 | k.decl<input_type>("other") << " = block[lid+offset];\n" << | |
181 | "#ifdef BOOST_COMPUTE_FIND_MAXIMUM\n" << | |
182 | "compare_result = " << compare(k.var<input_type>("other"), | |
183 | k.var<input_type>("mine")) << ";\n" << | |
184 | "equal = !compare_result && !" << | |
185 | compare(k.var<input_type>("mine"), | |
186 | k.var<input_type>("other")) << ";\n" << | |
187 | "#else\n" << | |
188 | "compare_result = " << compare(k.var<input_type>("mine"), | |
189 | k.var<input_type>("other")) << ";\n" << | |
190 | "equal = !compare_result && !" << | |
191 | compare(k.var<input_type>("other"), | |
192 | k.var<input_type>("mine")) << ";\n" << | |
193 | "#endif\n" << | |
194 | "block[lid] = compare_result ? mine : other;\n" << | |
195 | k.decl<uint_>("mine_idx") << " = block_idx[lid];\n" << | |
196 | k.decl<uint_>("other_idx") << " = block_idx[lid+offset];\n" << | |
197 | "block_idx[lid] = compare_result ? " << | |
198 | "mine_idx : " << | |
199 | "(equal ? min(mine_idx, other_idx) : other_idx);\n" << | |
200 | "}\n" | |
201 | "barrier(CLK_LOCAL_MEM_FENCE);\n" << | |
202 | "}\n\n" << | |
203 | ||
204 | // write block result to global output | |
205 | "if(lid == 0){\n" << | |
206 | result[k.var<uint_>("get_group_id(0)")] << " = block[0];\n" << | |
207 | result_idx[k.var<uint_>("get_group_id(0)")] << " = block_idx[0];\n" << | |
208 | "}"; | |
209 | ||
210 | std::string options; | |
211 | if(!find_minimum){ | |
212 | options = "-DBOOST_COMPUTE_FIND_MAXIMUM"; | |
213 | } | |
214 | if(use_input_idx){ | |
215 | options += " -DBOOST_COMPUTE_USE_INPUT_IDX"; | |
216 | } | |
217 | ||
218 | kernel kernel = k.compile(context, options); | |
219 | ||
220 | kernel.set_arg(count_arg, static_cast<uint_>(count)); | |
221 | kernel.set_arg(block_arg, local_buffer<input_type>(work_group_size)); | |
222 | kernel.set_arg(block_idx_arg, local_buffer<uint_>(work_group_size)); | |
223 | ||
224 | queue.enqueue_1d_range_kernel(kernel, | |
225 | 0, | |
226 | work_groups_no * work_group_size, | |
227 | work_group_size); | |
228 | } | |
229 | ||
230 | template<class InputIterator, class ResultIterator, class Compare> | |
231 | inline void find_extrema_with_reduce(InputIterator input, | |
232 | size_t count, | |
233 | ResultIterator result, | |
234 | vector<uint_>::iterator result_idx, | |
235 | size_t work_groups_no, | |
236 | size_t work_group_size, | |
237 | Compare compare, | |
238 | const bool find_minimum, | |
239 | command_queue &queue) | |
240 | { | |
241 | // dummy will not be used | |
242 | buffer_iterator<uint_> dummy = result_idx; | |
243 | return find_extrema_with_reduce( | |
244 | input, dummy, count, result, result_idx, work_groups_no, | |
245 | work_group_size, compare, find_minimum, false, queue | |
246 | ); | |
247 | } | |
248 | ||
249 | template<class InputIterator, class Compare> | |
250 | InputIterator find_extrema_with_reduce(InputIterator first, | |
251 | InputIterator last, | |
252 | Compare compare, | |
253 | const bool find_minimum, | |
254 | command_queue &queue) | |
255 | { | |
256 | typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; | |
257 | typedef typename std::iterator_traits<InputIterator>::value_type input_type; | |
258 | ||
259 | const context &context = queue.get_context(); | |
260 | const device &device = queue.get_device(); | |
261 | ||
262 | // Getting information about used queue and device | |
263 | const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>(); | |
264 | const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); | |
265 | ||
266 | const size_t count = detail::iterator_range_size(first, last); | |
267 | ||
268 | std::string cache_key = std::string("__boost_find_extrema_with_reduce_") | |
269 | + type_name<input_type>(); | |
270 | ||
271 | // load parameters | |
272 | boost::shared_ptr<parameter_cache> parameters = | |
273 | detail::parameter_cache::get_global_cache(device); | |
274 | ||
275 | // get preferred work group size and preferred number | |
276 | // of work groups per compute unit | |
277 | size_t work_group_size = parameters->get(cache_key, "wgsize", 256); | |
278 | size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 100); | |
279 | ||
280 | // calculate work group size and number of work groups | |
281 | work_group_size = (std::min)(max_work_group_size, work_group_size); | |
282 | size_t work_groups_no = compute_units_no * work_groups_per_cu; | |
283 | work_groups_no = (std::min)( | |
284 | work_groups_no, | |
285 | static_cast<size_t>(std::ceil(float(count) / work_group_size)) | |
286 | ); | |
287 | ||
288 | // phase I: finding candidates for extremum | |
289 | ||
290 | // device buffors for extremum candidates and their indices | |
291 | // each work-group computes its candidate | |
292 | vector<input_type> candidates(work_groups_no, context); | |
293 | vector<uint_> candidates_idx(work_groups_no, context); | |
294 | ||
295 | // finding candidates for first extremum and their indices | |
296 | find_extrema_with_reduce( | |
297 | first, count, candidates.begin(), candidates_idx.begin(), | |
298 | work_groups_no, work_group_size, compare, find_minimum, queue | |
299 | ); | |
300 | ||
301 | // phase II: finding extremum from among the candidates | |
302 | ||
303 | // zero-copy buffers for final result (value and index) | |
304 | vector<input_type, ::boost::compute::pinned_allocator<input_type> > | |
305 | result(1, context); | |
306 | vector<uint_, ::boost::compute::pinned_allocator<uint_> > | |
307 | result_idx(1, context); | |
308 | ||
309 | // get extremum from among the candidates | |
310 | find_extrema_with_reduce( | |
311 | candidates.begin(), candidates_idx.begin(), work_groups_no, result.begin(), | |
312 | result_idx.begin(), 1, work_group_size, compare, find_minimum, true, queue | |
313 | ); | |
314 | ||
315 | // mapping extremum index to host | |
316 | uint_* result_idx_host_ptr = | |
317 | static_cast<uint_*>( | |
318 | queue.enqueue_map_buffer( | |
319 | result_idx.get_buffer(), command_queue::map_read, | |
320 | 0, sizeof(uint_) | |
321 | ) | |
322 | ); | |
323 | ||
324 | return first + static_cast<difference_type>(*result_idx_host_ptr); | |
325 | } | |
326 | ||
327 | template<class InputIterator> | |
328 | InputIterator find_extrema_with_reduce(InputIterator first, | |
329 | InputIterator last, | |
330 | ::boost::compute::less< | |
331 | typename std::iterator_traits< | |
332 | InputIterator | |
333 | >::value_type | |
334 | > | |
335 | compare, | |
336 | const bool find_minimum, | |
337 | command_queue &queue) | |
338 | { | |
339 | typedef typename std::iterator_traits<InputIterator>::difference_type difference_type; | |
340 | typedef typename std::iterator_traits<InputIterator>::value_type input_type; | |
341 | ||
342 | const context &context = queue.get_context(); | |
343 | const device &device = queue.get_device(); | |
344 | ||
345 | // Getting information about used queue and device | |
346 | const size_t compute_units_no = device.get_info<CL_DEVICE_MAX_COMPUTE_UNITS>(); | |
347 | const size_t max_work_group_size = device.get_info<CL_DEVICE_MAX_WORK_GROUP_SIZE>(); | |
348 | ||
349 | const size_t count = detail::iterator_range_size(first, last); | |
350 | ||
351 | std::string cache_key = std::string("__boost_find_extrema_with_reduce_") | |
352 | + type_name<input_type>(); | |
353 | ||
354 | // load parameters | |
355 | boost::shared_ptr<parameter_cache> parameters = | |
356 | detail::parameter_cache::get_global_cache(device); | |
357 | ||
358 | // get preferred work group size and preferred number | |
359 | // of work groups per compute unit | |
360 | size_t work_group_size = parameters->get(cache_key, "wgsize", 256); | |
361 | size_t work_groups_per_cu = parameters->get(cache_key, "wgpcu", 64); | |
362 | ||
363 | // calculate work group size and number of work groups | |
364 | work_group_size = (std::min)(max_work_group_size, work_group_size); | |
365 | size_t work_groups_no = compute_units_no * work_groups_per_cu; | |
366 | work_groups_no = (std::min)( | |
367 | work_groups_no, | |
368 | static_cast<size_t>(std::ceil(float(count) / work_group_size)) | |
369 | ); | |
370 | ||
371 | // phase I: finding candidates for extremum | |
372 | ||
373 | // device buffors for extremum candidates and their indices | |
374 | // each work-group computes its candidate | |
375 | // zero-copy buffers are used to eliminate copying data back to host | |
376 | vector<input_type, ::boost::compute::pinned_allocator<input_type> > | |
377 | candidates(work_groups_no, context); | |
378 | vector<uint_, ::boost::compute::pinned_allocator <uint_> > | |
379 | candidates_idx(work_groups_no, context); | |
380 | ||
381 | // finding candidates for first extremum and their indices | |
382 | find_extrema_with_reduce( | |
383 | first, count, candidates.begin(), candidates_idx.begin(), | |
384 | work_groups_no, work_group_size, compare, find_minimum, queue | |
385 | ); | |
386 | ||
387 | // phase II: finding extremum from among the candidates | |
388 | ||
389 | // mapping candidates and their indices to host | |
390 | input_type* candidates_host_ptr = | |
391 | static_cast<input_type*>( | |
392 | queue.enqueue_map_buffer( | |
393 | candidates.get_buffer(), command_queue::map_read, | |
394 | 0, work_groups_no * sizeof(input_type) | |
395 | ) | |
396 | ); | |
397 | ||
398 | uint_* candidates_idx_host_ptr = | |
399 | static_cast<uint_*>( | |
400 | queue.enqueue_map_buffer( | |
401 | candidates_idx.get_buffer(), command_queue::map_read, | |
402 | 0, work_groups_no * sizeof(uint_) | |
403 | ) | |
404 | ); | |
405 | ||
406 | input_type* i = candidates_host_ptr; | |
407 | uint_* idx = candidates_idx_host_ptr; | |
408 | uint_* extremum_idx = idx; | |
409 | input_type extremum = *candidates_host_ptr; | |
410 | i++; idx++; | |
411 | ||
412 | // find extremum (serial) from among the candidates on host | |
413 | if(!find_minimum) { | |
414 | while(idx != (candidates_idx_host_ptr + work_groups_no)) { | |
415 | input_type next = *i; | |
416 | bool compare_result = next > extremum; | |
417 | bool equal = next == extremum; | |
418 | extremum = compare_result ? next : extremum; | |
419 | extremum_idx = compare_result ? idx : extremum_idx; | |
420 | extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx; | |
421 | idx++, i++; | |
422 | } | |
423 | } | |
424 | else { | |
425 | while(idx != (candidates_idx_host_ptr + work_groups_no)) { | |
426 | input_type next = *i; | |
427 | bool compare_result = next < extremum; | |
428 | bool equal = next == extremum; | |
429 | extremum = compare_result ? next : extremum; | |
430 | extremum_idx = compare_result ? idx : extremum_idx; | |
431 | extremum_idx = equal ? ((*extremum_idx < *idx) ? extremum_idx : idx) : extremum_idx; | |
432 | idx++, i++; | |
433 | } | |
434 | } | |
435 | ||
436 | return first + static_cast<difference_type>(*extremum_idx); | |
437 | } | |
438 | ||
439 | } // end detail namespace | |
440 | } // end compute namespace | |
441 | } // end boost namespace | |
442 | ||
443 | #endif // BOOST_COMPUTE_ALGORITHM_DETAIL_FIND_EXTREMA_WITH_REDUCE_HPP |